From 86dd6c04ef9f213e14d60c9f64bce1cc019f816e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:08 +0100 Subject: sched/balancing: Rename scheduler_tick() => sched_tick() - Standardize on prefixing scheduler-internal functions defined in with sched_*() prefix. scheduler_tick() was the only function using the scheduler_ prefix. Harmonize it. - The other reason to rename it is the NOHZ scheduler tick handling functions are already named sched_tick_*(). Make the 'git grep sched_tick' more meaningful. Signed-off-by: Ingo Molnar Acked-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-3-mingo@kernel.org --- tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 25432b8cd5bd..073a748b9380 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,7 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="scheduler_tick" +FUNC2="sched_tick" ALL_FUNCS="#### all functions enabled ####" -- cgit v1.2.3 From c2a0257c1edf16c6acd2afac7572d7e9043b6577 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 11 Mar 2024 18:37:26 -0700 Subject: bpftool: Cast pointers for shadow types explicitly. According to a report, skeletons fail to assign shadow pointers when being compiled with C++ programs. Unlike C doing implicit casting for void pointers, C++ requires an explicit casting. To support C++, we do explicit casting for each shadow pointer. Also add struct_ops_module.skel.h to test_cpp to validate C++ compilation as part of BPF selftests. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240312013726.1780720-1-thinker.li@gmail.com --- tools/bpf/bpftool/gen.c | 3 ++- tools/testing/selftests/bpf/test_cpp.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 4fa4ade1ce74..3ce277544c24 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1131,7 +1131,8 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ + obj->struct_ops.%1$s = (typeof(obj->struct_ops.%1$s))\n\ + bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); } diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index f4936834f76f..dde0bb16e782 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -7,6 +7,7 @@ #include #include #include "test_core_extern.skel.h" +#include "struct_ops_module.skel.h" template class Skeleton { @@ -98,6 +99,7 @@ int main(int argc, char *argv[]) { struct btf_dump_opts opts = { }; struct test_core_extern *skel; + struct struct_ops_module *skel2; struct btf *btf; int fd; @@ -118,6 +120,9 @@ int main(int argc, char *argv[]) skel = test_core_extern__open_and_load(); test_core_extern__destroy(skel); + skel2 = struct_ops_module__open_and_load(); + struct_ops_module__destroy(skel2); + fd = bpf_enable_stats(BPF_STATS_RUN_TIME); if (fd < 0) std::cout << "FAILED to enable stats: " << fd << std::endl; -- cgit v1.2.3 From fe879bb42f8a6513ed18e9d22efb99cb35590201 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Mar 2024 19:32:49 -0700 Subject: bpftool: Fix missing pids during link show Current 'bpftool link' command does not show pids, e.g., $ tools/build/bpftool/bpftool link ... 4: tracing prog 23 prog_type lsm attach_type lsm_mac target_obj_id 1 target_btf_id 31320 Hack the following change to enable normal libbpf debug output, --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -121,9 +121,9 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) /* we don't want output polluted with libbpf errors if bpf_iter is not * supported */ - default_print = libbpf_set_print(libbpf_print_none); + /* default_print = libbpf_set_print(libbpf_print_none); */ err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + /* libbpf_set_print(default_print); */ Rerun the above bpftool command: $ tools/build/bpftool/bpftool link libbpf: prog 'iter': BPF program load failed: Permission denied libbpf: prog 'iter': -- BEGIN PROG LOAD LOG -- 0: R1=ctx() R10=fp0 ; struct task_struct *task = ctx->task; @ pid_iter.bpf.c:69 0: (79) r6 = *(u64 *)(r1 +8) ; R1=ctx() R6_w=ptr_or_null_task_struct(id=1) ; struct file *file = ctx->file; @ pid_iter.bpf.c:68 ... ; struct bpf_link *link = (struct bpf_link *) file->private_data; @ pid_iter.bpf.c:103 80: (79) r3 = *(u64 *)(r8 +432) ; R3_w=scalar() R8=ptr_file() ; if (link->type == bpf_core_enum_value(enum bpf_link_type___local, @ pid_iter.bpf.c:105 81: (61) r1 = *(u32 *)(r3 +12) R3 invalid mem access 'scalar' processed 39 insns (limit 1000000) max_states_per_insn 0 total_states 3 peak_states 3 mark_read 2 -- END PROG LOAD LOG -- libbpf: prog 'iter': failed to load: -13 ... The 'file->private_data' returns a 'void' type and this caused subsequent 'link->type' (insn #81) failed in verification. To fix the issue, restore the previous BPF_CORE_READ so old kernels can also work. With this patch, the 'bpftool link' runs successfully with 'pids'. $ tools/build/bpftool/bpftool link ... 4: tracing prog 23 prog_type lsm attach_type lsm_mac target_obj_id 1 target_btf_id 31320 pids systemd(1) Fixes: 44ba7b30e84f ("bpftool: Use a local copy of BPF_LINK_TYPE_PERF_EVENT in pid_iter.bpf.c") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Tested-by: Quentin Monnet Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240312023249.3776718-1-yonghong.song@linux.dev --- tools/bpf/bpftool/skeleton/pid_iter.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c index 26004f0c5a6a..7bdbcac3cf62 100644 --- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c +++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c @@ -102,8 +102,8 @@ int iter(struct bpf_iter__task_file *ctx) BPF_LINK_TYPE_PERF_EVENT___local)) { struct bpf_link *link = (struct bpf_link *) file->private_data; - if (link->type == bpf_core_enum_value(enum bpf_link_type___local, - BPF_LINK_TYPE_PERF_EVENT___local)) { + if (BPF_CORE_READ(link, type) == bpf_core_enum_value(enum bpf_link_type___local, + BPF_LINK_TYPE_PERF_EVENT___local)) { e.has_bpf_cookie = true; e.bpf_cookie = get_bpf_cookie(link); } -- cgit v1.2.3 From 9bf48fa19a4b1d186e08b20bf7e5de26a15644fb Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 14 Mar 2024 15:04:38 +0000 Subject: libbpf: Prevent null-pointer dereference when prog to load has no BTF In bpf_objec_load_prog(), there's no guarantee that obj->btf is non-NULL when passing it to btf__fd(), and this function does not perform any check before dereferencing its argument (as bpf_object__btf_fd() used to do). As a consequence, we get segmentation fault errors in bpftool (for example) when trying to load programs that come without BTF information. v2: Keep btf__fd() in the fix instead of reverting to bpf_object__btf_fd(). Fixes: df7c3f7d3a3d ("libbpf: make uniform use of btf__fd() accessor inside libbpf") Suggested-by: Andrii Nakryiko Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240314150438.232462-1-qmo@kernel.org --- tools/lib/bpf/libbpf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index efab29b8935b..124883eaa0cf 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7317,9 +7317,9 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; char *log_buf = NULL, *tmp; - int btf_fd, ret, err; bool own_log_buf = true; __u32 log_level = prog->log_level; + int ret, err; if (prog->type == BPF_PROG_TYPE_UNSPEC) { /* @@ -7343,9 +7343,8 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog load_attr.prog_ifindex = prog->prog_ifindex; /* specify func_info/line_info only if kernel supports them */ - btf_fd = btf__fd(obj->btf); - if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { - load_attr.prog_btf_fd = btf_fd; + if (obj->btf && btf__fd(obj->btf) >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf__fd(obj->btf); load_attr.func_info = prog->func_info; load_attr.func_info_rec_size = prog->func_info_rec_size; load_attr.func_info_cnt = prog->func_info_cnt; -- cgit v1.2.3 From c911fc61a7ce367f9ea48e457f31bb171e80ca4d Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 13 Mar 2024 14:41:37 -0700 Subject: libbpf: Skip zeroed or null fields if not found in the kernel type. Accept additional fields of a struct_ops type with all zero values even if these fields are not in the corresponding type in the kernel. This provides a way to be backward compatible. User space programs can use the same map on a machine running an old kernel by clearing fields that do not exist in the kernel. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240313214139.685112-2-thinker.li@gmail.com --- tools/lib/bpf/libbpf.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 124883eaa0cf..604368cfbf02 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1132,8 +1132,26 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const char *mname; mname = btf__name_by_offset(btf, member->name_off); + moff = member->offset / 8; + mdata = data + moff; + msize = btf__resolve_size(btf, member->type); + if (msize < 0) { + pr_warn("struct_ops init_kern %s: failed to resolve the size of member %s\n", + map->name, mname); + return msize; + } + kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { + /* Skip all zeros or null fields if they are not + * presented in the kernel BTF. + */ + if (libbpf_is_mem_zeroed(mdata, msize)) { + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", + map->name, mname); + continue; + } + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", map->name, mname); return -ENOTSUP; @@ -1147,10 +1165,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) return -ENOTSUP; } - moff = member->offset / 8; kern_moff = kern_member->offset / 8; - - mdata = data + moff; kern_mdata = kern_data + kern_moff; mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); @@ -1230,9 +1245,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) continue; } - msize = btf__resolve_size(btf, mtype_id); kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); - if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + if (kern_msize < 0 || msize != kern_msize) { pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", map->name, mname, (ssize_t)msize, (ssize_t)kern_msize); -- cgit v1.2.3 From 26a7cf2bbea656837583f9a1a0f9390db63d6cc3 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 13 Mar 2024 14:41:38 -0700 Subject: selftests/bpf: Ensure libbpf skip all-zeros fields of struct_ops maps. A new version of a type may have additional fields that do not exist in older versions. Previously, libbpf would reject struct_ops maps with a new version containing extra fields when running on a machine with an old kernel. However, we have updated libbpf to ignore these fields if their values are all zeros or null in order to provide backward compatibility. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240313214139.685112-3-thinker.li@gmail.com --- .../bpf/prog_tests/test_struct_ops_module.c | 47 ++++++++++++++++++++++ .../selftests/bpf/progs/struct_ops_module.c | 16 +++++++- 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index ee5372c7f2c7..098776d00ab4 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -93,9 +93,56 @@ cleanup: struct_ops_module__destroy(skel); } +static void test_struct_ops_not_zeroed(void) +{ + struct struct_ops_module *skel; + int err; + + /* zeroed is 0, and zeroed_op is null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) + return; + + err = struct_ops_module__load(skel); + ASSERT_OK(err, "struct_ops_module_load"); + + struct_ops_module__destroy(skel); + + /* zeroed is not 0 */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed")) + return; + + /* libbpf should reject the testmod_zeroed since struct + * bpf_testmod_ops in the kernel has no "zeroed" field and the + * value of "zeroed" is non-zero. + */ + skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed"); + + struct_ops_module__destroy(skel); + + /* zeroed_op is not null */ + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed_op")) + return; + + /* libbpf should reject the testmod_zeroed since the value of its + * "zeroed_op" is not null. + */ + skel->struct_ops.testmod_zeroed->zeroed_op = skel->progs.test_3; + err = struct_ops_module__load(skel); + ASSERT_ERR(err, "struct_ops_module_load_not_zeroed_op"); + + struct_ops_module__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) test_struct_ops_load(); + if (test__start_subtest("test_struct_ops_not_zeroed")) + test_struct_ops_not_zeroed(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 026cabfa7f1f..86e1e50c5531 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -23,7 +23,7 @@ void BPF_PROG(test_2, int a, int b) test_2_result = a + b; } -SEC("struct_ops/test_3") +SEC("?struct_ops/test_3") int BPF_PROG(test_3, int a, int b) { test_2_result = a + b + 3; @@ -54,3 +54,17 @@ struct bpf_testmod_ops___v2 testmod_2 = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___zeroed { + int (*test_1)(void); + void (*test_2)(int a, int b); + int (*test_maybe_null)(int dummy, struct task_struct *task); + void (*zeroed_op)(int a, int b); + int zeroed; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___zeroed testmod_zeroed = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2_v2, +}; -- cgit v1.2.3 From 4c8644f86c854c214aaabbcc24a27fa4c7e6a951 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Mar 2024 09:26:54 +0000 Subject: selftests/bpf: Remove second semicolon There are statements with two semicolons. Remove the second one, it is redundant. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240315092654.2431062-1-colin.i.king@gmail.com --- tools/testing/selftests/bpf/benchs/bench_local_storage_create.c | 2 +- tools/testing/selftests/bpf/progs/iters.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c index b36de42ee4d9..e2ff8ea1cb79 100644 --- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c @@ -186,7 +186,7 @@ static void *task_producer(void *input) for (i = 0; i < batch_sz; i++) { if (!pthd_results[i]) - pthread_join(pthds[i], NULL);; + pthread_join(pthds[i], NULL); } } diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 3db416606f2f..fe65e0952a1e 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -673,7 +673,7 @@ static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul) static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n) { - int *t, i, sum = 0;; + int *t, i, sum = 0; while ((t = bpf_iter_num_next(it))) { i = *t; -- cgit v1.2.3 From 7b30c296af6525571fc967f6a8661f6e1127369e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 18 Mar 2024 13:18:08 +0000 Subject: libbpbpf: Check bpf_map/bpf_program fd validity libbpf creates bpf_program/bpf_map structs for each program/map that user defines, but it allows to disable creating/loading those objects in kernel, in that case they won't have associated file descriptor (fd < 0). Such functionality is used for backward compatibility with some older kernels. Nothing prevents users from passing these maps or programs with no kernel counterpart to libbpf APIs. This change introduces explicit checks for kernel objects existence, aiming to improve visibility of those edge cases and provide meaningful warnings to users. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240318131808.95959-1-yatsenko@meta.com --- tools/lib/bpf/libbpf.c | 59 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 604368cfbf02..3a756d61c120 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8572,6 +8572,11 @@ int bpf_map__pin(struct bpf_map *map, const char *path) return libbpf_err(-EINVAL); } + if (map->fd < 0) { + pr_warn("map '%s': can't pin BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + if (map->pin_path) { if (path && strcmp(path, map->pin_path)) { pr_warn("map '%s' already has pin path '%s' different from '%s'\n", @@ -10316,6 +10321,11 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, return -EINVAL; } + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + if (!check_value_sz) return 0; @@ -10428,8 +10438,15 @@ long libbpf_get_error(const void *ptr) int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { int ret; + int prog_fd = bpf_program__fd(prog); + + if (prog_fd < 0) { + pr_warn("prog '%s': can't use BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } - ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + ret = bpf_link_update(bpf_link__fd(link), prog_fd, NULL); return libbpf_err_errno(ret); } @@ -10623,7 +10640,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -11347,6 +11364,13 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, false); addrs = OPTS_GET(opts, addrs, false); cnt = OPTS_GET(opts, cnt, false); @@ -11387,7 +11411,6 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); if (link_fd < 0) { err = -errno; @@ -11770,6 +11793,13 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_uprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, NULL); offsets = OPTS_GET(opts, offsets, NULL); ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL); @@ -11845,7 +11875,6 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts); if (link_fd < 0) { err = -errno; @@ -12089,7 +12118,7 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (bpf_program__fd(prog) < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -12671,6 +12700,12 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) if (!prog->sec_def || !prog->sec_def->prog_attach_fn) return libbpf_err_ptr(-EOPNOTSUPP); + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); if (err) return libbpf_err_ptr(err); @@ -12711,9 +12746,14 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) __u32 zero = 0; int err, fd; - if (!bpf_map__is_struct_ops(map) || map->fd == -1) + if (!bpf_map__is_struct_ops(map)) return libbpf_err_ptr(-EINVAL); + if (map->fd < 0) { + pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name); + return libbpf_err_ptr(-EINVAL); + } + link = calloc(1, sizeof(*link)); if (!link) return libbpf_err_ptr(-EINVAL); @@ -12760,8 +12800,13 @@ int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) __u32 zero = 0; int err; - if (!bpf_map__is_struct_ops(map) || !map_is_created(map)) + if (!bpf_map__is_struct_ops(map)) + return -EINVAL; + + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); return -EINVAL; + } st_ops_link = container_of(link, struct bpf_link_struct_ops, link); /* Ensure the type of a link is correct */ -- cgit v1.2.3 From 84239a24d10174fcfc7d6760cb120435a6ff69af Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:48:59 -0700 Subject: selftests/bpf: Replace CHECK with ASSERT_* in ns_current_pid_tgid test Replace CHECK in selftest ns_current_pid_tgid with recommended ASSERT_* style. I also shortened subtest name as the prefix of subtest name is covered by the test name already. This patch does fix a testing issue. Currently even if bss->user_{pid,tgid} is not correct, the test still passed since the clone func returns 0. I fixed it to return a non-zero value if bss->user_{pid,tgid} is incorrect. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240315184859.2975543-1-yonghong.song@linux.dev --- .../selftests/bpf/prog_tests/ns_current_pid_tgid.c | 36 ++++++++++++---------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 24d493482ffc..3a0664a86243 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -20,19 +20,19 @@ static int test_current_pid_tgid(void *args) { struct test_ns_current_pid_tgid__bss *bss; struct test_ns_current_pid_tgid *skel; - int err = -1, duration = 0; + int ret = -1, err; pid_t tgid, pid; struct stat st; skel = test_ns_current_pid_tgid__open_and_load(); - if (CHECK(!skel, "skel_open_load", "failed to load skeleton\n")) - goto cleanup; + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open_and_load")) + goto out; pid = syscall(SYS_gettid); tgid = getpid(); err = stat("/proc/self/ns/pid", &st); - if (CHECK(err, "stat", "failed /proc/self/ns/pid: %d\n", err)) + if (!ASSERT_OK(err, "stat /proc/self/ns/pid")) goto cleanup; bss = skel->bss; @@ -42,24 +42,26 @@ static int test_current_pid_tgid(void *args) bss->user_tgid = 0; err = test_ns_current_pid_tgid__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); - ASSERT_EQ(bss->user_pid, pid, "pid"); - ASSERT_EQ(bss->user_tgid, tgid, "tgid"); - err = 0; + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; cleanup: - test_ns_current_pid_tgid__destroy(skel); - - return err; + test_ns_current_pid_tgid__destroy(skel); +out: + return ret; } static void test_ns_current_pid_tgid_new_ns(void) { - int wstatus, duration = 0; + int wstatus; pid_t cpid; /* Create a process in a new namespace, this process @@ -68,21 +70,21 @@ static void test_ns_current_pid_tgid_new_ns(void) cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, CLONE_NEWPID | SIGCHLD, NULL); - if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(cpid, -1, "clone")) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) + if (!ASSERT_NEQ(waitpid(cpid, &wstatus, 0), -1, "waitpid")) return; - if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) + if (!ASSERT_OK(WEXITSTATUS(wstatus), "newns_pidtgid")) return; } /* TODO: use a different tracepoint */ void serial_test_ns_current_pid_tgid(void) { - if (test__start_subtest("ns_current_pid_tgid_root_ns")) + if (test__start_subtest("root_ns_tp")) test_current_pid_tgid(NULL); - if (test__start_subtest("ns_current_pid_tgid_new_ns")) + if (test__start_subtest("new_ns_tp")) test_ns_current_pid_tgid_new_ns(); } -- cgit v1.2.3 From 4d4bd29e363c467752536f874a2cba10a5923c59 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:49:04 -0700 Subject: selftests/bpf: Refactor out some functions in ns_current_pid_tgid test Refactor some functions in both user space code and bpf program as these functions are used by later cgroup/sk_msg tests. Another change is to mark tp program optional loading as later patches will use optional loading as well since they have quite different attachment and testing logic. There is no functionality change. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240315184904.2976123-1-yonghong.song@linux.dev --- .../selftests/bpf/prog_tests/ns_current_pid_tgid.c | 53 ++++++++++++++-------- .../selftests/bpf/progs/test_ns_current_pid_tgid.c | 10 ++-- 2 files changed, 41 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 3a0664a86243..847d7b70e290 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -16,30 +16,46 @@ #define STACK_SIZE (1024 * 1024) static char child_stack[STACK_SIZE]; -static int test_current_pid_tgid(void *args) +static int get_pid_tgid(pid_t *pid, pid_t *tgid, + struct test_ns_current_pid_tgid__bss *bss) { - struct test_ns_current_pid_tgid__bss *bss; - struct test_ns_current_pid_tgid *skel; - int ret = -1, err; - pid_t tgid, pid; struct stat st; + int err; - skel = test_ns_current_pid_tgid__open_and_load(); - if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open_and_load")) - goto out; - - pid = syscall(SYS_gettid); - tgid = getpid(); + *pid = syscall(SYS_gettid); + *tgid = getpid(); err = stat("/proc/self/ns/pid", &st); if (!ASSERT_OK(err, "stat /proc/self/ns/pid")) - goto cleanup; + return err; - bss = skel->bss; bss->dev = st.st_dev; bss->ino = st.st_ino; bss->user_pid = 0; bss->user_tgid = 0; + return 0; +} + +static int test_current_pid_tgid_tp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.tp_handler, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; err = test_ns_current_pid_tgid__attach(skel); if (!ASSERT_OK(err, "test_ns_current_pid_tgid__attach")) @@ -55,11 +71,10 @@ static int test_current_pid_tgid(void *args) cleanup: test_ns_current_pid_tgid__destroy(skel); -out: return ret; } -static void test_ns_current_pid_tgid_new_ns(void) +static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) { int wstatus; pid_t cpid; @@ -67,8 +82,8 @@ static void test_ns_current_pid_tgid_new_ns(void) /* Create a process in a new namespace, this process * will be the init process of this new namespace hence will be pid 1. */ - cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, - CLONE_NEWPID | SIGCHLD, NULL); + cpid = clone(fn, child_stack + STACK_SIZE, + CLONE_NEWPID | SIGCHLD, arg); if (!ASSERT_NEQ(cpid, -1, "clone")) return; @@ -84,7 +99,7 @@ static void test_ns_current_pid_tgid_new_ns(void) void serial_test_ns_current_pid_tgid(void) { if (test__start_subtest("root_ns_tp")) - test_current_pid_tgid(NULL); + test_current_pid_tgid_tp(NULL); if (test__start_subtest("new_ns_tp")) - test_ns_current_pid_tgid_new_ns(); + test_ns_current_pid_tgid_new_ns(test_current_pid_tgid_tp, NULL); } diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index 0763d49f9c42..aa3ec7ca16d9 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -10,17 +10,21 @@ __u64 user_tgid = 0; __u64 dev = 0; __u64 ino = 0; -SEC("tracepoint/syscalls/sys_enter_nanosleep") -int handler(const void *ctx) +static void get_pid_tgid(void) { struct bpf_pidns_info nsdata; if (bpf_get_ns_current_pid_tgid(dev, ino, &nsdata, sizeof(struct bpf_pidns_info))) - return 0; + return; user_pid = nsdata.pid; user_tgid = nsdata.tgid; +} +SEC("?tracepoint/syscalls/sys_enter_nanosleep") +int tp_handler(const void *ctx) +{ + get_pid_tgid(); return 0; } -- cgit v1.2.3 From 87ade6cd859ea9dbde6e80b3fcf717ed9a73b4a9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:49:10 -0700 Subject: selftests/bpf: Add a cgroup prog bpf_get_ns_current_pid_tgid() test Add a cgroup bpf program test where the bpf program is running in a pid namespace. The test is successfully: #165/3 ns_current_pid_tgid/new_ns_cgrp:OK Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240315184910.2976522-1-yonghong.song@linux.dev --- .../selftests/bpf/prog_tests/ns_current_pid_tgid.c | 73 ++++++++++++++++++++++ .../selftests/bpf/progs/test_ns_current_pid_tgid.c | 7 +++ 2 files changed, 80 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 847d7b70e290..fded38d24aae 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -12,6 +12,7 @@ #include #include #include +#include "network_helpers.h" #define STACK_SIZE (1024 * 1024) static char child_stack[STACK_SIZE]; @@ -74,6 +75,50 @@ cleanup: return ret; } +static int test_current_pid_tgid_cgrp(void *args) +{ + struct test_ns_current_pid_tgid__bss *bss; + struct test_ns_current_pid_tgid *skel; + int server_fd = -1, ret = -1, err; + int cgroup_fd = *(int *)args; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.cgroup_bind4, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, bss)) + goto cleanup; + + skel->links.cgroup_bind4 = bpf_program__attach_cgroup( + skel->progs.cgroup_bind4, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.cgroup_bind4, "bpf_program__attach_cgroup")) + goto cleanup; + + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + if (server_fd >= 0) + close(server_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) { int wstatus; @@ -95,6 +140,25 @@ static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) return; } +static void test_in_netns(int (*fn)(void *), void *arg) +{ + struct nstoken *nstoken = NULL; + + SYS(cleanup, "ip netns add ns_current_pid_tgid"); + SYS(cleanup, "ip -net ns_current_pid_tgid link set dev lo up"); + + nstoken = open_netns("ns_current_pid_tgid"); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto cleanup; + + test_ns_current_pid_tgid_new_ns(fn, arg); + +cleanup: + if (nstoken) + close_netns(nstoken); + SYS_NOFAIL("ip netns del ns_current_pid_tgid"); +} + /* TODO: use a different tracepoint */ void serial_test_ns_current_pid_tgid(void) { @@ -102,4 +166,13 @@ void serial_test_ns_current_pid_tgid(void) test_current_pid_tgid_tp(NULL); if (test__start_subtest("new_ns_tp")) test_ns_current_pid_tgid_new_ns(test_current_pid_tgid_tp, NULL); + if (test__start_subtest("new_ns_cgrp")) { + int cgroup_fd = -1; + + cgroup_fd = test__join_cgroup("/sock_addr"); + if (ASSERT_GE(cgroup_fd, 0, "join_cgroup")) { + test_in_netns(test_current_pid_tgid_cgrp, &cgroup_fd); + close(cgroup_fd); + } + } } diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index aa3ec7ca16d9..d0010e698f66 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -28,4 +28,11 @@ int tp_handler(const void *ctx) return 0; } +SEC("?cgroup/bind4") +int cgroup_bind4(struct bpf_sock_addr *ctx) +{ + get_pid_tgid(); + return 1; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 4c195ee4865d57786b3a63018d489b8a07877354 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:49:15 -0700 Subject: selftests/bpf: Add a sk_msg prog bpf_get_ns_current_pid_tgid() test Add a sk_msg bpf program test where the program is running in a pid namespace. The test is successful: #165/4 ns_current_pid_tgid/new_ns_sk_msg:OK Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240315184915.2976718-1-yonghong.song@linux.dev --- .../selftests/bpf/prog_tests/ns_current_pid_tgid.c | 62 ++++++++++++++++++++++ .../selftests/bpf/progs/test_ns_current_pid_tgid.c | 14 +++++ 2 files changed, 76 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index fded38d24aae..e72d75d6baa7 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -119,6 +119,66 @@ cleanup: return ret; } +static int test_current_pid_tgid_sk_msg(void *args) +{ + int verdict, map, server_fd = -1, client_fd = -1; + struct test_ns_current_pid_tgid__bss *bss; + static const char send_msg[] = "message"; + struct test_ns_current_pid_tgid *skel; + int ret = -1, err, key = 0; + pid_t tgid, pid; + + skel = test_ns_current_pid_tgid__open(); + if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open")) + return ret; + + bpf_program__set_autoload(skel->progs.sk_msg, true); + + err = test_ns_current_pid_tgid__load(skel); + if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load")) + goto cleanup; + + bss = skel->bss; + if (get_pid_tgid(&pid, &tgid, skel->bss)) + goto cleanup; + + verdict = bpf_program__fd(skel->progs.sk_msg); + map = bpf_map__fd(skel->maps.sock_map); + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto cleanup; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto cleanup; + + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto cleanup; + + err = bpf_map_update_elem(map, &key, &client_fd, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto cleanup; + + err = send(client_fd, send_msg, sizeof(send_msg), 0); + if (!ASSERT_EQ(err, sizeof(send_msg), "send(msg)")) + goto cleanup; + + if (!ASSERT_EQ(bss->user_pid, pid, "pid")) + goto cleanup; + if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid")) + goto cleanup; + ret = 0; + +cleanup: + if (server_fd >= 0) + close(server_fd); + if (client_fd >= 0) + close(client_fd); + test_ns_current_pid_tgid__destroy(skel); + return ret; +} + static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg) { int wstatus; @@ -175,4 +235,6 @@ void serial_test_ns_current_pid_tgid(void) close(cgroup_fd); } } + if (test__start_subtest("new_ns_sk_msg")) + test_in_netns(test_current_pid_tgid_sk_msg, NULL); } diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c index d0010e698f66..386315afad65 100644 --- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -5,6 +5,13 @@ #include #include +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); +} sock_map SEC(".maps"); + __u64 user_pid = 0; __u64 user_tgid = 0; __u64 dev = 0; @@ -35,4 +42,11 @@ int cgroup_bind4(struct bpf_sock_addr *ctx) return 1; } +SEC("?sk_msg") +int sk_msg(struct sk_msg_md *msg) +{ + get_pid_tgid(); + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From f803bcf9208a2540acb4c32bdc3616673169f490 Mon Sep 17 00:00:00 2001 From: "Alessandro Carminati (Red Hat)" Date: Thu, 14 Mar 2024 10:59:11 +0000 Subject: selftests/bpf: Prevent client connect before server bind in test_tc_tunnel.sh In some systems, the netcat server can incur in delay to start listening. When this happens, the test can randomly fail in various points. This is an example error message: # ip gre none gso # encap 192.168.1.1 to 192.168.1.2, type gre, mac none len 2000 # test basic connectivity # Ncat: Connection refused. The issue stems from a race condition between the netcat client and server. The test author had addressed this problem by implementing a sleep, which I have removed in this patch. This patch introduces a function capable of sleeping for up to two seconds. However, it can terminate the waiting period early if the port is reported to be listening. Signed-off-by: Alessandro Carminati (Red Hat) Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240314105911.213411-1-alessandro.carminati@gmail.com --- tools/testing/selftests/bpf/test_tc_tunnel.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 910044f08908..7989ec608454 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -72,7 +72,6 @@ cleanup() { server_listen() { ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & server_pid=$! - sleep 0.2 } client_connect() { @@ -93,6 +92,16 @@ verify_data() { fi } +wait_for_port() { + for i in $(seq 20); do + if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then + return 0 + fi + sleep 0.1 + done + return 1 +} + set -e # no arguments: automated test, run all @@ -193,6 +202,7 @@ setup # basic communication works echo "test basic connectivity" server_listen +wait_for_port ${port} ${netcat_opt} client_connect verify_data @@ -204,6 +214,7 @@ ip netns exec "${ns1}" tc filter add dev veth1 egress \ section "encap_${tuntype}_${mac}" echo "test bpf encap without decap (expect failure)" server_listen +wait_for_port ${port} ${netcat_opt} ! client_connect if [[ "$tuntype" =~ "udp" ]]; then -- cgit v1.2.3 From 68ca5d4eebb8c4de246ee5f634eee26bc689562d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:50 -0700 Subject: bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs Wire up BPF cookie for raw tracepoint programs (both BTF and non-BTF aware variants). This brings them up to part w.r.t. BPF cookie usage with classic tracepoint and fentry/fexit programs. Acked-by: Stanislav Fomichev Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-4-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++-- kernel/bpf/syscall.c | 13 +++++++++---- kernel/trace/bpf_trace.c | 13 +++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 28 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2ea8ce59f582..62762390c93d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1610,6 +1610,7 @@ struct bpf_tracing_link { struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; + u64 cookie; }; struct bpf_link_primer { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c42b9f1bada..9585f5345353 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1662,8 +1662,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1cb4c3809af4..e44c276e8617 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3774,7 +3774,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name) + const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3821,6 +3821,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; + link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -3841,11 +3842,13 @@ out_put_btp: return err; } -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; + void __user *tp_name; + __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) @@ -3855,7 +3858,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); + cookie = attr->raw_tracepoint.cookie; + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; @@ -5193,7 +5198,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 17de91ad4a1f..434e3ece6688 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2004,6 +2004,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2066,6 +2068,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); @@ -2369,15 +2374,23 @@ static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { struct bpf_prog *prog = link->link.prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } + + run_ctx.bpf_cookie = link->cookie; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c42b9f1bada..bf80b614c4db 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1664,6 +1664,7 @@ union bpf_attr { struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ -- cgit v1.2.3 From 36ffb2023e3703a64266ca5fed30f710b1263c70 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:51 -0700 Subject: libbpf: add support for BPF cookie for raw_tp/tp_btf programs Wire up BPF cookie passing or raw_tp and tp_btf programs, both in low-level and high-level APIs. Acked-by: Stanislav Fomichev Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-5-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 16 ++++++++++++++-- tools/lib/bpf/bpf.h | 9 +++++++++ tools/lib/bpf/libbpf.c | 20 +++++++++++++++++--- tools/lib/bpf/libbpf.h | 11 +++++++++++ tools/lib/bpf/libbpf.map | 2 ++ 5 files changed, 53 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 97ec005c3c47..c9f4e04f38fe 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -785,6 +785,7 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, uprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: @@ -1173,20 +1174,31 @@ int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info return bpf_obj_get_info_by_fd(link_fd, info, info_len); } -int bpf_raw_tracepoint_open(const char *name, int prog_fd) +int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); union bpf_attr attr; int fd; + if (!OPTS_VALID(opts, bpf_raw_tp_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, attr_sz); - attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; + attr.raw_tracepoint.name = ptr_to_u64(OPTS_GET(opts, tp_name, NULL)); + attr.raw_tracepoint.cookie = OPTS_GET(opts, cookie, 0); fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); return libbpf_err_errno(fd); } +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + LIBBPF_OPTS(bpf_raw_tp_opts, opts, .tp_name = name); + + return bpf_raw_tracepoint_open_opts(prog_fd, &opts); +} + int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index df0db2f0cdb7..972e17ec0c09 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -617,6 +617,15 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); +struct bpf_raw_tp_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const char *tp_name; + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tp_opts__last_field cookie + +LIBBPF_API int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts); LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3a756d61c120..86df0d50cba7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -12309,13 +12309,19 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin return libbpf_get_error(*link); } -struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, - const char *tp_name) +struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts) { + LIBBPF_OPTS(bpf_raw_tp_opts, raw_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int prog_fd, pfd; + if (!OPTS_VALID(opts, bpf_raw_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { pr_warn("prog '%s': can't attach before loaded\n", prog->name); @@ -12327,7 +12333,9 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return libbpf_err_ptr(-ENOMEM); link->detach = &bpf_link__detach_fd; - pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + raw_opts.tp_name = tp_name; + raw_opts.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_opts); if (pfd < 0) { pfd = -errno; free(link); @@ -12339,6 +12347,12 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return link; } +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + return bpf_program__attach_raw_tracepoint_opts(prog, tp_name, NULL); +} + static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { static const char *const prefixes[] = { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 7b510761f545..f88ab50c0229 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -760,9 +760,20 @@ bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, const char *tp_name, const struct bpf_tracepoint_opts *opts); +struct bpf_raw_tracepoint_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tracepoint_opts__last_field cookie + LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts); struct bpf_trace_opts { /* size of this struct, for forward/backward compatibility */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 86804fd90dd1..51732ecb1385 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -410,6 +410,8 @@ LIBBPF_1.3.0 { LIBBPF_1.4.0 { global: + bpf_program__attach_raw_tracepoint_opts; + bpf_raw_tracepoint_open_opts; bpf_token_create; btf__new_split; btf_ext__raw_data; -- cgit v1.2.3 From 51146ff0fae309a558bc8ab6cbf6cfda17356993 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:52 -0700 Subject: selftests/bpf: add raw_tp/tp_btf BPF cookie subtests Add test validating BPF cookie can be passed during raw_tp/tp_btf attachment and can be retried at runtime with bpf_get_attach_cookie() helper. Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-6-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/bpf_cookie.c | 114 ++++++++++++++++++++- .../testing/selftests/bpf/progs/test_bpf_cookie.c | 16 +++ 2 files changed, 129 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 1454cebc262b..4407ea428e77 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -573,6 +573,115 @@ cleanup: close(lsm_fd); } +static void tp_btf_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_trace_opts, trace_opts); + + /* There are three different ways to attach tp_btf (BTF-aware raw + * tracepoint) programs. Let's test all of them. + */ + prog_fd = bpf_program__fd(skel->progs.handle_tp_btf); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->tp_btf_res = 0; + + raw_tp_opts.cookie = cookie = 0x11000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "raw_tp_open_res"); + + /* low-level generic bpf_link_create() API */ + skel->bss->tp_btf_res = 0; + + link_opts.tracing.cookie = cookie = 0x22000000000000L; + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_RAW_TP, &link_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "link_create_res"); + + /* high-level bpf_link-based bpf_program__attach_trace_opts() API */ + skel->bss->tp_btf_res = 0; + + trace_opts.cookie = cookie = 0x33000000000000L; + link = bpf_program__attach_trace_opts(skel->progs.handle_tp_btf, &trace_opts); + if (!ASSERT_OK_PTR(link, "attach_trace_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->tp_btf_res, cookie, "attach_trace_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + +static void raw_tp_subtest(struct test_bpf_cookie *skel) +{ + __u64 cookie; + int prog_fd, link_fd = -1; + struct bpf_link *link = NULL; + LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts); + LIBBPF_OPTS(bpf_raw_tracepoint_opts, opts); + + /* There are two different ways to attach raw_tp programs */ + prog_fd = bpf_program__fd(skel->progs.handle_raw_tp); + + /* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */ + skel->bss->raw_tp_res = 0; + + raw_tp_opts.tp_name = "sys_enter"; + raw_tp_opts.cookie = cookie = 0x55000000000000L; + link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts); + if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts")) + goto cleanup; + + usleep(1); /* trigger */ + close(link_fd); /* detach */ + link_fd = -1; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "raw_tp_open_res"); + + /* high-level bpf_link-based bpf_program__attach_raw_tracepoint_opts() API */ + skel->bss->raw_tp_res = 0; + + opts.cookie = cookie = 0x66000000000000L; + link = bpf_program__attach_raw_tracepoint_opts(skel->progs.handle_raw_tp, + "sys_enter", &opts); + if (!ASSERT_OK_PTR(link, "attach_raw_tp_opts")) + goto cleanup; + + usleep(1); /* trigger */ + bpf_link__destroy(link); /* detach */ + link = NULL; + + ASSERT_EQ(skel->bss->raw_tp_res, cookie, "attach_raw_tp_opts_res"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + bpf_link__destroy(link); +} + void test_bpf_cookie(void) { struct test_bpf_cookie *skel; @@ -601,6 +710,9 @@ void test_bpf_cookie(void) tracing_subtest(skel); if (test__start_subtest("lsm")) lsm_subtest(skel); - + if (test__start_subtest("tp_btf")) + tp_btf_subtest(skel); + if (test__start_subtest("raw_tp")) + raw_tp_subtest(skel); test_bpf_cookie__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c index 5a3a80f751c4..c83142b55f47 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -15,6 +15,8 @@ __u64 uprobe_res; __u64 uretprobe_res; __u64 tp_res; __u64 pe_res; +__u64 raw_tp_res; +__u64 tp_btf_res; __u64 fentry_res; __u64 fexit_res; __u64 fmod_ret_res; @@ -87,6 +89,20 @@ int handle_pe(struct pt_regs *ctx) return 0; } +SEC("raw_tp/sys_enter") +int handle_raw_tp(void *ctx) +{ + update(ctx, &raw_tp_res); + return 0; +} + +SEC("tp_btf/sys_enter") +int handle_tp_btf(void *ctx) +{ + update(ctx, &tp_btf_res); + return 0; +} + SEC("fentry/bpf_fentry_test1") int BPF_PROG(fentry_test1, int a) { -- cgit v1.2.3 From be24a895149b6df4c474848e3928c237ad10fdc4 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 20 Mar 2024 01:22:41 +0000 Subject: bpftool: Enable libbpf logs when loading pid_iter in debug mode When trying to load the pid_iter BPF program used to iterate over the PIDs of the processes holding file descriptors to BPF links, we would unconditionally silence libbpf in order to keep the output clean if the kernel does not support iterators and loading fails. Although this is the desirable behaviour in most cases, this may hide bugs in the pid_iter program that prevent it from loading, and it makes it hard to debug such load failures, even in "debug" mode. Instead, it makes more sense to print libbpf's logs when we pass the -d|--debug flag to bpftool, so that users get the logs to investigate failures without having to edit bpftool's source code. Signed-off-by: Quentin Monnet Message-ID: <20240320012241.42991-1-qmo@kernel.org> Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/pids.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index 00c77edb6331..9b898571b49e 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -101,7 +101,6 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) char buf[4096 / sizeof(*e) * sizeof(*e)]; struct pid_iter_bpf *skel; int err, ret, fd = -1, i; - libbpf_print_fn_t default_print; *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL); if (IS_ERR(*map)) { @@ -118,12 +117,18 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) skel->rodata->obj_type = type; - /* we don't want output polluted with libbpf errors if bpf_iter is not - * supported - */ - default_print = libbpf_set_print(libbpf_print_none); - err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + if (!verifier_logs) { + libbpf_print_fn_t default_print; + + /* Unless debug information is on, we don't want the output to + * be polluted with libbpf errors if bpf_iter is not supported. + */ + default_print = libbpf_set_print(libbpf_print_none); + err = pid_iter_bpf__load(skel); + libbpf_set_print(default_print); + } else { + err = pid_iter_bpf__load(skel); + } if (err) { /* too bad, kernel doesn't support BPF iterators yet */ err = 0; -- cgit v1.2.3 From e9a826dd145bf2c19888aee1b974214cefc74a2e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 20 Mar 2024 01:34:57 +0000 Subject: bpftool: Remove unnecessary source files from bootstrap version Commit d510296d331a ("bpftool: Use syscall/loader program in "prog load" and "gen skeleton" command.") added new files to the list of objects to compile in order to build the bootstrap version of bpftool. As far as I can tell, these objects are unnecessary and were added by mistake; maybe a draft version intended to add support for loading loader programs from the bootstrap version. Anyway, we can remove these object files from the list to make the bootstrap bpftool binary a tad smaller and faster to build. Fixes: d510296d331a ("bpftool: Use syscall/loader program in "prog load" and "gen skeleton" command.") Signed-off-by: Quentin Monnet Message-ID: <20240320013457.44808-1-qmo@kernel.org> Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Makefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index e9154ace80ff..00c704423539 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -183,7 +183,7 @@ HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o @@ -231,9 +231,6 @@ endif CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) -$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c - $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ - $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ -- cgit v1.2.3 From 520fad2e3206b2476f201a283ecf096cfb671902 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 15 Mar 2024 14:33:29 -0700 Subject: selftests/bpf: scale benchmark counting by using per-CPU counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When benchmarking with multiple threads (-pN, where N>1), we start contending on single atomic counter that both BPF trigger benchmarks are using, as well as "baseline" tests in user space (trig-base and trig-uprobe-base benchmarks). As such, we start bottlenecking on something completely irrelevant to benchmark at hand. Scale counting up by using per-CPU counters on BPF side. On use space side we do the next best thing: hash thread ID to approximate per-CPU behavior. It seems to work quite well in practice. To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8, 16, and 32 threads: - trig-uprobe-base (no syscalls, pure tight counting loop in user-space); - trig-base (get_pgid() syscall, atomic counter in user-space); - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU counter on BPF side). Command used: for b in uprobe-base base fentry; do \ for p in 1 2 4 8 16 32; do \ printf "%-11s %2d: %s\n" $b $p \ "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \ done; \ done Before these changes, aggregate throughput across all threads doesn't scale well with number of threads, it actually even falls sharply for uprobe-base due to a very high contention: uprobe-base 1: 138.998 ± 0.650M/s uprobe-base 2: 70.526 ± 1.147M/s uprobe-base 4: 63.114 ± 0.302M/s uprobe-base 8: 54.177 ± 0.138M/s uprobe-base 16: 45.439 ± 0.057M/s uprobe-base 32: 37.163 ± 0.242M/s base 1: 16.940 ± 0.182M/s base 2: 19.231 ± 0.105M/s base 4: 21.479 ± 0.038M/s base 8: 23.030 ± 0.037M/s base 16: 22.034 ± 0.004M/s base 32: 18.152 ± 0.013M/s fentry 1: 14.794 ± 0.054M/s fentry 2: 17.341 ± 0.055M/s fentry 4: 23.792 ± 0.024M/s fentry 8: 21.557 ± 0.047M/s fentry 16: 21.121 ± 0.004M/s fentry 32: 17.067 ± 0.023M/s After these changes, we see almost perfect linear scaling, as expected. The sub-linear scaling when going from 8 to 16 threads is interesting and consistent on my test machine, but I haven't investigated what is causing it this peculiar slowdown (across all benchmarks, could be due to hyperthreading effects, not sure). uprobe-base 1: 139.980 ± 0.648M/s uprobe-base 2: 270.244 ± 0.379M/s uprobe-base 4: 532.044 ± 1.519M/s uprobe-base 8: 1004.571 ± 3.174M/s uprobe-base 16: 1720.098 ± 0.744M/s uprobe-base 32: 3506.659 ± 8.549M/s base 1: 16.869 ± 0.071M/s base 2: 33.007 ± 0.092M/s base 4: 64.670 ± 0.203M/s base 8: 121.969 ± 0.210M/s base 16: 207.832 ± 0.112M/s base 32: 424.227 ± 1.477M/s fentry 1: 14.777 ± 0.087M/s fentry 2: 28.575 ± 0.146M/s fentry 4: 56.234 ± 0.176M/s fentry 8: 106.095 ± 0.385M/s fentry 16: 181.440 ± 0.032M/s fentry 32: 369.131 ± 0.693M/s Signed-off-by: Andrii Nakryiko Message-ID: <20240315213329.1161589-1-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/benchs/bench_trigger.c | 40 +++++++++++++++++++--- tools/testing/selftests/bpf/progs/trigger_bench.c | 39 ++++++++++++++------- 2 files changed, 62 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index ace0d1011a8e..b7aea79495ba 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,15 +1,45 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +/* adjust slot shift in inc_hits() if changing */ +#define MAX_BUCKETS 256 + /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; } ctx; -static struct counter base_hits; +static struct counter base_hits[MAX_BUCKETS]; + +static __always_inline void inc_counter(struct counter *counters) +{ + static __thread int tid = 0; + unsigned slot; + + if (unlikely(tid == 0)) + tid = gettid(); + + /* multiplicative hashing, it's fast */ + slot = 2654435769U * tid; + slot >>= 24; + + atomic_inc(&base_hits[slot].value); /* use highest byte as an index */ +} + +static long sum_and_reset_counters(struct counter *counters) +{ + int i; + long sum = 0; + + for (i = 0; i < MAX_BUCKETS; i++) + sum += atomic_swap(&counters[i].value, 0); + return sum; +} static void trigger_validate(void) { @@ -23,14 +53,14 @@ static void *trigger_base_producer(void *input) { while (true) { (void)syscall(__NR_getpgid); - atomic_inc(&base_hits.value); + inc_counter(base_hits); } return NULL; } static void trigger_base_measure(struct bench_res *res) { - res->hits = atomic_swap(&base_hits.value, 0); + res->hits = sum_and_reset_counters(base_hits); } static void *trigger_producer(void *input) @@ -42,7 +72,7 @@ static void *trigger_producer(void *input) static void trigger_measure(struct bench_res *res) { - res->hits = atomic_swap(&ctx.skel->bss->hits, 0); + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) @@ -164,7 +194,7 @@ static void *uprobe_base_producer(void *input) { while (true) { uprobe_target_nop(); - atomic_inc(&base_hits.value); + inc_counter(base_hits); } return NULL; } diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 5fda43901033..42ec202015ed 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -9,12 +9,27 @@ char _license[] SEC("license") = "GPL"; -long hits = 0; +#define CPU_MASK 255 +#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */ + +/* matches struct counter in bench.h */ +struct counter { + long value; +} __attribute__((aligned(128))); + +struct counter hits[MAX_CPUS]; + +static __always_inline void inc_counter(void) +{ + int cpu = bpf_get_smp_processor_id(); + + __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); +} SEC("tp/syscalls/sys_enter_getpgid") int bench_trigger_tp(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } @@ -22,69 +37,69 @@ SEC("raw_tp/sys_enter") int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) { if (id == __NR_getpgid) - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("kprobe/" SYS_PREFIX "sys_getpgid") int bench_trigger_kprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("kretprobe/" SYS_PREFIX "sys_getpgid") int bench_trigger_kretprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") int bench_trigger_kprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") int bench_trigger_kretprobe_multi(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("fentry/" SYS_PREFIX "sys_getpgid") int bench_trigger_fentry(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("fexit/" SYS_PREFIX "sys_getpgid") int bench_trigger_fexit(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("fentry.s/" SYS_PREFIX "sys_getpgid") int bench_trigger_fentry_sleep(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") int bench_trigger_fmodret(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return -22; } SEC("uprobe") int bench_trigger_uprobe(void *ctx) { - __sync_add_and_fetch(&hits, 1); + inc_counter(); return 0; } -- cgit v1.2.3 From cc9b22dfa735800980e7362f02aff6f1c2280996 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 20 Mar 2024 01:41:03 +0000 Subject: bpftool: Clean up HOST_CFLAGS, HOST_LDFLAGS for bootstrap bpftool Bpftool's Makefile uses $(HOST_CFLAGS) to build the bootstrap version of bpftool, in order to pick the flags for the host (where we run the bootstrap version) and not for the target system (where we plan to run the full bpftool binary). But we pass too much information through this variable. In particular, we set HOST_CFLAGS by copying most of the $(CFLAGS); but we do this after the feature detection for bpftool, which means that $(CFLAGS), hence $(HOST_CFLAGS), contain all macro definitions for using the different optional features. For example, -DHAVE_LLVM_SUPPORT may be passed to the $(HOST_CFLAGS), even though the LLVM disassembler is not used in the bootstrap version, and the related library may even be missing for the host architecture. A similar thing happens with the $(LDFLAGS), that we use unchanged for linking the bootstrap version even though they may contains flags to link against additional libraries. To address the $(HOST_CFLAGS) issue, we move the definition of $(HOST_CFLAGS) earlier in the Makefile, before the $(CFLAGS) update resulting from the feature probing - none of which being relevant to the bootstrap version. To clean up the $(LDFLAGS) for the bootstrap version, we introduce a dedicated $(HOST_LDFLAGS) variable that we base on $(LDFLAGS), before the feature probing as well. On my setup, the following macro and libraries are removed from the compiler invocation to build bpftool after this patch: -DUSE_LIBCAP -DHAVE_LLVM_SUPPORT -I/usr/lib/llvm-17/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -lLLVM-17 -L/usr/lib/llvm-17/lib Another advantage of cleaning up these flags is that displaying available features with "bpftool version" becomes more accurate for the bootstrap bpftool, and no longer reflects the features detected (and available only) for the final binary. Cc: Jean-Philippe Brucker Signed-off-by: Quentin Monnet Acked-by: Jiri Olsa Message-ID: <20240320014103.45641-1-qmo@kernel.org> Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 00c704423539..b67454b45a49 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -89,6 +89,10 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif +HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) +HOST_LDFLAGS := $(LDFLAGS) + INSTALL ?= install RM ?= rm -f @@ -178,9 +182,6 @@ ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),) SRCS := $(filter-out jit_disasm.c,$(SRCS)) endif -HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ - $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) - BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) @@ -235,7 +236,7 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) - $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ + $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(HOST_LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ -- cgit v1.2.3 From a9f4c6c999008c92e2aba6d4f50f2b2d10ed7fd0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 31 Jan 2024 17:57:54 -0300 Subject: perf trace: Collect sys_nanosleep first argument That is a 'struct timespec' passed from userspace to the kernel as we can see with a system wide syscall tracing: root@number:~# perf trace -e nanosleep 0.000 (10.102 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 38.924 (10.077 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 100.177 (10.107 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 139.171 (10.063 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 200.603 (10.105 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 239.399 (10.064 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 300.994 (10.096 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 339.584 (10.067 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 401.335 (10.057 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 439.758 (10.166 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 501.814 (10.110 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 539.983 (10.227 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 602.284 (10.199 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 640.208 (10.105 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 702.662 (10.163 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 740.440 (10.107 ms): podman/2195174 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 802.993 (10.159 ms): podman/9150 nanosleep(rqtp: { .tv_sec: 0, .tv_nsec: 10000000 }) = 0 ^Croot@number:~# strace -p 9150 -e nanosleep If we then use the ptrace method to look at that podman process: root@number:~# strace -p 9150 -e nanosleep strace: Process 9150 attached nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0 ^Cstrace: Process 9150 detached root@number:~# With some changes we can get something closer to the strace output, still in system wide mode: root@number:~# perf config trace.show_arg_names=false root@number:~# perf config trace.show_duration=false root@number:~# perf config trace.show_timestamp=false root@number:~# perf config trace.show_zeros=true root@number:~# perf config trace.args_alignment=0 root@number:~# perf trace -e nanosleep --max-events=10 podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/2195174 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 podman/9150 nanosleep({ .tv_sec: 0, .tv_nsec: 10000000 }, NULL) = 0 root@number:~# root@number:~# perf config trace.show_arg_names=false trace.show_duration=false trace.show_timestamp=false trace.show_zeros=true trace.args_alignment=0 root@number:~# cat ~/.perfconfig # this file is auto-generated. [trace] show_arg_names = false show_duration = false show_timestamp = false show_zeros = true args_alignment = 0 root@number:~# This will not get reused by any other syscall as nanosleep is the only one to have as its first argument a 'struct timespec" pointer argument passed from userspace to the kernel: root@number:~# grep timespec /sys/kernel/tracing/events/syscalls/sys_enter_*/format | grep offset:16 /sys/kernel/tracing/events/syscalls/sys_enter_nanosleep/format: field:struct __kernel_timespec * rqtp; offset:16; size:8; signed:0; root@number:~# BTF based pretty printing will simplify all this, but then lets just get the low hanging fruits first. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Zbq72dJRpOlfRWnf@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 ++ .../perf/util/bpf_skel/augmented_raw_syscalls.bpf.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 90eaff8c0f6e..8fb032caeaf5 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1049,6 +1049,8 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, }, { .name = "name_to_handle_at", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + { .name = "nanosleep", + .arg = { [0] = { .scnprintf = SCA_TIMESPEC, /* req */ }, }, }, { .name = "newfstatat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, { .name = "open", diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c index 2872f9bc0785..0acbd74e8c76 100644 --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -341,6 +341,27 @@ failure: return 1; /* Failure: don't filter */ } +SEC("tp/syscalls/sys_enter_nanosleep") +int sys_enter_nanosleep(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *req_arg = (const void *)args->args[0]; + unsigned int len = sizeof(augmented_args->args); + __u32 size = sizeof(struct timespec64); + + if (augmented_args == NULL) + goto failure; + + if (size > sizeof(augmented_args->__data)) + goto failure; + + bpf_probe_read_user(&augmented_args->__data, size, req_arg); + + return augmented__output(args, augmented_args, len + size); +failure: + return 1; /* Failure: don't filter */ +} + static pid_t getpid(void) { return bpf_get_current_pid_tgid(); -- cgit v1.2.3 From 4b3761eebb1c5c1bacec66fafff51eb65c4139bc Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sat, 20 Mar 2021 04:58:24 +0530 Subject: perf c2c: Fix a punctuation s/dont/don\'t/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210319232824.742-1-unixbhaskar@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 16b40f5d43db..14f228c7c869 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2050,7 +2050,7 @@ static int hpp_list__parse(struct perf_hpp_list *hpp_list, perf_hpp__setup_output_field(hpp_list); /* - * We dont need other sorting keys other than those + * We don't need other sorting keys other than those * we already specified. It also really slows down * the processing a lot with big number of output * fields, so switching this off for c2c. -- cgit v1.2.3 From 5d8c646038f2f173db79692607c313b195062759 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:49:01 -0300 Subject: perf beauty: Fix dependency of tables using uapi/linux/mount.h Several such tables were depending on uapi/linux/fs.h, cut and paste error when they were introduced, fix it. Cc: Ian Rogers Cc: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Ze9vjxv42PN_QGZv@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 04d89d2ed209..d3a568abc389 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -499,19 +499,19 @@ $(fadvise_advice_array): $(linux_uapi_dir)/in.h $(fadvise_advice_tbl) fsmount_arrays := $(beauty_outdir)/fsmount_arrays.c fsmount_tbls := $(srctree)/tools/perf/trace/beauty/fsmount.sh -$(fsmount_arrays): $(linux_uapi_dir)/fs.h $(fsmount_tbls) +$(fsmount_arrays): $(linux_uapi_dir)/mount.h $(fsmount_tbls) $(Q)$(SHELL) '$(fsmount_tbls)' $(linux_uapi_dir) > $@ fspick_arrays := $(beauty_outdir)/fspick_arrays.c fspick_tbls := $(srctree)/tools/perf/trace/beauty/fspick.sh -$(fspick_arrays): $(linux_uapi_dir)/fs.h $(fspick_tbls) +$(fspick_arrays): $(linux_uapi_dir)/mount.h $(fspick_tbls) $(Q)$(SHELL) '$(fspick_tbls)' $(linux_uapi_dir) > $@ fsconfig_arrays := $(beauty_outdir)/fsconfig_arrays.c fsconfig_tbls := $(srctree)/tools/perf/trace/beauty/fsconfig.sh -$(fsconfig_arrays): $(linux_uapi_dir)/fs.h $(fsconfig_tbls) +$(fsconfig_arrays): $(linux_uapi_dir)/mount.h $(fsconfig_tbls) $(Q)$(SHELL) '$(fsconfig_tbls)' $(linux_uapi_dir) > $@ pkey_alloc_access_rights_array := $(beauty_outdir)/pkey_alloc_access_rights_array.c @@ -597,13 +597,13 @@ $(mremap_flags_array): $(linux_uapi_dir)/mman.h $(mremap_flags_tbl) mount_flags_array := $(beauty_outdir)/mount_flags_array.c mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/mount_flags.sh -$(mount_flags_array): $(linux_uapi_dir)/fs.h $(mount_flags_tbl) +$(mount_flags_array): $(linux_uapi_dir)/mount.h $(mount_flags_tbl) $(Q)$(SHELL) '$(mount_flags_tbl)' $(linux_uapi_dir) > $@ move_mount_flags_array := $(beauty_outdir)/move_mount_flags_array.c move_mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/move_mount_flags.sh -$(move_mount_flags_array): $(linux_uapi_dir)/fs.h $(move_mount_flags_tbl) +$(move_mount_flags_array): $(linux_uapi_dir)/mount.h $(move_mount_flags_tbl) $(Q)$(SHELL) '$(move_mount_flags_tbl)' $(linux_uapi_dir) > $@ -- cgit v1.2.3 From faf7217a397f041f146a4cf6d9f1c9bd99cd5ca4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move uapi/linux/fs.h copy out of the directory used to build perf It is mostly used only to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/include/ hierarchy, that is used just for scraping. The only case where it was being used to build was in tools/perf/trace/beauty/sync_file_range.c, because some older systems doesn't have the SYNC_FILE_RANGE_WRITE_AND_WAIT define, just use the system's linux/fs.h header instead, defining it if not available. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. No other tools/ living code uses it, just coming from either 'make install_headers' or from the system /usr/include/ directory. Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fs.h | 368 ------------------------ tools/perf/Makefile.perf | 9 +- tools/perf/check-headers.sh | 2 +- tools/perf/trace/beauty/include/uapi/linux/fs.h | 368 ++++++++++++++++++++++++ tools/perf/trace/beauty/rename_flags.sh | 2 +- tools/perf/trace/beauty/sync_file_range.c | 11 +- tools/perf/trace/beauty/sync_file_range.sh | 2 +- 7 files changed, 386 insertions(+), 376 deletions(-) delete mode 100644 tools/include/uapi/linux/fs.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/fs.h (limited to 'tools') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h deleted file mode 100644 index 48ad69f7722e..000000000000 --- a/tools/include/uapi/linux/fs.h +++ /dev/null @@ -1,368 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_FS_H -#define _UAPI_LINUX_FS_H - -/* - * This file has definitions for some important file table structures - * and constants and structures used by various generic file system - * ioctl's. Please do not make any changes in this file before - * sending patches for review to linux-fsdevel@vger.kernel.org and - * linux-api@vger.kernel.org. - */ - -#include -#include -#include -#ifndef __KERNEL__ -#include -#endif - -/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ -#if !defined(__KERNEL__) -#include -#endif - -/* - * It's silly to have NR_OPEN bigger than NR_FILE, but you can change - * the file limit at runtime and only root can increase the per-process - * nr_file rlimit, so it's safe to set up a ridiculously high absolute - * upper limit on files-per-process. - * - * Some programs (notably those using select()) may have to be - * recompiled to take full advantage of the new limits.. - */ - -/* Fixed constants first: */ -#undef NR_OPEN -#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ -#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ - -#define BLOCK_SIZE_BITS 10 -#define BLOCK_SIZE (1< /dev/null beauty_linux_dir := $(srctree)/tools/perf/trace/beauty/include/linux/ +beauty_uapi_linux_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/linux/ linux_uapi_dir := $(srctree)/tools/include/uapi/linux asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/ @@ -647,8 +648,8 @@ $(x86_arch_MSRs_array): $(x86_arch_asm_dir)/msr-index.h $(x86_arch_MSRs_tbl) rename_flags_array := $(beauty_outdir)/rename_flags_array.c rename_flags_tbl := $(srctree)/tools/perf/trace/beauty/rename_flags.sh -$(rename_flags_array): $(linux_uapi_dir)/fs.h $(rename_flags_tbl) - $(Q)$(SHELL) '$(rename_flags_tbl)' $(linux_uapi_dir) > $@ +$(rename_flags_array): $(beauty_uapi_linux_dir)/fs.h $(rename_flags_tbl) + $(Q)$(SHELL) '$(rename_flags_tbl)' $(beauty_uapi_linux_dir) > $@ arch_errno_name_array := $(beauty_outdir)/arch_errno_name_array.c arch_errno_hdr_dir := $(srctree)/tools @@ -660,8 +661,8 @@ $(arch_errno_name_array): $(arch_errno_tbl) sync_file_range_arrays := $(beauty_outdir)/sync_file_range_arrays.c sync_file_range_tbls := $(srctree)/tools/perf/trace/beauty/sync_file_range.sh -$(sync_file_range_arrays): $(linux_uapi_dir)/fs.h $(sync_file_range_tbls) - $(Q)$(SHELL) '$(sync_file_range_tbls)' $(linux_uapi_dir) > $@ +$(sync_file_range_arrays): $(beauty_uapi_linux_dir)/fs.h $(sync_file_range_tbls) + $(Q)$(SHELL) '$(sync_file_range_tbls)' $(beauty_uapi_linux_dir) > $@ TESTS_CORESIGHT_DIR := $(srctree)/tools/perf/tests/shell/coresight diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 66ba33dbcef2..015f74137b75 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -11,7 +11,6 @@ FILES=( "include/uapi/drm/i915_drm.h" "include/uapi/linux/fadvise.h" "include/uapi/linux/fcntl.h" - "include/uapi/linux/fs.h" "include/uapi/linux/fscrypt.h" "include/uapi/linux/kcmp.h" "include/uapi/linux/kvm.h" @@ -98,6 +97,7 @@ SYNC_CHECK_FILES=( declare -a BEAUTY_FILES BEAUTY_FILES=( "include/linux/socket.h" + "include/uapi/linux/fs.h" ) declare -a FAILURES diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h new file mode 100644 index 000000000000..48ad69f7722e --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FS_H +#define _UAPI_LINUX_FS_H + +/* + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. + */ + +#include +#include +#include +#ifndef __KERNEL__ +#include +#endif + +/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */ +#if !defined(__KERNEL__) +#include +#endif + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1< # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/perf/trace/beauty/include/uapi/linux/ fs_header=${header_dir}/fs.h diff --git a/tools/perf/trace/beauty/sync_file_range.c b/tools/perf/trace/beauty/sync_file_range.c index 1c425f04047d..3e8f50ff4fc7 100644 --- a/tools/perf/trace/beauty/sync_file_range.c +++ b/tools/perf/trace/beauty/sync_file_range.c @@ -7,7 +7,16 @@ #include "trace/beauty/beauty.h" #include -#include +#include + +#ifndef SYNC_FILE_RANGE_WRITE_AND_WAIT +#define SYNC_FILE_RANGE_WAIT_BEFORE 1 +#define SYNC_FILE_RANGE_WRITE 2 +#define SYNC_FILE_RANGE_WAIT_AFTER 4 +#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \ + SYNC_FILE_RANGE_WAIT_BEFORE | \ + SYNC_FILE_RANGE_WAIT_AFTER) +#endif static size_t sync_file_range__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) { diff --git a/tools/perf/trace/beauty/sync_file_range.sh b/tools/perf/trace/beauty/sync_file_range.sh index 90bf633be879..b1084c4cab63 100755 --- a/tools/perf/trace/beauty/sync_file_range.sh +++ b/tools/perf/trace/beauty/sync_file_range.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + linux_header_dir=tools/perf/trace/beauty/include/uapi/linux/ else linux_header_dir=$1 fi -- cgit v1.2.3 From 22916d2cbad9a20d3fbca7cda015efcf10427297 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Mar 2024 13:53:39 -0300 Subject: perf beauty: Don't include uapi/linux/mount.h, use sys/mount.h instead The tools/include/uapi/linux/mount.h file is mostly used for scrapping defines into id->string tables, this is the only place were it is being directly used, stop doing so. Define MOUNT_ATTR_RELATIME and MOUNT_ATTR__ATIME if not available in the system's headers. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/fsmount.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/fsmount.c b/tools/perf/trace/beauty/fsmount.c index 30c8c082a3c3..28c2c16fc1a8 100644 --- a/tools/perf/trace/beauty/fsmount.c +++ b/tools/perf/trace/beauty/fsmount.c @@ -7,7 +7,14 @@ #include "trace/beauty/beauty.h" #include -#include +#include + +#ifndef MOUNT_ATTR__ATIME +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#endif +#ifndef MOUNT_ATTR_RELATIME +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#endif static size_t fsmount__scnprintf_attr_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) { -- cgit v1.2.3 From ab3316119f9d0b3a1c4b90809ea37ab7927de72b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move uapi/linux/mount.h copy out of the directory used to build perf It is mostly used only to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/include/ hierarchy, that is used just for scraping. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. No other tools/ living code uses it, just coming from either 'make install_headers' or from the system /usr/include/ directory. Suggested-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/mount.h | 211 --------------------- tools/perf/Makefile.perf | 21 +- tools/perf/check-headers.sh | 2 +- tools/perf/trace/beauty/fsconfig.sh | 6 +- tools/perf/trace/beauty/fsmount.sh | 6 +- tools/perf/trace/beauty/fspick.sh | 6 +- tools/perf/trace/beauty/include/uapi/linux/mount.h | 211 +++++++++++++++++++++ tools/perf/trace/beauty/mount_flags.sh | 6 +- tools/perf/trace/beauty/move_mount_flags.sh | 6 +- 9 files changed, 237 insertions(+), 238 deletions(-) delete mode 100644 tools/include/uapi/linux/mount.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/mount.h (limited to 'tools') diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h deleted file mode 100644 index ad5478dbad00..000000000000 --- a/tools/include/uapi/linux/mount.h +++ /dev/null @@ -1,211 +0,0 @@ -#ifndef _UAPI_LINUX_MOUNT_H -#define _UAPI_LINUX_MOUNT_H - -#include - -/* - * These are the fs-independent mount-flags: up to 32 flags are supported - * - * Usage of these is restricted within the kernel to core mount(2) code and - * callers of sys_mount() only. Filesystems should be using the SB_* - * equivalent instead. - */ -#define MS_RDONLY 1 /* Mount read-only */ -#define MS_NOSUID 2 /* Ignore suid and sgid bits */ -#define MS_NODEV 4 /* Disallow access to device special files */ -#define MS_NOEXEC 8 /* Disallow program execution */ -#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ -#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ -#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ -#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ -#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */ -#define MS_NOATIME 1024 /* Do not update access times. */ -#define MS_NODIRATIME 2048 /* Do not update directory access times */ -#define MS_BIND 4096 -#define MS_MOVE 8192 -#define MS_REC 16384 -#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. - MS_VERBOSE is deprecated. */ -#define MS_SILENT 32768 -#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ -#define MS_UNBINDABLE (1<<17) /* change to unbindable */ -#define MS_PRIVATE (1<<18) /* change to private */ -#define MS_SLAVE (1<<19) /* change to slave */ -#define MS_SHARED (1<<20) /* change to shared */ -#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ -#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ -#define MS_I_VERSION (1<<23) /* Update inode I_version field */ -#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ -#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ - -/* These sb flags are internal to the kernel */ -#define MS_SUBMOUNT (1<<26) -#define MS_NOREMOTELOCK (1<<27) -#define MS_NOSEC (1<<28) -#define MS_BORN (1<<29) -#define MS_ACTIVE (1<<30) -#define MS_NOUSER (1<<31) - -/* - * Superblock flags that can be altered by MS_REMOUNT - */ -#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ - MS_LAZYTIME) - -/* - * Old magic mount flag and mask - */ -#define MS_MGC_VAL 0xC0ED0000 -#define MS_MGC_MSK 0xffff0000 - -/* - * open_tree() flags. - */ -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ -#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ - -/* - * move_mount() flags. - */ -#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ -#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ -#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ -#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ -#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ -#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ -#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ -#define MOVE_MOUNT__MASK 0x00000377 - -/* - * fsopen() flags. - */ -#define FSOPEN_CLOEXEC 0x00000001 - -/* - * fspick() flags. - */ -#define FSPICK_CLOEXEC 0x00000001 -#define FSPICK_SYMLINK_NOFOLLOW 0x00000002 -#define FSPICK_NO_AUTOMOUNT 0x00000004 -#define FSPICK_EMPTY_PATH 0x00000008 - -/* - * The type of fsconfig() call made. - */ -enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */ - FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ - FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */ -}; - -/* - * fsmount() flags. - */ -#define FSMOUNT_CLOEXEC 0x00000001 - -/* - * Mount attributes. - */ -#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ -#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ -#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ -#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ -#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ -#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ -#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ -#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ -#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ -#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ -#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */ - -/* - * mount_setattr() - */ -struct mount_attr { - __u64 attr_set; - __u64 attr_clr; - __u64 propagation; - __u64 userns_fd; -}; - -/* List of all mount_attr versions. */ -#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ - - -/* - * Structure for getting mount/superblock/filesystem info with statmount(2). - * - * The interface is similar to statx(2): individual fields or groups can be - * selected with the @mask argument of statmount(). Kernel will set the @mask - * field according to the supported fields. - * - * If string fields are selected, then the caller needs to pass a buffer that - * has space after the fixed part of the structure. Nul terminated strings are - * copied there and offsets relative to @str are stored in the relevant fields. - * If the buffer is too small, then EOVERFLOW is returned. The actually used - * size is returned in @size. - */ -struct statmount { - __u32 size; /* Total size, including strings */ - __u32 __spare1; - __u64 mask; /* What results were written */ - __u32 sb_dev_major; /* Device ID */ - __u32 sb_dev_minor; - __u64 sb_magic; /* ..._SUPER_MAGIC */ - __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ - __u32 fs_type; /* [str] Filesystem type */ - __u64 mnt_id; /* Unique ID of mount */ - __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ - __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ - __u32 mnt_parent_id_old; - __u64 mnt_attr; /* MOUNT_ATTR_... */ - __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ - __u64 mnt_peer_group; /* ID of shared peer group */ - __u64 mnt_master; /* Mount receives propagation from this ID */ - __u64 propagate_from; /* Propagation from in current namespace */ - __u32 mnt_root; /* [str] Root of mount relative to root of fs */ - __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; - char str[]; /* Variable size part containing strings */ -}; - -/* - * Structure for passing mount ID and miscellaneous parameters to statmount(2) - * and listmount(2). - * - * For statmount(2) @param represents the request mask. - * For listmount(2) @param represents the last listed mount id (or zero). - */ -struct mnt_id_req { - __u32 size; - __u32 spare; - __u64 mnt_id; - __u64 param; -}; - -/* List of all mnt_id_req versions. */ -#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ - -/* - * @mask bits for statmount(2) - */ -#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ -#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ -#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ -#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ -#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ -#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ - -/* - * Special @mnt_id values that can be passed to listmount - */ -#define LSMT_ROOT 0xffffffffffffffff /* root mount */ - -#endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 643e9fa6ec89..523c3b7d6c9d 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -500,20 +500,20 @@ $(fadvise_advice_array): $(linux_uapi_dir)/in.h $(fadvise_advice_tbl) fsmount_arrays := $(beauty_outdir)/fsmount_arrays.c fsmount_tbls := $(srctree)/tools/perf/trace/beauty/fsmount.sh -$(fsmount_arrays): $(linux_uapi_dir)/mount.h $(fsmount_tbls) - $(Q)$(SHELL) '$(fsmount_tbls)' $(linux_uapi_dir) > $@ +$(fsmount_arrays): $(beauty_uapi_linux_dir)/mount.h $(fsmount_tbls) + $(Q)$(SHELL) '$(fsmount_tbls)' $(beauty_uapi_linux_dir) > $@ fspick_arrays := $(beauty_outdir)/fspick_arrays.c fspick_tbls := $(srctree)/tools/perf/trace/beauty/fspick.sh -$(fspick_arrays): $(linux_uapi_dir)/mount.h $(fspick_tbls) - $(Q)$(SHELL) '$(fspick_tbls)' $(linux_uapi_dir) > $@ +$(fspick_arrays): $(beauty_uapi_linux_dir)/mount.h $(fspick_tbls) + $(Q)$(SHELL) '$(fspick_tbls)' $(beauty_uapi_linux_dir) > $@ fsconfig_arrays := $(beauty_outdir)/fsconfig_arrays.c fsconfig_tbls := $(srctree)/tools/perf/trace/beauty/fsconfig.sh -$(fsconfig_arrays): $(linux_uapi_dir)/mount.h $(fsconfig_tbls) - $(Q)$(SHELL) '$(fsconfig_tbls)' $(linux_uapi_dir) > $@ +$(fsconfig_arrays): $(beauty_uapi_linux_dir)/mount.h $(fsconfig_tbls) + $(Q)$(SHELL) '$(fsconfig_tbls)' $(beauty_uapi_linux_dir) > $@ pkey_alloc_access_rights_array := $(beauty_outdir)/pkey_alloc_access_rights_array.c asm_generic_hdr_dir := $(srctree)/tools/include/uapi/asm-generic/ @@ -598,15 +598,14 @@ $(mremap_flags_array): $(linux_uapi_dir)/mman.h $(mremap_flags_tbl) mount_flags_array := $(beauty_outdir)/mount_flags_array.c mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/mount_flags.sh -$(mount_flags_array): $(linux_uapi_dir)/mount.h $(mount_flags_tbl) - $(Q)$(SHELL) '$(mount_flags_tbl)' $(linux_uapi_dir) > $@ +$(mount_flags_array): $(beauty_uapi_linux_dir)/mount.h $(mount_flags_tbl) + $(Q)$(SHELL) '$(mount_flags_tbl)' $(beauty_uapi_linux_dir) > $@ move_mount_flags_array := $(beauty_outdir)/move_mount_flags_array.c move_mount_flags_tbl := $(srctree)/tools/perf/trace/beauty/move_mount_flags.sh -$(move_mount_flags_array): $(linux_uapi_dir)/mount.h $(move_mount_flags_tbl) - $(Q)$(SHELL) '$(move_mount_flags_tbl)' $(linux_uapi_dir) > $@ - +$(move_mount_flags_array): $(beauty_uapi_linux_dir)/mount.h $(move_mount_flags_tbl) + $(Q)$(SHELL) '$(move_mount_flags_tbl)' $(beauty_uapi_linux_dir) > $@ mmap_prot_array := $(beauty_outdir)/mmap_prot_array.c mmap_prot_tbl := $(srctree)/tools/perf/trace/beauty/mmap_prot.sh diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 015f74137b75..c2c26d6b87ef 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -15,7 +15,6 @@ FILES=( "include/uapi/linux/kcmp.h" "include/uapi/linux/kvm.h" "include/uapi/linux/in.h" - "include/uapi/linux/mount.h" "include/uapi/linux/openat2.h" "include/uapi/linux/perf_event.h" "include/uapi/linux/prctl.h" @@ -98,6 +97,7 @@ declare -a BEAUTY_FILES BEAUTY_FILES=( "include/linux/socket.h" "include/uapi/linux/fs.h" + "include/uapi/linux/mount.h" ) declare -a FAILURES diff --git a/tools/perf/trace/beauty/fsconfig.sh b/tools/perf/trace/beauty/fsconfig.sh index bc6ef7bb7a5f..09cee79de00c 100755 --- a/tools/perf/trace/beauty/fsconfig.sh +++ b/tools/perf/trace/beauty/fsconfig.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h printf "static const char *fsconfig_cmds[] = {\n" ms='[[:space:]]*' diff --git a/tools/perf/trace/beauty/fsmount.sh b/tools/perf/trace/beauty/fsmount.sh index cba8897a751f..6b67a54cdeee 100755 --- a/tools/perf/trace/beauty/fsmount.sh +++ b/tools/perf/trace/beauty/fsmount.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h # Remove MOUNT_ATTR_RELATIME as it is zeros, handle it a special way in the beautifier # Only handle MOUNT_ATTR_ followed by a capital letter/num as __ is special case diff --git a/tools/perf/trace/beauty/fspick.sh b/tools/perf/trace/beauty/fspick.sh index 1f088329b96e..0d9951c22b95 100755 --- a/tools/perf/trace/beauty/fspick.sh +++ b/tools/perf/trace/beauty/fspick.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h printf "static const char *fspick_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+FSPICK_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' diff --git a/tools/perf/trace/beauty/include/uapi/linux/mount.h b/tools/perf/trace/beauty/include/uapi/linux/mount.h new file mode 100644 index 000000000000..ad5478dbad00 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/mount.h @@ -0,0 +1,211 @@ +#ifndef _UAPI_LINUX_MOUNT_H +#define _UAPI_LINUX_MOUNT_H + +#include + +/* + * These are the fs-independent mount-flags: up to 32 flags are supported + * + * Usage of these is restricted within the kernel to core mount(2) code and + * callers of sys_mount() only. Filesystems should be using the SB_* + * equivalent instead. + */ +#define MS_RDONLY 1 /* Mount read-only */ +#define MS_NOSUID 2 /* Ignore suid and sgid bits */ +#define MS_NODEV 4 /* Disallow access to device special files */ +#define MS_NOEXEC 8 /* Disallow program execution */ +#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ +#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ +#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ +#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */ +#define MS_NOATIME 1024 /* Do not update access times. */ +#define MS_NODIRATIME 2048 /* Do not update directory access times */ +#define MS_BIND 4096 +#define MS_MOVE 8192 +#define MS_REC 16384 +#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. + MS_VERBOSE is deprecated. */ +#define MS_SILENT 32768 +#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +#define MS_UNBINDABLE (1<<17) /* change to unbindable */ +#define MS_PRIVATE (1<<18) /* change to private */ +#define MS_SLAVE (1<<19) /* change to slave */ +#define MS_SHARED (1<<20) /* change to shared */ +#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ +#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ +#define MS_I_VERSION (1<<23) /* Update inode I_version field */ +#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define MS_SUBMOUNT (1<<26) +#define MS_NOREMOTELOCK (1<<27) +#define MS_NOSEC (1<<28) +#define MS_BORN (1<<29) +#define MS_ACTIVE (1<<30) +#define MS_NOUSER (1<<31) + +/* + * Superblock flags that can be altered by MS_REMOUNT + */ +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ + MS_LAZYTIME) + +/* + * Old magic mount flag and mask + */ +#define MS_MGC_VAL 0xC0ED0000 +#define MS_MGC_MSK 0xffff0000 + +/* + * open_tree() flags. + */ +#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ + +/* + * move_mount() flags. + */ +#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ +#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ +#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ +#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ +#define MOVE_MOUNT__MASK 0x00000377 + +/* + * fsopen() flags. + */ +#define FSOPEN_CLOEXEC 0x00000001 + +/* + * fspick() flags. + */ +#define FSPICK_CLOEXEC 0x00000001 +#define FSPICK_SYMLINK_NOFOLLOW 0x00000002 +#define FSPICK_NO_AUTOMOUNT 0x00000004 +#define FSPICK_EMPTY_PATH 0x00000008 + +/* + * The type of fsconfig() call made. + */ +enum fsconfig_command { + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ + FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */ + FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ + FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */ +}; + +/* + * fsmount() flags. + */ +#define FSMOUNT_CLOEXEC 0x00000001 + +/* + * Mount attributes. + */ +#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ +#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ +#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ +#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ +#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ +#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ +#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */ + +/* + * mount_setattr() + */ +struct mount_attr { + __u64 attr_set; + __u64 attr_clr; + __u64 propagation; + __u64 userns_fd; +}; + +/* List of all mount_attr versions. */ +#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ + + +/* + * Structure for getting mount/superblock/filesystem info with statmount(2). + * + * The interface is similar to statx(2): individual fields or groups can be + * selected with the @mask argument of statmount(). Kernel will set the @mask + * field according to the supported fields. + * + * If string fields are selected, then the caller needs to pass a buffer that + * has space after the fixed part of the structure. Nul terminated strings are + * copied there and offsets relative to @str are stored in the relevant fields. + * If the buffer is too small, then EOVERFLOW is returned. The actually used + * size is returned in @size. + */ +struct statmount { + __u32 size; /* Total size, including strings */ + __u32 __spare1; + __u64 mask; /* What results were written */ + __u32 sb_dev_major; /* Device ID */ + __u32 sb_dev_minor; + __u64 sb_magic; /* ..._SUPER_MAGIC */ + __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ + __u32 fs_type; /* [str] Filesystem type */ + __u64 mnt_id; /* Unique ID of mount */ + __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ + __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ + __u32 mnt_parent_id_old; + __u64 mnt_attr; /* MOUNT_ATTR_... */ + __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ + __u64 mnt_peer_group; /* ID of shared peer group */ + __u64 mnt_master; /* Mount receives propagation from this ID */ + __u64 propagate_from; /* Propagation from in current namespace */ + __u32 mnt_root; /* [str] Root of mount relative to root of fs */ + __u32 mnt_point; /* [str] Mountpoint relative to current root */ + __u64 __spare2[50]; + char str[]; /* Variable size part containing strings */ +}; + +/* + * Structure for passing mount ID and miscellaneous parameters to statmount(2) + * and listmount(2). + * + * For statmount(2) @param represents the request mask. + * For listmount(2) @param represents the last listed mount id (or zero). + */ +struct mnt_id_req { + __u32 size; + __u32 spare; + __u64 mnt_id; + __u64 param; +}; + +/* List of all mnt_id_req versions. */ +#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ + +/* + * @mask bits for statmount(2) + */ +#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ +#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ +#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ +#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ +#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ +#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ + +/* + * Special @mnt_id values that can be passed to listmount + */ +#define LSMT_ROOT 0xffffffffffffffff /* root mount */ + +#endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/perf/trace/beauty/mount_flags.sh b/tools/perf/trace/beauty/mount_flags.sh index 730099a9a67c..ff578f7b451b 100755 --- a/tools/perf/trace/beauty/mount_flags.sh +++ b/tools/perf/trace/beauty/mount_flags.sh @@ -1,15 +1,15 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ printf "static const char *mount_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*' -grep -E $regex ${header_dir}/mount.h | grep -E -v '(MSK|VERBOSE|MGC_VAL)\>' | \ +grep -E $regex ${beauty_uapi_linux_dir}/mount.h | grep -E -v '(MSK|VERBOSE|MGC_VAL)\>' | \ sed -r "s/$regex/\2 \2 \1/g" | sort -n | \ xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+\(1<<([[:digit:]]+)\)[[:space:]]*.*' -grep -E $regex ${header_dir}/mount.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/mount.h | \ sed -r "s/$regex/\2 \1/g" | \ xargs printf "\t[%s + 1] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/move_mount_flags.sh b/tools/perf/trace/beauty/move_mount_flags.sh index ce5e632d1448..c0dde9020bc3 100755 --- a/tools/perf/trace/beauty/move_mount_flags.sh +++ b/tools/perf/trace/beauty/move_mount_flags.sh @@ -2,12 +2,12 @@ # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then - linux_header_dir=tools/include/uapi/linux + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ else - linux_header_dir=$1 + beauty_uapi_linux_dir=$1 fi -linux_mount=${linux_header_dir}/mount.h +linux_mount=${beauty_uapi_linux_dir}/mount.h printf "static const char *move_mount_flags[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MOVE_MOUNT_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' -- cgit v1.2.3 From 44512bd6136ec7bb31e4dfaaf49313d91539af6f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move uapi/linux/usbdevice_fs.h copy out of the directory used to build perf It is mostly used only to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/include/ hierarchy, that is used just for scraping. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. No other tools/ living code uses it, just coming from either 'make install_headers' or from the system /usr/include/ directory. Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/usbdevice_fs.h | 231 --------------------- tools/perf/Makefile.perf | 4 +- tools/perf/check-headers.sh | 2 +- .../trace/beauty/include/uapi/linux/usbdevice_fs.h | 231 +++++++++++++++++++++ tools/perf/trace/beauty/usbdevfs_ioctl.sh | 6 +- 5 files changed, 237 insertions(+), 237 deletions(-) delete mode 100644 tools/include/uapi/linux/usbdevice_fs.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h (limited to 'tools') diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h deleted file mode 100644 index 74a84e02422a..000000000000 --- a/tools/include/uapi/linux/usbdevice_fs.h +++ /dev/null @@ -1,231 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -/*****************************************************************************/ - -/* - * usbdevice_fs.h -- USB device file system. - * - * Copyright (C) 2000 - * Thomas Sailer (sailer@ife.ee.ethz.ch) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * History: - * 0.1 04.01.2000 Created - */ - -/*****************************************************************************/ - -#ifndef _UAPI_LINUX_USBDEVICE_FS_H -#define _UAPI_LINUX_USBDEVICE_FS_H - -#include -#include - -/* --------------------------------------------------------------------- */ - -/* usbdevfs ioctl codes */ - -struct usbdevfs_ctrltransfer { - __u8 bRequestType; - __u8 bRequest; - __u16 wValue; - __u16 wIndex; - __u16 wLength; - __u32 timeout; /* in milliseconds */ - void __user *data; -}; - -struct usbdevfs_bulktransfer { - unsigned int ep; - unsigned int len; - unsigned int timeout; /* in milliseconds */ - void __user *data; -}; - -struct usbdevfs_setinterface { - unsigned int interface; - unsigned int altsetting; -}; - -struct usbdevfs_disconnectsignal { - unsigned int signr; - void __user *context; -}; - -#define USBDEVFS_MAXDRIVERNAME 255 - -struct usbdevfs_getdriver { - unsigned int interface; - char driver[USBDEVFS_MAXDRIVERNAME + 1]; -}; - -struct usbdevfs_connectinfo { - unsigned int devnum; - unsigned char slow; -}; - -struct usbdevfs_conninfo_ex { - __u32 size; /* Size of the structure from the kernel's */ - /* point of view. Can be used by userspace */ - /* to determine how much data can be */ - /* used/trusted. */ - __u32 busnum; /* USB bus number, as enumerated by the */ - /* kernel, the device is connected to. */ - __u32 devnum; /* Device address on the bus. */ - __u32 speed; /* USB_SPEED_* constants from ch9.h */ - __u8 num_ports; /* Number of ports the device is connected */ - /* to on the way to the root hub. It may */ - /* be bigger than size of 'ports' array so */ - /* userspace can detect overflows. */ - __u8 ports[7]; /* List of ports on the way from the root */ - /* hub to the device. Current limit in */ - /* USB specification is 7 tiers (root hub, */ - /* 5 intermediate hubs, device), which */ - /* gives at most 6 port entries. */ -}; - -#define USBDEVFS_URB_SHORT_NOT_OK 0x01 -#define USBDEVFS_URB_ISO_ASAP 0x02 -#define USBDEVFS_URB_BULK_CONTINUATION 0x04 -#define USBDEVFS_URB_NO_FSBR 0x20 /* Not used */ -#define USBDEVFS_URB_ZERO_PACKET 0x40 -#define USBDEVFS_URB_NO_INTERRUPT 0x80 - -#define USBDEVFS_URB_TYPE_ISO 0 -#define USBDEVFS_URB_TYPE_INTERRUPT 1 -#define USBDEVFS_URB_TYPE_CONTROL 2 -#define USBDEVFS_URB_TYPE_BULK 3 - -struct usbdevfs_iso_packet_desc { - unsigned int length; - unsigned int actual_length; - unsigned int status; -}; - -struct usbdevfs_urb { - unsigned char type; - unsigned char endpoint; - int status; - unsigned int flags; - void __user *buffer; - int buffer_length; - int actual_length; - int start_frame; - union { - int number_of_packets; /* Only used for isoc urbs */ - unsigned int stream_id; /* Only used with bulk streams */ - }; - int error_count; - unsigned int signr; /* signal to be sent on completion, - or 0 if none should be sent. */ - void __user *usercontext; - struct usbdevfs_iso_packet_desc iso_frame_desc[]; -}; - -/* ioctls for talking directly to drivers */ -struct usbdevfs_ioctl { - int ifno; /* interface 0..N ; negative numbers reserved */ - int ioctl_code; /* MUST encode size + direction of data so the - * macros in give correct values */ - void __user *data; /* param buffer (in, or out) */ -}; - -/* You can do most things with hubs just through control messages, - * except find out what device connects to what port. */ -struct usbdevfs_hub_portinfo { - char nports; /* number of downstream ports in this hub */ - char port [127]; /* e.g. port 3 connects to device 27 */ -}; - -/* System and bus capability flags */ -#define USBDEVFS_CAP_ZERO_PACKET 0x01 -#define USBDEVFS_CAP_BULK_CONTINUATION 0x02 -#define USBDEVFS_CAP_NO_PACKET_SIZE_LIM 0x04 -#define USBDEVFS_CAP_BULK_SCATTER_GATHER 0x08 -#define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10 -#define USBDEVFS_CAP_MMAP 0x20 -#define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 -#define USBDEVFS_CAP_CONNINFO_EX 0x80 -#define USBDEVFS_CAP_SUSPEND 0x100 - -/* USBDEVFS_DISCONNECT_CLAIM flags & struct */ - -/* disconnect-and-claim if the driver matches the driver field */ -#define USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER 0x01 -/* disconnect-and-claim except when the driver matches the driver field */ -#define USBDEVFS_DISCONNECT_CLAIM_EXCEPT_DRIVER 0x02 - -struct usbdevfs_disconnect_claim { - unsigned int interface; - unsigned int flags; - char driver[USBDEVFS_MAXDRIVERNAME + 1]; -}; - -struct usbdevfs_streams { - unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */ - unsigned int num_eps; - unsigned char eps[]; -}; - -/* - * USB_SPEED_* values returned by USBDEVFS_GET_SPEED are defined in - * linux/usb/ch9.h - */ - -#define USBDEVFS_CONTROL _IOWR('U', 0, struct usbdevfs_ctrltransfer) -#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) -#define USBDEVFS_BULK _IOWR('U', 2, struct usbdevfs_bulktransfer) -#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) -#define USBDEVFS_RESETEP _IOR('U', 3, unsigned int) -#define USBDEVFS_SETINTERFACE _IOR('U', 4, struct usbdevfs_setinterface) -#define USBDEVFS_SETCONFIGURATION _IOR('U', 5, unsigned int) -#define USBDEVFS_GETDRIVER _IOW('U', 8, struct usbdevfs_getdriver) -#define USBDEVFS_SUBMITURB _IOR('U', 10, struct usbdevfs_urb) -#define USBDEVFS_SUBMITURB32 _IOR('U', 10, struct usbdevfs_urb32) -#define USBDEVFS_DISCARDURB _IO('U', 11) -#define USBDEVFS_REAPURB _IOW('U', 12, void *) -#define USBDEVFS_REAPURB32 _IOW('U', 12, __u32) -#define USBDEVFS_REAPURBNDELAY _IOW('U', 13, void *) -#define USBDEVFS_REAPURBNDELAY32 _IOW('U', 13, __u32) -#define USBDEVFS_DISCSIGNAL _IOR('U', 14, struct usbdevfs_disconnectsignal) -#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) -#define USBDEVFS_CLAIMINTERFACE _IOR('U', 15, unsigned int) -#define USBDEVFS_RELEASEINTERFACE _IOR('U', 16, unsigned int) -#define USBDEVFS_CONNECTINFO _IOW('U', 17, struct usbdevfs_connectinfo) -#define USBDEVFS_IOCTL _IOWR('U', 18, struct usbdevfs_ioctl) -#define USBDEVFS_IOCTL32 _IOWR('U', 18, struct usbdevfs_ioctl32) -#define USBDEVFS_HUB_PORTINFO _IOR('U', 19, struct usbdevfs_hub_portinfo) -#define USBDEVFS_RESET _IO('U', 20) -#define USBDEVFS_CLEAR_HALT _IOR('U', 21, unsigned int) -#define USBDEVFS_DISCONNECT _IO('U', 22) -#define USBDEVFS_CONNECT _IO('U', 23) -#define USBDEVFS_CLAIM_PORT _IOR('U', 24, unsigned int) -#define USBDEVFS_RELEASE_PORT _IOR('U', 25, unsigned int) -#define USBDEVFS_GET_CAPABILITIES _IOR('U', 26, __u32) -#define USBDEVFS_DISCONNECT_CLAIM _IOR('U', 27, struct usbdevfs_disconnect_claim) -#define USBDEVFS_ALLOC_STREAMS _IOR('U', 28, struct usbdevfs_streams) -#define USBDEVFS_FREE_STREAMS _IOR('U', 29, struct usbdevfs_streams) -#define USBDEVFS_DROP_PRIVILEGES _IOW('U', 30, __u32) -#define USBDEVFS_GET_SPEED _IO('U', 31) -/* - * Returns struct usbdevfs_conninfo_ex; length is variable to allow - * extending size of the data returned. - */ -#define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) -#define USBDEVFS_FORBID_SUSPEND _IO('U', 33) -#define USBDEVFS_ALLOW_SUSPEND _IO('U', 34) -#define USBDEVFS_WAIT_FOR_RESUME _IO('U', 35) - -#endif /* _UAPI_LINUX_USBDEVICE_FS_H */ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 523c3b7d6c9d..53ec3765b4b2 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -623,8 +623,8 @@ $(prctl_option_array): $(prctl_hdr_dir)/prctl.h $(prctl_option_tbl) usbdevfs_ioctl_array := $(beauty_ioctl_outdir)/usbdevfs_ioctl_array.c usbdevfs_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/usbdevfs_ioctl.sh -$(usbdevfs_ioctl_array): $(linux_uapi_dir)/usbdevice_fs.h $(usbdevfs_ioctl_tbl) - $(Q)$(SHELL) '$(usbdevfs_ioctl_tbl)' $(linux_uapi_dir) > $@ +$(usbdevfs_ioctl_array): $(beauty_uapi_linux_dir)/usbdevice_fs.h $(usbdevfs_ioctl_tbl) + $(Q)$(SHELL) '$(usbdevfs_ioctl_tbl)' $(beauty_uapi_linux_dir) > $@ x86_arch_prctl_code_array := $(beauty_outdir)/x86_arch_prctl_code_array.c x86_arch_prctl_code_tbl := $(srctree)/tools/perf/trace/beauty/x86_arch_prctl.sh diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index c2c26d6b87ef..356ddb76a954 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -21,7 +21,6 @@ FILES=( "include/uapi/linux/sched.h" "include/uapi/linux/seccomp.h" "include/uapi/linux/stat.h" - "include/uapi/linux/usbdevice_fs.h" "include/uapi/linux/vhost.h" "include/uapi/sound/asound.h" "include/linux/bits.h" @@ -98,6 +97,7 @@ BEAUTY_FILES=( "include/linux/socket.h" "include/uapi/linux/fs.h" "include/uapi/linux/mount.h" + "include/uapi/linux/usbdevice_fs.h" ) declare -a FAILURES diff --git a/tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h b/tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h new file mode 100644 index 000000000000..74a84e02422a --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/usbdevice_fs.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/*****************************************************************************/ + +/* + * usbdevice_fs.h -- USB device file system. + * + * Copyright (C) 2000 + * Thomas Sailer (sailer@ife.ee.ethz.ch) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * History: + * 0.1 04.01.2000 Created + */ + +/*****************************************************************************/ + +#ifndef _UAPI_LINUX_USBDEVICE_FS_H +#define _UAPI_LINUX_USBDEVICE_FS_H + +#include +#include + +/* --------------------------------------------------------------------- */ + +/* usbdevfs ioctl codes */ + +struct usbdevfs_ctrltransfer { + __u8 bRequestType; + __u8 bRequest; + __u16 wValue; + __u16 wIndex; + __u16 wLength; + __u32 timeout; /* in milliseconds */ + void __user *data; +}; + +struct usbdevfs_bulktransfer { + unsigned int ep; + unsigned int len; + unsigned int timeout; /* in milliseconds */ + void __user *data; +}; + +struct usbdevfs_setinterface { + unsigned int interface; + unsigned int altsetting; +}; + +struct usbdevfs_disconnectsignal { + unsigned int signr; + void __user *context; +}; + +#define USBDEVFS_MAXDRIVERNAME 255 + +struct usbdevfs_getdriver { + unsigned int interface; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; +}; + +struct usbdevfs_connectinfo { + unsigned int devnum; + unsigned char slow; +}; + +struct usbdevfs_conninfo_ex { + __u32 size; /* Size of the structure from the kernel's */ + /* point of view. Can be used by userspace */ + /* to determine how much data can be */ + /* used/trusted. */ + __u32 busnum; /* USB bus number, as enumerated by the */ + /* kernel, the device is connected to. */ + __u32 devnum; /* Device address on the bus. */ + __u32 speed; /* USB_SPEED_* constants from ch9.h */ + __u8 num_ports; /* Number of ports the device is connected */ + /* to on the way to the root hub. It may */ + /* be bigger than size of 'ports' array so */ + /* userspace can detect overflows. */ + __u8 ports[7]; /* List of ports on the way from the root */ + /* hub to the device. Current limit in */ + /* USB specification is 7 tiers (root hub, */ + /* 5 intermediate hubs, device), which */ + /* gives at most 6 port entries. */ +}; + +#define USBDEVFS_URB_SHORT_NOT_OK 0x01 +#define USBDEVFS_URB_ISO_ASAP 0x02 +#define USBDEVFS_URB_BULK_CONTINUATION 0x04 +#define USBDEVFS_URB_NO_FSBR 0x20 /* Not used */ +#define USBDEVFS_URB_ZERO_PACKET 0x40 +#define USBDEVFS_URB_NO_INTERRUPT 0x80 + +#define USBDEVFS_URB_TYPE_ISO 0 +#define USBDEVFS_URB_TYPE_INTERRUPT 1 +#define USBDEVFS_URB_TYPE_CONTROL 2 +#define USBDEVFS_URB_TYPE_BULK 3 + +struct usbdevfs_iso_packet_desc { + unsigned int length; + unsigned int actual_length; + unsigned int status; +}; + +struct usbdevfs_urb { + unsigned char type; + unsigned char endpoint; + int status; + unsigned int flags; + void __user *buffer; + int buffer_length; + int actual_length; + int start_frame; + union { + int number_of_packets; /* Only used for isoc urbs */ + unsigned int stream_id; /* Only used with bulk streams */ + }; + int error_count; + unsigned int signr; /* signal to be sent on completion, + or 0 if none should be sent. */ + void __user *usercontext; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; +}; + +/* ioctls for talking directly to drivers */ +struct usbdevfs_ioctl { + int ifno; /* interface 0..N ; negative numbers reserved */ + int ioctl_code; /* MUST encode size + direction of data so the + * macros in give correct values */ + void __user *data; /* param buffer (in, or out) */ +}; + +/* You can do most things with hubs just through control messages, + * except find out what device connects to what port. */ +struct usbdevfs_hub_portinfo { + char nports; /* number of downstream ports in this hub */ + char port [127]; /* e.g. port 3 connects to device 27 */ +}; + +/* System and bus capability flags */ +#define USBDEVFS_CAP_ZERO_PACKET 0x01 +#define USBDEVFS_CAP_BULK_CONTINUATION 0x02 +#define USBDEVFS_CAP_NO_PACKET_SIZE_LIM 0x04 +#define USBDEVFS_CAP_BULK_SCATTER_GATHER 0x08 +#define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10 +#define USBDEVFS_CAP_MMAP 0x20 +#define USBDEVFS_CAP_DROP_PRIVILEGES 0x40 +#define USBDEVFS_CAP_CONNINFO_EX 0x80 +#define USBDEVFS_CAP_SUSPEND 0x100 + +/* USBDEVFS_DISCONNECT_CLAIM flags & struct */ + +/* disconnect-and-claim if the driver matches the driver field */ +#define USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER 0x01 +/* disconnect-and-claim except when the driver matches the driver field */ +#define USBDEVFS_DISCONNECT_CLAIM_EXCEPT_DRIVER 0x02 + +struct usbdevfs_disconnect_claim { + unsigned int interface; + unsigned int flags; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; +}; + +struct usbdevfs_streams { + unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */ + unsigned int num_eps; + unsigned char eps[]; +}; + +/* + * USB_SPEED_* values returned by USBDEVFS_GET_SPEED are defined in + * linux/usb/ch9.h + */ + +#define USBDEVFS_CONTROL _IOWR('U', 0, struct usbdevfs_ctrltransfer) +#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) +#define USBDEVFS_BULK _IOWR('U', 2, struct usbdevfs_bulktransfer) +#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) +#define USBDEVFS_RESETEP _IOR('U', 3, unsigned int) +#define USBDEVFS_SETINTERFACE _IOR('U', 4, struct usbdevfs_setinterface) +#define USBDEVFS_SETCONFIGURATION _IOR('U', 5, unsigned int) +#define USBDEVFS_GETDRIVER _IOW('U', 8, struct usbdevfs_getdriver) +#define USBDEVFS_SUBMITURB _IOR('U', 10, struct usbdevfs_urb) +#define USBDEVFS_SUBMITURB32 _IOR('U', 10, struct usbdevfs_urb32) +#define USBDEVFS_DISCARDURB _IO('U', 11) +#define USBDEVFS_REAPURB _IOW('U', 12, void *) +#define USBDEVFS_REAPURB32 _IOW('U', 12, __u32) +#define USBDEVFS_REAPURBNDELAY _IOW('U', 13, void *) +#define USBDEVFS_REAPURBNDELAY32 _IOW('U', 13, __u32) +#define USBDEVFS_DISCSIGNAL _IOR('U', 14, struct usbdevfs_disconnectsignal) +#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) +#define USBDEVFS_CLAIMINTERFACE _IOR('U', 15, unsigned int) +#define USBDEVFS_RELEASEINTERFACE _IOR('U', 16, unsigned int) +#define USBDEVFS_CONNECTINFO _IOW('U', 17, struct usbdevfs_connectinfo) +#define USBDEVFS_IOCTL _IOWR('U', 18, struct usbdevfs_ioctl) +#define USBDEVFS_IOCTL32 _IOWR('U', 18, struct usbdevfs_ioctl32) +#define USBDEVFS_HUB_PORTINFO _IOR('U', 19, struct usbdevfs_hub_portinfo) +#define USBDEVFS_RESET _IO('U', 20) +#define USBDEVFS_CLEAR_HALT _IOR('U', 21, unsigned int) +#define USBDEVFS_DISCONNECT _IO('U', 22) +#define USBDEVFS_CONNECT _IO('U', 23) +#define USBDEVFS_CLAIM_PORT _IOR('U', 24, unsigned int) +#define USBDEVFS_RELEASE_PORT _IOR('U', 25, unsigned int) +#define USBDEVFS_GET_CAPABILITIES _IOR('U', 26, __u32) +#define USBDEVFS_DISCONNECT_CLAIM _IOR('U', 27, struct usbdevfs_disconnect_claim) +#define USBDEVFS_ALLOC_STREAMS _IOR('U', 28, struct usbdevfs_streams) +#define USBDEVFS_FREE_STREAMS _IOR('U', 29, struct usbdevfs_streams) +#define USBDEVFS_DROP_PRIVILEGES _IOW('U', 30, __u32) +#define USBDEVFS_GET_SPEED _IO('U', 31) +/* + * Returns struct usbdevfs_conninfo_ex; length is variable to allow + * extending size of the data returned. + */ +#define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) +#define USBDEVFS_FORBID_SUSPEND _IO('U', 33) +#define USBDEVFS_ALLOW_SUSPEND _IO('U', 34) +#define USBDEVFS_WAIT_FOR_RESUME _IO('U', 35) + +#endif /* _UAPI_LINUX_USBDEVICE_FS_H */ diff --git a/tools/perf/trace/beauty/usbdevfs_ioctl.sh b/tools/perf/trace/beauty/usbdevfs_ioctl.sh index b39cfb3720b8..12a30a9a8e0c 100755 --- a/tools/perf/trace/beauty/usbdevfs_ioctl.sh +++ b/tools/perf/trace/beauty/usbdevfs_ioctl.sh @@ -1,21 +1,21 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ # also as: # #define USBDEVFS_CONNINFO_EX(len) _IOC(_IOC_READ, 'U', 32, len) printf "static const char *usbdevfs_ioctl_cmds[] = {\n" regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)(\(\w+\))?[[:space:]]+_IO[CWR]{0,2}\([[:space:]]*(_IOC_\w+,[[:space:]]*)?'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*" -grep -E "$regex" ${header_dir}/usbdevice_fs.h | grep -E -v 'USBDEVFS_\w+32[[:space:]]' | \ +grep -E "$regex" ${beauty_uapi_linux_dir}/usbdevice_fs.h | grep -E -v 'USBDEVFS_\w+32[[:space:]]' | \ sed -r "s/$regex/\4 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n\n" printf "#if 0\n" printf "static const char *usbdevfs_ioctl_32_cmds[] = {\n" regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)[[:space:]]+_IO[WR]{0,2}\([[:space:]]*'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*" -grep -E $regex ${header_dir}/usbdevice_fs.h | grep -E 'USBDEVFS_\w+32[[:space:]]' | \ +grep -E $regex ${beauty_uapi_linux_dir}/usbdevice_fs.h | grep -E 'USBDEVFS_\w+32[[:space:]]' | \ sed -r "s/$regex/\2 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" -- cgit v1.2.3 From 7050e33e86ad03d26d7b969bba1d48ee159be496 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move uapi/sound/asound.h copy out of the directory used to build perf It is used only to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/include/ hierarchy, that is used just for scraping. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 1252 -------------------- tools/perf/Makefile.perf | 9 +- tools/perf/check-headers.sh | 2 +- .../perf/trace/beauty/include/uapi/sound/asound.h | 1252 ++++++++++++++++++++ tools/perf/trace/beauty/sndrv_ctl_ioctl.sh | 4 +- tools/perf/trace/beauty/sndrv_pcm_ioctl.sh | 4 +- 6 files changed, 1262 insertions(+), 1261 deletions(-) delete mode 100644 tools/include/uapi/sound/asound.h create mode 100644 tools/perf/trace/beauty/include/uapi/sound/asound.h (limited to 'tools') diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h deleted file mode 100644 index d5b9cfbd9cea..000000000000 --- a/tools/include/uapi/sound/asound.h +++ /dev/null @@ -1,1252 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -/* - * Advanced Linux Sound Architecture - ALSA - Driver - * Copyright (c) 1994-2003 by Jaroslav Kysela , - * Abramo Bagnara - */ - -#ifndef _UAPI__SOUND_ASOUND_H -#define _UAPI__SOUND_ASOUND_H - -#if defined(__KERNEL__) || defined(__linux__) -#include -#include -#else -#include -#include -#endif - -#ifndef __KERNEL__ -#include -#include -#endif - -/* - * protocol version - */ - -#define SNDRV_PROTOCOL_VERSION(major, minor, subminor) (((major)<<16)|((minor)<<8)|(subminor)) -#define SNDRV_PROTOCOL_MAJOR(version) (((version)>>16)&0xffff) -#define SNDRV_PROTOCOL_MINOR(version) (((version)>>8)&0xff) -#define SNDRV_PROTOCOL_MICRO(version) ((version)&0xff) -#define SNDRV_PROTOCOL_INCOMPATIBLE(kversion, uversion) \ - (SNDRV_PROTOCOL_MAJOR(kversion) != SNDRV_PROTOCOL_MAJOR(uversion) || \ - (SNDRV_PROTOCOL_MAJOR(kversion) == SNDRV_PROTOCOL_MAJOR(uversion) && \ - SNDRV_PROTOCOL_MINOR(kversion) != SNDRV_PROTOCOL_MINOR(uversion))) - -/**************************************************************************** - * * - * Digital audio interface * - * * - ****************************************************************************/ - -#define AES_IEC958_STATUS_SIZE 24 - -struct snd_aes_iec958 { - unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ - unsigned char subcode[147]; /* AES/IEC958 subcode bits */ - unsigned char pad; /* nothing */ - unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ -}; - -/**************************************************************************** - * * - * CEA-861 Audio InfoFrame. Used in HDMI and DisplayPort * - * * - ****************************************************************************/ - -struct snd_cea_861_aud_if { - unsigned char db1_ct_cc; /* coding type and channel count */ - unsigned char db2_sf_ss; /* sample frequency and size */ - unsigned char db3; /* not used, all zeros */ - unsigned char db4_ca; /* channel allocation code */ - unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */ -}; - -/**************************************************************************** - * * - * Section for driver hardware dependent interface - /dev/snd/hw? * - * * - ****************************************************************************/ - -#define SNDRV_HWDEP_VERSION SNDRV_PROTOCOL_VERSION(1, 0, 1) - -enum { - SNDRV_HWDEP_IFACE_OPL2 = 0, - SNDRV_HWDEP_IFACE_OPL3, - SNDRV_HWDEP_IFACE_OPL4, - SNDRV_HWDEP_IFACE_SB16CSP, /* Creative Signal Processor */ - SNDRV_HWDEP_IFACE_EMU10K1, /* FX8010 processor in EMU10K1 chip */ - SNDRV_HWDEP_IFACE_YSS225, /* Yamaha FX processor */ - SNDRV_HWDEP_IFACE_ICS2115, /* Wavetable synth */ - SNDRV_HWDEP_IFACE_SSCAPE, /* Ensoniq SoundScape ISA card (MC68EC000) */ - SNDRV_HWDEP_IFACE_VX, /* Digigram VX cards */ - SNDRV_HWDEP_IFACE_MIXART, /* Digigram miXart cards */ - SNDRV_HWDEP_IFACE_USX2Y, /* Tascam US122, US224 & US428 usb */ - SNDRV_HWDEP_IFACE_EMUX_WAVETABLE, /* EmuX wavetable */ - SNDRV_HWDEP_IFACE_BLUETOOTH, /* Bluetooth audio */ - SNDRV_HWDEP_IFACE_USX2Y_PCM, /* Tascam US122, US224 & US428 rawusb pcm */ - SNDRV_HWDEP_IFACE_PCXHR, /* Digigram PCXHR */ - SNDRV_HWDEP_IFACE_SB_RC, /* SB Extigy/Audigy2NX remote control */ - SNDRV_HWDEP_IFACE_HDA, /* HD-audio */ - SNDRV_HWDEP_IFACE_USB_STREAM, /* direct access to usb stream */ - SNDRV_HWDEP_IFACE_FW_DICE, /* TC DICE FireWire device */ - SNDRV_HWDEP_IFACE_FW_FIREWORKS, /* Echo Audio Fireworks based device */ - SNDRV_HWDEP_IFACE_FW_BEBOB, /* BridgeCo BeBoB based device */ - SNDRV_HWDEP_IFACE_FW_OXFW, /* Oxford OXFW970/971 based device */ - SNDRV_HWDEP_IFACE_FW_DIGI00X, /* Digidesign Digi 002/003 family */ - SNDRV_HWDEP_IFACE_FW_TASCAM, /* TASCAM FireWire series */ - SNDRV_HWDEP_IFACE_LINE6, /* Line6 USB processors */ - SNDRV_HWDEP_IFACE_FW_MOTU, /* MOTU FireWire series */ - SNDRV_HWDEP_IFACE_FW_FIREFACE, /* RME Fireface series */ - - /* Don't forget to change the following: */ - SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_FIREFACE -}; - -struct snd_hwdep_info { - unsigned int device; /* WR: device number */ - int card; /* R: card number */ - unsigned char id[64]; /* ID (user selectable) */ - unsigned char name[80]; /* hwdep name */ - int iface; /* hwdep interface */ - unsigned char reserved[64]; /* reserved for future */ -}; - -/* generic DSP loader */ -struct snd_hwdep_dsp_status { - unsigned int version; /* R: driver-specific version */ - unsigned char id[32]; /* R: driver-specific ID string */ - unsigned int num_dsps; /* R: number of DSP images to transfer */ - unsigned int dsp_loaded; /* R: bit flags indicating the loaded DSPs */ - unsigned int chip_ready; /* R: 1 = initialization finished */ - unsigned char reserved[16]; /* reserved for future use */ -}; - -struct snd_hwdep_dsp_image { - unsigned int index; /* W: DSP index */ - unsigned char name[64]; /* W: ID (e.g. file name) */ - unsigned char __user *image; /* W: binary image */ - size_t length; /* W: size of image in bytes */ - unsigned long driver_data; /* W: driver-specific data */ -}; - -#define SNDRV_HWDEP_IOCTL_PVERSION _IOR ('H', 0x00, int) -#define SNDRV_HWDEP_IOCTL_INFO _IOR ('H', 0x01, struct snd_hwdep_info) -#define SNDRV_HWDEP_IOCTL_DSP_STATUS _IOR('H', 0x02, struct snd_hwdep_dsp_status) -#define SNDRV_HWDEP_IOCTL_DSP_LOAD _IOW('H', 0x03, struct snd_hwdep_dsp_image) - -/***************************************************************************** - * * - * Digital Audio (PCM) interface - /dev/snd/pcm?? * - * * - *****************************************************************************/ - -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) - -typedef unsigned long snd_pcm_uframes_t; -typedef signed long snd_pcm_sframes_t; - -enum { - SNDRV_PCM_CLASS_GENERIC = 0, /* standard mono or stereo device */ - SNDRV_PCM_CLASS_MULTI, /* multichannel device */ - SNDRV_PCM_CLASS_MODEM, /* software modem class */ - SNDRV_PCM_CLASS_DIGITIZER, /* digitizer class */ - /* Don't forget to change the following: */ - SNDRV_PCM_CLASS_LAST = SNDRV_PCM_CLASS_DIGITIZER, -}; - -enum { - SNDRV_PCM_SUBCLASS_GENERIC_MIX = 0, /* mono or stereo subdevices are mixed together */ - SNDRV_PCM_SUBCLASS_MULTI_MIX, /* multichannel subdevices are mixed together */ - /* Don't forget to change the following: */ - SNDRV_PCM_SUBCLASS_LAST = SNDRV_PCM_SUBCLASS_MULTI_MIX, -}; - -enum { - SNDRV_PCM_STREAM_PLAYBACK = 0, - SNDRV_PCM_STREAM_CAPTURE, - SNDRV_PCM_STREAM_LAST = SNDRV_PCM_STREAM_CAPTURE, -}; - -typedef int __bitwise snd_pcm_access_t; -#define SNDRV_PCM_ACCESS_MMAP_INTERLEAVED ((__force snd_pcm_access_t) 0) /* interleaved mmap */ -#define SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED ((__force snd_pcm_access_t) 1) /* noninterleaved mmap */ -#define SNDRV_PCM_ACCESS_MMAP_COMPLEX ((__force snd_pcm_access_t) 2) /* complex mmap */ -#define SNDRV_PCM_ACCESS_RW_INTERLEAVED ((__force snd_pcm_access_t) 3) /* readi/writei */ -#define SNDRV_PCM_ACCESS_RW_NONINTERLEAVED ((__force snd_pcm_access_t) 4) /* readn/writen */ -#define SNDRV_PCM_ACCESS_LAST SNDRV_PCM_ACCESS_RW_NONINTERLEAVED - -typedef int __bitwise snd_pcm_format_t; -#define SNDRV_PCM_FORMAT_S8 ((__force snd_pcm_format_t) 0) -#define SNDRV_PCM_FORMAT_U8 ((__force snd_pcm_format_t) 1) -#define SNDRV_PCM_FORMAT_S16_LE ((__force snd_pcm_format_t) 2) -#define SNDRV_PCM_FORMAT_S16_BE ((__force snd_pcm_format_t) 3) -#define SNDRV_PCM_FORMAT_U16_LE ((__force snd_pcm_format_t) 4) -#define SNDRV_PCM_FORMAT_U16_BE ((__force snd_pcm_format_t) 5) -#define SNDRV_PCM_FORMAT_S24_LE ((__force snd_pcm_format_t) 6) /* low three bytes */ -#define SNDRV_PCM_FORMAT_S24_BE ((__force snd_pcm_format_t) 7) /* low three bytes */ -#define SNDRV_PCM_FORMAT_U24_LE ((__force snd_pcm_format_t) 8) /* low three bytes */ -#define SNDRV_PCM_FORMAT_U24_BE ((__force snd_pcm_format_t) 9) /* low three bytes */ -/* - * For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the - * available bit count in most significant bit. It's for the case of so-called 'left-justified' or - * `right-padding` sample which has less width than 32 bit. - */ -#define SNDRV_PCM_FORMAT_S32_LE ((__force snd_pcm_format_t) 10) -#define SNDRV_PCM_FORMAT_S32_BE ((__force snd_pcm_format_t) 11) -#define SNDRV_PCM_FORMAT_U32_LE ((__force snd_pcm_format_t) 12) -#define SNDRV_PCM_FORMAT_U32_BE ((__force snd_pcm_format_t) 13) -#define SNDRV_PCM_FORMAT_FLOAT_LE ((__force snd_pcm_format_t) 14) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_FLOAT_BE ((__force snd_pcm_format_t) 15) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_FLOAT64_LE ((__force snd_pcm_format_t) 16) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_FLOAT64_BE ((__force snd_pcm_format_t) 17) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE ((__force snd_pcm_format_t) 18) /* IEC-958 subframe, Little Endian */ -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE ((__force snd_pcm_format_t) 19) /* IEC-958 subframe, Big Endian */ -#define SNDRV_PCM_FORMAT_MU_LAW ((__force snd_pcm_format_t) 20) -#define SNDRV_PCM_FORMAT_A_LAW ((__force snd_pcm_format_t) 21) -#define SNDRV_PCM_FORMAT_IMA_ADPCM ((__force snd_pcm_format_t) 22) -#define SNDRV_PCM_FORMAT_MPEG ((__force snd_pcm_format_t) 23) -#define SNDRV_PCM_FORMAT_GSM ((__force snd_pcm_format_t) 24) -#define SNDRV_PCM_FORMAT_S20_LE ((__force snd_pcm_format_t) 25) /* in four bytes, LSB justified */ -#define SNDRV_PCM_FORMAT_S20_BE ((__force snd_pcm_format_t) 26) /* in four bytes, LSB justified */ -#define SNDRV_PCM_FORMAT_U20_LE ((__force snd_pcm_format_t) 27) /* in four bytes, LSB justified */ -#define SNDRV_PCM_FORMAT_U20_BE ((__force snd_pcm_format_t) 28) /* in four bytes, LSB justified */ -/* gap in the numbering for a future standard linear format */ -#define SNDRV_PCM_FORMAT_SPECIAL ((__force snd_pcm_format_t) 31) -#define SNDRV_PCM_FORMAT_S24_3LE ((__force snd_pcm_format_t) 32) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S24_3BE ((__force snd_pcm_format_t) 33) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U24_3LE ((__force snd_pcm_format_t) 34) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U24_3BE ((__force snd_pcm_format_t) 35) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S20_3LE ((__force snd_pcm_format_t) 36) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S20_3BE ((__force snd_pcm_format_t) 37) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U20_3LE ((__force snd_pcm_format_t) 38) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U20_3BE ((__force snd_pcm_format_t) 39) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S18_3LE ((__force snd_pcm_format_t) 40) /* in three bytes */ -#define SNDRV_PCM_FORMAT_S18_3BE ((__force snd_pcm_format_t) 41) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U18_3LE ((__force snd_pcm_format_t) 42) /* in three bytes */ -#define SNDRV_PCM_FORMAT_U18_3BE ((__force snd_pcm_format_t) 43) /* in three bytes */ -#define SNDRV_PCM_FORMAT_G723_24 ((__force snd_pcm_format_t) 44) /* 8 samples in 3 bytes */ -#define SNDRV_PCM_FORMAT_G723_24_1B ((__force snd_pcm_format_t) 45) /* 1 sample in 1 byte */ -#define SNDRV_PCM_FORMAT_G723_40 ((__force snd_pcm_format_t) 46) /* 8 Samples in 5 bytes */ -#define SNDRV_PCM_FORMAT_G723_40_1B ((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */ -#define SNDRV_PCM_FORMAT_DSD_U8 ((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */ -#define SNDRV_PCM_FORMAT_DSD_U16_LE ((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */ -#define SNDRV_PCM_FORMAT_DSD_U32_LE ((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */ -#define SNDRV_PCM_FORMAT_DSD_U16_BE ((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */ -#define SNDRV_PCM_FORMAT_DSD_U32_BE ((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */ -#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_BE -#define SNDRV_PCM_FORMAT_FIRST SNDRV_PCM_FORMAT_S8 - -#ifdef SNDRV_LITTLE_ENDIAN -#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE -#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_LE -#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_LE -#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_LE -#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_LE -#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_LE -#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_LE -#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_LE -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE -#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_LE -#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_LE -#endif -#ifdef SNDRV_BIG_ENDIAN -#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_BE -#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_BE -#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_BE -#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_BE -#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_BE -#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_BE -#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_BE -#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_BE -#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE -#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_BE -#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_BE -#endif - -typedef int __bitwise snd_pcm_subformat_t; -#define SNDRV_PCM_SUBFORMAT_STD ((__force snd_pcm_subformat_t) 0) -#define SNDRV_PCM_SUBFORMAT_MSBITS_MAX ((__force snd_pcm_subformat_t) 1) -#define SNDRV_PCM_SUBFORMAT_MSBITS_20 ((__force snd_pcm_subformat_t) 2) -#define SNDRV_PCM_SUBFORMAT_MSBITS_24 ((__force snd_pcm_subformat_t) 3) -#define SNDRV_PCM_SUBFORMAT_LAST SNDRV_PCM_SUBFORMAT_MSBITS_24 - -#define SNDRV_PCM_INFO_MMAP 0x00000001 /* hardware supports mmap */ -#define SNDRV_PCM_INFO_MMAP_VALID 0x00000002 /* period data are valid during transfer */ -#define SNDRV_PCM_INFO_DOUBLE 0x00000004 /* Double buffering needed for PCM start/stop */ -#define SNDRV_PCM_INFO_BATCH 0x00000010 /* double buffering */ -#define SNDRV_PCM_INFO_SYNC_APPLPTR 0x00000020 /* need the explicit sync of appl_ptr update */ -#define SNDRV_PCM_INFO_PERFECT_DRAIN 0x00000040 /* silencing at the end of stream is not required */ -#define SNDRV_PCM_INFO_INTERLEAVED 0x00000100 /* channels are interleaved */ -#define SNDRV_PCM_INFO_NONINTERLEAVED 0x00000200 /* channels are not interleaved */ -#define SNDRV_PCM_INFO_COMPLEX 0x00000400 /* complex frame organization (mmap only) */ -#define SNDRV_PCM_INFO_BLOCK_TRANSFER 0x00010000 /* hardware transfer block of samples */ -#define SNDRV_PCM_INFO_OVERRANGE 0x00020000 /* hardware supports ADC (capture) overrange detection */ -#define SNDRV_PCM_INFO_RESUME 0x00040000 /* hardware supports stream resume after suspend */ -#define SNDRV_PCM_INFO_PAUSE 0x00080000 /* pause ioctl is supported */ -#define SNDRV_PCM_INFO_HALF_DUPLEX 0x00100000 /* only half duplex */ -#define SNDRV_PCM_INFO_JOINT_DUPLEX 0x00200000 /* playback and capture stream are somewhat correlated */ -#define SNDRV_PCM_INFO_SYNC_START 0x00400000 /* pcm support some kind of sync go */ -#define SNDRV_PCM_INFO_NO_PERIOD_WAKEUP 0x00800000 /* period wakeup can be disabled */ -#define SNDRV_PCM_INFO_HAS_WALL_CLOCK 0x01000000 /* (Deprecated)has audio wall clock for audio/system time sync */ -#define SNDRV_PCM_INFO_HAS_LINK_ATIME 0x01000000 /* report hardware link audio time, reset on startup */ -#define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME 0x02000000 /* report absolute hardware link audio time, not reset on startup */ -#define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ -#define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ -#define SNDRV_PCM_INFO_EXPLICIT_SYNC 0x10000000 /* needs explicit sync of pointers and data */ -#define SNDRV_PCM_INFO_NO_REWINDS 0x20000000 /* hardware can only support monotonic changes of appl_ptr */ -#define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ -#define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ - -#if (__BITS_PER_LONG == 32 && defined(__USE_TIME_BITS64)) || defined __KERNEL__ -#define __SND_STRUCT_TIME64 -#endif - -typedef int __bitwise snd_pcm_state_t; -#define SNDRV_PCM_STATE_OPEN ((__force snd_pcm_state_t) 0) /* stream is open */ -#define SNDRV_PCM_STATE_SETUP ((__force snd_pcm_state_t) 1) /* stream has a setup */ -#define SNDRV_PCM_STATE_PREPARED ((__force snd_pcm_state_t) 2) /* stream is ready to start */ -#define SNDRV_PCM_STATE_RUNNING ((__force snd_pcm_state_t) 3) /* stream is running */ -#define SNDRV_PCM_STATE_XRUN ((__force snd_pcm_state_t) 4) /* stream reached an xrun */ -#define SNDRV_PCM_STATE_DRAINING ((__force snd_pcm_state_t) 5) /* stream is draining */ -#define SNDRV_PCM_STATE_PAUSED ((__force snd_pcm_state_t) 6) /* stream is paused */ -#define SNDRV_PCM_STATE_SUSPENDED ((__force snd_pcm_state_t) 7) /* hardware is suspended */ -#define SNDRV_PCM_STATE_DISCONNECTED ((__force snd_pcm_state_t) 8) /* hardware is disconnected */ -#define SNDRV_PCM_STATE_LAST SNDRV_PCM_STATE_DISCONNECTED - -enum { - SNDRV_PCM_MMAP_OFFSET_DATA = 0x00000000, - SNDRV_PCM_MMAP_OFFSET_STATUS_OLD = 0x80000000, - SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD = 0x81000000, - SNDRV_PCM_MMAP_OFFSET_STATUS_NEW = 0x82000000, - SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW = 0x83000000, -#ifdef __SND_STRUCT_TIME64 - SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_NEW, - SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW, -#else - SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_OLD, - SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD, -#endif -}; - -union snd_pcm_sync_id { - unsigned char id[16]; - unsigned short id16[8]; - unsigned int id32[4]; -}; - -struct snd_pcm_info { - unsigned int device; /* RO/WR (control): device number */ - unsigned int subdevice; /* RO/WR (control): subdevice number */ - int stream; /* RO/WR (control): stream direction */ - int card; /* R: card number */ - unsigned char id[64]; /* ID (user selectable) */ - unsigned char name[80]; /* name of this device */ - unsigned char subname[32]; /* subdevice name */ - int dev_class; /* SNDRV_PCM_CLASS_* */ - int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ - unsigned int subdevices_count; - unsigned int subdevices_avail; - union snd_pcm_sync_id sync; /* hardware synchronization ID */ - unsigned char reserved[64]; /* reserved for future... */ -}; - -typedef int snd_pcm_hw_param_t; -#define SNDRV_PCM_HW_PARAM_ACCESS 0 /* Access type */ -#define SNDRV_PCM_HW_PARAM_FORMAT 1 /* Format */ -#define SNDRV_PCM_HW_PARAM_SUBFORMAT 2 /* Subformat */ -#define SNDRV_PCM_HW_PARAM_FIRST_MASK SNDRV_PCM_HW_PARAM_ACCESS -#define SNDRV_PCM_HW_PARAM_LAST_MASK SNDRV_PCM_HW_PARAM_SUBFORMAT - -#define SNDRV_PCM_HW_PARAM_SAMPLE_BITS 8 /* Bits per sample */ -#define SNDRV_PCM_HW_PARAM_FRAME_BITS 9 /* Bits per frame */ -#define SNDRV_PCM_HW_PARAM_CHANNELS 10 /* Channels */ -#define SNDRV_PCM_HW_PARAM_RATE 11 /* Approx rate */ -#define SNDRV_PCM_HW_PARAM_PERIOD_TIME 12 /* Approx distance between - * interrupts in us - */ -#define SNDRV_PCM_HW_PARAM_PERIOD_SIZE 13 /* Approx frames between - * interrupts - */ -#define SNDRV_PCM_HW_PARAM_PERIOD_BYTES 14 /* Approx bytes between - * interrupts - */ -#define SNDRV_PCM_HW_PARAM_PERIODS 15 /* Approx interrupts per - * buffer - */ -#define SNDRV_PCM_HW_PARAM_BUFFER_TIME 16 /* Approx duration of buffer - * in us - */ -#define SNDRV_PCM_HW_PARAM_BUFFER_SIZE 17 /* Size of buffer in frames */ -#define SNDRV_PCM_HW_PARAM_BUFFER_BYTES 18 /* Size of buffer in bytes */ -#define SNDRV_PCM_HW_PARAM_TICK_TIME 19 /* Approx tick duration in us */ -#define SNDRV_PCM_HW_PARAM_FIRST_INTERVAL SNDRV_PCM_HW_PARAM_SAMPLE_BITS -#define SNDRV_PCM_HW_PARAM_LAST_INTERVAL SNDRV_PCM_HW_PARAM_TICK_TIME - -#define SNDRV_PCM_HW_PARAMS_NORESAMPLE (1<<0) /* avoid rate resampling */ -#define SNDRV_PCM_HW_PARAMS_EXPORT_BUFFER (1<<1) /* export buffer */ -#define SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP (1<<2) /* disable period wakeups */ -#define SNDRV_PCM_HW_PARAMS_NO_DRAIN_SILENCE (1<<3) /* suppress drain with the filling - * of the silence samples - */ - -struct snd_interval { - unsigned int min, max; - unsigned int openmin:1, - openmax:1, - integer:1, - empty:1; -}; - -#define SNDRV_MASK_MAX 256 - -struct snd_mask { - __u32 bits[(SNDRV_MASK_MAX+31)/32]; -}; - -struct snd_pcm_hw_params { - unsigned int flags; - struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK - - SNDRV_PCM_HW_PARAM_FIRST_MASK + 1]; - struct snd_mask mres[5]; /* reserved masks */ - struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL - - SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1]; - struct snd_interval ires[9]; /* reserved intervals */ - unsigned int rmask; /* W: requested masks */ - unsigned int cmask; /* R: changed masks */ - unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits */ - unsigned int rate_num; /* R: rate numerator */ - unsigned int rate_den; /* R: rate denominator */ - snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ - unsigned char reserved[64]; /* reserved for future */ -}; - -enum { - SNDRV_PCM_TSTAMP_NONE = 0, - SNDRV_PCM_TSTAMP_ENABLE, - SNDRV_PCM_TSTAMP_LAST = SNDRV_PCM_TSTAMP_ENABLE, -}; - -struct snd_pcm_sw_params { - int tstamp_mode; /* timestamp mode */ - unsigned int period_step; - unsigned int sleep_min; /* min ticks to sleep */ - snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */ - snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */ - snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */ - /* - * The following two thresholds alleviate playback buffer underruns; when - * hw_avail drops below the threshold, the respective action is triggered: - */ - snd_pcm_uframes_t stop_threshold; /* - stop playback */ - snd_pcm_uframes_t silence_threshold; /* - pre-fill buffer with silence */ - snd_pcm_uframes_t silence_size; /* max size of silence pre-fill; when >= boundary, - * fill played area with silence immediately */ - snd_pcm_uframes_t boundary; /* pointers wrap point */ - unsigned int proto; /* protocol version */ - unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */ - unsigned char reserved[56]; /* reserved for future */ -}; - -struct snd_pcm_channel_info { - unsigned int channel; - __kernel_off_t offset; /* mmap offset */ - unsigned int first; /* offset to first sample in bits */ - unsigned int step; /* samples distance in bits */ -}; - -enum { - /* - * first definition for backwards compatibility only, - * maps to wallclock/link time for HDAudio playback and DEFAULT/DMA time for everything else - */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT = 0, - - /* timestamp definitions */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT = 1, /* DMA time, reported as per hw_ptr */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK = 2, /* link time reported by sample or wallclock counter, reset on startup */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ABSOLUTE = 3, /* link time reported by sample or wallclock counter, not reset on startup */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ESTIMATED = 4, /* link time estimated indirectly */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED = 5, /* link time synchronized with system time */ - SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED -}; - -#ifndef __KERNEL__ -/* explicit padding avoids incompatibility between i386 and x86-64 */ -typedef struct { unsigned char pad[sizeof(time_t) - sizeof(int)]; } __time_pad; - -struct snd_pcm_status { - snd_pcm_state_t state; /* stream state */ - __time_pad pad1; /* align to timespec */ - struct timespec trigger_tstamp; /* time when stream was started/stopped/paused */ - struct timespec tstamp; /* reference timestamp */ - snd_pcm_uframes_t appl_ptr; /* appl ptr */ - snd_pcm_uframes_t hw_ptr; /* hw ptr */ - snd_pcm_sframes_t delay; /* current delay in frames */ - snd_pcm_uframes_t avail; /* number of frames available */ - snd_pcm_uframes_t avail_max; /* max frames available on hw since last status */ - snd_pcm_uframes_t overrange; /* count of ADC (capture) overrange detections from last status */ - snd_pcm_state_t suspended_state; /* suspended stream state */ - __u32 audio_tstamp_data; /* needed for 64-bit alignment, used for configs/report to/from userspace */ - struct timespec audio_tstamp; /* sample counter, wall clock, PHC or on-demand sync'ed */ - struct timespec driver_tstamp; /* useful in case reference system tstamp is reported with delay */ - __u32 audio_tstamp_accuracy; /* in ns units, only valid if indicated in audio_tstamp_data */ - unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */ -}; -#endif - -/* - * For mmap operations, we need the 64-bit layout, both for compat mode, - * and for y2038 compatibility. For 64-bit applications, the two definitions - * are identical, so we keep the traditional version. - */ -#ifdef __SND_STRUCT_TIME64 -#define __snd_pcm_mmap_status64 snd_pcm_mmap_status -#define __snd_pcm_mmap_control64 snd_pcm_mmap_control -#define __snd_pcm_sync_ptr64 snd_pcm_sync_ptr -#ifdef __KERNEL__ -#define __snd_timespec64 __kernel_timespec -#else -#define __snd_timespec64 timespec -#endif -struct __snd_timespec { - __s32 tv_sec; - __s32 tv_nsec; -}; -#else -#define __snd_pcm_mmap_status snd_pcm_mmap_status -#define __snd_pcm_mmap_control snd_pcm_mmap_control -#define __snd_pcm_sync_ptr snd_pcm_sync_ptr -#define __snd_timespec timespec -struct __snd_timespec64 { - __s64 tv_sec; - __s64 tv_nsec; -}; - -#endif - -struct __snd_pcm_mmap_status { - snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ - int pad1; /* Needed for 64 bit alignment */ - snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ - struct __snd_timespec tstamp; /* Timestamp */ - snd_pcm_state_t suspended_state; /* RO: suspended stream state */ - struct __snd_timespec audio_tstamp; /* from sample counter or wall clock */ -}; - -struct __snd_pcm_mmap_control { - snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ - snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ -}; - -#define SNDRV_PCM_SYNC_PTR_HWSYNC (1<<0) /* execute hwsync */ -#define SNDRV_PCM_SYNC_PTR_APPL (1<<1) /* get appl_ptr from driver (r/w op) */ -#define SNDRV_PCM_SYNC_PTR_AVAIL_MIN (1<<2) /* get avail_min from driver */ - -struct __snd_pcm_sync_ptr { - unsigned int flags; - union { - struct __snd_pcm_mmap_status status; - unsigned char reserved[64]; - } s; - union { - struct __snd_pcm_mmap_control control; - unsigned char reserved[64]; - } c; -}; - -#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) -typedef char __pad_before_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; -typedef char __pad_after_uframe[0]; -#endif - -#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) -typedef char __pad_before_uframe[0]; -typedef char __pad_after_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; -#endif - -struct __snd_pcm_mmap_status64 { - snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ - __u32 pad1; /* Needed for 64 bit alignment */ - __pad_before_uframe __pad1; - snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ - __pad_after_uframe __pad2; - struct __snd_timespec64 tstamp; /* Timestamp */ - snd_pcm_state_t suspended_state;/* RO: suspended stream state */ - __u32 pad3; /* Needed for 64 bit alignment */ - struct __snd_timespec64 audio_tstamp; /* sample counter or wall clock */ -}; - -struct __snd_pcm_mmap_control64 { - __pad_before_uframe __pad1; - snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ - __pad_before_uframe __pad2; // This should be __pad_after_uframe, but binary - // backwards compatibility constraints prevent a fix. - - __pad_before_uframe __pad3; - snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ - __pad_after_uframe __pad4; -}; - -struct __snd_pcm_sync_ptr64 { - __u32 flags; - __u32 pad1; - union { - struct __snd_pcm_mmap_status64 status; - unsigned char reserved[64]; - } s; - union { - struct __snd_pcm_mmap_control64 control; - unsigned char reserved[64]; - } c; -}; - -struct snd_xferi { - snd_pcm_sframes_t result; - void __user *buf; - snd_pcm_uframes_t frames; -}; - -struct snd_xfern { - snd_pcm_sframes_t result; - void __user * __user *bufs; - snd_pcm_uframes_t frames; -}; - -enum { - SNDRV_PCM_TSTAMP_TYPE_GETTIMEOFDAY = 0, /* gettimeofday equivalent */ - SNDRV_PCM_TSTAMP_TYPE_MONOTONIC, /* posix_clock_monotonic equivalent */ - SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, /* monotonic_raw (no NTP) */ - SNDRV_PCM_TSTAMP_TYPE_LAST = SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, -}; - -/* channel positions */ -enum { - SNDRV_CHMAP_UNKNOWN = 0, - SNDRV_CHMAP_NA, /* N/A, silent */ - SNDRV_CHMAP_MONO, /* mono stream */ - /* this follows the alsa-lib mixer channel value + 3 */ - SNDRV_CHMAP_FL, /* front left */ - SNDRV_CHMAP_FR, /* front right */ - SNDRV_CHMAP_RL, /* rear left */ - SNDRV_CHMAP_RR, /* rear right */ - SNDRV_CHMAP_FC, /* front center */ - SNDRV_CHMAP_LFE, /* LFE */ - SNDRV_CHMAP_SL, /* side left */ - SNDRV_CHMAP_SR, /* side right */ - SNDRV_CHMAP_RC, /* rear center */ - /* new definitions */ - SNDRV_CHMAP_FLC, /* front left center */ - SNDRV_CHMAP_FRC, /* front right center */ - SNDRV_CHMAP_RLC, /* rear left center */ - SNDRV_CHMAP_RRC, /* rear right center */ - SNDRV_CHMAP_FLW, /* front left wide */ - SNDRV_CHMAP_FRW, /* front right wide */ - SNDRV_CHMAP_FLH, /* front left high */ - SNDRV_CHMAP_FCH, /* front center high */ - SNDRV_CHMAP_FRH, /* front right high */ - SNDRV_CHMAP_TC, /* top center */ - SNDRV_CHMAP_TFL, /* top front left */ - SNDRV_CHMAP_TFR, /* top front right */ - SNDRV_CHMAP_TFC, /* top front center */ - SNDRV_CHMAP_TRL, /* top rear left */ - SNDRV_CHMAP_TRR, /* top rear right */ - SNDRV_CHMAP_TRC, /* top rear center */ - /* new definitions for UAC2 */ - SNDRV_CHMAP_TFLC, /* top front left center */ - SNDRV_CHMAP_TFRC, /* top front right center */ - SNDRV_CHMAP_TSL, /* top side left */ - SNDRV_CHMAP_TSR, /* top side right */ - SNDRV_CHMAP_LLFE, /* left LFE */ - SNDRV_CHMAP_RLFE, /* right LFE */ - SNDRV_CHMAP_BC, /* bottom center */ - SNDRV_CHMAP_BLC, /* bottom left center */ - SNDRV_CHMAP_BRC, /* bottom right center */ - SNDRV_CHMAP_LAST = SNDRV_CHMAP_BRC, -}; - -#define SNDRV_CHMAP_POSITION_MASK 0xffff -#define SNDRV_CHMAP_PHASE_INVERSE (0x01 << 16) -#define SNDRV_CHMAP_DRIVER_SPEC (0x02 << 16) - -#define SNDRV_PCM_IOCTL_PVERSION _IOR('A', 0x00, int) -#define SNDRV_PCM_IOCTL_INFO _IOR('A', 0x01, struct snd_pcm_info) -#define SNDRV_PCM_IOCTL_TSTAMP _IOW('A', 0x02, int) -#define SNDRV_PCM_IOCTL_TTSTAMP _IOW('A', 0x03, int) -#define SNDRV_PCM_IOCTL_USER_PVERSION _IOW('A', 0x04, int) -#define SNDRV_PCM_IOCTL_HW_REFINE _IOWR('A', 0x10, struct snd_pcm_hw_params) -#define SNDRV_PCM_IOCTL_HW_PARAMS _IOWR('A', 0x11, struct snd_pcm_hw_params) -#define SNDRV_PCM_IOCTL_HW_FREE _IO('A', 0x12) -#define SNDRV_PCM_IOCTL_SW_PARAMS _IOWR('A', 0x13, struct snd_pcm_sw_params) -#define SNDRV_PCM_IOCTL_STATUS _IOR('A', 0x20, struct snd_pcm_status) -#define SNDRV_PCM_IOCTL_DELAY _IOR('A', 0x21, snd_pcm_sframes_t) -#define SNDRV_PCM_IOCTL_HWSYNC _IO('A', 0x22) -#define __SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct __snd_pcm_sync_ptr) -#define __SNDRV_PCM_IOCTL_SYNC_PTR64 _IOWR('A', 0x23, struct __snd_pcm_sync_ptr64) -#define SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct snd_pcm_sync_ptr) -#define SNDRV_PCM_IOCTL_STATUS_EXT _IOWR('A', 0x24, struct snd_pcm_status) -#define SNDRV_PCM_IOCTL_CHANNEL_INFO _IOR('A', 0x32, struct snd_pcm_channel_info) -#define SNDRV_PCM_IOCTL_PREPARE _IO('A', 0x40) -#define SNDRV_PCM_IOCTL_RESET _IO('A', 0x41) -#define SNDRV_PCM_IOCTL_START _IO('A', 0x42) -#define SNDRV_PCM_IOCTL_DROP _IO('A', 0x43) -#define SNDRV_PCM_IOCTL_DRAIN _IO('A', 0x44) -#define SNDRV_PCM_IOCTL_PAUSE _IOW('A', 0x45, int) -#define SNDRV_PCM_IOCTL_REWIND _IOW('A', 0x46, snd_pcm_uframes_t) -#define SNDRV_PCM_IOCTL_RESUME _IO('A', 0x47) -#define SNDRV_PCM_IOCTL_XRUN _IO('A', 0x48) -#define SNDRV_PCM_IOCTL_FORWARD _IOW('A', 0x49, snd_pcm_uframes_t) -#define SNDRV_PCM_IOCTL_WRITEI_FRAMES _IOW('A', 0x50, struct snd_xferi) -#define SNDRV_PCM_IOCTL_READI_FRAMES _IOR('A', 0x51, struct snd_xferi) -#define SNDRV_PCM_IOCTL_WRITEN_FRAMES _IOW('A', 0x52, struct snd_xfern) -#define SNDRV_PCM_IOCTL_READN_FRAMES _IOR('A', 0x53, struct snd_xfern) -#define SNDRV_PCM_IOCTL_LINK _IOW('A', 0x60, int) -#define SNDRV_PCM_IOCTL_UNLINK _IO('A', 0x61) - -/***************************************************************************** - * * - * MIDI v1.0 interface * - * * - *****************************************************************************/ - -/* - * Raw MIDI section - /dev/snd/midi?? - */ - -#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 4) - -enum { - SNDRV_RAWMIDI_STREAM_OUTPUT = 0, - SNDRV_RAWMIDI_STREAM_INPUT, - SNDRV_RAWMIDI_STREAM_LAST = SNDRV_RAWMIDI_STREAM_INPUT, -}; - -#define SNDRV_RAWMIDI_INFO_OUTPUT 0x00000001 -#define SNDRV_RAWMIDI_INFO_INPUT 0x00000002 -#define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 -#define SNDRV_RAWMIDI_INFO_UMP 0x00000008 - -struct snd_rawmidi_info { - unsigned int device; /* RO/WR (control): device number */ - unsigned int subdevice; /* RO/WR (control): subdevice number */ - int stream; /* WR: stream */ - int card; /* R: card number */ - unsigned int flags; /* SNDRV_RAWMIDI_INFO_XXXX */ - unsigned char id[64]; /* ID (user selectable) */ - unsigned char name[80]; /* name of device */ - unsigned char subname[32]; /* name of active or selected subdevice */ - unsigned int subdevices_count; - unsigned int subdevices_avail; - unsigned char reserved[64]; /* reserved for future use */ -}; - -#define SNDRV_RAWMIDI_MODE_FRAMING_MASK (7<<0) -#define SNDRV_RAWMIDI_MODE_FRAMING_SHIFT 0 -#define SNDRV_RAWMIDI_MODE_FRAMING_NONE (0<<0) -#define SNDRV_RAWMIDI_MODE_FRAMING_TSTAMP (1<<0) -#define SNDRV_RAWMIDI_MODE_CLOCK_MASK (7<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_SHIFT 3 -#define SNDRV_RAWMIDI_MODE_CLOCK_NONE (0<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_REALTIME (1<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC (2<<3) -#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC_RAW (3<<3) - -#define SNDRV_RAWMIDI_FRAMING_DATA_LENGTH 16 - -struct snd_rawmidi_framing_tstamp { - /* For now, frame_type is always 0. Midi 2.0 is expected to add new - * types here. Applications are expected to skip unknown frame types. - */ - __u8 frame_type; - __u8 length; /* number of valid bytes in data field */ - __u8 reserved[2]; - __u32 tv_nsec; /* nanoseconds */ - __u64 tv_sec; /* seconds */ - __u8 data[SNDRV_RAWMIDI_FRAMING_DATA_LENGTH]; -} __packed; - -struct snd_rawmidi_params { - int stream; - size_t buffer_size; /* queue size in bytes */ - size_t avail_min; /* minimum avail bytes for wakeup */ - unsigned int no_active_sensing: 1; /* do not send active sensing byte in close() */ - unsigned int mode; /* For input data only, frame incoming data */ - unsigned char reserved[12]; /* reserved for future use */ -}; - -#ifndef __KERNEL__ -struct snd_rawmidi_status { - int stream; - __time_pad pad1; - struct timespec tstamp; /* Timestamp */ - size_t avail; /* available bytes */ - size_t xruns; /* count of overruns since last status (in bytes) */ - unsigned char reserved[16]; /* reserved for future use */ -}; -#endif - -/* UMP EP info flags */ -#define SNDRV_UMP_EP_INFO_STATIC_BLOCKS 0x01 - -/* UMP EP Protocol / JRTS capability bits */ -#define SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK 0x0300 -#define SNDRV_UMP_EP_INFO_PROTO_MIDI1 0x0100 /* MIDI 1.0 */ -#define SNDRV_UMP_EP_INFO_PROTO_MIDI2 0x0200 /* MIDI 2.0 */ -#define SNDRV_UMP_EP_INFO_PROTO_JRTS_MASK 0x0003 -#define SNDRV_UMP_EP_INFO_PROTO_JRTS_TX 0x0001 /* JRTS Transmit */ -#define SNDRV_UMP_EP_INFO_PROTO_JRTS_RX 0x0002 /* JRTS Receive */ - -/* UMP Endpoint information */ -struct snd_ump_endpoint_info { - int card; /* card number */ - int device; /* device number */ - unsigned int flags; /* additional info */ - unsigned int protocol_caps; /* protocol capabilities */ - unsigned int protocol; /* current protocol */ - unsigned int num_blocks; /* # of function blocks */ - unsigned short version; /* UMP major/minor version */ - unsigned short family_id; /* MIDI device family ID */ - unsigned short model_id; /* MIDI family model ID */ - unsigned int manufacturer_id; /* MIDI manufacturer ID */ - unsigned char sw_revision[4]; /* software revision */ - unsigned short padding; - unsigned char name[128]; /* endpoint name string */ - unsigned char product_id[128]; /* unique product id string */ - unsigned char reserved[32]; -} __packed; - -/* UMP direction */ -#define SNDRV_UMP_DIR_INPUT 0x01 -#define SNDRV_UMP_DIR_OUTPUT 0x02 -#define SNDRV_UMP_DIR_BIDIRECTION 0x03 - -/* UMP block info flags */ -#define SNDRV_UMP_BLOCK_IS_MIDI1 (1U << 0) /* MIDI 1.0 port w/o restrict */ -#define SNDRV_UMP_BLOCK_IS_LOWSPEED (1U << 1) /* 31.25Kbps B/W MIDI1 port */ - -/* UMP block user-interface hint */ -#define SNDRV_UMP_BLOCK_UI_HINT_UNKNOWN 0x00 -#define SNDRV_UMP_BLOCK_UI_HINT_RECEIVER 0x01 -#define SNDRV_UMP_BLOCK_UI_HINT_SENDER 0x02 -#define SNDRV_UMP_BLOCK_UI_HINT_BOTH 0x03 - -/* UMP groups and blocks */ -#define SNDRV_UMP_MAX_GROUPS 16 -#define SNDRV_UMP_MAX_BLOCKS 32 - -/* UMP Block information */ -struct snd_ump_block_info { - int card; /* card number */ - int device; /* device number */ - unsigned char block_id; /* block ID (R/W) */ - unsigned char direction; /* UMP direction */ - unsigned char active; /* Activeness */ - unsigned char first_group; /* first group ID */ - unsigned char num_groups; /* number of groups */ - unsigned char midi_ci_version; /* MIDI-CI support version */ - unsigned char sysex8_streams; /* max number of sysex8 streams */ - unsigned char ui_hint; /* user interface hint */ - unsigned int flags; /* various info flags */ - unsigned char name[128]; /* block name string */ - unsigned char reserved[32]; -} __packed; - -#define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) -#define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) -#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) -#define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) -#define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) -#define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) -#define SNDRV_RAWMIDI_IOCTL_DRAIN _IOW('W', 0x31, int) -/* Additional ioctls for UMP rawmidi devices */ -#define SNDRV_UMP_IOCTL_ENDPOINT_INFO _IOR('W', 0x40, struct snd_ump_endpoint_info) -#define SNDRV_UMP_IOCTL_BLOCK_INFO _IOR('W', 0x41, struct snd_ump_block_info) - -/* - * Timer section - /dev/snd/timer - */ - -#define SNDRV_TIMER_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 7) - -enum { - SNDRV_TIMER_CLASS_NONE = -1, - SNDRV_TIMER_CLASS_SLAVE = 0, - SNDRV_TIMER_CLASS_GLOBAL, - SNDRV_TIMER_CLASS_CARD, - SNDRV_TIMER_CLASS_PCM, - SNDRV_TIMER_CLASS_LAST = SNDRV_TIMER_CLASS_PCM, -}; - -/* slave timer classes */ -enum { - SNDRV_TIMER_SCLASS_NONE = 0, - SNDRV_TIMER_SCLASS_APPLICATION, - SNDRV_TIMER_SCLASS_SEQUENCER, /* alias */ - SNDRV_TIMER_SCLASS_OSS_SEQUENCER, /* alias */ - SNDRV_TIMER_SCLASS_LAST = SNDRV_TIMER_SCLASS_OSS_SEQUENCER, -}; - -/* global timers (device member) */ -#define SNDRV_TIMER_GLOBAL_SYSTEM 0 -#define SNDRV_TIMER_GLOBAL_RTC 1 /* unused */ -#define SNDRV_TIMER_GLOBAL_HPET 2 -#define SNDRV_TIMER_GLOBAL_HRTIMER 3 - -/* info flags */ -#define SNDRV_TIMER_FLG_SLAVE (1<<0) /* cannot be controlled */ - -struct snd_timer_id { - int dev_class; - int dev_sclass; - int card; - int device; - int subdevice; -}; - -struct snd_timer_ginfo { - struct snd_timer_id tid; /* requested timer ID */ - unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ - int card; /* card number */ - unsigned char id[64]; /* timer identification */ - unsigned char name[80]; /* timer name */ - unsigned long reserved0; /* reserved for future use */ - unsigned long resolution; /* average period resolution in ns */ - unsigned long resolution_min; /* minimal period resolution in ns */ - unsigned long resolution_max; /* maximal period resolution in ns */ - unsigned int clients; /* active timer clients */ - unsigned char reserved[32]; -}; - -struct snd_timer_gparams { - struct snd_timer_id tid; /* requested timer ID */ - unsigned long period_num; /* requested precise period duration (in seconds) - numerator */ - unsigned long period_den; /* requested precise period duration (in seconds) - denominator */ - unsigned char reserved[32]; -}; - -struct snd_timer_gstatus { - struct snd_timer_id tid; /* requested timer ID */ - unsigned long resolution; /* current period resolution in ns */ - unsigned long resolution_num; /* precise current period resolution (in seconds) - numerator */ - unsigned long resolution_den; /* precise current period resolution (in seconds) - denominator */ - unsigned char reserved[32]; -}; - -struct snd_timer_select { - struct snd_timer_id id; /* bind to timer ID */ - unsigned char reserved[32]; /* reserved */ -}; - -struct snd_timer_info { - unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ - int card; /* card number */ - unsigned char id[64]; /* timer identificator */ - unsigned char name[80]; /* timer name */ - unsigned long reserved0; /* reserved for future use */ - unsigned long resolution; /* average period resolution in ns */ - unsigned char reserved[64]; /* reserved */ -}; - -#define SNDRV_TIMER_PSFLG_AUTO (1<<0) /* auto start, otherwise one-shot */ -#define SNDRV_TIMER_PSFLG_EXCLUSIVE (1<<1) /* exclusive use, precise start/stop/pause/continue */ -#define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ - -struct snd_timer_params { - unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ - unsigned int ticks; /* requested resolution in ticks */ - unsigned int queue_size; /* total size of queue (32-1024) */ - unsigned int reserved0; /* reserved, was: failure locations */ - unsigned int filter; /* event filter (bitmask of SNDRV_TIMER_EVENT_*) */ - unsigned char reserved[60]; /* reserved */ -}; - -#ifndef __KERNEL__ -struct snd_timer_status { - struct timespec tstamp; /* Timestamp - last update */ - unsigned int resolution; /* current period resolution in ns */ - unsigned int lost; /* counter of master tick lost */ - unsigned int overrun; /* count of read queue overruns */ - unsigned int queue; /* used queue size */ - unsigned char reserved[64]; /* reserved */ -}; -#endif - -#define SNDRV_TIMER_IOCTL_PVERSION _IOR('T', 0x00, int) -#define SNDRV_TIMER_IOCTL_NEXT_DEVICE _IOWR('T', 0x01, struct snd_timer_id) -#define SNDRV_TIMER_IOCTL_TREAD_OLD _IOW('T', 0x02, int) -#define SNDRV_TIMER_IOCTL_GINFO _IOWR('T', 0x03, struct snd_timer_ginfo) -#define SNDRV_TIMER_IOCTL_GPARAMS _IOW('T', 0x04, struct snd_timer_gparams) -#define SNDRV_TIMER_IOCTL_GSTATUS _IOWR('T', 0x05, struct snd_timer_gstatus) -#define SNDRV_TIMER_IOCTL_SELECT _IOW('T', 0x10, struct snd_timer_select) -#define SNDRV_TIMER_IOCTL_INFO _IOR('T', 0x11, struct snd_timer_info) -#define SNDRV_TIMER_IOCTL_PARAMS _IOW('T', 0x12, struct snd_timer_params) -#define SNDRV_TIMER_IOCTL_STATUS _IOR('T', 0x14, struct snd_timer_status) -/* The following four ioctls are changed since 1.0.9 due to confliction */ -#define SNDRV_TIMER_IOCTL_START _IO('T', 0xa0) -#define SNDRV_TIMER_IOCTL_STOP _IO('T', 0xa1) -#define SNDRV_TIMER_IOCTL_CONTINUE _IO('T', 0xa2) -#define SNDRV_TIMER_IOCTL_PAUSE _IO('T', 0xa3) -#define SNDRV_TIMER_IOCTL_TREAD64 _IOW('T', 0xa4, int) - -#if __BITS_PER_LONG == 64 -#define SNDRV_TIMER_IOCTL_TREAD SNDRV_TIMER_IOCTL_TREAD_OLD -#else -#define SNDRV_TIMER_IOCTL_TREAD ((sizeof(__kernel_long_t) >= sizeof(time_t)) ? \ - SNDRV_TIMER_IOCTL_TREAD_OLD : \ - SNDRV_TIMER_IOCTL_TREAD64) -#endif - -struct snd_timer_read { - unsigned int resolution; - unsigned int ticks; -}; - -enum { - SNDRV_TIMER_EVENT_RESOLUTION = 0, /* val = resolution in ns */ - SNDRV_TIMER_EVENT_TICK, /* val = ticks */ - SNDRV_TIMER_EVENT_START, /* val = resolution in ns */ - SNDRV_TIMER_EVENT_STOP, /* val = 0 */ - SNDRV_TIMER_EVENT_CONTINUE, /* val = resolution in ns */ - SNDRV_TIMER_EVENT_PAUSE, /* val = 0 */ - SNDRV_TIMER_EVENT_EARLY, /* val = 0, early event */ - SNDRV_TIMER_EVENT_SUSPEND, /* val = 0 */ - SNDRV_TIMER_EVENT_RESUME, /* val = resolution in ns */ - /* master timer events for slave timer instances */ - SNDRV_TIMER_EVENT_MSTART = SNDRV_TIMER_EVENT_START + 10, - SNDRV_TIMER_EVENT_MSTOP = SNDRV_TIMER_EVENT_STOP + 10, - SNDRV_TIMER_EVENT_MCONTINUE = SNDRV_TIMER_EVENT_CONTINUE + 10, - SNDRV_TIMER_EVENT_MPAUSE = SNDRV_TIMER_EVENT_PAUSE + 10, - SNDRV_TIMER_EVENT_MSUSPEND = SNDRV_TIMER_EVENT_SUSPEND + 10, - SNDRV_TIMER_EVENT_MRESUME = SNDRV_TIMER_EVENT_RESUME + 10, -}; - -#ifndef __KERNEL__ -struct snd_timer_tread { - int event; - __time_pad pad1; - struct timespec tstamp; - unsigned int val; - __time_pad pad2; -}; -#endif - -/**************************************************************************** - * * - * Section for driver control interface - /dev/snd/control? * - * * - ****************************************************************************/ - -#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 9) - -struct snd_ctl_card_info { - int card; /* card number */ - int pad; /* reserved for future (was type) */ - unsigned char id[16]; /* ID of card (user selectable) */ - unsigned char driver[16]; /* Driver name */ - unsigned char name[32]; /* Short name of soundcard */ - unsigned char longname[80]; /* name + info text about soundcard */ - unsigned char reserved_[16]; /* reserved for future (was ID of mixer) */ - unsigned char mixername[80]; /* visual mixer identification */ - unsigned char components[128]; /* card components / fine identification, delimited with one space (AC97 etc..) */ -}; - -typedef int __bitwise snd_ctl_elem_type_t; -#define SNDRV_CTL_ELEM_TYPE_NONE ((__force snd_ctl_elem_type_t) 0) /* invalid */ -#define SNDRV_CTL_ELEM_TYPE_BOOLEAN ((__force snd_ctl_elem_type_t) 1) /* boolean type */ -#define SNDRV_CTL_ELEM_TYPE_INTEGER ((__force snd_ctl_elem_type_t) 2) /* integer type */ -#define SNDRV_CTL_ELEM_TYPE_ENUMERATED ((__force snd_ctl_elem_type_t) 3) /* enumerated type */ -#define SNDRV_CTL_ELEM_TYPE_BYTES ((__force snd_ctl_elem_type_t) 4) /* byte array */ -#define SNDRV_CTL_ELEM_TYPE_IEC958 ((__force snd_ctl_elem_type_t) 5) /* IEC958 (S/PDIF) setup */ -#define SNDRV_CTL_ELEM_TYPE_INTEGER64 ((__force snd_ctl_elem_type_t) 6) /* 64-bit integer type */ -#define SNDRV_CTL_ELEM_TYPE_LAST SNDRV_CTL_ELEM_TYPE_INTEGER64 - -typedef int __bitwise snd_ctl_elem_iface_t; -#define SNDRV_CTL_ELEM_IFACE_CARD ((__force snd_ctl_elem_iface_t) 0) /* global control */ -#define SNDRV_CTL_ELEM_IFACE_HWDEP ((__force snd_ctl_elem_iface_t) 1) /* hardware dependent device */ -#define SNDRV_CTL_ELEM_IFACE_MIXER ((__force snd_ctl_elem_iface_t) 2) /* virtual mixer device */ -#define SNDRV_CTL_ELEM_IFACE_PCM ((__force snd_ctl_elem_iface_t) 3) /* PCM device */ -#define SNDRV_CTL_ELEM_IFACE_RAWMIDI ((__force snd_ctl_elem_iface_t) 4) /* RawMidi device */ -#define SNDRV_CTL_ELEM_IFACE_TIMER ((__force snd_ctl_elem_iface_t) 5) /* timer device */ -#define SNDRV_CTL_ELEM_IFACE_SEQUENCER ((__force snd_ctl_elem_iface_t) 6) /* sequencer client */ -#define SNDRV_CTL_ELEM_IFACE_LAST SNDRV_CTL_ELEM_IFACE_SEQUENCER - -#define SNDRV_CTL_ELEM_ACCESS_READ (1<<0) -#define SNDRV_CTL_ELEM_ACCESS_WRITE (1<<1) -#define SNDRV_CTL_ELEM_ACCESS_READWRITE (SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE) -#define SNDRV_CTL_ELEM_ACCESS_VOLATILE (1<<2) /* control value may be changed without a notification */ -/* (1 << 3) is unused. */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_READ (1<<4) /* TLV read is possible */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE (1<<5) /* TLV write is possible */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE (SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) -#define SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND (1<<6) /* TLV command is possible */ -#define SNDRV_CTL_ELEM_ACCESS_INACTIVE (1<<8) /* control does actually nothing, but may be updated */ -#define SNDRV_CTL_ELEM_ACCESS_LOCK (1<<9) /* write lock */ -#define SNDRV_CTL_ELEM_ACCESS_OWNER (1<<10) /* write lock owner */ -#define SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK (1<<28) /* kernel use a TLV callback */ -#define SNDRV_CTL_ELEM_ACCESS_USER (1<<29) /* user space element */ -/* bits 30 and 31 are obsoleted (for indirect access) */ - -/* for further details see the ACPI and PCI power management specification */ -#define SNDRV_CTL_POWER_D0 0x0000 /* full On */ -#define SNDRV_CTL_POWER_D1 0x0100 /* partial On */ -#define SNDRV_CTL_POWER_D2 0x0200 /* partial On */ -#define SNDRV_CTL_POWER_D3 0x0300 /* Off */ -#define SNDRV_CTL_POWER_D3hot (SNDRV_CTL_POWER_D3|0x0000) /* Off, with power */ -#define SNDRV_CTL_POWER_D3cold (SNDRV_CTL_POWER_D3|0x0001) /* Off, without power */ - -#define SNDRV_CTL_ELEM_ID_NAME_MAXLEN 44 - -struct snd_ctl_elem_id { - unsigned int numid; /* numeric identifier, zero = invalid */ - snd_ctl_elem_iface_t iface; /* interface identifier */ - unsigned int device; /* device/client number */ - unsigned int subdevice; /* subdevice (substream) number */ - unsigned char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; /* ASCII name of item */ - unsigned int index; /* index of item */ -}; - -struct snd_ctl_elem_list { - unsigned int offset; /* W: first element ID to get */ - unsigned int space; /* W: count of element IDs to get */ - unsigned int used; /* R: count of element IDs set */ - unsigned int count; /* R: count of all elements */ - struct snd_ctl_elem_id __user *pids; /* R: IDs */ - unsigned char reserved[50]; -}; - -struct snd_ctl_elem_info { - struct snd_ctl_elem_id id; /* W: element ID */ - snd_ctl_elem_type_t type; /* R: value type - SNDRV_CTL_ELEM_TYPE_* */ - unsigned int access; /* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */ - unsigned int count; /* count of values */ - __kernel_pid_t owner; /* owner's PID of this control */ - union { - struct { - long min; /* R: minimum value */ - long max; /* R: maximum value */ - long step; /* R: step (0 variable) */ - } integer; - struct { - long long min; /* R: minimum value */ - long long max; /* R: maximum value */ - long long step; /* R: step (0 variable) */ - } integer64; - struct { - unsigned int items; /* R: number of items */ - unsigned int item; /* W: item number */ - char name[64]; /* R: value name */ - __u64 names_ptr; /* W: names list (ELEM_ADD only) */ - unsigned int names_length; - } enumerated; - unsigned char reserved[128]; - } value; - unsigned char reserved[64]; -}; - -struct snd_ctl_elem_value { - struct snd_ctl_elem_id id; /* W: element ID */ - unsigned int indirect: 1; /* W: indirect access - obsoleted */ - union { - union { - long value[128]; - long *value_ptr; /* obsoleted */ - } integer; - union { - long long value[64]; - long long *value_ptr; /* obsoleted */ - } integer64; - union { - unsigned int item[128]; - unsigned int *item_ptr; /* obsoleted */ - } enumerated; - union { - unsigned char data[512]; - unsigned char *data_ptr; /* obsoleted */ - } bytes; - struct snd_aes_iec958 iec958; - } value; /* RO */ - unsigned char reserved[128]; -}; - -struct snd_ctl_tlv { - unsigned int numid; /* control element numeric identification */ - unsigned int length; /* in bytes aligned to 4 */ - unsigned int tlv[]; /* first TLV */ -}; - -#define SNDRV_CTL_IOCTL_PVERSION _IOR('U', 0x00, int) -#define SNDRV_CTL_IOCTL_CARD_INFO _IOR('U', 0x01, struct snd_ctl_card_info) -#define SNDRV_CTL_IOCTL_ELEM_LIST _IOWR('U', 0x10, struct snd_ctl_elem_list) -#define SNDRV_CTL_IOCTL_ELEM_INFO _IOWR('U', 0x11, struct snd_ctl_elem_info) -#define SNDRV_CTL_IOCTL_ELEM_READ _IOWR('U', 0x12, struct snd_ctl_elem_value) -#define SNDRV_CTL_IOCTL_ELEM_WRITE _IOWR('U', 0x13, struct snd_ctl_elem_value) -#define SNDRV_CTL_IOCTL_ELEM_LOCK _IOW('U', 0x14, struct snd_ctl_elem_id) -#define SNDRV_CTL_IOCTL_ELEM_UNLOCK _IOW('U', 0x15, struct snd_ctl_elem_id) -#define SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS _IOWR('U', 0x16, int) -#define SNDRV_CTL_IOCTL_ELEM_ADD _IOWR('U', 0x17, struct snd_ctl_elem_info) -#define SNDRV_CTL_IOCTL_ELEM_REPLACE _IOWR('U', 0x18, struct snd_ctl_elem_info) -#define SNDRV_CTL_IOCTL_ELEM_REMOVE _IOWR('U', 0x19, struct snd_ctl_elem_id) -#define SNDRV_CTL_IOCTL_TLV_READ _IOWR('U', 0x1a, struct snd_ctl_tlv) -#define SNDRV_CTL_IOCTL_TLV_WRITE _IOWR('U', 0x1b, struct snd_ctl_tlv) -#define SNDRV_CTL_IOCTL_TLV_COMMAND _IOWR('U', 0x1c, struct snd_ctl_tlv) -#define SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE _IOWR('U', 0x20, int) -#define SNDRV_CTL_IOCTL_HWDEP_INFO _IOR('U', 0x21, struct snd_hwdep_info) -#define SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE _IOR('U', 0x30, int) -#define SNDRV_CTL_IOCTL_PCM_INFO _IOWR('U', 0x31, struct snd_pcm_info) -#define SNDRV_CTL_IOCTL_PCM_PREFER_SUBDEVICE _IOW('U', 0x32, int) -#define SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE _IOWR('U', 0x40, int) -#define SNDRV_CTL_IOCTL_RAWMIDI_INFO _IOWR('U', 0x41, struct snd_rawmidi_info) -#define SNDRV_CTL_IOCTL_RAWMIDI_PREFER_SUBDEVICE _IOW('U', 0x42, int) -#define SNDRV_CTL_IOCTL_UMP_NEXT_DEVICE _IOWR('U', 0x43, int) -#define SNDRV_CTL_IOCTL_UMP_ENDPOINT_INFO _IOWR('U', 0x44, struct snd_ump_endpoint_info) -#define SNDRV_CTL_IOCTL_UMP_BLOCK_INFO _IOWR('U', 0x45, struct snd_ump_block_info) -#define SNDRV_CTL_IOCTL_POWER _IOWR('U', 0xd0, int) -#define SNDRV_CTL_IOCTL_POWER_STATE _IOR('U', 0xd1, int) - -/* - * Read interface. - */ - -enum sndrv_ctl_event_type { - SNDRV_CTL_EVENT_ELEM = 0, - SNDRV_CTL_EVENT_LAST = SNDRV_CTL_EVENT_ELEM, -}; - -#define SNDRV_CTL_EVENT_MASK_VALUE (1<<0) /* element value was changed */ -#define SNDRV_CTL_EVENT_MASK_INFO (1<<1) /* element info was changed */ -#define SNDRV_CTL_EVENT_MASK_ADD (1<<2) /* element was added */ -#define SNDRV_CTL_EVENT_MASK_TLV (1<<3) /* element TLV tree was changed */ -#define SNDRV_CTL_EVENT_MASK_REMOVE (~0U) /* element was removed */ - -struct snd_ctl_event { - int type; /* event type - SNDRV_CTL_EVENT_* */ - union { - struct { - unsigned int mask; - struct snd_ctl_elem_id id; - } elem; - unsigned char data8[60]; - } data; -}; - -/* - * Control names - */ - -#define SNDRV_CTL_NAME_NONE "" -#define SNDRV_CTL_NAME_PLAYBACK "Playback " -#define SNDRV_CTL_NAME_CAPTURE "Capture " - -#define SNDRV_CTL_NAME_IEC958_NONE "" -#define SNDRV_CTL_NAME_IEC958_SWITCH "Switch" -#define SNDRV_CTL_NAME_IEC958_VOLUME "Volume" -#define SNDRV_CTL_NAME_IEC958_DEFAULT "Default" -#define SNDRV_CTL_NAME_IEC958_MASK "Mask" -#define SNDRV_CTL_NAME_IEC958_CON_MASK "Con Mask" -#define SNDRV_CTL_NAME_IEC958_PRO_MASK "Pro Mask" -#define SNDRV_CTL_NAME_IEC958_PCM_STREAM "PCM Stream" -#define SNDRV_CTL_NAME_IEC958(expl,direction,what) "IEC958 " expl SNDRV_CTL_NAME_##direction SNDRV_CTL_NAME_IEC958_##what - -#endif /* _UAPI__SOUND_ASOUND_H */ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 53ec3765b4b2..757777d96860 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -473,6 +473,7 @@ arm64-sysreg-defs-clean: beauty_linux_dir := $(srctree)/tools/perf/trace/beauty/include/linux/ beauty_uapi_linux_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/linux/ +beauty_uapi_sound_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/sound/ linux_uapi_dir := $(srctree)/tools/include/uapi/linux asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/ @@ -526,15 +527,15 @@ sndrv_ctl_ioctl_array := $(beauty_ioctl_outdir)/sndrv_ctl_ioctl_array.c sndrv_ctl_hdr_dir := $(srctree)/tools/include/uapi/sound sndrv_ctl_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh -$(sndrv_ctl_ioctl_array): $(sndrv_ctl_hdr_dir)/asound.h $(sndrv_ctl_ioctl_tbl) - $(Q)$(SHELL) '$(sndrv_ctl_ioctl_tbl)' $(sndrv_ctl_hdr_dir) > $@ +$(sndrv_ctl_ioctl_array): $(beauty_uapi_sound_dir)/asound.h $(sndrv_ctl_ioctl_tbl) + $(Q)$(SHELL) '$(sndrv_ctl_ioctl_tbl)' $(beauty_uapi_sound_dir) > $@ sndrv_pcm_ioctl_array := $(beauty_ioctl_outdir)/sndrv_pcm_ioctl_array.c sndrv_pcm_hdr_dir := $(srctree)/tools/include/uapi/sound sndrv_pcm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh -$(sndrv_pcm_ioctl_array): $(sndrv_pcm_hdr_dir)/asound.h $(sndrv_pcm_ioctl_tbl) - $(Q)$(SHELL) '$(sndrv_pcm_ioctl_tbl)' $(sndrv_pcm_hdr_dir) > $@ +$(sndrv_pcm_ioctl_array): $(beauty_uapi_sound_dir)/asound.h $(sndrv_pcm_ioctl_tbl) + $(Q)$(SHELL) '$(sndrv_pcm_ioctl_tbl)' $(beauty_uapi_sound_dir) > $@ kcmp_type_array := $(beauty_outdir)/kcmp_type_array.c kcmp_hdr_dir := $(srctree)/tools/include/uapi/linux/ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 356ddb76a954..93ad7a787a19 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -22,7 +22,6 @@ FILES=( "include/uapi/linux/seccomp.h" "include/uapi/linux/stat.h" "include/uapi/linux/vhost.h" - "include/uapi/sound/asound.h" "include/linux/bits.h" "include/vdso/bits.h" "include/linux/const.h" @@ -98,6 +97,7 @@ BEAUTY_FILES=( "include/uapi/linux/fs.h" "include/uapi/linux/mount.h" "include/uapi/linux/usbdevice_fs.h" + "include/uapi/sound/asound.h" ) declare -a FAILURES diff --git a/tools/perf/trace/beauty/include/uapi/sound/asound.h b/tools/perf/trace/beauty/include/uapi/sound/asound.h new file mode 100644 index 000000000000..d5b9cfbd9cea --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/sound/asound.h @@ -0,0 +1,1252 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Advanced Linux Sound Architecture - ALSA - Driver + * Copyright (c) 1994-2003 by Jaroslav Kysela , + * Abramo Bagnara + */ + +#ifndef _UAPI__SOUND_ASOUND_H +#define _UAPI__SOUND_ASOUND_H + +#if defined(__KERNEL__) || defined(__linux__) +#include +#include +#else +#include +#include +#endif + +#ifndef __KERNEL__ +#include +#include +#endif + +/* + * protocol version + */ + +#define SNDRV_PROTOCOL_VERSION(major, minor, subminor) (((major)<<16)|((minor)<<8)|(subminor)) +#define SNDRV_PROTOCOL_MAJOR(version) (((version)>>16)&0xffff) +#define SNDRV_PROTOCOL_MINOR(version) (((version)>>8)&0xff) +#define SNDRV_PROTOCOL_MICRO(version) ((version)&0xff) +#define SNDRV_PROTOCOL_INCOMPATIBLE(kversion, uversion) \ + (SNDRV_PROTOCOL_MAJOR(kversion) != SNDRV_PROTOCOL_MAJOR(uversion) || \ + (SNDRV_PROTOCOL_MAJOR(kversion) == SNDRV_PROTOCOL_MAJOR(uversion) && \ + SNDRV_PROTOCOL_MINOR(kversion) != SNDRV_PROTOCOL_MINOR(uversion))) + +/**************************************************************************** + * * + * Digital audio interface * + * * + ****************************************************************************/ + +#define AES_IEC958_STATUS_SIZE 24 + +struct snd_aes_iec958 { + unsigned char status[AES_IEC958_STATUS_SIZE]; /* AES/IEC958 channel status bits */ + unsigned char subcode[147]; /* AES/IEC958 subcode bits */ + unsigned char pad; /* nothing */ + unsigned char dig_subframe[4]; /* AES/IEC958 subframe bits */ +}; + +/**************************************************************************** + * * + * CEA-861 Audio InfoFrame. Used in HDMI and DisplayPort * + * * + ****************************************************************************/ + +struct snd_cea_861_aud_if { + unsigned char db1_ct_cc; /* coding type and channel count */ + unsigned char db2_sf_ss; /* sample frequency and size */ + unsigned char db3; /* not used, all zeros */ + unsigned char db4_ca; /* channel allocation code */ + unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */ +}; + +/**************************************************************************** + * * + * Section for driver hardware dependent interface - /dev/snd/hw? * + * * + ****************************************************************************/ + +#define SNDRV_HWDEP_VERSION SNDRV_PROTOCOL_VERSION(1, 0, 1) + +enum { + SNDRV_HWDEP_IFACE_OPL2 = 0, + SNDRV_HWDEP_IFACE_OPL3, + SNDRV_HWDEP_IFACE_OPL4, + SNDRV_HWDEP_IFACE_SB16CSP, /* Creative Signal Processor */ + SNDRV_HWDEP_IFACE_EMU10K1, /* FX8010 processor in EMU10K1 chip */ + SNDRV_HWDEP_IFACE_YSS225, /* Yamaha FX processor */ + SNDRV_HWDEP_IFACE_ICS2115, /* Wavetable synth */ + SNDRV_HWDEP_IFACE_SSCAPE, /* Ensoniq SoundScape ISA card (MC68EC000) */ + SNDRV_HWDEP_IFACE_VX, /* Digigram VX cards */ + SNDRV_HWDEP_IFACE_MIXART, /* Digigram miXart cards */ + SNDRV_HWDEP_IFACE_USX2Y, /* Tascam US122, US224 & US428 usb */ + SNDRV_HWDEP_IFACE_EMUX_WAVETABLE, /* EmuX wavetable */ + SNDRV_HWDEP_IFACE_BLUETOOTH, /* Bluetooth audio */ + SNDRV_HWDEP_IFACE_USX2Y_PCM, /* Tascam US122, US224 & US428 rawusb pcm */ + SNDRV_HWDEP_IFACE_PCXHR, /* Digigram PCXHR */ + SNDRV_HWDEP_IFACE_SB_RC, /* SB Extigy/Audigy2NX remote control */ + SNDRV_HWDEP_IFACE_HDA, /* HD-audio */ + SNDRV_HWDEP_IFACE_USB_STREAM, /* direct access to usb stream */ + SNDRV_HWDEP_IFACE_FW_DICE, /* TC DICE FireWire device */ + SNDRV_HWDEP_IFACE_FW_FIREWORKS, /* Echo Audio Fireworks based device */ + SNDRV_HWDEP_IFACE_FW_BEBOB, /* BridgeCo BeBoB based device */ + SNDRV_HWDEP_IFACE_FW_OXFW, /* Oxford OXFW970/971 based device */ + SNDRV_HWDEP_IFACE_FW_DIGI00X, /* Digidesign Digi 002/003 family */ + SNDRV_HWDEP_IFACE_FW_TASCAM, /* TASCAM FireWire series */ + SNDRV_HWDEP_IFACE_LINE6, /* Line6 USB processors */ + SNDRV_HWDEP_IFACE_FW_MOTU, /* MOTU FireWire series */ + SNDRV_HWDEP_IFACE_FW_FIREFACE, /* RME Fireface series */ + + /* Don't forget to change the following: */ + SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_FIREFACE +}; + +struct snd_hwdep_info { + unsigned int device; /* WR: device number */ + int card; /* R: card number */ + unsigned char id[64]; /* ID (user selectable) */ + unsigned char name[80]; /* hwdep name */ + int iface; /* hwdep interface */ + unsigned char reserved[64]; /* reserved for future */ +}; + +/* generic DSP loader */ +struct snd_hwdep_dsp_status { + unsigned int version; /* R: driver-specific version */ + unsigned char id[32]; /* R: driver-specific ID string */ + unsigned int num_dsps; /* R: number of DSP images to transfer */ + unsigned int dsp_loaded; /* R: bit flags indicating the loaded DSPs */ + unsigned int chip_ready; /* R: 1 = initialization finished */ + unsigned char reserved[16]; /* reserved for future use */ +}; + +struct snd_hwdep_dsp_image { + unsigned int index; /* W: DSP index */ + unsigned char name[64]; /* W: ID (e.g. file name) */ + unsigned char __user *image; /* W: binary image */ + size_t length; /* W: size of image in bytes */ + unsigned long driver_data; /* W: driver-specific data */ +}; + +#define SNDRV_HWDEP_IOCTL_PVERSION _IOR ('H', 0x00, int) +#define SNDRV_HWDEP_IOCTL_INFO _IOR ('H', 0x01, struct snd_hwdep_info) +#define SNDRV_HWDEP_IOCTL_DSP_STATUS _IOR('H', 0x02, struct snd_hwdep_dsp_status) +#define SNDRV_HWDEP_IOCTL_DSP_LOAD _IOW('H', 0x03, struct snd_hwdep_dsp_image) + +/***************************************************************************** + * * + * Digital Audio (PCM) interface - /dev/snd/pcm?? * + * * + *****************************************************************************/ + +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) + +typedef unsigned long snd_pcm_uframes_t; +typedef signed long snd_pcm_sframes_t; + +enum { + SNDRV_PCM_CLASS_GENERIC = 0, /* standard mono or stereo device */ + SNDRV_PCM_CLASS_MULTI, /* multichannel device */ + SNDRV_PCM_CLASS_MODEM, /* software modem class */ + SNDRV_PCM_CLASS_DIGITIZER, /* digitizer class */ + /* Don't forget to change the following: */ + SNDRV_PCM_CLASS_LAST = SNDRV_PCM_CLASS_DIGITIZER, +}; + +enum { + SNDRV_PCM_SUBCLASS_GENERIC_MIX = 0, /* mono or stereo subdevices are mixed together */ + SNDRV_PCM_SUBCLASS_MULTI_MIX, /* multichannel subdevices are mixed together */ + /* Don't forget to change the following: */ + SNDRV_PCM_SUBCLASS_LAST = SNDRV_PCM_SUBCLASS_MULTI_MIX, +}; + +enum { + SNDRV_PCM_STREAM_PLAYBACK = 0, + SNDRV_PCM_STREAM_CAPTURE, + SNDRV_PCM_STREAM_LAST = SNDRV_PCM_STREAM_CAPTURE, +}; + +typedef int __bitwise snd_pcm_access_t; +#define SNDRV_PCM_ACCESS_MMAP_INTERLEAVED ((__force snd_pcm_access_t) 0) /* interleaved mmap */ +#define SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED ((__force snd_pcm_access_t) 1) /* noninterleaved mmap */ +#define SNDRV_PCM_ACCESS_MMAP_COMPLEX ((__force snd_pcm_access_t) 2) /* complex mmap */ +#define SNDRV_PCM_ACCESS_RW_INTERLEAVED ((__force snd_pcm_access_t) 3) /* readi/writei */ +#define SNDRV_PCM_ACCESS_RW_NONINTERLEAVED ((__force snd_pcm_access_t) 4) /* readn/writen */ +#define SNDRV_PCM_ACCESS_LAST SNDRV_PCM_ACCESS_RW_NONINTERLEAVED + +typedef int __bitwise snd_pcm_format_t; +#define SNDRV_PCM_FORMAT_S8 ((__force snd_pcm_format_t) 0) +#define SNDRV_PCM_FORMAT_U8 ((__force snd_pcm_format_t) 1) +#define SNDRV_PCM_FORMAT_S16_LE ((__force snd_pcm_format_t) 2) +#define SNDRV_PCM_FORMAT_S16_BE ((__force snd_pcm_format_t) 3) +#define SNDRV_PCM_FORMAT_U16_LE ((__force snd_pcm_format_t) 4) +#define SNDRV_PCM_FORMAT_U16_BE ((__force snd_pcm_format_t) 5) +#define SNDRV_PCM_FORMAT_S24_LE ((__force snd_pcm_format_t) 6) /* low three bytes */ +#define SNDRV_PCM_FORMAT_S24_BE ((__force snd_pcm_format_t) 7) /* low three bytes */ +#define SNDRV_PCM_FORMAT_U24_LE ((__force snd_pcm_format_t) 8) /* low three bytes */ +#define SNDRV_PCM_FORMAT_U24_BE ((__force snd_pcm_format_t) 9) /* low three bytes */ +/* + * For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the + * available bit count in most significant bit. It's for the case of so-called 'left-justified' or + * `right-padding` sample which has less width than 32 bit. + */ +#define SNDRV_PCM_FORMAT_S32_LE ((__force snd_pcm_format_t) 10) +#define SNDRV_PCM_FORMAT_S32_BE ((__force snd_pcm_format_t) 11) +#define SNDRV_PCM_FORMAT_U32_LE ((__force snd_pcm_format_t) 12) +#define SNDRV_PCM_FORMAT_U32_BE ((__force snd_pcm_format_t) 13) +#define SNDRV_PCM_FORMAT_FLOAT_LE ((__force snd_pcm_format_t) 14) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_FLOAT_BE ((__force snd_pcm_format_t) 15) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_FLOAT64_LE ((__force snd_pcm_format_t) 16) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_FLOAT64_BE ((__force snd_pcm_format_t) 17) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */ +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE ((__force snd_pcm_format_t) 18) /* IEC-958 subframe, Little Endian */ +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE ((__force snd_pcm_format_t) 19) /* IEC-958 subframe, Big Endian */ +#define SNDRV_PCM_FORMAT_MU_LAW ((__force snd_pcm_format_t) 20) +#define SNDRV_PCM_FORMAT_A_LAW ((__force snd_pcm_format_t) 21) +#define SNDRV_PCM_FORMAT_IMA_ADPCM ((__force snd_pcm_format_t) 22) +#define SNDRV_PCM_FORMAT_MPEG ((__force snd_pcm_format_t) 23) +#define SNDRV_PCM_FORMAT_GSM ((__force snd_pcm_format_t) 24) +#define SNDRV_PCM_FORMAT_S20_LE ((__force snd_pcm_format_t) 25) /* in four bytes, LSB justified */ +#define SNDRV_PCM_FORMAT_S20_BE ((__force snd_pcm_format_t) 26) /* in four bytes, LSB justified */ +#define SNDRV_PCM_FORMAT_U20_LE ((__force snd_pcm_format_t) 27) /* in four bytes, LSB justified */ +#define SNDRV_PCM_FORMAT_U20_BE ((__force snd_pcm_format_t) 28) /* in four bytes, LSB justified */ +/* gap in the numbering for a future standard linear format */ +#define SNDRV_PCM_FORMAT_SPECIAL ((__force snd_pcm_format_t) 31) +#define SNDRV_PCM_FORMAT_S24_3LE ((__force snd_pcm_format_t) 32) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S24_3BE ((__force snd_pcm_format_t) 33) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U24_3LE ((__force snd_pcm_format_t) 34) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U24_3BE ((__force snd_pcm_format_t) 35) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S20_3LE ((__force snd_pcm_format_t) 36) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S20_3BE ((__force snd_pcm_format_t) 37) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U20_3LE ((__force snd_pcm_format_t) 38) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U20_3BE ((__force snd_pcm_format_t) 39) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S18_3LE ((__force snd_pcm_format_t) 40) /* in three bytes */ +#define SNDRV_PCM_FORMAT_S18_3BE ((__force snd_pcm_format_t) 41) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U18_3LE ((__force snd_pcm_format_t) 42) /* in three bytes */ +#define SNDRV_PCM_FORMAT_U18_3BE ((__force snd_pcm_format_t) 43) /* in three bytes */ +#define SNDRV_PCM_FORMAT_G723_24 ((__force snd_pcm_format_t) 44) /* 8 samples in 3 bytes */ +#define SNDRV_PCM_FORMAT_G723_24_1B ((__force snd_pcm_format_t) 45) /* 1 sample in 1 byte */ +#define SNDRV_PCM_FORMAT_G723_40 ((__force snd_pcm_format_t) 46) /* 8 Samples in 5 bytes */ +#define SNDRV_PCM_FORMAT_G723_40_1B ((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */ +#define SNDRV_PCM_FORMAT_DSD_U8 ((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */ +#define SNDRV_PCM_FORMAT_DSD_U16_LE ((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */ +#define SNDRV_PCM_FORMAT_DSD_U32_LE ((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */ +#define SNDRV_PCM_FORMAT_DSD_U16_BE ((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */ +#define SNDRV_PCM_FORMAT_DSD_U32_BE ((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */ +#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_BE +#define SNDRV_PCM_FORMAT_FIRST SNDRV_PCM_FORMAT_S8 + +#ifdef SNDRV_LITTLE_ENDIAN +#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE +#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_LE +#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_LE +#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_LE +#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_LE +#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_LE +#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_LE +#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_LE +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE +#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_LE +#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_LE +#endif +#ifdef SNDRV_BIG_ENDIAN +#define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_BE +#define SNDRV_PCM_FORMAT_U16 SNDRV_PCM_FORMAT_U16_BE +#define SNDRV_PCM_FORMAT_S24 SNDRV_PCM_FORMAT_S24_BE +#define SNDRV_PCM_FORMAT_U24 SNDRV_PCM_FORMAT_U24_BE +#define SNDRV_PCM_FORMAT_S32 SNDRV_PCM_FORMAT_S32_BE +#define SNDRV_PCM_FORMAT_U32 SNDRV_PCM_FORMAT_U32_BE +#define SNDRV_PCM_FORMAT_FLOAT SNDRV_PCM_FORMAT_FLOAT_BE +#define SNDRV_PCM_FORMAT_FLOAT64 SNDRV_PCM_FORMAT_FLOAT64_BE +#define SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE +#define SNDRV_PCM_FORMAT_S20 SNDRV_PCM_FORMAT_S20_BE +#define SNDRV_PCM_FORMAT_U20 SNDRV_PCM_FORMAT_U20_BE +#endif + +typedef int __bitwise snd_pcm_subformat_t; +#define SNDRV_PCM_SUBFORMAT_STD ((__force snd_pcm_subformat_t) 0) +#define SNDRV_PCM_SUBFORMAT_MSBITS_MAX ((__force snd_pcm_subformat_t) 1) +#define SNDRV_PCM_SUBFORMAT_MSBITS_20 ((__force snd_pcm_subformat_t) 2) +#define SNDRV_PCM_SUBFORMAT_MSBITS_24 ((__force snd_pcm_subformat_t) 3) +#define SNDRV_PCM_SUBFORMAT_LAST SNDRV_PCM_SUBFORMAT_MSBITS_24 + +#define SNDRV_PCM_INFO_MMAP 0x00000001 /* hardware supports mmap */ +#define SNDRV_PCM_INFO_MMAP_VALID 0x00000002 /* period data are valid during transfer */ +#define SNDRV_PCM_INFO_DOUBLE 0x00000004 /* Double buffering needed for PCM start/stop */ +#define SNDRV_PCM_INFO_BATCH 0x00000010 /* double buffering */ +#define SNDRV_PCM_INFO_SYNC_APPLPTR 0x00000020 /* need the explicit sync of appl_ptr update */ +#define SNDRV_PCM_INFO_PERFECT_DRAIN 0x00000040 /* silencing at the end of stream is not required */ +#define SNDRV_PCM_INFO_INTERLEAVED 0x00000100 /* channels are interleaved */ +#define SNDRV_PCM_INFO_NONINTERLEAVED 0x00000200 /* channels are not interleaved */ +#define SNDRV_PCM_INFO_COMPLEX 0x00000400 /* complex frame organization (mmap only) */ +#define SNDRV_PCM_INFO_BLOCK_TRANSFER 0x00010000 /* hardware transfer block of samples */ +#define SNDRV_PCM_INFO_OVERRANGE 0x00020000 /* hardware supports ADC (capture) overrange detection */ +#define SNDRV_PCM_INFO_RESUME 0x00040000 /* hardware supports stream resume after suspend */ +#define SNDRV_PCM_INFO_PAUSE 0x00080000 /* pause ioctl is supported */ +#define SNDRV_PCM_INFO_HALF_DUPLEX 0x00100000 /* only half duplex */ +#define SNDRV_PCM_INFO_JOINT_DUPLEX 0x00200000 /* playback and capture stream are somewhat correlated */ +#define SNDRV_PCM_INFO_SYNC_START 0x00400000 /* pcm support some kind of sync go */ +#define SNDRV_PCM_INFO_NO_PERIOD_WAKEUP 0x00800000 /* period wakeup can be disabled */ +#define SNDRV_PCM_INFO_HAS_WALL_CLOCK 0x01000000 /* (Deprecated)has audio wall clock for audio/system time sync */ +#define SNDRV_PCM_INFO_HAS_LINK_ATIME 0x01000000 /* report hardware link audio time, reset on startup */ +#define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME 0x02000000 /* report absolute hardware link audio time, not reset on startup */ +#define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME 0x04000000 /* report estimated link audio time */ +#define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000 /* report synchronized audio/system time */ +#define SNDRV_PCM_INFO_EXPLICIT_SYNC 0x10000000 /* needs explicit sync of pointers and data */ +#define SNDRV_PCM_INFO_NO_REWINDS 0x20000000 /* hardware can only support monotonic changes of appl_ptr */ +#define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ +#define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ + +#if (__BITS_PER_LONG == 32 && defined(__USE_TIME_BITS64)) || defined __KERNEL__ +#define __SND_STRUCT_TIME64 +#endif + +typedef int __bitwise snd_pcm_state_t; +#define SNDRV_PCM_STATE_OPEN ((__force snd_pcm_state_t) 0) /* stream is open */ +#define SNDRV_PCM_STATE_SETUP ((__force snd_pcm_state_t) 1) /* stream has a setup */ +#define SNDRV_PCM_STATE_PREPARED ((__force snd_pcm_state_t) 2) /* stream is ready to start */ +#define SNDRV_PCM_STATE_RUNNING ((__force snd_pcm_state_t) 3) /* stream is running */ +#define SNDRV_PCM_STATE_XRUN ((__force snd_pcm_state_t) 4) /* stream reached an xrun */ +#define SNDRV_PCM_STATE_DRAINING ((__force snd_pcm_state_t) 5) /* stream is draining */ +#define SNDRV_PCM_STATE_PAUSED ((__force snd_pcm_state_t) 6) /* stream is paused */ +#define SNDRV_PCM_STATE_SUSPENDED ((__force snd_pcm_state_t) 7) /* hardware is suspended */ +#define SNDRV_PCM_STATE_DISCONNECTED ((__force snd_pcm_state_t) 8) /* hardware is disconnected */ +#define SNDRV_PCM_STATE_LAST SNDRV_PCM_STATE_DISCONNECTED + +enum { + SNDRV_PCM_MMAP_OFFSET_DATA = 0x00000000, + SNDRV_PCM_MMAP_OFFSET_STATUS_OLD = 0x80000000, + SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD = 0x81000000, + SNDRV_PCM_MMAP_OFFSET_STATUS_NEW = 0x82000000, + SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW = 0x83000000, +#ifdef __SND_STRUCT_TIME64 + SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_NEW, + SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW, +#else + SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_OLD, + SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD, +#endif +}; + +union snd_pcm_sync_id { + unsigned char id[16]; + unsigned short id16[8]; + unsigned int id32[4]; +}; + +struct snd_pcm_info { + unsigned int device; /* RO/WR (control): device number */ + unsigned int subdevice; /* RO/WR (control): subdevice number */ + int stream; /* RO/WR (control): stream direction */ + int card; /* R: card number */ + unsigned char id[64]; /* ID (user selectable) */ + unsigned char name[80]; /* name of this device */ + unsigned char subname[32]; /* subdevice name */ + int dev_class; /* SNDRV_PCM_CLASS_* */ + int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */ + unsigned int subdevices_count; + unsigned int subdevices_avail; + union snd_pcm_sync_id sync; /* hardware synchronization ID */ + unsigned char reserved[64]; /* reserved for future... */ +}; + +typedef int snd_pcm_hw_param_t; +#define SNDRV_PCM_HW_PARAM_ACCESS 0 /* Access type */ +#define SNDRV_PCM_HW_PARAM_FORMAT 1 /* Format */ +#define SNDRV_PCM_HW_PARAM_SUBFORMAT 2 /* Subformat */ +#define SNDRV_PCM_HW_PARAM_FIRST_MASK SNDRV_PCM_HW_PARAM_ACCESS +#define SNDRV_PCM_HW_PARAM_LAST_MASK SNDRV_PCM_HW_PARAM_SUBFORMAT + +#define SNDRV_PCM_HW_PARAM_SAMPLE_BITS 8 /* Bits per sample */ +#define SNDRV_PCM_HW_PARAM_FRAME_BITS 9 /* Bits per frame */ +#define SNDRV_PCM_HW_PARAM_CHANNELS 10 /* Channels */ +#define SNDRV_PCM_HW_PARAM_RATE 11 /* Approx rate */ +#define SNDRV_PCM_HW_PARAM_PERIOD_TIME 12 /* Approx distance between + * interrupts in us + */ +#define SNDRV_PCM_HW_PARAM_PERIOD_SIZE 13 /* Approx frames between + * interrupts + */ +#define SNDRV_PCM_HW_PARAM_PERIOD_BYTES 14 /* Approx bytes between + * interrupts + */ +#define SNDRV_PCM_HW_PARAM_PERIODS 15 /* Approx interrupts per + * buffer + */ +#define SNDRV_PCM_HW_PARAM_BUFFER_TIME 16 /* Approx duration of buffer + * in us + */ +#define SNDRV_PCM_HW_PARAM_BUFFER_SIZE 17 /* Size of buffer in frames */ +#define SNDRV_PCM_HW_PARAM_BUFFER_BYTES 18 /* Size of buffer in bytes */ +#define SNDRV_PCM_HW_PARAM_TICK_TIME 19 /* Approx tick duration in us */ +#define SNDRV_PCM_HW_PARAM_FIRST_INTERVAL SNDRV_PCM_HW_PARAM_SAMPLE_BITS +#define SNDRV_PCM_HW_PARAM_LAST_INTERVAL SNDRV_PCM_HW_PARAM_TICK_TIME + +#define SNDRV_PCM_HW_PARAMS_NORESAMPLE (1<<0) /* avoid rate resampling */ +#define SNDRV_PCM_HW_PARAMS_EXPORT_BUFFER (1<<1) /* export buffer */ +#define SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP (1<<2) /* disable period wakeups */ +#define SNDRV_PCM_HW_PARAMS_NO_DRAIN_SILENCE (1<<3) /* suppress drain with the filling + * of the silence samples + */ + +struct snd_interval { + unsigned int min, max; + unsigned int openmin:1, + openmax:1, + integer:1, + empty:1; +}; + +#define SNDRV_MASK_MAX 256 + +struct snd_mask { + __u32 bits[(SNDRV_MASK_MAX+31)/32]; +}; + +struct snd_pcm_hw_params { + unsigned int flags; + struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK - + SNDRV_PCM_HW_PARAM_FIRST_MASK + 1]; + struct snd_mask mres[5]; /* reserved masks */ + struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL - + SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1]; + struct snd_interval ires[9]; /* reserved intervals */ + unsigned int rmask; /* W: requested masks */ + unsigned int cmask; /* R: changed masks */ + unsigned int info; /* R: Info flags for returned setup */ + unsigned int msbits; /* R: used most significant bits */ + unsigned int rate_num; /* R: rate numerator */ + unsigned int rate_den; /* R: rate denominator */ + snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ + unsigned char reserved[64]; /* reserved for future */ +}; + +enum { + SNDRV_PCM_TSTAMP_NONE = 0, + SNDRV_PCM_TSTAMP_ENABLE, + SNDRV_PCM_TSTAMP_LAST = SNDRV_PCM_TSTAMP_ENABLE, +}; + +struct snd_pcm_sw_params { + int tstamp_mode; /* timestamp mode */ + unsigned int period_step; + unsigned int sleep_min; /* min ticks to sleep */ + snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */ + snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */ + snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */ + /* + * The following two thresholds alleviate playback buffer underruns; when + * hw_avail drops below the threshold, the respective action is triggered: + */ + snd_pcm_uframes_t stop_threshold; /* - stop playback */ + snd_pcm_uframes_t silence_threshold; /* - pre-fill buffer with silence */ + snd_pcm_uframes_t silence_size; /* max size of silence pre-fill; when >= boundary, + * fill played area with silence immediately */ + snd_pcm_uframes_t boundary; /* pointers wrap point */ + unsigned int proto; /* protocol version */ + unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */ + unsigned char reserved[56]; /* reserved for future */ +}; + +struct snd_pcm_channel_info { + unsigned int channel; + __kernel_off_t offset; /* mmap offset */ + unsigned int first; /* offset to first sample in bits */ + unsigned int step; /* samples distance in bits */ +}; + +enum { + /* + * first definition for backwards compatibility only, + * maps to wallclock/link time for HDAudio playback and DEFAULT/DMA time for everything else + */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT = 0, + + /* timestamp definitions */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT = 1, /* DMA time, reported as per hw_ptr */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK = 2, /* link time reported by sample or wallclock counter, reset on startup */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ABSOLUTE = 3, /* link time reported by sample or wallclock counter, not reset on startup */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ESTIMATED = 4, /* link time estimated indirectly */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED = 5, /* link time synchronized with system time */ + SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED +}; + +#ifndef __KERNEL__ +/* explicit padding avoids incompatibility between i386 and x86-64 */ +typedef struct { unsigned char pad[sizeof(time_t) - sizeof(int)]; } __time_pad; + +struct snd_pcm_status { + snd_pcm_state_t state; /* stream state */ + __time_pad pad1; /* align to timespec */ + struct timespec trigger_tstamp; /* time when stream was started/stopped/paused */ + struct timespec tstamp; /* reference timestamp */ + snd_pcm_uframes_t appl_ptr; /* appl ptr */ + snd_pcm_uframes_t hw_ptr; /* hw ptr */ + snd_pcm_sframes_t delay; /* current delay in frames */ + snd_pcm_uframes_t avail; /* number of frames available */ + snd_pcm_uframes_t avail_max; /* max frames available on hw since last status */ + snd_pcm_uframes_t overrange; /* count of ADC (capture) overrange detections from last status */ + snd_pcm_state_t suspended_state; /* suspended stream state */ + __u32 audio_tstamp_data; /* needed for 64-bit alignment, used for configs/report to/from userspace */ + struct timespec audio_tstamp; /* sample counter, wall clock, PHC or on-demand sync'ed */ + struct timespec driver_tstamp; /* useful in case reference system tstamp is reported with delay */ + __u32 audio_tstamp_accuracy; /* in ns units, only valid if indicated in audio_tstamp_data */ + unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */ +}; +#endif + +/* + * For mmap operations, we need the 64-bit layout, both for compat mode, + * and for y2038 compatibility. For 64-bit applications, the two definitions + * are identical, so we keep the traditional version. + */ +#ifdef __SND_STRUCT_TIME64 +#define __snd_pcm_mmap_status64 snd_pcm_mmap_status +#define __snd_pcm_mmap_control64 snd_pcm_mmap_control +#define __snd_pcm_sync_ptr64 snd_pcm_sync_ptr +#ifdef __KERNEL__ +#define __snd_timespec64 __kernel_timespec +#else +#define __snd_timespec64 timespec +#endif +struct __snd_timespec { + __s32 tv_sec; + __s32 tv_nsec; +}; +#else +#define __snd_pcm_mmap_status snd_pcm_mmap_status +#define __snd_pcm_mmap_control snd_pcm_mmap_control +#define __snd_pcm_sync_ptr snd_pcm_sync_ptr +#define __snd_timespec timespec +struct __snd_timespec64 { + __s64 tv_sec; + __s64 tv_nsec; +}; + +#endif + +struct __snd_pcm_mmap_status { + snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ + int pad1; /* Needed for 64 bit alignment */ + snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ + struct __snd_timespec tstamp; /* Timestamp */ + snd_pcm_state_t suspended_state; /* RO: suspended stream state */ + struct __snd_timespec audio_tstamp; /* from sample counter or wall clock */ +}; + +struct __snd_pcm_mmap_control { + snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ + snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ +}; + +#define SNDRV_PCM_SYNC_PTR_HWSYNC (1<<0) /* execute hwsync */ +#define SNDRV_PCM_SYNC_PTR_APPL (1<<1) /* get appl_ptr from driver (r/w op) */ +#define SNDRV_PCM_SYNC_PTR_AVAIL_MIN (1<<2) /* get avail_min from driver */ + +struct __snd_pcm_sync_ptr { + unsigned int flags; + union { + struct __snd_pcm_mmap_status status; + unsigned char reserved[64]; + } s; + union { + struct __snd_pcm_mmap_control control; + unsigned char reserved[64]; + } c; +}; + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) +typedef char __pad_before_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; +typedef char __pad_after_uframe[0]; +#endif + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) +typedef char __pad_before_uframe[0]; +typedef char __pad_after_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; +#endif + +struct __snd_pcm_mmap_status64 { + snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ + __u32 pad1; /* Needed for 64 bit alignment */ + __pad_before_uframe __pad1; + snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ + __pad_after_uframe __pad2; + struct __snd_timespec64 tstamp; /* Timestamp */ + snd_pcm_state_t suspended_state;/* RO: suspended stream state */ + __u32 pad3; /* Needed for 64 bit alignment */ + struct __snd_timespec64 audio_tstamp; /* sample counter or wall clock */ +}; + +struct __snd_pcm_mmap_control64 { + __pad_before_uframe __pad1; + snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ + __pad_before_uframe __pad2; // This should be __pad_after_uframe, but binary + // backwards compatibility constraints prevent a fix. + + __pad_before_uframe __pad3; + snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ + __pad_after_uframe __pad4; +}; + +struct __snd_pcm_sync_ptr64 { + __u32 flags; + __u32 pad1; + union { + struct __snd_pcm_mmap_status64 status; + unsigned char reserved[64]; + } s; + union { + struct __snd_pcm_mmap_control64 control; + unsigned char reserved[64]; + } c; +}; + +struct snd_xferi { + snd_pcm_sframes_t result; + void __user *buf; + snd_pcm_uframes_t frames; +}; + +struct snd_xfern { + snd_pcm_sframes_t result; + void __user * __user *bufs; + snd_pcm_uframes_t frames; +}; + +enum { + SNDRV_PCM_TSTAMP_TYPE_GETTIMEOFDAY = 0, /* gettimeofday equivalent */ + SNDRV_PCM_TSTAMP_TYPE_MONOTONIC, /* posix_clock_monotonic equivalent */ + SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, /* monotonic_raw (no NTP) */ + SNDRV_PCM_TSTAMP_TYPE_LAST = SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW, +}; + +/* channel positions */ +enum { + SNDRV_CHMAP_UNKNOWN = 0, + SNDRV_CHMAP_NA, /* N/A, silent */ + SNDRV_CHMAP_MONO, /* mono stream */ + /* this follows the alsa-lib mixer channel value + 3 */ + SNDRV_CHMAP_FL, /* front left */ + SNDRV_CHMAP_FR, /* front right */ + SNDRV_CHMAP_RL, /* rear left */ + SNDRV_CHMAP_RR, /* rear right */ + SNDRV_CHMAP_FC, /* front center */ + SNDRV_CHMAP_LFE, /* LFE */ + SNDRV_CHMAP_SL, /* side left */ + SNDRV_CHMAP_SR, /* side right */ + SNDRV_CHMAP_RC, /* rear center */ + /* new definitions */ + SNDRV_CHMAP_FLC, /* front left center */ + SNDRV_CHMAP_FRC, /* front right center */ + SNDRV_CHMAP_RLC, /* rear left center */ + SNDRV_CHMAP_RRC, /* rear right center */ + SNDRV_CHMAP_FLW, /* front left wide */ + SNDRV_CHMAP_FRW, /* front right wide */ + SNDRV_CHMAP_FLH, /* front left high */ + SNDRV_CHMAP_FCH, /* front center high */ + SNDRV_CHMAP_FRH, /* front right high */ + SNDRV_CHMAP_TC, /* top center */ + SNDRV_CHMAP_TFL, /* top front left */ + SNDRV_CHMAP_TFR, /* top front right */ + SNDRV_CHMAP_TFC, /* top front center */ + SNDRV_CHMAP_TRL, /* top rear left */ + SNDRV_CHMAP_TRR, /* top rear right */ + SNDRV_CHMAP_TRC, /* top rear center */ + /* new definitions for UAC2 */ + SNDRV_CHMAP_TFLC, /* top front left center */ + SNDRV_CHMAP_TFRC, /* top front right center */ + SNDRV_CHMAP_TSL, /* top side left */ + SNDRV_CHMAP_TSR, /* top side right */ + SNDRV_CHMAP_LLFE, /* left LFE */ + SNDRV_CHMAP_RLFE, /* right LFE */ + SNDRV_CHMAP_BC, /* bottom center */ + SNDRV_CHMAP_BLC, /* bottom left center */ + SNDRV_CHMAP_BRC, /* bottom right center */ + SNDRV_CHMAP_LAST = SNDRV_CHMAP_BRC, +}; + +#define SNDRV_CHMAP_POSITION_MASK 0xffff +#define SNDRV_CHMAP_PHASE_INVERSE (0x01 << 16) +#define SNDRV_CHMAP_DRIVER_SPEC (0x02 << 16) + +#define SNDRV_PCM_IOCTL_PVERSION _IOR('A', 0x00, int) +#define SNDRV_PCM_IOCTL_INFO _IOR('A', 0x01, struct snd_pcm_info) +#define SNDRV_PCM_IOCTL_TSTAMP _IOW('A', 0x02, int) +#define SNDRV_PCM_IOCTL_TTSTAMP _IOW('A', 0x03, int) +#define SNDRV_PCM_IOCTL_USER_PVERSION _IOW('A', 0x04, int) +#define SNDRV_PCM_IOCTL_HW_REFINE _IOWR('A', 0x10, struct snd_pcm_hw_params) +#define SNDRV_PCM_IOCTL_HW_PARAMS _IOWR('A', 0x11, struct snd_pcm_hw_params) +#define SNDRV_PCM_IOCTL_HW_FREE _IO('A', 0x12) +#define SNDRV_PCM_IOCTL_SW_PARAMS _IOWR('A', 0x13, struct snd_pcm_sw_params) +#define SNDRV_PCM_IOCTL_STATUS _IOR('A', 0x20, struct snd_pcm_status) +#define SNDRV_PCM_IOCTL_DELAY _IOR('A', 0x21, snd_pcm_sframes_t) +#define SNDRV_PCM_IOCTL_HWSYNC _IO('A', 0x22) +#define __SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct __snd_pcm_sync_ptr) +#define __SNDRV_PCM_IOCTL_SYNC_PTR64 _IOWR('A', 0x23, struct __snd_pcm_sync_ptr64) +#define SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct snd_pcm_sync_ptr) +#define SNDRV_PCM_IOCTL_STATUS_EXT _IOWR('A', 0x24, struct snd_pcm_status) +#define SNDRV_PCM_IOCTL_CHANNEL_INFO _IOR('A', 0x32, struct snd_pcm_channel_info) +#define SNDRV_PCM_IOCTL_PREPARE _IO('A', 0x40) +#define SNDRV_PCM_IOCTL_RESET _IO('A', 0x41) +#define SNDRV_PCM_IOCTL_START _IO('A', 0x42) +#define SNDRV_PCM_IOCTL_DROP _IO('A', 0x43) +#define SNDRV_PCM_IOCTL_DRAIN _IO('A', 0x44) +#define SNDRV_PCM_IOCTL_PAUSE _IOW('A', 0x45, int) +#define SNDRV_PCM_IOCTL_REWIND _IOW('A', 0x46, snd_pcm_uframes_t) +#define SNDRV_PCM_IOCTL_RESUME _IO('A', 0x47) +#define SNDRV_PCM_IOCTL_XRUN _IO('A', 0x48) +#define SNDRV_PCM_IOCTL_FORWARD _IOW('A', 0x49, snd_pcm_uframes_t) +#define SNDRV_PCM_IOCTL_WRITEI_FRAMES _IOW('A', 0x50, struct snd_xferi) +#define SNDRV_PCM_IOCTL_READI_FRAMES _IOR('A', 0x51, struct snd_xferi) +#define SNDRV_PCM_IOCTL_WRITEN_FRAMES _IOW('A', 0x52, struct snd_xfern) +#define SNDRV_PCM_IOCTL_READN_FRAMES _IOR('A', 0x53, struct snd_xfern) +#define SNDRV_PCM_IOCTL_LINK _IOW('A', 0x60, int) +#define SNDRV_PCM_IOCTL_UNLINK _IO('A', 0x61) + +/***************************************************************************** + * * + * MIDI v1.0 interface * + * * + *****************************************************************************/ + +/* + * Raw MIDI section - /dev/snd/midi?? + */ + +#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 4) + +enum { + SNDRV_RAWMIDI_STREAM_OUTPUT = 0, + SNDRV_RAWMIDI_STREAM_INPUT, + SNDRV_RAWMIDI_STREAM_LAST = SNDRV_RAWMIDI_STREAM_INPUT, +}; + +#define SNDRV_RAWMIDI_INFO_OUTPUT 0x00000001 +#define SNDRV_RAWMIDI_INFO_INPUT 0x00000002 +#define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 +#define SNDRV_RAWMIDI_INFO_UMP 0x00000008 + +struct snd_rawmidi_info { + unsigned int device; /* RO/WR (control): device number */ + unsigned int subdevice; /* RO/WR (control): subdevice number */ + int stream; /* WR: stream */ + int card; /* R: card number */ + unsigned int flags; /* SNDRV_RAWMIDI_INFO_XXXX */ + unsigned char id[64]; /* ID (user selectable) */ + unsigned char name[80]; /* name of device */ + unsigned char subname[32]; /* name of active or selected subdevice */ + unsigned int subdevices_count; + unsigned int subdevices_avail; + unsigned char reserved[64]; /* reserved for future use */ +}; + +#define SNDRV_RAWMIDI_MODE_FRAMING_MASK (7<<0) +#define SNDRV_RAWMIDI_MODE_FRAMING_SHIFT 0 +#define SNDRV_RAWMIDI_MODE_FRAMING_NONE (0<<0) +#define SNDRV_RAWMIDI_MODE_FRAMING_TSTAMP (1<<0) +#define SNDRV_RAWMIDI_MODE_CLOCK_MASK (7<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_SHIFT 3 +#define SNDRV_RAWMIDI_MODE_CLOCK_NONE (0<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_REALTIME (1<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC (2<<3) +#define SNDRV_RAWMIDI_MODE_CLOCK_MONOTONIC_RAW (3<<3) + +#define SNDRV_RAWMIDI_FRAMING_DATA_LENGTH 16 + +struct snd_rawmidi_framing_tstamp { + /* For now, frame_type is always 0. Midi 2.0 is expected to add new + * types here. Applications are expected to skip unknown frame types. + */ + __u8 frame_type; + __u8 length; /* number of valid bytes in data field */ + __u8 reserved[2]; + __u32 tv_nsec; /* nanoseconds */ + __u64 tv_sec; /* seconds */ + __u8 data[SNDRV_RAWMIDI_FRAMING_DATA_LENGTH]; +} __packed; + +struct snd_rawmidi_params { + int stream; + size_t buffer_size; /* queue size in bytes */ + size_t avail_min; /* minimum avail bytes for wakeup */ + unsigned int no_active_sensing: 1; /* do not send active sensing byte in close() */ + unsigned int mode; /* For input data only, frame incoming data */ + unsigned char reserved[12]; /* reserved for future use */ +}; + +#ifndef __KERNEL__ +struct snd_rawmidi_status { + int stream; + __time_pad pad1; + struct timespec tstamp; /* Timestamp */ + size_t avail; /* available bytes */ + size_t xruns; /* count of overruns since last status (in bytes) */ + unsigned char reserved[16]; /* reserved for future use */ +}; +#endif + +/* UMP EP info flags */ +#define SNDRV_UMP_EP_INFO_STATIC_BLOCKS 0x01 + +/* UMP EP Protocol / JRTS capability bits */ +#define SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK 0x0300 +#define SNDRV_UMP_EP_INFO_PROTO_MIDI1 0x0100 /* MIDI 1.0 */ +#define SNDRV_UMP_EP_INFO_PROTO_MIDI2 0x0200 /* MIDI 2.0 */ +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_MASK 0x0003 +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_TX 0x0001 /* JRTS Transmit */ +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_RX 0x0002 /* JRTS Receive */ + +/* UMP Endpoint information */ +struct snd_ump_endpoint_info { + int card; /* card number */ + int device; /* device number */ + unsigned int flags; /* additional info */ + unsigned int protocol_caps; /* protocol capabilities */ + unsigned int protocol; /* current protocol */ + unsigned int num_blocks; /* # of function blocks */ + unsigned short version; /* UMP major/minor version */ + unsigned short family_id; /* MIDI device family ID */ + unsigned short model_id; /* MIDI family model ID */ + unsigned int manufacturer_id; /* MIDI manufacturer ID */ + unsigned char sw_revision[4]; /* software revision */ + unsigned short padding; + unsigned char name[128]; /* endpoint name string */ + unsigned char product_id[128]; /* unique product id string */ + unsigned char reserved[32]; +} __packed; + +/* UMP direction */ +#define SNDRV_UMP_DIR_INPUT 0x01 +#define SNDRV_UMP_DIR_OUTPUT 0x02 +#define SNDRV_UMP_DIR_BIDIRECTION 0x03 + +/* UMP block info flags */ +#define SNDRV_UMP_BLOCK_IS_MIDI1 (1U << 0) /* MIDI 1.0 port w/o restrict */ +#define SNDRV_UMP_BLOCK_IS_LOWSPEED (1U << 1) /* 31.25Kbps B/W MIDI1 port */ + +/* UMP block user-interface hint */ +#define SNDRV_UMP_BLOCK_UI_HINT_UNKNOWN 0x00 +#define SNDRV_UMP_BLOCK_UI_HINT_RECEIVER 0x01 +#define SNDRV_UMP_BLOCK_UI_HINT_SENDER 0x02 +#define SNDRV_UMP_BLOCK_UI_HINT_BOTH 0x03 + +/* UMP groups and blocks */ +#define SNDRV_UMP_MAX_GROUPS 16 +#define SNDRV_UMP_MAX_BLOCKS 32 + +/* UMP Block information */ +struct snd_ump_block_info { + int card; /* card number */ + int device; /* device number */ + unsigned char block_id; /* block ID (R/W) */ + unsigned char direction; /* UMP direction */ + unsigned char active; /* Activeness */ + unsigned char first_group; /* first group ID */ + unsigned char num_groups; /* number of groups */ + unsigned char midi_ci_version; /* MIDI-CI support version */ + unsigned char sysex8_streams; /* max number of sysex8 streams */ + unsigned char ui_hint; /* user interface hint */ + unsigned int flags; /* various info flags */ + unsigned char name[128]; /* block name string */ + unsigned char reserved[32]; +} __packed; + +#define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) +#define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) +#define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) +#define SNDRV_RAWMIDI_IOCTL_PARAMS _IOWR('W', 0x10, struct snd_rawmidi_params) +#define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) +#define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) +#define SNDRV_RAWMIDI_IOCTL_DRAIN _IOW('W', 0x31, int) +/* Additional ioctls for UMP rawmidi devices */ +#define SNDRV_UMP_IOCTL_ENDPOINT_INFO _IOR('W', 0x40, struct snd_ump_endpoint_info) +#define SNDRV_UMP_IOCTL_BLOCK_INFO _IOR('W', 0x41, struct snd_ump_block_info) + +/* + * Timer section - /dev/snd/timer + */ + +#define SNDRV_TIMER_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 7) + +enum { + SNDRV_TIMER_CLASS_NONE = -1, + SNDRV_TIMER_CLASS_SLAVE = 0, + SNDRV_TIMER_CLASS_GLOBAL, + SNDRV_TIMER_CLASS_CARD, + SNDRV_TIMER_CLASS_PCM, + SNDRV_TIMER_CLASS_LAST = SNDRV_TIMER_CLASS_PCM, +}; + +/* slave timer classes */ +enum { + SNDRV_TIMER_SCLASS_NONE = 0, + SNDRV_TIMER_SCLASS_APPLICATION, + SNDRV_TIMER_SCLASS_SEQUENCER, /* alias */ + SNDRV_TIMER_SCLASS_OSS_SEQUENCER, /* alias */ + SNDRV_TIMER_SCLASS_LAST = SNDRV_TIMER_SCLASS_OSS_SEQUENCER, +}; + +/* global timers (device member) */ +#define SNDRV_TIMER_GLOBAL_SYSTEM 0 +#define SNDRV_TIMER_GLOBAL_RTC 1 /* unused */ +#define SNDRV_TIMER_GLOBAL_HPET 2 +#define SNDRV_TIMER_GLOBAL_HRTIMER 3 + +/* info flags */ +#define SNDRV_TIMER_FLG_SLAVE (1<<0) /* cannot be controlled */ + +struct snd_timer_id { + int dev_class; + int dev_sclass; + int card; + int device; + int subdevice; +}; + +struct snd_timer_ginfo { + struct snd_timer_id tid; /* requested timer ID */ + unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ + int card; /* card number */ + unsigned char id[64]; /* timer identification */ + unsigned char name[80]; /* timer name */ + unsigned long reserved0; /* reserved for future use */ + unsigned long resolution; /* average period resolution in ns */ + unsigned long resolution_min; /* minimal period resolution in ns */ + unsigned long resolution_max; /* maximal period resolution in ns */ + unsigned int clients; /* active timer clients */ + unsigned char reserved[32]; +}; + +struct snd_timer_gparams { + struct snd_timer_id tid; /* requested timer ID */ + unsigned long period_num; /* requested precise period duration (in seconds) - numerator */ + unsigned long period_den; /* requested precise period duration (in seconds) - denominator */ + unsigned char reserved[32]; +}; + +struct snd_timer_gstatus { + struct snd_timer_id tid; /* requested timer ID */ + unsigned long resolution; /* current period resolution in ns */ + unsigned long resolution_num; /* precise current period resolution (in seconds) - numerator */ + unsigned long resolution_den; /* precise current period resolution (in seconds) - denominator */ + unsigned char reserved[32]; +}; + +struct snd_timer_select { + struct snd_timer_id id; /* bind to timer ID */ + unsigned char reserved[32]; /* reserved */ +}; + +struct snd_timer_info { + unsigned int flags; /* timer flags - SNDRV_TIMER_FLG_* */ + int card; /* card number */ + unsigned char id[64]; /* timer identificator */ + unsigned char name[80]; /* timer name */ + unsigned long reserved0; /* reserved for future use */ + unsigned long resolution; /* average period resolution in ns */ + unsigned char reserved[64]; /* reserved */ +}; + +#define SNDRV_TIMER_PSFLG_AUTO (1<<0) /* auto start, otherwise one-shot */ +#define SNDRV_TIMER_PSFLG_EXCLUSIVE (1<<1) /* exclusive use, precise start/stop/pause/continue */ +#define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ + +struct snd_timer_params { + unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ + unsigned int ticks; /* requested resolution in ticks */ + unsigned int queue_size; /* total size of queue (32-1024) */ + unsigned int reserved0; /* reserved, was: failure locations */ + unsigned int filter; /* event filter (bitmask of SNDRV_TIMER_EVENT_*) */ + unsigned char reserved[60]; /* reserved */ +}; + +#ifndef __KERNEL__ +struct snd_timer_status { + struct timespec tstamp; /* Timestamp - last update */ + unsigned int resolution; /* current period resolution in ns */ + unsigned int lost; /* counter of master tick lost */ + unsigned int overrun; /* count of read queue overruns */ + unsigned int queue; /* used queue size */ + unsigned char reserved[64]; /* reserved */ +}; +#endif + +#define SNDRV_TIMER_IOCTL_PVERSION _IOR('T', 0x00, int) +#define SNDRV_TIMER_IOCTL_NEXT_DEVICE _IOWR('T', 0x01, struct snd_timer_id) +#define SNDRV_TIMER_IOCTL_TREAD_OLD _IOW('T', 0x02, int) +#define SNDRV_TIMER_IOCTL_GINFO _IOWR('T', 0x03, struct snd_timer_ginfo) +#define SNDRV_TIMER_IOCTL_GPARAMS _IOW('T', 0x04, struct snd_timer_gparams) +#define SNDRV_TIMER_IOCTL_GSTATUS _IOWR('T', 0x05, struct snd_timer_gstatus) +#define SNDRV_TIMER_IOCTL_SELECT _IOW('T', 0x10, struct snd_timer_select) +#define SNDRV_TIMER_IOCTL_INFO _IOR('T', 0x11, struct snd_timer_info) +#define SNDRV_TIMER_IOCTL_PARAMS _IOW('T', 0x12, struct snd_timer_params) +#define SNDRV_TIMER_IOCTL_STATUS _IOR('T', 0x14, struct snd_timer_status) +/* The following four ioctls are changed since 1.0.9 due to confliction */ +#define SNDRV_TIMER_IOCTL_START _IO('T', 0xa0) +#define SNDRV_TIMER_IOCTL_STOP _IO('T', 0xa1) +#define SNDRV_TIMER_IOCTL_CONTINUE _IO('T', 0xa2) +#define SNDRV_TIMER_IOCTL_PAUSE _IO('T', 0xa3) +#define SNDRV_TIMER_IOCTL_TREAD64 _IOW('T', 0xa4, int) + +#if __BITS_PER_LONG == 64 +#define SNDRV_TIMER_IOCTL_TREAD SNDRV_TIMER_IOCTL_TREAD_OLD +#else +#define SNDRV_TIMER_IOCTL_TREAD ((sizeof(__kernel_long_t) >= sizeof(time_t)) ? \ + SNDRV_TIMER_IOCTL_TREAD_OLD : \ + SNDRV_TIMER_IOCTL_TREAD64) +#endif + +struct snd_timer_read { + unsigned int resolution; + unsigned int ticks; +}; + +enum { + SNDRV_TIMER_EVENT_RESOLUTION = 0, /* val = resolution in ns */ + SNDRV_TIMER_EVENT_TICK, /* val = ticks */ + SNDRV_TIMER_EVENT_START, /* val = resolution in ns */ + SNDRV_TIMER_EVENT_STOP, /* val = 0 */ + SNDRV_TIMER_EVENT_CONTINUE, /* val = resolution in ns */ + SNDRV_TIMER_EVENT_PAUSE, /* val = 0 */ + SNDRV_TIMER_EVENT_EARLY, /* val = 0, early event */ + SNDRV_TIMER_EVENT_SUSPEND, /* val = 0 */ + SNDRV_TIMER_EVENT_RESUME, /* val = resolution in ns */ + /* master timer events for slave timer instances */ + SNDRV_TIMER_EVENT_MSTART = SNDRV_TIMER_EVENT_START + 10, + SNDRV_TIMER_EVENT_MSTOP = SNDRV_TIMER_EVENT_STOP + 10, + SNDRV_TIMER_EVENT_MCONTINUE = SNDRV_TIMER_EVENT_CONTINUE + 10, + SNDRV_TIMER_EVENT_MPAUSE = SNDRV_TIMER_EVENT_PAUSE + 10, + SNDRV_TIMER_EVENT_MSUSPEND = SNDRV_TIMER_EVENT_SUSPEND + 10, + SNDRV_TIMER_EVENT_MRESUME = SNDRV_TIMER_EVENT_RESUME + 10, +}; + +#ifndef __KERNEL__ +struct snd_timer_tread { + int event; + __time_pad pad1; + struct timespec tstamp; + unsigned int val; + __time_pad pad2; +}; +#endif + +/**************************************************************************** + * * + * Section for driver control interface - /dev/snd/control? * + * * + ****************************************************************************/ + +#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 9) + +struct snd_ctl_card_info { + int card; /* card number */ + int pad; /* reserved for future (was type) */ + unsigned char id[16]; /* ID of card (user selectable) */ + unsigned char driver[16]; /* Driver name */ + unsigned char name[32]; /* Short name of soundcard */ + unsigned char longname[80]; /* name + info text about soundcard */ + unsigned char reserved_[16]; /* reserved for future (was ID of mixer) */ + unsigned char mixername[80]; /* visual mixer identification */ + unsigned char components[128]; /* card components / fine identification, delimited with one space (AC97 etc..) */ +}; + +typedef int __bitwise snd_ctl_elem_type_t; +#define SNDRV_CTL_ELEM_TYPE_NONE ((__force snd_ctl_elem_type_t) 0) /* invalid */ +#define SNDRV_CTL_ELEM_TYPE_BOOLEAN ((__force snd_ctl_elem_type_t) 1) /* boolean type */ +#define SNDRV_CTL_ELEM_TYPE_INTEGER ((__force snd_ctl_elem_type_t) 2) /* integer type */ +#define SNDRV_CTL_ELEM_TYPE_ENUMERATED ((__force snd_ctl_elem_type_t) 3) /* enumerated type */ +#define SNDRV_CTL_ELEM_TYPE_BYTES ((__force snd_ctl_elem_type_t) 4) /* byte array */ +#define SNDRV_CTL_ELEM_TYPE_IEC958 ((__force snd_ctl_elem_type_t) 5) /* IEC958 (S/PDIF) setup */ +#define SNDRV_CTL_ELEM_TYPE_INTEGER64 ((__force snd_ctl_elem_type_t) 6) /* 64-bit integer type */ +#define SNDRV_CTL_ELEM_TYPE_LAST SNDRV_CTL_ELEM_TYPE_INTEGER64 + +typedef int __bitwise snd_ctl_elem_iface_t; +#define SNDRV_CTL_ELEM_IFACE_CARD ((__force snd_ctl_elem_iface_t) 0) /* global control */ +#define SNDRV_CTL_ELEM_IFACE_HWDEP ((__force snd_ctl_elem_iface_t) 1) /* hardware dependent device */ +#define SNDRV_CTL_ELEM_IFACE_MIXER ((__force snd_ctl_elem_iface_t) 2) /* virtual mixer device */ +#define SNDRV_CTL_ELEM_IFACE_PCM ((__force snd_ctl_elem_iface_t) 3) /* PCM device */ +#define SNDRV_CTL_ELEM_IFACE_RAWMIDI ((__force snd_ctl_elem_iface_t) 4) /* RawMidi device */ +#define SNDRV_CTL_ELEM_IFACE_TIMER ((__force snd_ctl_elem_iface_t) 5) /* timer device */ +#define SNDRV_CTL_ELEM_IFACE_SEQUENCER ((__force snd_ctl_elem_iface_t) 6) /* sequencer client */ +#define SNDRV_CTL_ELEM_IFACE_LAST SNDRV_CTL_ELEM_IFACE_SEQUENCER + +#define SNDRV_CTL_ELEM_ACCESS_READ (1<<0) +#define SNDRV_CTL_ELEM_ACCESS_WRITE (1<<1) +#define SNDRV_CTL_ELEM_ACCESS_READWRITE (SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE) +#define SNDRV_CTL_ELEM_ACCESS_VOLATILE (1<<2) /* control value may be changed without a notification */ +/* (1 << 3) is unused. */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_READ (1<<4) /* TLV read is possible */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE (1<<5) /* TLV write is possible */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE (SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) +#define SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND (1<<6) /* TLV command is possible */ +#define SNDRV_CTL_ELEM_ACCESS_INACTIVE (1<<8) /* control does actually nothing, but may be updated */ +#define SNDRV_CTL_ELEM_ACCESS_LOCK (1<<9) /* write lock */ +#define SNDRV_CTL_ELEM_ACCESS_OWNER (1<<10) /* write lock owner */ +#define SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK (1<<28) /* kernel use a TLV callback */ +#define SNDRV_CTL_ELEM_ACCESS_USER (1<<29) /* user space element */ +/* bits 30 and 31 are obsoleted (for indirect access) */ + +/* for further details see the ACPI and PCI power management specification */ +#define SNDRV_CTL_POWER_D0 0x0000 /* full On */ +#define SNDRV_CTL_POWER_D1 0x0100 /* partial On */ +#define SNDRV_CTL_POWER_D2 0x0200 /* partial On */ +#define SNDRV_CTL_POWER_D3 0x0300 /* Off */ +#define SNDRV_CTL_POWER_D3hot (SNDRV_CTL_POWER_D3|0x0000) /* Off, with power */ +#define SNDRV_CTL_POWER_D3cold (SNDRV_CTL_POWER_D3|0x0001) /* Off, without power */ + +#define SNDRV_CTL_ELEM_ID_NAME_MAXLEN 44 + +struct snd_ctl_elem_id { + unsigned int numid; /* numeric identifier, zero = invalid */ + snd_ctl_elem_iface_t iface; /* interface identifier */ + unsigned int device; /* device/client number */ + unsigned int subdevice; /* subdevice (substream) number */ + unsigned char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; /* ASCII name of item */ + unsigned int index; /* index of item */ +}; + +struct snd_ctl_elem_list { + unsigned int offset; /* W: first element ID to get */ + unsigned int space; /* W: count of element IDs to get */ + unsigned int used; /* R: count of element IDs set */ + unsigned int count; /* R: count of all elements */ + struct snd_ctl_elem_id __user *pids; /* R: IDs */ + unsigned char reserved[50]; +}; + +struct snd_ctl_elem_info { + struct snd_ctl_elem_id id; /* W: element ID */ + snd_ctl_elem_type_t type; /* R: value type - SNDRV_CTL_ELEM_TYPE_* */ + unsigned int access; /* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */ + unsigned int count; /* count of values */ + __kernel_pid_t owner; /* owner's PID of this control */ + union { + struct { + long min; /* R: minimum value */ + long max; /* R: maximum value */ + long step; /* R: step (0 variable) */ + } integer; + struct { + long long min; /* R: minimum value */ + long long max; /* R: maximum value */ + long long step; /* R: step (0 variable) */ + } integer64; + struct { + unsigned int items; /* R: number of items */ + unsigned int item; /* W: item number */ + char name[64]; /* R: value name */ + __u64 names_ptr; /* W: names list (ELEM_ADD only) */ + unsigned int names_length; + } enumerated; + unsigned char reserved[128]; + } value; + unsigned char reserved[64]; +}; + +struct snd_ctl_elem_value { + struct snd_ctl_elem_id id; /* W: element ID */ + unsigned int indirect: 1; /* W: indirect access - obsoleted */ + union { + union { + long value[128]; + long *value_ptr; /* obsoleted */ + } integer; + union { + long long value[64]; + long long *value_ptr; /* obsoleted */ + } integer64; + union { + unsigned int item[128]; + unsigned int *item_ptr; /* obsoleted */ + } enumerated; + union { + unsigned char data[512]; + unsigned char *data_ptr; /* obsoleted */ + } bytes; + struct snd_aes_iec958 iec958; + } value; /* RO */ + unsigned char reserved[128]; +}; + +struct snd_ctl_tlv { + unsigned int numid; /* control element numeric identification */ + unsigned int length; /* in bytes aligned to 4 */ + unsigned int tlv[]; /* first TLV */ +}; + +#define SNDRV_CTL_IOCTL_PVERSION _IOR('U', 0x00, int) +#define SNDRV_CTL_IOCTL_CARD_INFO _IOR('U', 0x01, struct snd_ctl_card_info) +#define SNDRV_CTL_IOCTL_ELEM_LIST _IOWR('U', 0x10, struct snd_ctl_elem_list) +#define SNDRV_CTL_IOCTL_ELEM_INFO _IOWR('U', 0x11, struct snd_ctl_elem_info) +#define SNDRV_CTL_IOCTL_ELEM_READ _IOWR('U', 0x12, struct snd_ctl_elem_value) +#define SNDRV_CTL_IOCTL_ELEM_WRITE _IOWR('U', 0x13, struct snd_ctl_elem_value) +#define SNDRV_CTL_IOCTL_ELEM_LOCK _IOW('U', 0x14, struct snd_ctl_elem_id) +#define SNDRV_CTL_IOCTL_ELEM_UNLOCK _IOW('U', 0x15, struct snd_ctl_elem_id) +#define SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS _IOWR('U', 0x16, int) +#define SNDRV_CTL_IOCTL_ELEM_ADD _IOWR('U', 0x17, struct snd_ctl_elem_info) +#define SNDRV_CTL_IOCTL_ELEM_REPLACE _IOWR('U', 0x18, struct snd_ctl_elem_info) +#define SNDRV_CTL_IOCTL_ELEM_REMOVE _IOWR('U', 0x19, struct snd_ctl_elem_id) +#define SNDRV_CTL_IOCTL_TLV_READ _IOWR('U', 0x1a, struct snd_ctl_tlv) +#define SNDRV_CTL_IOCTL_TLV_WRITE _IOWR('U', 0x1b, struct snd_ctl_tlv) +#define SNDRV_CTL_IOCTL_TLV_COMMAND _IOWR('U', 0x1c, struct snd_ctl_tlv) +#define SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE _IOWR('U', 0x20, int) +#define SNDRV_CTL_IOCTL_HWDEP_INFO _IOR('U', 0x21, struct snd_hwdep_info) +#define SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE _IOR('U', 0x30, int) +#define SNDRV_CTL_IOCTL_PCM_INFO _IOWR('U', 0x31, struct snd_pcm_info) +#define SNDRV_CTL_IOCTL_PCM_PREFER_SUBDEVICE _IOW('U', 0x32, int) +#define SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE _IOWR('U', 0x40, int) +#define SNDRV_CTL_IOCTL_RAWMIDI_INFO _IOWR('U', 0x41, struct snd_rawmidi_info) +#define SNDRV_CTL_IOCTL_RAWMIDI_PREFER_SUBDEVICE _IOW('U', 0x42, int) +#define SNDRV_CTL_IOCTL_UMP_NEXT_DEVICE _IOWR('U', 0x43, int) +#define SNDRV_CTL_IOCTL_UMP_ENDPOINT_INFO _IOWR('U', 0x44, struct snd_ump_endpoint_info) +#define SNDRV_CTL_IOCTL_UMP_BLOCK_INFO _IOWR('U', 0x45, struct snd_ump_block_info) +#define SNDRV_CTL_IOCTL_POWER _IOWR('U', 0xd0, int) +#define SNDRV_CTL_IOCTL_POWER_STATE _IOR('U', 0xd1, int) + +/* + * Read interface. + */ + +enum sndrv_ctl_event_type { + SNDRV_CTL_EVENT_ELEM = 0, + SNDRV_CTL_EVENT_LAST = SNDRV_CTL_EVENT_ELEM, +}; + +#define SNDRV_CTL_EVENT_MASK_VALUE (1<<0) /* element value was changed */ +#define SNDRV_CTL_EVENT_MASK_INFO (1<<1) /* element info was changed */ +#define SNDRV_CTL_EVENT_MASK_ADD (1<<2) /* element was added */ +#define SNDRV_CTL_EVENT_MASK_TLV (1<<3) /* element TLV tree was changed */ +#define SNDRV_CTL_EVENT_MASK_REMOVE (~0U) /* element was removed */ + +struct snd_ctl_event { + int type; /* event type - SNDRV_CTL_EVENT_* */ + union { + struct { + unsigned int mask; + struct snd_ctl_elem_id id; + } elem; + unsigned char data8[60]; + } data; +}; + +/* + * Control names + */ + +#define SNDRV_CTL_NAME_NONE "" +#define SNDRV_CTL_NAME_PLAYBACK "Playback " +#define SNDRV_CTL_NAME_CAPTURE "Capture " + +#define SNDRV_CTL_NAME_IEC958_NONE "" +#define SNDRV_CTL_NAME_IEC958_SWITCH "Switch" +#define SNDRV_CTL_NAME_IEC958_VOLUME "Volume" +#define SNDRV_CTL_NAME_IEC958_DEFAULT "Default" +#define SNDRV_CTL_NAME_IEC958_MASK "Mask" +#define SNDRV_CTL_NAME_IEC958_CON_MASK "Con Mask" +#define SNDRV_CTL_NAME_IEC958_PRO_MASK "Pro Mask" +#define SNDRV_CTL_NAME_IEC958_PCM_STREAM "PCM Stream" +#define SNDRV_CTL_NAME_IEC958(expl,direction,what) "IEC958 " expl SNDRV_CTL_NAME_##direction SNDRV_CTL_NAME_IEC958_##what + +#endif /* _UAPI__SOUND_ASOUND_H */ diff --git a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh index e0803b957593..572939a12884 100755 --- a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh @@ -1,9 +1,9 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ +[ $# -eq 1 ] && beauty_uapi_sound_dir=$1 || beauty_uapi_sound_dir=tools/perf/trace/beauty/include/uapi/sound/ printf "static const char *sndrv_ctl_ioctl_cmds[] = {\n" -grep "^#define[\t ]\+SNDRV_CTL_IOCTL_" $header_dir/asound.h | \ +grep "^#define[\t ]\+SNDRV_CTL_IOCTL_" $beauty_uapi_sound_dir/asound.h | \ sed -r 's/^#define +SNDRV_CTL_IOCTL_([A-Z0-9_]+)[\t ]+_IO[RW]*\( *.U., *(0x[[:xdigit:]]+),?.*/\t[\2] = \"\1\",/g' printf "};\n" diff --git a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh index 7a464a7bf913..33afae9a1c07 100755 --- a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh +++ b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh @@ -1,9 +1,9 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/sound/ +[ $# -eq 1 ] && beauty_uapi_sound_dir=$1 || beauty_uapi_sound_dir=tools/perf/trace/beauty/include/uapi/sound/ printf "static const char *sndrv_pcm_ioctl_cmds[] = {\n" -grep "^#define[\t ]\+SNDRV_PCM_IOCTL_" $header_dir/asound.h | \ +grep "^#define[\t ]\+SNDRV_PCM_IOCTL_" $beauty_uapi_sound_dir/asound.h | \ sed -r 's/^#define +SNDRV_PCM_IOCTL_([A-Z0-9_]+)[\t ]+_IO[RW]*\( *.A., *(0x[[:xdigit:]]+),?.*/\t[\2] = \"\1\",/g' printf "};\n" -- cgit v1.2.3 From c8bfe3fad4f86a029da7157bae9699c816f0c309 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move arch/x86/include/asm/irq_vectors.h copy out of the directory used to build perf It is used only to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/include/ hierarchy, that is used just for scraping. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. No other tools/ living code uses it. Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/irq_vectors.h | 142 --------------------- tools/perf/Makefile.perf | 6 +- tools/perf/check-headers.sh | 2 +- .../beauty/arch/x86/include/asm/irq_vectors.h | 142 +++++++++++++++++++++ .../trace/beauty/tracepoints/x86_irq_vectors.sh | 6 +- 5 files changed, 150 insertions(+), 148 deletions(-) delete mode 100644 tools/arch/x86/include/asm/irq_vectors.h create mode 100644 tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h (limited to 'tools') diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h deleted file mode 100644 index 3f73ac3ed3a0..000000000000 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IRQ_VECTORS_H -#define _ASM_X86_IRQ_VECTORS_H - -#include -/* - * Linux IRQ vector layout. - * - * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can - * be defined by Linux. They are used as a jump table by the CPU when a - * given vector is triggered - by a CPU-external, CPU-internal or - * software-triggered event. - * - * Linux sets the kernel code address each entry jumps to early during - * bootup, and never changes them. This is the general layout of the - * IDT entries: - * - * Vectors 0 ... 31 : system traps and exceptions - hardcoded events - * Vectors 32 ... 127 : device interrupts - * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... LOCAL_TIMER_VECTOR-1 - * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts - * - * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. - * - * This file enumerates the exact layout of them: - */ - -/* This is used as an interrupt vector when programming the APIC. */ -#define NMI_VECTOR 0x02 - -/* - * IDT vectors usable for external interrupt sources start at 0x20. - * (0x80 is the syscall vector, 0x30-0x3f are for ISA) - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define IA32_SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x30-0x3f are used for ISA interrupts. - * round up to the next 16-vector boundary - */ -#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - */ - -#define SPURIOUS_APIC_VECTOR 0xff -/* - * Sanity check - */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif - -#define ERROR_APIC_VECTOR 0xfe -#define RESCHEDULE_VECTOR 0xfd -#define CALL_FUNCTION_VECTOR 0xfc -#define CALL_FUNCTION_SINGLE_VECTOR 0xfb -#define THERMAL_APIC_VECTOR 0xfa -#define THRESHOLD_APIC_VECTOR 0xf9 -#define REBOOT_VECTOR 0xf8 - -/* - * Generic system vector for platform specific use - */ -#define X86_PLATFORM_IPI_VECTOR 0xf7 - -/* - * IRQ work vector: - */ -#define IRQ_WORK_VECTOR 0xf6 - -/* 0xf5 - unused, was UV_BAU_MESSAGE */ -#define DEFERRED_ERROR_VECTOR 0xf4 - -/* Vector on which hypervisor callbacks will be delivered */ -#define HYPERVISOR_CALLBACK_VECTOR 0xf3 - -/* Vector for KVM to deliver posted interrupt IPI */ -#if IS_ENABLED(CONFIG_KVM) -#define POSTED_INTR_VECTOR 0xf2 -#define POSTED_INTR_WAKEUP_VECTOR 0xf1 -#define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif - -#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef - -#if IS_ENABLED(CONFIG_HYPERV) -#define HYPERV_REENLIGHTENMENT_VECTOR 0xee -#define HYPERV_STIMER0_VECTOR 0xed -#endif - -#define LOCAL_TIMER_VECTOR 0xec - -#define NR_VECTORS 256 - -#ifdef CONFIG_X86_LOCAL_APIC -#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR -#else -#define FIRST_SYSTEM_VECTOR NR_VECTORS -#endif - -#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) -#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) - -/* - * Size the maximum number of interrupts. - * - * If the irq_desc[] array has a sparse layout, we can size things - * generously - it scales up linearly with the maximum number of CPUs, - * and the maximum number of IO-APICs, whichever is higher. - * - * In other cases we size more conservatively, to not create too large - * static arrays. - */ - -#define NR_IRQS_LEGACY 16 - -#define CPU_VECTOR_LIMIT (64 * NR_CPUS) -#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) - -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) -#define NR_IRQS \ - (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ - (NR_VECTORS + CPU_VECTOR_LIMIT) : \ - (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -#elif defined(CONFIG_X86_IO_APIC) -#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) -#elif defined(CONFIG_PCI_MSI) -#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) -#else -#define NR_IRQS NR_IRQS_LEGACY -#endif - -#endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 757777d96860..c75342b21089 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -474,6 +474,8 @@ arm64-sysreg-defs-clean: beauty_linux_dir := $(srctree)/tools/perf/trace/beauty/include/linux/ beauty_uapi_linux_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/linux/ beauty_uapi_sound_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/sound/ +beauty_arch_asm_dir := $(srctree)/tools/perf/trace/beauty/arch/x86/include/asm/ + linux_uapi_dir := $(srctree)/tools/include/uapi/linux asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/ @@ -636,8 +638,8 @@ $(x86_arch_prctl_code_array): $(x86_arch_asm_uapi_dir)/prctl.h $(x86_arch_prctl_ x86_arch_irq_vectors_array := $(beauty_outdir)/x86_arch_irq_vectors_array.c x86_arch_irq_vectors_tbl := $(srctree)/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh -$(x86_arch_irq_vectors_array): $(x86_arch_asm_dir)/irq_vectors.h $(x86_arch_irq_vectors_tbl) - $(Q)$(SHELL) '$(x86_arch_irq_vectors_tbl)' $(x86_arch_asm_dir) > $@ +$(x86_arch_irq_vectors_array): $(beauty_arch_asm_dir)/irq_vectors.h $(x86_arch_irq_vectors_tbl) + $(Q)$(SHELL) '$(x86_arch_irq_vectors_tbl)' $(beauty_arch_asm_dir) > $@ x86_arch_MSRs_array := $(beauty_outdir)/x86_arch_MSRs_array.c x86_arch_MSRs_tbl := $(srctree)/tools/perf/trace/beauty/tracepoints/x86_msr.sh diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 93ad7a787a19..b35eba5e99c8 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -34,7 +34,6 @@ FILES=( "arch/x86/include/asm/cpufeatures.h" "arch/x86/include/asm/inat_types.h" "arch/x86/include/asm/emulate_prefix.h" - "arch/x86/include/asm/irq_vectors.h" "arch/x86/include/asm/msr-index.h" "arch/x86/include/uapi/asm/prctl.h" "arch/x86/lib/x86-opcode-map.txt" @@ -93,6 +92,7 @@ SYNC_CHECK_FILES=( declare -a BEAUTY_FILES BEAUTY_FILES=( + "arch/x86/include/asm/irq_vectors.h" "include/linux/socket.h" "include/uapi/linux/fs.h" "include/uapi/linux/mount.h" diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h new file mode 100644 index 000000000000..3f73ac3ed3a0 --- /dev/null +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H + +#include +/* + * Linux IRQ vector layout. + * + * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can + * be defined by Linux. They are used as a jump table by the CPU when a + * given vector is triggered - by a CPU-external, CPU-internal or + * software-triggered event. + * + * Linux sets the kernel code address each entry jumps to early during + * bootup, and never changes them. This is the general layout of the + * IDT entries: + * + * Vectors 0 ... 31 : system traps and exceptions - hardcoded events + * Vectors 32 ... 127 : device interrupts + * Vector 128 : legacy int80 syscall interface + * Vectors 129 ... LOCAL_TIMER_VECTOR-1 + * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts + * + * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. + * + * This file enumerates the exact layout of them: + */ + +/* This is used as an interrupt vector when programming the APIC. */ +#define NMI_VECTOR 0x02 + +/* + * IDT vectors usable for external interrupt sources start at 0x20. + * (0x80 is the syscall vector, 0x30-0x3f are for ISA) + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#define IA32_SYSCALL_VECTOR 0x80 + +/* + * Vectors 0x30-0x3f are used for ISA interrupts. + * round up to the next 16-vector boundary + */ +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) + +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + */ + +#define SPURIOUS_APIC_VECTOR 0xff +/* + * Sanity check + */ +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +# error SPURIOUS_APIC_VECTOR definition error +#endif + +#define ERROR_APIC_VECTOR 0xfe +#define RESCHEDULE_VECTOR 0xfd +#define CALL_FUNCTION_VECTOR 0xfc +#define CALL_FUNCTION_SINGLE_VECTOR 0xfb +#define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 +#define REBOOT_VECTOR 0xf8 + +/* + * Generic system vector for platform specific use + */ +#define X86_PLATFORM_IPI_VECTOR 0xf7 + +/* + * IRQ work vector: + */ +#define IRQ_WORK_VECTOR 0xf6 + +/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define DEFERRED_ERROR_VECTOR 0xf4 + +/* Vector on which hypervisor callbacks will be delivered */ +#define HYPERVISOR_CALLBACK_VECTOR 0xf3 + +/* Vector for KVM to deliver posted interrupt IPI */ +#if IS_ENABLED(CONFIG_KVM) +#define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 +#define POSTED_INTR_NESTED_VECTOR 0xf0 +#endif + +#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef + +#if IS_ENABLED(CONFIG_HYPERV) +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#define HYPERV_STIMER0_VECTOR 0xed +#endif + +#define LOCAL_TIMER_VECTOR 0xec + +#define NR_VECTORS 256 + +#ifdef CONFIG_X86_LOCAL_APIC +#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR +#else +#define FIRST_SYSTEM_VECTOR NR_VECTORS +#endif + +#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) +#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) + +/* + * Size the maximum number of interrupts. + * + * If the irq_desc[] array has a sparse layout, we can size things + * generously - it scales up linearly with the maximum number of CPUs, + * and the maximum number of IO-APICs, whichever is higher. + * + * In other cases we size more conservatively, to not create too large + * static arrays. + */ + +#define NR_IRQS_LEGACY 16 + +#define CPU_VECTOR_LIMIT (64 * NR_CPUS) +#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) +#define NR_IRQS \ + (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) +#elif defined(CONFIG_X86_IO_APIC) +#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +#elif defined(CONFIG_PCI_MSI) +#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else +#define NR_IRQS NR_IRQS_LEGACY +#endif + +#endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh b/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh index 87dc68c7de0c..d8e927dd2bb7 100755 --- a/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh +++ b/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh @@ -3,12 +3,12 @@ # (C) 2019, Arnaldo Carvalho de Melo if [ $# -ne 1 ] ; then - arch_x86_header_dir=tools/arch/x86/include/asm/ + beauty_arch_asm_dir=tools/perf/trace/beauty/arch/x86/include/asm/ else - arch_x86_header_dir=$1 + beauty_arch_asm_dir=$1 fi -x86_irq_vectors=${arch_x86_header_dir}/irq_vectors.h +x86_irq_vectors=${beauty_arch_asm_dir}/irq_vectors.h # FIRST_EXTERNAL_VECTOR is not that useful, find what is its number # and then replace whatever is using it and that is useful, which at -- cgit v1.2.3 From f324b73c2c05832b7c3c1071b87f588b28e45d28 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Mar 2024 17:39:29 -0300 Subject: perf beauty: Stop using the copy of uapi/linux/prctl.h Use the system one, nothing used in that file isn't available in the supported, active distros. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim To: Ian Rogers Link: https://lore.kernel.org/lkml/20240315204835.748716-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/prctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/prctl.c b/tools/perf/trace/beauty/prctl.c index 6fe5ad5f5d3a..7d1aa9fd03da 100644 --- a/tools/perf/trace/beauty/prctl.c +++ b/tools/perf/trace/beauty/prctl.c @@ -7,7 +7,7 @@ #include "trace/beauty/beauty.h" #include -#include +#include #include "trace/beauty/generated/prctl_option_array.c" -- cgit v1.2.3 From eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move prctl.h files (uapi/linux and x86's) copy out of the directory used to build perf It is used only to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/{include,arch}/ hierarchies, that is used just for scraping. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. No other tools/ living code uses it, just coming from either 'make install_headers' or from the system /usr/include/ directory. Suggested-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240315204835.748716-3-acme@kernel.org Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/prctl.h | 43 --- tools/include/uapi/linux/prctl.h | 309 --------------------- tools/perf/Makefile.perf | 11 +- tools/perf/check-headers.sh | 4 +- .../trace/beauty/arch/x86/include/uapi/asm/prctl.h | 43 +++ tools/perf/trace/beauty/include/uapi/linux/prctl.h | 309 +++++++++++++++++++++ tools/perf/trace/beauty/prctl_option.sh | 6 +- tools/perf/trace/beauty/x86_arch_prctl.sh | 4 +- 8 files changed, 364 insertions(+), 365 deletions(-) delete mode 100644 tools/arch/x86/include/uapi/asm/prctl.h delete mode 100644 tools/include/uapi/linux/prctl.h create mode 100644 tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/prctl.h (limited to 'tools') diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h deleted file mode 100644 index 384e2cc6ac19..000000000000 --- a/tools/arch/x86/include/uapi/asm/prctl.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_X86_PRCTL_H -#define _ASM_X86_PRCTL_H - -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 - -#define ARCH_GET_CPUID 0x1011 -#define ARCH_SET_CPUID 0x1012 - -#define ARCH_GET_XCOMP_SUPP 0x1021 -#define ARCH_GET_XCOMP_PERM 0x1022 -#define ARCH_REQ_XCOMP_PERM 0x1023 -#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 -#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 - -#define ARCH_XCOMP_TILECFG 17 -#define ARCH_XCOMP_TILEDATA 18 - -#define ARCH_MAP_VDSO_X32 0x2001 -#define ARCH_MAP_VDSO_32 0x2002 -#define ARCH_MAP_VDSO_64 0x2003 - -/* Don't use 0x3001-0x3004 because of old glibcs */ - -#define ARCH_GET_UNTAG_MASK 0x4001 -#define ARCH_ENABLE_TAGGED_ADDR 0x4002 -#define ARCH_GET_MAX_TAG_BITS 0x4003 -#define ARCH_FORCE_TAGGED_SVA 0x4004 - -#define ARCH_SHSTK_ENABLE 0x5001 -#define ARCH_SHSTK_DISABLE 0x5002 -#define ARCH_SHSTK_LOCK 0x5003 -#define ARCH_SHSTK_UNLOCK 0x5004 -#define ARCH_SHSTK_STATUS 0x5005 - -/* ARCH_SHSTK_ features bits */ -#define ARCH_SHSTK_SHSTK (1ULL << 0) -#define ARCH_SHSTK_WRSS (1ULL << 1) - -#endif /* _ASM_X86_PRCTL_H */ diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h deleted file mode 100644 index 370ed14b1ae0..000000000000 --- a/tools/include/uapi/linux/prctl.h +++ /dev/null @@ -1,309 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_PRCTL_H -#define _LINUX_PRCTL_H - -#include - -/* Values to pass as first argument to prctl() */ - -#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ -#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ - -/* Get/set current->mm->dumpable */ -#define PR_GET_DUMPABLE 3 -#define PR_SET_DUMPABLE 4 - -/* Get/set unaligned access control bits (if meaningful) */ -#define PR_GET_UNALIGN 5 -#define PR_SET_UNALIGN 6 -# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */ -# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */ - -/* Get/set whether or not to drop capabilities on setuid() away from - * uid 0 (as per security/commoncap.c) */ -#define PR_GET_KEEPCAPS 7 -#define PR_SET_KEEPCAPS 8 - -/* Get/set floating-point emulation control bits (if meaningful) */ -#define PR_GET_FPEMU 9 -#define PR_SET_FPEMU 10 -# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ -# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ - -/* Get/set floating-point exception mode (if meaningful) */ -#define PR_GET_FPEXC 11 -#define PR_SET_FPEXC 12 -# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */ -# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */ -# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */ -# define PR_FP_EXC_UND 0x040000 /* floating point underflow */ -# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */ -# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */ -# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ -# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ -# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ -# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ - -/* Get/set whether we use statistical process timing or accurate timestamp - * based process timing */ -#define PR_GET_TIMING 13 -#define PR_SET_TIMING 14 -# define PR_TIMING_STATISTICAL 0 /* Normal, traditional, - statistical process timing */ -# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based - process timing */ - -#define PR_SET_NAME 15 /* Set process name */ -#define PR_GET_NAME 16 /* Get process name */ - -/* Get/set process endian */ -#define PR_GET_ENDIAN 19 -#define PR_SET_ENDIAN 20 -# define PR_ENDIAN_BIG 0 -# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ -# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ - -/* Get/set process seccomp mode */ -#define PR_GET_SECCOMP 21 -#define PR_SET_SECCOMP 22 - -/* Get/set the capability bounding set (as per security/commoncap.c) */ -#define PR_CAPBSET_READ 23 -#define PR_CAPBSET_DROP 24 - -/* Get/set the process' ability to use the timestamp counter instruction */ -#define PR_GET_TSC 25 -#define PR_SET_TSC 26 -# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */ -# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */ - -/* Get/set securebits (as per security/commoncap.c) */ -#define PR_GET_SECUREBITS 27 -#define PR_SET_SECUREBITS 28 - -/* - * Get/set the timerslack as used by poll/select/nanosleep - * A value of 0 means "use default" - */ -#define PR_SET_TIMERSLACK 29 -#define PR_GET_TIMERSLACK 30 - -#define PR_TASK_PERF_EVENTS_DISABLE 31 -#define PR_TASK_PERF_EVENTS_ENABLE 32 - -/* - * Set early/late kill mode for hwpoison memory corruption. - * This influences when the process gets killed on a memory corruption. - */ -#define PR_MCE_KILL 33 -# define PR_MCE_KILL_CLEAR 0 -# define PR_MCE_KILL_SET 1 - -# define PR_MCE_KILL_LATE 0 -# define PR_MCE_KILL_EARLY 1 -# define PR_MCE_KILL_DEFAULT 2 - -#define PR_MCE_KILL_GET 34 - -/* - * Tune up process memory map specifics. - */ -#define PR_SET_MM 35 -# define PR_SET_MM_START_CODE 1 -# define PR_SET_MM_END_CODE 2 -# define PR_SET_MM_START_DATA 3 -# define PR_SET_MM_END_DATA 4 -# define PR_SET_MM_START_STACK 5 -# define PR_SET_MM_START_BRK 6 -# define PR_SET_MM_BRK 7 -# define PR_SET_MM_ARG_START 8 -# define PR_SET_MM_ARG_END 9 -# define PR_SET_MM_ENV_START 10 -# define PR_SET_MM_ENV_END 11 -# define PR_SET_MM_AUXV 12 -# define PR_SET_MM_EXE_FILE 13 -# define PR_SET_MM_MAP 14 -# define PR_SET_MM_MAP_SIZE 15 - -/* - * This structure provides new memory descriptor - * map which mostly modifies /proc/pid/stat[m] - * output for a task. This mostly done in a - * sake of checkpoint/restore functionality. - */ -struct prctl_mm_map { - __u64 start_code; /* code section bounds */ - __u64 end_code; - __u64 start_data; /* data section bounds */ - __u64 end_data; - __u64 start_brk; /* heap for brk() syscall */ - __u64 brk; - __u64 start_stack; /* stack starts at */ - __u64 arg_start; /* command line arguments bounds */ - __u64 arg_end; - __u64 env_start; /* environment variables bounds */ - __u64 env_end; - __u64 *auxv; /* auxiliary vector */ - __u32 auxv_size; /* vector size */ - __u32 exe_fd; /* /proc/$pid/exe link file */ -}; - -/* - * Set specific pid that is allowed to ptrace the current task. - * A value of 0 mean "no process". - */ -#define PR_SET_PTRACER 0x59616d61 -# define PR_SET_PTRACER_ANY ((unsigned long)-1) - -#define PR_SET_CHILD_SUBREAPER 36 -#define PR_GET_CHILD_SUBREAPER 37 - -/* - * If no_new_privs is set, then operations that grant new privileges (i.e. - * execve) will either fail or not grant them. This affects suid/sgid, - * file capabilities, and LSMs. - * - * Operations that merely manipulate or drop existing privileges (setresuid, - * capset, etc.) will still work. Drop those privileges if you want them gone. - * - * Changing LSM security domain is considered a new privilege. So, for example, - * asking selinux for a specific new context (e.g. with runcon) will result - * in execve returning -EPERM. - * - * See Documentation/userspace-api/no_new_privs.rst for more details. - */ -#define PR_SET_NO_NEW_PRIVS 38 -#define PR_GET_NO_NEW_PRIVS 39 - -#define PR_GET_TID_ADDRESS 40 - -#define PR_SET_THP_DISABLE 41 -#define PR_GET_THP_DISABLE 42 - -/* - * No longer implemented, but left here to ensure the numbers stay reserved: - */ -#define PR_MPX_ENABLE_MANAGEMENT 43 -#define PR_MPX_DISABLE_MANAGEMENT 44 - -#define PR_SET_FP_MODE 45 -#define PR_GET_FP_MODE 46 -# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ -# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ - -/* Control the ambient capability set */ -#define PR_CAP_AMBIENT 47 -# define PR_CAP_AMBIENT_IS_SET 1 -# define PR_CAP_AMBIENT_RAISE 2 -# define PR_CAP_AMBIENT_LOWER 3 -# define PR_CAP_AMBIENT_CLEAR_ALL 4 - -/* arm64 Scalable Vector Extension controls */ -/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */ -#define PR_SVE_SET_VL 50 /* set task vector length */ -# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ -#define PR_SVE_GET_VL 51 /* get task vector length */ -/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */ -# define PR_SVE_VL_LEN_MASK 0xffff -# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ - -/* Per task speculation control */ -#define PR_GET_SPECULATION_CTRL 52 -#define PR_SET_SPECULATION_CTRL 53 -/* Speculation control variants */ -# define PR_SPEC_STORE_BYPASS 0 -# define PR_SPEC_INDIRECT_BRANCH 1 -# define PR_SPEC_L1D_FLUSH 2 -/* Return and control values for PR_SET/GET_SPECULATION_CTRL */ -# define PR_SPEC_NOT_AFFECTED 0 -# define PR_SPEC_PRCTL (1UL << 0) -# define PR_SPEC_ENABLE (1UL << 1) -# define PR_SPEC_DISABLE (1UL << 2) -# define PR_SPEC_FORCE_DISABLE (1UL << 3) -# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) - -/* Reset arm64 pointer authentication keys */ -#define PR_PAC_RESET_KEYS 54 -# define PR_PAC_APIAKEY (1UL << 0) -# define PR_PAC_APIBKEY (1UL << 1) -# define PR_PAC_APDAKEY (1UL << 2) -# define PR_PAC_APDBKEY (1UL << 3) -# define PR_PAC_APGAKEY (1UL << 4) - -/* Tagged user address controls for arm64 */ -#define PR_SET_TAGGED_ADDR_CTRL 55 -#define PR_GET_TAGGED_ADDR_CTRL 56 -# define PR_TAGGED_ADDR_ENABLE (1UL << 0) -/* MTE tag check fault modes */ -# define PR_MTE_TCF_NONE 0UL -# define PR_MTE_TCF_SYNC (1UL << 1) -# define PR_MTE_TCF_ASYNC (1UL << 2) -# define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) -/* MTE tag inclusion mask */ -# define PR_MTE_TAG_SHIFT 3 -# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) -/* Unused; kept only for source compatibility */ -# define PR_MTE_TCF_SHIFT 1 - -/* Control reclaim behavior when allocating memory */ -#define PR_SET_IO_FLUSHER 57 -#define PR_GET_IO_FLUSHER 58 - -/* Dispatch syscalls to a userspace handler */ -#define PR_SET_SYSCALL_USER_DISPATCH 59 -# define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 -/* The control values for the user space selector when dispatch is enabled */ -# define SYSCALL_DISPATCH_FILTER_ALLOW 0 -# define SYSCALL_DISPATCH_FILTER_BLOCK 1 - -/* Set/get enabled arm64 pointer authentication keys */ -#define PR_PAC_SET_ENABLED_KEYS 60 -#define PR_PAC_GET_ENABLED_KEYS 61 - -/* Request the scheduler to share a core */ -#define PR_SCHED_CORE 62 -# define PR_SCHED_CORE_GET 0 -# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ -# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ -# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ -# define PR_SCHED_CORE_MAX 4 -# define PR_SCHED_CORE_SCOPE_THREAD 0 -# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 -# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 - -/* arm64 Scalable Matrix Extension controls */ -/* Flag values must be in sync with SVE versions */ -#define PR_SME_SET_VL 63 /* set task vector length */ -# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ -#define PR_SME_GET_VL 64 /* get task vector length */ -/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ -# define PR_SME_VL_LEN_MASK 0xffff -# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ - -/* Memory deny write / execute */ -#define PR_SET_MDWE 65 -# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) -# define PR_MDWE_NO_INHERIT (1UL << 1) - -#define PR_GET_MDWE 66 - -#define PR_SET_VMA 0x53564d41 -# define PR_SET_VMA_ANON_NAME 0 - -#define PR_GET_AUXV 0x41555856 - -#define PR_SET_MEMORY_MERGE 67 -#define PR_GET_MEMORY_MERGE 68 - -#define PR_RISCV_V_SET_CONTROL 69 -#define PR_RISCV_V_GET_CONTROL 70 -# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 -# define PR_RISCV_V_VSTATE_CTRL_OFF 1 -# define PR_RISCV_V_VSTATE_CTRL_ON 2 -# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) -# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 -# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc -# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f - -#endif /* _LINUX_PRCTL_H */ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index c75342b21089..b36c0c68c346 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -475,11 +475,11 @@ beauty_linux_dir := $(srctree)/tools/perf/trace/beauty/include/linux/ beauty_uapi_linux_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/linux/ beauty_uapi_sound_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/sound/ beauty_arch_asm_dir := $(srctree)/tools/perf/trace/beauty/arch/x86/include/asm/ +beauty_x86_arch_asm_uapi_dir := $(srctree)/tools/perf/trace/beauty/arch/x86/include/uapi/asm/ linux_uapi_dir := $(srctree)/tools/include/uapi/linux asm_generic_uapi_dir := $(srctree)/tools/include/uapi/asm-generic arch_asm_uapi_dir := $(srctree)/tools/arch/$(SRCARCH)/include/uapi/asm/ -x86_arch_asm_uapi_dir := $(srctree)/tools/arch/x86/include/uapi/asm/ x86_arch_asm_dir := $(srctree)/tools/arch/x86/include/asm/ beauty_outdir := $(OUTPUT)trace/beauty/generated @@ -617,11 +617,10 @@ $(mmap_prot_array): $(asm_generic_uapi_dir)/mman.h $(asm_generic_uapi_dir)/mman- $(Q)$(SHELL) '$(mmap_prot_tbl)' $(asm_generic_uapi_dir) $(arch_asm_uapi_dir) > $@ prctl_option_array := $(beauty_outdir)/prctl_option_array.c -prctl_hdr_dir := $(srctree)/tools/include/uapi/linux/ prctl_option_tbl := $(srctree)/tools/perf/trace/beauty/prctl_option.sh -$(prctl_option_array): $(prctl_hdr_dir)/prctl.h $(prctl_option_tbl) - $(Q)$(SHELL) '$(prctl_option_tbl)' $(prctl_hdr_dir) > $@ +$(prctl_option_array): $(beauty_uapi_linux_dir)/prctl.h $(prctl_option_tbl) + $(Q)$(SHELL) '$(prctl_option_tbl)' $(beauty_uapi_linux_dir) > $@ usbdevfs_ioctl_array := $(beauty_ioctl_outdir)/usbdevfs_ioctl_array.c usbdevfs_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/usbdevfs_ioctl.sh @@ -632,8 +631,8 @@ $(usbdevfs_ioctl_array): $(beauty_uapi_linux_dir)/usbdevice_fs.h $(usbdevfs_ioct x86_arch_prctl_code_array := $(beauty_outdir)/x86_arch_prctl_code_array.c x86_arch_prctl_code_tbl := $(srctree)/tools/perf/trace/beauty/x86_arch_prctl.sh -$(x86_arch_prctl_code_array): $(x86_arch_asm_uapi_dir)/prctl.h $(x86_arch_prctl_code_tbl) - $(Q)$(SHELL) '$(x86_arch_prctl_code_tbl)' $(x86_arch_asm_uapi_dir) > $@ +$(x86_arch_prctl_code_array): $(beauty_x86_arch_asm_uapi_dir)/prctl.h $(x86_arch_prctl_code_tbl) + $(Q)$(SHELL) '$(x86_arch_prctl_code_tbl)' $(beauty_x86_arch_asm_uapi_dir) > $@ x86_arch_irq_vectors_array := $(beauty_outdir)/x86_arch_irq_vectors_array.c x86_arch_irq_vectors_tbl := $(srctree)/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index b35eba5e99c8..03d00110a48f 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -17,7 +17,6 @@ FILES=( "include/uapi/linux/in.h" "include/uapi/linux/openat2.h" "include/uapi/linux/perf_event.h" - "include/uapi/linux/prctl.h" "include/uapi/linux/sched.h" "include/uapi/linux/seccomp.h" "include/uapi/linux/stat.h" @@ -35,7 +34,6 @@ FILES=( "arch/x86/include/asm/inat_types.h" "arch/x86/include/asm/emulate_prefix.h" "arch/x86/include/asm/msr-index.h" - "arch/x86/include/uapi/asm/prctl.h" "arch/x86/lib/x86-opcode-map.txt" "arch/x86/tools/gen-insn-attr-x86.awk" "arch/arm/include/uapi/asm/perf_regs.h" @@ -93,9 +91,11 @@ SYNC_CHECK_FILES=( declare -a BEAUTY_FILES BEAUTY_FILES=( "arch/x86/include/asm/irq_vectors.h" + "arch/x86/include/uapi/asm/prctl.h" "include/linux/socket.h" "include/uapi/linux/fs.h" "include/uapi/linux/mount.h" + "include/uapi/linux/prctl.h" "include/uapi/linux/usbdevice_fs.h" "include/uapi/sound/asound.h" ) diff --git a/tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h b/tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h new file mode 100644 index 000000000000..384e2cc6ac19 --- /dev/null +++ b/tools/perf/trace/beauty/arch/x86/include/uapi/asm/prctl.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_X86_PRCTL_H +#define _ASM_X86_PRCTL_H + +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 + +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 +#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 + +#define ARCH_XCOMP_TILECFG 17 +#define ARCH_XCOMP_TILEDATA 18 + +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 + +/* Don't use 0x3001-0x3004 because of old glibcs */ + +#define ARCH_GET_UNTAG_MASK 0x4001 +#define ARCH_ENABLE_TAGGED_ADDR 0x4002 +#define ARCH_GET_MAX_TAG_BITS 0x4003 +#define ARCH_FORCE_TAGGED_SVA 0x4004 + +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +/* ARCH_SHSTK_ features bits */ +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#endif /* _ASM_X86_PRCTL_H */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h new file mode 100644 index 000000000000..370ed14b1ae0 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_PRCTL_H +#define _LINUX_PRCTL_H + +#include + +/* Values to pass as first argument to prctl() */ + +#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ +#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ + +/* Get/set current->mm->dumpable */ +#define PR_GET_DUMPABLE 3 +#define PR_SET_DUMPABLE 4 + +/* Get/set unaligned access control bits (if meaningful) */ +#define PR_GET_UNALIGN 5 +#define PR_SET_UNALIGN 6 +# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */ +# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */ + +/* Get/set whether or not to drop capabilities on setuid() away from + * uid 0 (as per security/commoncap.c) */ +#define PR_GET_KEEPCAPS 7 +#define PR_SET_KEEPCAPS 8 + +/* Get/set floating-point emulation control bits (if meaningful) */ +#define PR_GET_FPEMU 9 +#define PR_SET_FPEMU 10 +# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ +# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ + +/* Get/set floating-point exception mode (if meaningful) */ +#define PR_GET_FPEXC 11 +#define PR_SET_FPEXC 12 +# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */ +# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */ +# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */ +# define PR_FP_EXC_UND 0x040000 /* floating point underflow */ +# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */ +# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */ +# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ +# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ +# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ +# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ + +/* Get/set whether we use statistical process timing or accurate timestamp + * based process timing */ +#define PR_GET_TIMING 13 +#define PR_SET_TIMING 14 +# define PR_TIMING_STATISTICAL 0 /* Normal, traditional, + statistical process timing */ +# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based + process timing */ + +#define PR_SET_NAME 15 /* Set process name */ +#define PR_GET_NAME 16 /* Get process name */ + +/* Get/set process endian */ +#define PR_GET_ENDIAN 19 +#define PR_SET_ENDIAN 20 +# define PR_ENDIAN_BIG 0 +# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ +# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ + +/* Get/set process seccomp mode */ +#define PR_GET_SECCOMP 21 +#define PR_SET_SECCOMP 22 + +/* Get/set the capability bounding set (as per security/commoncap.c) */ +#define PR_CAPBSET_READ 23 +#define PR_CAPBSET_DROP 24 + +/* Get/set the process' ability to use the timestamp counter instruction */ +#define PR_GET_TSC 25 +#define PR_SET_TSC 26 +# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */ +# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */ + +/* Get/set securebits (as per security/commoncap.c) */ +#define PR_GET_SECUREBITS 27 +#define PR_SET_SECUREBITS 28 + +/* + * Get/set the timerslack as used by poll/select/nanosleep + * A value of 0 means "use default" + */ +#define PR_SET_TIMERSLACK 29 +#define PR_GET_TIMERSLACK 30 + +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 + +/* + * Set early/late kill mode for hwpoison memory corruption. + * This influences when the process gets killed on a memory corruption. + */ +#define PR_MCE_KILL 33 +# define PR_MCE_KILL_CLEAR 0 +# define PR_MCE_KILL_SET 1 + +# define PR_MCE_KILL_LATE 0 +# define PR_MCE_KILL_EARLY 1 +# define PR_MCE_KILL_DEFAULT 2 + +#define PR_MCE_KILL_GET 34 + +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 +# define PR_SET_MM_ARG_START 8 +# define PR_SET_MM_ARG_END 9 +# define PR_SET_MM_ENV_START 10 +# define PR_SET_MM_ENV_END 11 +# define PR_SET_MM_AUXV 12 +# define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; + +/* + * Set specific pid that is allowed to ptrace the current task. + * A value of 0 mean "no process". + */ +#define PR_SET_PTRACER 0x59616d61 +# define PR_SET_PTRACER_ANY ((unsigned long)-1) + +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 + +/* + * If no_new_privs is set, then operations that grant new privileges (i.e. + * execve) will either fail or not grant them. This affects suid/sgid, + * file capabilities, and LSMs. + * + * Operations that merely manipulate or drop existing privileges (setresuid, + * capset, etc.) will still work. Drop those privileges if you want them gone. + * + * Changing LSM security domain is considered a new privilege. So, for example, + * asking selinux for a specific new context (e.g. with runcon) will result + * in execve returning -EPERM. + * + * See Documentation/userspace-api/no_new_privs.rst for more details. + */ +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 + +#define PR_SET_THP_DISABLE 41 +#define PR_GET_THP_DISABLE 42 + +/* + * No longer implemented, but left here to ensure the numbers stay reserved: + */ +#define PR_MPX_ENABLE_MANAGEMENT 43 +#define PR_MPX_DISABLE_MANAGEMENT 44 + +#define PR_SET_FP_MODE 45 +#define PR_GET_FP_MODE 46 +# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ +# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ + +/* Control the ambient capability set */ +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 + +/* arm64 Scalable Vector Extension controls */ +/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */ +#define PR_SVE_SET_VL 50 /* set task vector length */ +# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SVE_GET_VL 51 /* get task vector length */ +/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */ +# define PR_SVE_VL_LEN_MASK 0xffff +# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */ + +/* Per task speculation control */ +#define PR_GET_SPECULATION_CTRL 52 +#define PR_SET_SPECULATION_CTRL 53 +/* Speculation control variants */ +# define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 +# define PR_SPEC_L1D_FLUSH 2 +/* Return and control values for PR_SET/GET_SPECULATION_CTRL */ +# define PR_SPEC_NOT_AFFECTED 0 +# define PR_SPEC_PRCTL (1UL << 0) +# define PR_SPEC_ENABLE (1UL << 1) +# define PR_SPEC_DISABLE (1UL << 2) +# define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) + +/* Reset arm64 pointer authentication keys */ +#define PR_PAC_RESET_KEYS 54 +# define PR_PAC_APIAKEY (1UL << 0) +# define PR_PAC_APIBKEY (1UL << 1) +# define PR_PAC_APDAKEY (1UL << 2) +# define PR_PAC_APDBKEY (1UL << 3) +# define PR_PAC_APGAKEY (1UL << 4) + +/* Tagged user address controls for arm64 */ +#define PR_SET_TAGGED_ADDR_CTRL 55 +#define PR_GET_TAGGED_ADDR_CTRL 56 +# define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* MTE tag check fault modes */ +# define PR_MTE_TCF_NONE 0UL +# define PR_MTE_TCF_SYNC (1UL << 1) +# define PR_MTE_TCF_ASYNC (1UL << 2) +# define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC) +/* MTE tag inclusion mask */ +# define PR_MTE_TAG_SHIFT 3 +# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) +/* Unused; kept only for source compatibility */ +# define PR_MTE_TCF_SHIFT 1 + +/* Control reclaim behavior when allocating memory */ +#define PR_SET_IO_FLUSHER 57 +#define PR_GET_IO_FLUSHER 58 + +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +/* The control values for the user space selector when dispatch is enabled */ +# define SYSCALL_DISPATCH_FILTER_ALLOW 0 +# define SYSCALL_DISPATCH_FILTER_BLOCK 1 + +/* Set/get enabled arm64 pointer authentication keys */ +#define PR_PAC_SET_ENABLED_KEYS 60 +#define PR_PAC_GET_ENABLED_KEYS 61 + +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 +# define PR_SCHED_CORE_SCOPE_THREAD 0 +# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 +# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 + +/* arm64 Scalable Matrix Extension controls */ +/* Flag values must be in sync with SVE versions */ +#define PR_SME_SET_VL 63 /* set task vector length */ +# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */ +#define PR_SME_GET_VL 64 /* get task vector length */ +/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */ +# define PR_SME_VL_LEN_MASK 0xffff +# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ + +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) + +#define PR_GET_MDWE 66 + +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + +#define PR_GET_AUXV 0x41555856 + +#define PR_SET_MEMORY_MERGE 67 +#define PR_GET_MEMORY_MERGE 68 + +#define PR_RISCV_V_SET_CONTROL 69 +#define PR_RISCV_V_GET_CONTROL 70 +# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 +# define PR_RISCV_V_VSTATE_CTRL_OFF 1 +# define PR_RISCV_V_VSTATE_CTRL_ON 2 +# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) +# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 +# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc +# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f + +#endif /* _LINUX_PRCTL_H */ diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index 9455d9672f14..e049f5e9c011 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -1,18 +1,18 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ printf "static const char *prctl_options[] = {\n" regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*/.*)?$' -grep -E $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ +grep -E $regex ${beauty_uapi_linux_dir}/prctl.h | grep -v PR_SET_PTRACER | \ sed -E "s%$regex%\2 \1%g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" printf "static const char *prctl_set_mm_options[] = {\n" regex='^#[[:space:]]+define[[:space:]]+PR_SET_MM_(\w+)[[:space:]]*([[:digit:]]+).*' -grep -E $regex ${header_dir}/prctl.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/prctl.h | \ sed -r "s/$regex/\2 \1/g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/x86_arch_prctl.sh b/tools/perf/trace/beauty/x86_arch_prctl.sh index b1596df251f0..b714ffa3cb7a 100755 --- a/tools/perf/trace/beauty/x86_arch_prctl.sh +++ b/tools/perf/trace/beauty/x86_arch_prctl.sh @@ -2,9 +2,9 @@ # Copyright (C) 2018, Red Hat Inc, Arnaldo Carvalho de Melo # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && x86_header_dir=$1 || x86_header_dir=tools/arch/x86/include/uapi/asm/ +[ $# -eq 1 ] && beauty_x86_arch_asm_uapi_dir=$1 || beauty_x86_arch_asm_uapi_dir=tools/perf/trace/beauty/arch/x86/include/uapi/asm/ -prctl_arch_header=${x86_header_dir}/prctl.h +prctl_arch_header=${beauty_x86_arch_asm_uapi_dir}/prctl.h print_range () { idx=$1 -- cgit v1.2.3 From 6652830c87be8446a0e9b47f83af8990fee2f274 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Mar 2024 16:06:01 -0300 Subject: perf beauty: Use the system linux/fcntl.h instead of a copy from the kernel Builds ok all the way back to these older distros: 1 almalinux:8 : Ok gcc (GCC) 8.5.0 20210514 (Red Hat 8.5.0-20) , clang version 16.0.6 (Red Hat 16.0.6-2.module_el8.9.0+3621+df7f7146) flex 2.6.1 3 alpine:3.15 : Ok gcc (Alpine 10.3.1_git20211027) 10.3.1 20211027 , Alpine clang version 12.0.1 flex 2.6.4 15 debian:10 : Ok gcc (Debian 8.3.0-6) 8.3.0 , Debian clang version 11.0.1-2~deb10u1 flex 2.6.4 32 opensuse:15.4 : Ok gcc (SUSE Linux) 7.5.0 , clang version 15.0.7 flex 2.6.4 23 fedora:35 : Ok gcc (GCC) 11.3.1 20220421 (Red Hat 11.3.1-3) , clang version 13.0.1 (Fedora 13.0.1-1.fc35) flex 2.6.4 38 ubuntu:18.04 : Ok gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 flex 2.6.4 Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240315204835.748716-4-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/fcntl.c | 2 +- tools/perf/trace/beauty/flock.c | 2 +- tools/perf/trace/beauty/statx.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c index 56ef83b3d130..d075904dccce 100644 --- a/tools/perf/trace/beauty/fcntl.c +++ b/tools/perf/trace/beauty/fcntl.c @@ -7,7 +7,7 @@ #include "trace/beauty/beauty.h" #include -#include +#include static size_t fcntl__scnprintf_getfd(unsigned long val, char *bf, size_t size, bool show_prefix) { diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c index c14274edd6d9..a6514a6f07cf 100644 --- a/tools/perf/trace/beauty/flock.c +++ b/tools/perf/trace/beauty/flock.c @@ -2,7 +2,7 @@ #include "trace/beauty/beauty.h" #include -#include +#include #ifndef LOCK_MAND #define LOCK_MAND 32 diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index dc5943a6352d..d1ccc93316bd 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -8,7 +8,7 @@ #include "trace/beauty/beauty.h" #include #include -#include +#include #include size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg) -- cgit v1.2.3 From 8a1ad4413519b10fbe5e73300b198498a0b28806 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Mar 2024 16:18:14 -0300 Subject: tools headers: Remove now unused copies of uapi/{fcntl,openat2}.h and asm/fcntl.h These were used to build perf to provide defines not available in older distros, but this was back in 2017, nowadays all the distros that are supported and I have build containers for work using just the system headers, so ditch them. Some of these older distros may not have things that are used in 'perf trace', but then they also don't have libtraceevent packages, so don't build 'perf trace'. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240315204835.748716-5-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/fcntl.h | 221 --------------------------------- tools/include/uapi/linux/fcntl.h | 123 ------------------ tools/include/uapi/linux/openat2.h | 43 ------- tools/perf/check-headers.sh | 2 - 4 files changed, 389 deletions(-) delete mode 100644 tools/include/uapi/asm-generic/fcntl.h delete mode 100644 tools/include/uapi/linux/fcntl.h delete mode 100644 tools/include/uapi/linux/openat2.h (limited to 'tools') diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h deleted file mode 100644 index 1c7a0f6632c0..000000000000 --- a/tools/include/uapi/asm-generic/fcntl.h +++ /dev/null @@ -1,221 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_GENERIC_FCNTL_H -#define _ASM_GENERIC_FCNTL_H - -#include - -/* - * FMODE_EXEC is 0x20 - * FMODE_NONOTIFY is 0x4000000 - * These cannot be used by userspace O_* until internal and external open - * flags are split. - * -Eric Paris - */ - -/* - * When introducing new O_* bits, please check its uniqueness in fcntl_init(). - */ - -#define O_ACCMODE 00000003 -#define O_RDONLY 00000000 -#define O_WRONLY 00000001 -#define O_RDWR 00000002 -#ifndef O_CREAT -#define O_CREAT 00000100 /* not fcntl */ -#endif -#ifndef O_EXCL -#define O_EXCL 00000200 /* not fcntl */ -#endif -#ifndef O_NOCTTY -#define O_NOCTTY 00000400 /* not fcntl */ -#endif -#ifndef O_TRUNC -#define O_TRUNC 00001000 /* not fcntl */ -#endif -#ifndef O_APPEND -#define O_APPEND 00002000 -#endif -#ifndef O_NONBLOCK -#define O_NONBLOCK 00004000 -#endif -#ifndef O_DSYNC -#define O_DSYNC 00010000 /* used to be O_SYNC, see below */ -#endif -#ifndef FASYNC -#define FASYNC 00020000 /* fcntl, for BSD compatibility */ -#endif -#ifndef O_DIRECT -#define O_DIRECT 00040000 /* direct disk access hint */ -#endif -#ifndef O_LARGEFILE -#define O_LARGEFILE 00100000 -#endif -#ifndef O_DIRECTORY -#define O_DIRECTORY 00200000 /* must be a directory */ -#endif -#ifndef O_NOFOLLOW -#define O_NOFOLLOW 00400000 /* don't follow links */ -#endif -#ifndef O_NOATIME -#define O_NOATIME 01000000 -#endif -#ifndef O_CLOEXEC -#define O_CLOEXEC 02000000 /* set close_on_exec */ -#endif - -/* - * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using - * the O_SYNC flag. We continue to use the existing numerical value - * for O_DSYNC semantics now, but using the correct symbolic name for it. - * This new value is used to request true Posix O_SYNC semantics. It is - * defined in this strange way to make sure applications compiled against - * new headers get at least O_DSYNC semantics on older kernels. - * - * This has the nice side-effect that we can simply test for O_DSYNC - * wherever we do not care if O_DSYNC or O_SYNC is used. - * - * Note: __O_SYNC must never be used directly. - */ -#ifndef O_SYNC -#define __O_SYNC 04000000 -#define O_SYNC (__O_SYNC|O_DSYNC) -#endif - -#ifndef O_PATH -#define O_PATH 010000000 -#endif - -#ifndef __O_TMPFILE -#define __O_TMPFILE 020000000 -#endif - -/* a horrid kludge trying to make sure that this will fail on old kernels */ -#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) - -#ifndef O_NDELAY -#define O_NDELAY O_NONBLOCK -#endif - -#define F_DUPFD 0 /* dup */ -#define F_GETFD 1 /* get close_on_exec */ -#define F_SETFD 2 /* set/clear close_on_exec */ -#define F_GETFL 3 /* get file->f_flags */ -#define F_SETFL 4 /* set file->f_flags */ -#ifndef F_GETLK -#define F_GETLK 5 -#define F_SETLK 6 -#define F_SETLKW 7 -#endif -#ifndef F_SETOWN -#define F_SETOWN 8 /* for sockets. */ -#define F_GETOWN 9 /* for sockets. */ -#endif -#ifndef F_SETSIG -#define F_SETSIG 10 /* for sockets. */ -#define F_GETSIG 11 /* for sockets. */ -#endif - -#if __BITS_PER_LONG == 32 || defined(__KERNEL__) -#ifndef F_GETLK64 -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 -#endif -#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ - -#ifndef F_SETOWN_EX -#define F_SETOWN_EX 15 -#define F_GETOWN_EX 16 -#endif - -#ifndef F_GETOWNER_UIDS -#define F_GETOWNER_UIDS 17 -#endif - -/* - * Open File Description Locks - * - * Usually record locks held by a process are released on *any* close and are - * not inherited across a fork(). - * - * These cmd values will set locks that conflict with process-associated - * record locks, but are "owned" by the open file description, not the - * process. This means that they are inherited across fork() like BSD (flock) - * locks, and they are only released automatically when the last reference to - * the open file against which they were acquired is put. - */ -#define F_OFD_GETLK 36 -#define F_OFD_SETLK 37 -#define F_OFD_SETLKW 38 - -#define F_OWNER_TID 0 -#define F_OWNER_PID 1 -#define F_OWNER_PGRP 2 - -struct f_owner_ex { - int type; - __kernel_pid_t pid; -}; - -/* for F_[GET|SET]FL */ -#define FD_CLOEXEC 1 /* actually anything with low bit set goes */ - -/* for posix fcntl() and lockf() */ -#ifndef F_RDLCK -#define F_RDLCK 0 -#define F_WRLCK 1 -#define F_UNLCK 2 -#endif - -/* for old implementation of bsd flock () */ -#ifndef F_EXLCK -#define F_EXLCK 4 /* or 3 */ -#define F_SHLCK 8 /* or 4 */ -#endif - -/* operations for bsd flock(), also used by the kernel implementation */ -#define LOCK_SH 1 /* shared lock */ -#define LOCK_EX 2 /* exclusive lock */ -#define LOCK_NB 4 /* or'd with one of the above to prevent - blocking */ -#define LOCK_UN 8 /* remove lock */ - -/* - * LOCK_MAND support has been removed from the kernel. We leave the symbols - * here to not break legacy builds, but these should not be used in new code. - */ -#define LOCK_MAND 32 /* This is a mandatory flock ... */ -#define LOCK_READ 64 /* which allows concurrent read operations */ -#define LOCK_WRITE 128 /* which allows concurrent write operations */ -#define LOCK_RW 192 /* which allows concurrent read & write ops */ - -#define F_LINUX_SPECIFIC_BASE 1024 - -#ifndef HAVE_ARCH_STRUCT_FLOCK -struct flock { - short l_type; - short l_whence; - __kernel_off_t l_start; - __kernel_off_t l_len; - __kernel_pid_t l_pid; -#ifdef __ARCH_FLOCK_EXTRA_SYSID - __ARCH_FLOCK_EXTRA_SYSID -#endif -#ifdef __ARCH_FLOCK_PAD - __ARCH_FLOCK_PAD -#endif -}; - -struct flock64 { - short l_type; - short l_whence; - __kernel_loff_t l_start; - __kernel_loff_t l_len; - __kernel_pid_t l_pid; -#ifdef __ARCH_FLOCK64_PAD - __ARCH_FLOCK64_PAD -#endif -}; -#endif /* HAVE_ARCH_STRUCT_FLOCK */ - -#endif /* _ASM_GENERIC_FCNTL_H */ diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h deleted file mode 100644 index 282e90aeb163..000000000000 --- a/tools/include/uapi/linux/fcntl.h +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_FCNTL_H -#define _UAPI_LINUX_FCNTL_H - -#include -#include - -#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) -#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) - -/* - * Cancel a blocking posix lock; internal use only until we expose an - * asynchronous lock api to userspace: - */ -#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5) - -/* Create a file descriptor with FD_CLOEXEC set. */ -#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) - -/* - * Request nofications on a directory. - * See below for events that may be notified. - */ -#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) - -/* - * Set and get of pipe page size array - */ -#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) -#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) - -/* - * Set/Get seals - */ -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -/* - * Types of seals - */ -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ -#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ -#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ -/* (1U << 31) is reserved for signed error codes */ - -/* - * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the - * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on - * the specific file. - */ -#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) -#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) -#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) -#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) - -/* - * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be - * used to clear any hints previously set. - */ -#define RWH_WRITE_LIFE_NOT_SET 0 -#define RWH_WRITE_LIFE_NONE 1 -#define RWH_WRITE_LIFE_SHORT 2 -#define RWH_WRITE_LIFE_MEDIUM 3 -#define RWH_WRITE_LIFE_LONG 4 -#define RWH_WRITE_LIFE_EXTREME 5 - -/* - * The originally introduced spelling is remained from the first - * versions of the patch set that introduced the feature, see commit - * v4.13-rc1~212^2~51. - */ -#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET - -/* - * Types of directory notifications that may be requested. - */ -#define DN_ACCESS 0x00000001 /* File accessed */ -#define DN_MODIFY 0x00000002 /* File modified */ -#define DN_CREATE 0x00000004 /* File created */ -#define DN_DELETE 0x00000008 /* File removed */ -#define DN_RENAME 0x00000010 /* File renamed */ -#define DN_ATTRIB 0x00000020 /* File changed attibutes */ -#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ - -/* - * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is - * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to - * unlinkat. The two functions do completely different things and therefore, - * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to - * faccessat would be undefined behavior and thus treating it equivalent to - * AT_EACCESS is valid undefined behavior. - */ -#define AT_FDCWD -100 /* Special value used to indicate - openat should use the current - working directory. */ -#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ -#define AT_EACCESS 0x200 /* Test access permitted for - effective IDs, not real IDs. */ -#define AT_REMOVEDIR 0x200 /* Remove directory instead of - unlinking file. */ -#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ -#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ -#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ - -#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ -#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ -#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ -#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ - -#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ - -/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ -#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to - compare object identity and may not - be usable to open_by_handle_at(2) */ -#if defined(__KERNEL__) -#define AT_GETATTR_NOSEC 0x80000000 -#endif - -#endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/tools/include/uapi/linux/openat2.h b/tools/include/uapi/linux/openat2.h deleted file mode 100644 index a5feb7604948..000000000000 --- a/tools/include/uapi/linux/openat2.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_OPENAT2_H -#define _UAPI_LINUX_OPENAT2_H - -#include - -/* - * Arguments for how openat2(2) should open the target path. If only @flags and - * @mode are non-zero, then openat2(2) operates very similarly to openat(2). - * - * However, unlike openat(2), unknown or invalid bits in @flags result in - * -EINVAL rather than being silently ignored. @mode must be zero unless one of - * {O_CREAT, O_TMPFILE} are set. - * - * @flags: O_* flags. - * @mode: O_CREAT/O_TMPFILE file mode. - * @resolve: RESOLVE_* flags. - */ -struct open_how { - __u64 flags; - __u64 mode; - __u64 resolve; -}; - -/* how->resolve flags for openat2(2). */ -#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings - (includes bind-mounts). */ -#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style - "magic-links". */ -#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks - (implies OEXT_NO_MAGICLINKS) */ -#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like - "..", symlinks, and absolute - paths which escape the dirfd. */ -#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." - be scoped inside the dirfd - (similar to chroot(2)). */ -#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be - completed through cached lookup. May - return -EAGAIN if that's not - possible. */ - -#endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 03d00110a48f..4ea674df11f8 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -10,12 +10,10 @@ FILES=( "include/uapi/drm/drm.h" "include/uapi/drm/i915_drm.h" "include/uapi/linux/fadvise.h" - "include/uapi/linux/fcntl.h" "include/uapi/linux/fscrypt.h" "include/uapi/linux/kcmp.h" "include/uapi/linux/kvm.h" "include/uapi/linux/in.h" - "include/uapi/linux/openat2.h" "include/uapi/linux/perf_event.h" "include/uapi/linux/sched.h" "include/uapi/linux/seccomp.h" -- cgit v1.2.3 From a672af9139a843eb7a48fd7846bb6df8f81b5f86 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Mar 2024 16:18:14 -0300 Subject: tools headers: Remove almost unused copy of uapi/stat.h, add few conditional defines These were used to build perf to provide defines not available in older distros, but this was back in 2017, nowadays most the distros that are supported and I have build containers for work using just the system headers, so ditch them. For the few that don't have STATX_MNT_ID{_UNIQUE}, or STATX_MNT_DIOALIGN add them conditionally. Some of these older distros may not have things that are used in 'perf trace', but then they also don't have libtraceevent packages, so don't build 'perf trace'. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240315204835.748716-6-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/stat.h | 195 ---------------------------------------- tools/perf/check-headers.sh | 1 - tools/perf/trace/beauty/statx.c | 12 ++- 3 files changed, 11 insertions(+), 197 deletions(-) delete mode 100644 tools/include/uapi/linux/stat.h (limited to 'tools') diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h deleted file mode 100644 index 2f2ee82d5517..000000000000 --- a/tools/include/uapi/linux/stat.h +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_STAT_H -#define _UAPI_LINUX_STAT_H - -#include - -#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) - -#define S_IFMT 00170000 -#define S_IFSOCK 0140000 -#define S_IFLNK 0120000 -#define S_IFREG 0100000 -#define S_IFBLK 0060000 -#define S_IFDIR 0040000 -#define S_IFCHR 0020000 -#define S_IFIFO 0010000 -#define S_ISUID 0004000 -#define S_ISGID 0002000 -#define S_ISVTX 0001000 - -#define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) -#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) -#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) -#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) -#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK) -#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO) -#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) - -#define S_IRWXU 00700 -#define S_IRUSR 00400 -#define S_IWUSR 00200 -#define S_IXUSR 00100 - -#define S_IRWXG 00070 -#define S_IRGRP 00040 -#define S_IWGRP 00020 -#define S_IXGRP 00010 - -#define S_IRWXO 00007 -#define S_IROTH 00004 -#define S_IWOTH 00002 -#define S_IXOTH 00001 - -#endif - -/* - * Timestamp structure for the timestamps in struct statx. - * - * tv_sec holds the number of seconds before (negative) or after (positive) - * 00:00:00 1st January 1970 UTC. - * - * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. - * - * __reserved is held in case we need a yet finer resolution. - */ -struct statx_timestamp { - __s64 tv_sec; - __u32 tv_nsec; - __s32 __reserved; -}; - -/* - * Structures for the extended file attribute retrieval system call - * (statx()). - * - * The caller passes a mask of what they're specifically interested in as a - * parameter to statx(). What statx() actually got will be indicated in - * st_mask upon return. - * - * For each bit in the mask argument: - * - * - if the datum is not supported: - * - * - the bit will be cleared, and - * - * - the datum will be set to an appropriate fabricated value if one is - * available (eg. CIFS can take a default uid and gid), otherwise - * - * - the field will be cleared; - * - * - otherwise, if explicitly requested: - * - * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is - * set or if the datum is considered out of date, and - * - * - the field will be filled in and the bit will be set; - * - * - otherwise, if not requested, but available in approximate form without any - * effort, it will be filled in anyway, and the bit will be set upon return - * (it might not be up to date, however, and no attempt will be made to - * synchronise the internal state first); - * - * - otherwise the field and the bit will be cleared before returning. - * - * Items in STATX_BASIC_STATS may be marked unavailable on return, but they - * will have values installed for compatibility purposes so that stat() and - * co. can be emulated in userspace. - */ -struct statx { - /* 0x00 */ - __u32 stx_mask; /* What results were written [uncond] */ - __u32 stx_blksize; /* Preferred general I/O size [uncond] */ - __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ - /* 0x10 */ - __u32 stx_nlink; /* Number of hard links */ - __u32 stx_uid; /* User ID of owner */ - __u32 stx_gid; /* Group ID of owner */ - __u16 stx_mode; /* File mode */ - __u16 __spare0[1]; - /* 0x20 */ - __u64 stx_ino; /* Inode number */ - __u64 stx_size; /* File size */ - __u64 stx_blocks; /* Number of 512-byte blocks allocated */ - __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ - /* 0x40 */ - struct statx_timestamp stx_atime; /* Last access time */ - struct statx_timestamp stx_btime; /* File creation time */ - struct statx_timestamp stx_ctime; /* Last attribute change time */ - struct statx_timestamp stx_mtime; /* Last data modification time */ - /* 0x80 */ - __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ - __u32 stx_rdev_minor; - __u32 stx_dev_major; /* ID of device containing file [uncond] */ - __u32 stx_dev_minor; - /* 0x90 */ - __u64 stx_mnt_id; - __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ - __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ - /* 0xa0 */ - __u64 __spare3[12]; /* Spare space for future expansion */ - /* 0x100 */ -}; - -/* - * Flags to be stx_mask - * - * Query request/result mask for statx() and struct statx::stx_mask. - * - * These bits should be set in the mask argument of statx() to request - * particular items when calling statx(). - */ -#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */ -#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */ -#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */ -#define STATX_UID 0x00000008U /* Want/got stx_uid */ -#define STATX_GID 0x00000010U /* Want/got stx_gid */ -#define STATX_ATIME 0x00000020U /* Want/got stx_atime */ -#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */ -#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */ -#define STATX_INO 0x00000100U /* Want/got stx_ino */ -#define STATX_SIZE 0x00000200U /* Want/got stx_size */ -#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ -#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ -#define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ -#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ -#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ - -#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ - -#ifndef __KERNEL__ -/* - * This is deprecated, and shall remain the same value in the future. To avoid - * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) - * instead. - */ -#define STATX_ALL 0x00000fffU -#endif - -/* - * Attributes to be found in stx_attributes and masked in stx_attributes_mask. - * - * These give information about the features or the state of a file that might - * be of use to ordinary userspace programs such as GUIs or ls rather than - * specialised tools. - * - * Note that the flags marked [I] correspond to the FS_IOC_SETFLAGS flags - * semantically. Where possible, the numerical value is picked to correspond - * also. Note that the DAX attribute indicates that the file is in the CPU - * direct access state. It does not correspond to the per-inode flag that - * some filesystems support. - * - */ -#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */ -#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */ -#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ -#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ -#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ -#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ -#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ -#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ -#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ - - -#endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 4ea674df11f8..859cd6f35b0a 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -17,7 +17,6 @@ FILES=( "include/uapi/linux/perf_event.h" "include/uapi/linux/sched.h" "include/uapi/linux/seccomp.h" - "include/uapi/linux/stat.h" "include/uapi/linux/vhost.h" "include/linux/bits.h" "include/vdso/bits.h" diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index d1ccc93316bd..1f7e34ed4e02 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -9,7 +9,17 @@ #include #include #include -#include +#include + +#ifndef STATX_MNT_ID +#define STATX_MNT_ID 0x00001000U +#endif +#ifndef STATX_DIOALIGN +#define STATX_DIOALIGN 0x00002000U +#endif +#ifndef STATX_MNT_ID_UNIQUE +#define STATX_MNT_ID_UNIQUE 0x00004000U +#endif size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg) { -- cgit v1.2.3 From 36f65f9b7a454be6e2a329ca959c70c2f2529e9e Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 12 Mar 2024 13:25:07 +0000 Subject: perf docs arm_spe: Clarify more SPE requirements related to KPTI The question of exactly when KPTI needs to be disabled comes up a lot because it doesn't always need to be done. Add the relevant kernel function and some examples that describe the behavior. Also describe the interrupt requirement and that no error message will be printed if this isn't met. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240312132508.423320-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-arm-spe.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt index bf03222e9a68..0a3eda482307 100644 --- a/tools/perf/Documentation/perf-arm-spe.txt +++ b/tools/perf/Documentation/perf-arm-spe.txt @@ -116,6 +116,15 @@ Depending on CPU model, the kernel may need to be booted with page table isolati (kpti=off). If KPTI needs to be disabled, this will fail with a console message "profiling buffer inaccessible. Try passing 'kpti=off' on the kernel command line". +For the full criteria that determine whether KPTI needs to be forced off or not, see function +unmap_kernel_at_el0() in the kernel sources. Common cases where it's not required +are on the CPUs in kpti_safe_list, or on Arm v8.5+ where FEAT_E0PD is mandatory. + +The SPE interrupt must also be described by the firmware. If the module is loaded and KPTI is +disabled (or isn't required to be disabled) but the SPE PMU still doesn't show in +/sys/bus/event_source/devices/, then it's possible that the SPE interrupt isn't described by +ACPI or DT. In this case no warning will be printed by the driver. + Capturing SPE with perf command-line tools ------------------------------------------ @@ -199,7 +208,8 @@ Common errors - "Cannot find PMU `arm_spe'. Missing kernel support?" - Module not built or loaded, KPTI not disabled (see above), or running on a VM + Module not built or loaded, KPTI not disabled, interrupt not described by firmware, + or running on a VM. See 'Kernel Requirements' above. - "Arm SPE CONTEXT packets not found in the traces." -- cgit v1.2.3 From d4a98b45fbe6d06f4b79ed90d0bb05ced8674c23 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 15 Mar 2024 09:13:33 +0200 Subject: perf script: Show also errors for --insn-trace option The trace could be misleading if trace errors are not taken into account, so display them also by adding the itrace "e" option. Note --call-trace and --call-ret-trace already add the itrace "e" option. Fixes: b585ebdb5912cf14 ("perf script: Add --insn-trace for instruction decoding") Reviewed-by: Andi Kleen Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240315071334.3478-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 37088cc0ff1b..2e7148d667bd 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3806,7 +3806,7 @@ static int parse_insn_trace(const struct option *opt __maybe_unused, if (ret < 0) return ret; - itrace_parse_synth_opts(opt, "i0ns", 0); + itrace_parse_synth_opts(opt, "i0nse", 0); symbol_conf.nanosecs = true; return 0; } -- cgit v1.2.3 From bb69c912c4e8005cf1ee6c63782d2fc28838dee2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 15 Mar 2024 09:13:34 +0200 Subject: perf auxtrace: Fix multiple use of --itrace option If the --itrace option is used more than once, the options are combined, but "i" and "y" (sub-)options can be corrupted because itrace_do_parse_synth_opts() incorrectly overwrites the period type and period with default values. For example, with: --itrace=i0ns --itrace=e The processing of "--itrace=e", resets the "i" period from 0 nanoseconds to the default 100 microseconds. Fix by performing the default setting of period type and period only if "i" or "y" are present in the currently processed --itrace value. Fixes: f6986c95af84ff2a ("perf session: Add instruction tracing options") Signed-off-by: Adrian Hunter Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240315071334.3478-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 3684e6009b63..ef314a5797e3 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1466,6 +1466,7 @@ int itrace_do_parse_synth_opts(struct itrace_synth_opts *synth_opts, char *endptr; bool period_type_set = false; bool period_set = false; + bool iy = false; synth_opts->set = true; @@ -1484,6 +1485,7 @@ int itrace_do_parse_synth_opts(struct itrace_synth_opts *synth_opts, switch (*p++) { case 'i': case 'y': + iy = true; if (p[-1] == 'y') synth_opts->cycles = true; else @@ -1649,7 +1651,7 @@ int itrace_do_parse_synth_opts(struct itrace_synth_opts *synth_opts, } } out: - if (synth_opts->instructions || synth_opts->cycles) { + if (iy) { if (!period_type_set) synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE; -- cgit v1.2.3 From efae55bb78cf8722c7df01cd974197dfd13ece39 Mon Sep 17 00:00:00 2001 From: Ethan Adams Date: Thu, 14 Mar 2024 15:20:12 -0700 Subject: perf build: Fix out of tree build related to installation of sysreg-defs It seems that a previous modification to sysreg-defs, which corrected emitting the header to the specified output directory, exposed missing subdir, prefix variables. This breaks out of tree builds of perf as the file is now built into the output directory, but still tries to descend into output directory as a subdir. Fixes: a29ee6aea7030786 ("perf build: Ensure sysreg-defs Makefile respects output dir") Reviewed-by: Oliver Upton Reviewed-by: Tycho Andersen Signed-off-by: Ethan Adams Tested-by: Tycho Andersen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240314222012.47193-1-j.ethan.adams@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index b36c0c68c346..f5654d06e313 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -458,18 +458,19 @@ SHELL = $(SHELL_PATH) arm64_gen_sysreg_dir := $(srctree)/tools/arch/arm64/tools ifneq ($(OUTPUT),) - arm64_gen_sysreg_outdir := $(OUTPUT) + arm64_gen_sysreg_outdir := $(abspath $(OUTPUT)) else arm64_gen_sysreg_outdir := $(CURDIR) endif arm64-sysreg-defs: FORCE - $(Q)$(MAKE) -C $(arm64_gen_sysreg_dir) O=$(arm64_gen_sysreg_outdir) + $(Q)$(MAKE) -C $(arm64_gen_sysreg_dir) O=$(arm64_gen_sysreg_outdir) \ + prefix= subdir= arm64-sysreg-defs-clean: $(call QUIET_CLEAN,arm64-sysreg-defs) $(Q)$(MAKE) -C $(arm64_gen_sysreg_dir) O=$(arm64_gen_sysreg_outdir) \ - clean > /dev/null + prefix= subdir= clean > /dev/null beauty_linux_dir := $(srctree)/tools/perf/trace/beauty/include/linux/ beauty_uapi_linux_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/linux/ -- cgit v1.2.3 From b6b4a62d8525c3093c3273dc6b2e6831adbfc4b2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:50 -0800 Subject: libperf cpumap: Add any, empty and min helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Additional helpers to better replace perf_cpu_map__has_any_cpu_or_is_empty(). Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 27 +++++++++++++++++++++++++++ tools/lib/perf/include/perf/cpumap.h | 16 ++++++++++++++++ tools/lib/perf/libperf.map | 4 ++++ 3 files changed, 47 insertions(+) (limited to 'tools') diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 4adcd7920d03..ba49552952c5 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -316,6 +316,19 @@ bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map) return map ? __perf_cpu_map__cpu(map, 0).cpu == -1 : true; } +bool perf_cpu_map__is_any_cpu_or_is_empty(const struct perf_cpu_map *map) +{ + if (!map) + return true; + + return __perf_cpu_map__nr(map) == 1 && __perf_cpu_map__cpu(map, 0).cpu == -1; +} + +bool perf_cpu_map__is_empty(const struct perf_cpu_map *map) +{ + return map == NULL; +} + int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu) { int low, high; @@ -372,6 +385,20 @@ bool perf_cpu_map__has_any_cpu(const struct perf_cpu_map *map) return map && __perf_cpu_map__cpu(map, 0).cpu == -1; } +struct perf_cpu perf_cpu_map__min(const struct perf_cpu_map *map) +{ + struct perf_cpu cpu, result = { + .cpu = -1 + }; + int idx; + + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, map) { + result = cpu; + break; + } + return result; +} + struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map) { struct perf_cpu result = { diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index 228c6c629b0c..90457d17fb2f 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -61,6 +61,22 @@ LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); * perf_cpu_map__has_any_cpu_or_is_empty - is map either empty or has the "any CPU"/dummy value. */ LIBPERF_API bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map); +/** + * perf_cpu_map__is_any_cpu_or_is_empty - is map either empty or the "any CPU"/dummy value. + */ +LIBPERF_API bool perf_cpu_map__is_any_cpu_or_is_empty(const struct perf_cpu_map *map); +/** + * perf_cpu_map__is_empty - does the map contain no values and it doesn't + * contain the special "any CPU"/dummy value. + */ +LIBPERF_API bool perf_cpu_map__is_empty(const struct perf_cpu_map *map); +/** + * perf_cpu_map__min - the minimum CPU value or -1 if empty or just the "any CPU"/dummy value. + */ +LIBPERF_API struct perf_cpu perf_cpu_map__min(const struct perf_cpu_map *map); +/** + * perf_cpu_map__max - the maximum CPU value or -1 if empty or just the "any CPU"/dummy value. + */ LIBPERF_API struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map); LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu); LIBPERF_API bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map index 10b3f3722642..2aa79b696032 100644 --- a/tools/lib/perf/libperf.map +++ b/tools/lib/perf/libperf.map @@ -10,6 +10,10 @@ LIBPERF_0.0.1 { perf_cpu_map__nr; perf_cpu_map__cpu; perf_cpu_map__has_any_cpu_or_is_empty; + perf_cpu_map__is_any_cpu_or_is_empty; + perf_cpu_map__is_empty; + perf_cpu_map__has_any_cpu; + perf_cpu_map__min; perf_cpu_map__max; perf_cpu_map__has; perf_thread_map__new_array; -- cgit v1.2.3 From dcd45b376d0ab07b987fa968a01e7a5b0c4bf837 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:51 -0800 Subject: libperf cpumap: Ensure empty cpumap is NULL from alloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Potential corner cases could cause a cpumap to be allocated with size 0, but an empty cpumap should be represented as NULL. Add a path in perf_cpu_map__alloc() to ensure this. Suggested-by: James Clark Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Closes: https://lore.kernel.org/lkml/2cd09e7c-eb88-6726-6169-647dcd0a8101@arm.com/ Link: https://lore.kernel.org/r/20240202234057.2085863-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index ba49552952c5..cae799ad44e1 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -18,9 +18,13 @@ void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus) struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus) { - RC_STRUCT(perf_cpu_map) *cpus = malloc(sizeof(*cpus) + sizeof(struct perf_cpu) * nr_cpus); + RC_STRUCT(perf_cpu_map) *cpus; struct perf_cpu_map *result; + if (nr_cpus == 0) + return NULL; + + cpus = malloc(sizeof(*cpus) + sizeof(struct perf_cpu) * nr_cpus); if (ADD_RC_CHK(result, cpus)) { cpus->nr = nr_cpus; refcount_set(&cpus->refcnt, 1); -- cgit v1.2.3 From e28ee1239daa147a30c287ff0ad34ccadcc01de0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:52 -0800 Subject: perf arm-spe/cs-etm: Directly iterate CPU maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than iterate all CPUs and see if they are in CPU maps, directly iterate the CPU map. Similarly make use of the intersect function taking care for when "any" CPU is specified. Switch perf_cpu_map__has_any_cpu_or_is_empty() to more appropriate alternatives. Reviewed-by: James Clark Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 114 +++++++++++++++-------------------- tools/perf/arch/arm64/util/arm-spe.c | 4 +- 2 files changed, 51 insertions(+), 67 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 77e6663c1703..07be32d99805 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -197,38 +197,37 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, static int cs_etm_validate_config(struct auxtrace_record *itr, struct evsel *evsel) { - int i, err = -EINVAL; + int idx, err = 0; struct perf_cpu_map *event_cpus = evsel->evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - - /* Set option of each CPU we have */ - for (i = 0; i < cpu__max_cpu().cpu; i++) { - struct perf_cpu cpu = { .cpu = i, }; + struct perf_cpu_map *intersect_cpus; + struct perf_cpu cpu; - /* - * In per-cpu case, do the validation for CPUs to work with. - * In per-thread case, the CPU map is empty. Since the traced - * program can run on any CPUs in this case, thus don't skip - * validation. - */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus) && - !perf_cpu_map__has(event_cpus, cpu)) - continue; + /* + * Set option of each CPU we have. In per-cpu case, do the validation + * for CPUs to work with. In per-thread case, the CPU map has the "any" + * CPU value. Since the traced program can run on any CPUs in this case, + * thus don't skip validation. + */ + if (!perf_cpu_map__has_any_cpu(event_cpus)) { + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - if (!perf_cpu_map__has(online_cpus, cpu)) - continue; + intersect_cpus = perf_cpu_map__intersect(event_cpus, online_cpus); + perf_cpu_map__put(online_cpus); + } else { + intersect_cpus = perf_cpu_map__new_online_cpus(); + } - err = cs_etm_validate_context_id(itr, evsel, i); + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { + err = cs_etm_validate_context_id(itr, evsel, cpu.cpu); if (err) - goto out; - err = cs_etm_validate_timestamp(itr, evsel, i); + break; + + err = cs_etm_validate_timestamp(itr, evsel, cpu.cpu); if (err) - goto out; + break; } - err = 0; -out: - perf_cpu_map__put(online_cpus); + perf_cpu_map__put(intersect_cpus); return err; } @@ -435,7 +434,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * Also the case of per-cpu mmaps, need the contextID in order to be notified * when a context switch happened. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, "timestamp", 1); evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, @@ -461,7 +460,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, evsel->core.attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(evsel, TIME); err = cs_etm_validate_config(itr, cs_etm_evsel); @@ -533,45 +532,31 @@ static size_t cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, struct evlist *evlist __maybe_unused) { - int i; + int idx; int etmv3 = 0, etmv4 = 0, ete = 0; struct perf_cpu_map *event_cpus = evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - - /* cpu map is not empty, we have specific CPUs to work with */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { - for (i = 0; i < cpu__max_cpu().cpu; i++) { - struct perf_cpu cpu = { .cpu = i, }; + struct perf_cpu_map *intersect_cpus; + struct perf_cpu cpu; - if (!perf_cpu_map__has(event_cpus, cpu) || - !perf_cpu_map__has(online_cpus, cpu)) - continue; + if (!perf_cpu_map__has_any_cpu(event_cpus)) { + /* cpu map is not "any" CPU , we have specific CPUs to work with */ + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); - if (cs_etm_is_ete(itr, i)) - ete++; - else if (cs_etm_is_etmv4(itr, i)) - etmv4++; - else - etmv3++; - } + intersect_cpus = perf_cpu_map__intersect(event_cpus, online_cpus); + perf_cpu_map__put(online_cpus); } else { - /* get configuration for all CPUs in the system */ - for (i = 0; i < cpu__max_cpu().cpu; i++) { - struct perf_cpu cpu = { .cpu = i, }; - - if (!perf_cpu_map__has(online_cpus, cpu)) - continue; - - if (cs_etm_is_ete(itr, i)) - ete++; - else if (cs_etm_is_etmv4(itr, i)) - etmv4++; - else - etmv3++; - } + /* Event can be "any" CPU so count all online CPUs. */ + intersect_cpus = perf_cpu_map__new_online_cpus(); } - - perf_cpu_map__put(online_cpus); + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { + if (cs_etm_is_ete(itr, cpu.cpu)) + ete++; + else if (cs_etm_is_etmv4(itr, cpu.cpu)) + etmv4++; + else + etmv3++; + } + perf_cpu_map__put(intersect_cpus); return (CS_ETM_HEADER_SIZE + (ete * CS_ETE_PRIV_SIZE) + @@ -813,16 +798,15 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, if (!session->evlist->core.nr_mmaps) return -EINVAL; - /* If the cpu_map is empty all online CPUs are involved */ - if (perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { + /* If the cpu_map has the "any" CPU all online CPUs are involved */ + if (perf_cpu_map__has_any_cpu(event_cpus)) { cpu_map = online_cpus; } else { /* Make sure all specified CPUs are online */ - for (i = 0; i < perf_cpu_map__nr(event_cpus); i++) { - struct perf_cpu cpu = { .cpu = i, }; + struct perf_cpu cpu; - if (perf_cpu_map__has(event_cpus, cpu) && - !perf_cpu_map__has(online_cpus, cpu)) + perf_cpu_map__for_each_cpu(cpu, i, event_cpus) { + if (!perf_cpu_map__has(online_cpus, cpu)) return -EINVAL; } diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 51ccbfd3d246..0b52e67edb3b 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -232,7 +232,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, sample CPU for AUX event; * also enable the timestamp tracing for samples correlation. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(arm_spe_evsel, CPU); evsel__set_config_if_unset(arm_spe_pmu, arm_spe_evsel, "ts_enable", 1); @@ -265,7 +265,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, tracking_evsel->core.attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(tracking_evsel, TIME); evsel__set_sample_bit(tracking_evsel, CPU); -- cgit v1.2.3 From 291dcd774b644875c9eb2a56b272919fe35c5c6f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:53 -0800 Subject: perf intel-pt/intel-bts: Switch perf_cpu_map__has_any_cpu_or_is_empty use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch perf_cpu_map__has_any_cpu_or_is_empty() to perf_cpu_map__is_any_cpu_or_is_empty() as a CPU map may contain CPUs as well as the dummy event and perf_cpu_map__is_any_cpu_or_is_empty() is a more correct alternative. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/intel-bts.c | 4 ++-- tools/perf/arch/x86/util/intel-pt.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index af8ae4647585..34696f3d3d5d 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -143,7 +143,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, if (!opts->full_auxtrace) return 0; - if (opts->full_auxtrace && !perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (opts->full_auxtrace && !perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { pr_err(INTEL_BTS_PMU_NAME " does not support per-cpu recording\n"); return -EINVAL; } @@ -224,7 +224,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, we need the CPU on the * AUX event. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(intel_bts_evsel, CPU); } diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index d199619df3ab..6de7e2d21075 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -369,7 +369,7 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, ui__warning("Intel Processor Trace: TSC not available\n"); } - per_cpu_mmaps = !perf_cpu_map__has_any_cpu_or_is_empty(session->evlist->core.user_requested_cpus); + per_cpu_mmaps = !perf_cpu_map__is_any_cpu_or_is_empty(session->evlist->core.user_requested_cpus); auxtrace_info->type = PERF_AUXTRACE_INTEL_PT; auxtrace_info->priv[INTEL_PT_PMU_TYPE] = intel_pt_pmu->type; @@ -774,7 +774,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * Per-cpu recording needs sched_switch events to distinguish different * threads. */ - if (have_timing_info && !perf_cpu_map__has_any_cpu_or_is_empty(cpus) && + if (have_timing_info && !perf_cpu_map__is_any_cpu_or_is_empty(cpus) && !record_opts__no_switch_events(opts)) { if (perf_can_record_switch_events()) { bool cpu_wide = !target__none(&opts->target) && @@ -832,7 +832,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, we need the CPU on the * AUX event. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(intel_pt_evsel, CPU); } @@ -858,7 +858,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, tracking_evsel->immediate = true; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(tracking_evsel, TIME); /* And the CPU for switch events */ evsel__set_sample_bit(tracking_evsel, CPU); @@ -870,7 +870,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * Warn the user when we do not have enough information to decode i.e. * per-cpu with no sched_switch (except workload-only). */ - if (!ptr->have_sched_switch && !perf_cpu_map__has_any_cpu_or_is_empty(cpus) && + if (!ptr->have_sched_switch && !perf_cpu_map__is_any_cpu_or_is_empty(cpus) && !target__none(&opts->target) && !intel_pt_evsel->core.attr.exclude_user) ui__warning("Intel Processor Trace decoding will not be possible except for kernel tracing!\n"); -- cgit v1.2.3 From 3e5deb708c8f3f7d645f567b2ba38d8045fa11ba Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:54 -0800 Subject: perf cpumap: Clean up use of perf_cpu_map__has_any_cpu_or_is_empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most uses of what was perf_cpu_map__empty but is now perf_cpu_map__has_any_cpu_or_is_empty want to do something with the CPU map if it contains CPUs. Replace uses of perf_cpu_map__has_any_cpu_or_is_empty with other helpers so that CPUs within the map can be handled. Reviewed-by: James Clark Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 6 +----- tools/perf/builtin-stat.c | 9 ++++----- tools/perf/util/auxtrace.c | 4 ++-- tools/perf/util/record.c | 2 +- tools/perf/util/stat.c | 2 +- 5 files changed, 9 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 14f228c7c869..1f1d17df9b9a 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2319,11 +2319,7 @@ static int setup_nodes(struct perf_session *session) nodes[node] = set; - /* empty node, skip */ - if (perf_cpu_map__has_any_cpu_or_is_empty(map)) - continue; - - perf_cpu_map__for_each_cpu(cpu, idx, map) { + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, map) { __set_bit(cpu.cpu, set); if (WARN_ONCE(cpu2node[cpu.cpu] != -1, "node/cpu topology bug")) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6bba1a89d030..a47ced077fa6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1319,10 +1319,9 @@ static int cpu__get_cache_id_from_map(struct perf_cpu cpu, char *map) * be the first online CPU in the cache domain else use the * first online CPU of the cache domain as the ID. */ - if (perf_cpu_map__has_any_cpu_or_is_empty(cpu_map)) + id = perf_cpu_map__min(cpu_map).cpu; + if (id == -1) id = cpu.cpu; - else - id = perf_cpu_map__cpu(cpu_map, 0).cpu; /* Free the perf_cpu_map used to find the cache ID */ perf_cpu_map__put(cpu_map); @@ -1642,7 +1641,7 @@ static int perf_stat_init_aggr_mode(void) * taking the highest cpu number to be the size of * the aggregation translate cpumap. */ - if (!perf_cpu_map__has_any_cpu_or_is_empty(evsel_list->core.user_requested_cpus)) + if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel_list->core.user_requested_cpus)) nr = perf_cpu_map__max(evsel_list->core.user_requested_cpus).cpu; else nr = 0; @@ -2334,7 +2333,7 @@ int process_stat_config_event(struct perf_session *session, perf_event__read_stat_config(&stat_config, &event->stat_config); - if (perf_cpu_map__has_any_cpu_or_is_empty(st->cpus)) { + if (perf_cpu_map__is_empty(st->cpus)) { if (st->aggr_mode != AGGR_UNSET) pr_warning("warning: processing task data, aggregation mode not set\n"); } else if (st->aggr_mode != AGGR_UNSET) { diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index ef314a5797e3..cfa2153d4611 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -174,7 +174,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, struct evlist *evlist, struct evsel *evsel, int idx) { - bool per_cpu = !perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus); + bool per_cpu = !perf_cpu_map__has_any_cpu(evlist->core.user_requested_cpus); mp->mmap_needed = evsel->needs_auxtrace_mmap; @@ -648,7 +648,7 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr, static int evlist__enable_event_idx(struct evlist *evlist, struct evsel *evsel, int idx) { - bool per_cpu_mmaps = !perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus); + bool per_cpu_mmaps = !perf_cpu_map__has_any_cpu(evlist->core.user_requested_cpus); if (per_cpu_mmaps) { struct perf_cpu evlist_cpu = perf_cpu_map__cpu(evlist->core.all_cpus, idx); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 87e817b3cf7e..e867de8ddaaa 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -237,7 +237,7 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str) evsel = evlist__last(temp_evlist); - if (!evlist || perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus)) { + if (!evlist || perf_cpu_map__is_any_cpu_or_is_empty(evlist->core.user_requested_cpus)) { struct perf_cpu_map *cpus = perf_cpu_map__new_online_cpus(); if (cpus) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index b0bcf92f0f9c..0bd5467389e4 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -315,7 +315,7 @@ static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals, if (!counter->per_pkg) return 0; - if (perf_cpu_map__has_any_cpu_or_is_empty(cpus)) + if (perf_cpu_map__is_any_cpu_or_is_empty(cpus)) return 0; if (!mask) { -- cgit v1.2.3 From 4ddccd0048089eb324330fa3a0036e41c31355d3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:55 -0800 Subject: perf arm64 header: Remove unnecessary CPU map get and put MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In both cases the CPU map is known owned by either the caller or a PMU. Reviewed-by: James Clark Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/header.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index 97037499152e..a9de0b5187dd 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -25,8 +25,6 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) if (!sysfs || sz < MIDR_SIZE) return EINVAL; - cpus = perf_cpu_map__get(cpus); - for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { char path[PATH_MAX]; FILE *file; @@ -51,7 +49,6 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) break; } - perf_cpu_map__put(cpus); return ret; } -- cgit v1.2.3 From 954ac1b4a79a06736fd85a183f34c18c152286c6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:56 -0800 Subject: perf stat: Remove duplicate cpus_map_matched function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use libperf's perf_cpu_map__equal() that performs the same function. Reviewed-by: James Clark Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a47ced077fa6..65388c57bb5d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -164,26 +164,6 @@ static struct perf_stat_config stat_config = { .iostat_run = false, }; -static bool cpus_map_matched(struct evsel *a, struct evsel *b) -{ - if (!a->core.cpus && !b->core.cpus) - return true; - - if (!a->core.cpus || !b->core.cpus) - return false; - - if (perf_cpu_map__nr(a->core.cpus) != perf_cpu_map__nr(b->core.cpus)) - return false; - - for (int i = 0; i < perf_cpu_map__nr(a->core.cpus); i++) { - if (perf_cpu_map__cpu(a->core.cpus, i).cpu != - perf_cpu_map__cpu(b->core.cpus, i).cpu) - return false; - } - - return true; -} - static void evlist__check_cpu_maps(struct evlist *evlist) { struct evsel *evsel, *warned_leader = NULL; @@ -194,7 +174,7 @@ static void evlist__check_cpu_maps(struct evlist *evlist) /* Check that leader matches cpus with each member. */ if (leader == evsel) continue; - if (cpus_map_matched(leader, evsel)) + if (perf_cpu_map__equal(leader->core.cpus, evsel->core.cpus)) continue; /* If there's mismatch disable the group and warn user. */ -- cgit v1.2.3 From 71bc3ac8e8c93f769d1a2040153fe6d6b8093fa7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 2 Feb 2024 15:40:57 -0800 Subject: perf cpumap: Use perf_cpu_map__for_each_cpu when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than manually iterating the CPU map, use perf_cpu_map__for_each_cpu(). When possible tidy local variables. Reviewed-by: James Clark Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20240202234057.2085863-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/header.c | 10 ++--- tools/perf/tests/bitmap.c | 13 +++--- tools/perf/tests/topology.c | 46 +++++++++++----------- tools/perf/util/bpf_kwork.c | 16 ++++---- tools/perf/util/bpf_kwork_top.c | 12 +++--- tools/perf/util/cpumap.c | 12 +++--- .../util/scripting-engines/trace-event-python.c | 12 +++--- tools/perf/util/session.c | 5 +-- tools/perf/util/svghelper.c | 20 +++++----- 9 files changed, 72 insertions(+), 74 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index a9de0b5187dd..741df3614a09 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -4,8 +4,6 @@ #include #include #include -#include -#include #include #include #include "debug.h" @@ -19,18 +17,18 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) { const char *sysfs = sysfs__mountpoint(); - int cpu; - int ret = EINVAL; + struct perf_cpu cpu; + int idx, ret = EINVAL; if (!sysfs || sz < MIDR_SIZE) return EINVAL; - for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { + perf_cpu_map__for_each_cpu(cpu, idx, cpus) { char path[PATH_MAX]; FILE *file; scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d" MIDR, - sysfs, RC_CHK_ACCESS(cpus)->map[cpu].cpu); + sysfs, cpu.cpu); file = fopen(path, "r"); if (!file) { diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index 0173f5402a35..98956e0e0765 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -11,18 +11,19 @@ static unsigned long *get_bitmap(const char *str, int nbits) { struct perf_cpu_map *map = perf_cpu_map__new(str); - unsigned long *bm = NULL; - int i; + unsigned long *bm; bm = bitmap_zalloc(nbits); if (map && bm) { - for (i = 0; i < perf_cpu_map__nr(map); i++) - __set_bit(perf_cpu_map__cpu(map, i).cpu, bm); + int i; + struct perf_cpu cpu; + + perf_cpu_map__for_each_cpu(cpu, i, map) + __set_bit(cpu.cpu, bm); } - if (map) - perf_cpu_map__put(map); + perf_cpu_map__put(map); return bm; } diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 2a842f53fbb5..a8cb5ba898ab 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -68,6 +68,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) }; int i; struct aggr_cpu_id id; + struct perf_cpu cpu; session = perf_session__new(&data, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); @@ -113,8 +114,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Session header CPU map not set", session->header.env.cpu); for (i = 0; i < session->header.env.nr_cpus_avail; i++) { - struct perf_cpu cpu = { .cpu = i }; - + cpu.cpu = i; if (!perf_cpu_map__has(map, cpu)) continue; pr_debug("CPU %d, core %d, socket %d\n", i, @@ -123,48 +123,48 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that CPU ID contains socket, die, core and CPU - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__cpu(cpu, NULL); TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", - perf_cpu_map__cpu(map, i).cpu == id.cpu.cpu); + cpu.cpu == id.cpu.cpu); TEST_ASSERT_VAL("Cpu map - Core ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); + session->header.env.cpu[cpu.cpu].core_id == id.core); TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Cpu map - Die ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); + session->header.env.cpu[cpu.cpu].die_id == id.die); TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Cpu map - Thread IDX is set", id.thread_idx == -1); } // Test that core ID contains socket, die and core - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__core(cpu, NULL); TEST_ASSERT_VAL("Core map - Core ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); + session->header.env.cpu[cpu.cpu].core_id == id.core); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Core map - Die ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); + session->header.env.cpu[cpu.cpu].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Core map - Thread IDX is set", id.thread_idx == -1); } // Test that die ID contains socket and die - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__die(cpu, NULL); TEST_ASSERT_VAL("Die map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Die map - Die ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); + session->header.env.cpu[cpu.cpu].die_id == id.die); TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); @@ -173,10 +173,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that socket ID contains only socket - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__socket(cpu, NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", - session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + session->header.env.cpu[cpu.cpu].socket_id == id.socket); TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); @@ -187,10 +187,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that node ID contains only node - for (i = 0; i < perf_cpu_map__nr(map); i++) { - id = aggr_cpu_id__node(perf_cpu_map__cpu(map, i), NULL); + perf_cpu_map__for_each_cpu(cpu, i, map) { + id = aggr_cpu_id__node(cpu, NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", - cpu__get_node(perf_cpu_map__cpu(map, i)) == id.node); + cpu__get_node(cpu) == id.node); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c index 6eb2c78fd7f4..44f0f708a15d 100644 --- a/tools/perf/util/bpf_kwork.c +++ b/tools/perf/util/bpf_kwork.c @@ -147,12 +147,12 @@ static bool valid_kwork_class_type(enum kwork_class_type type) static int setup_filters(struct perf_kwork *kwork) { - u8 val = 1; - int i, nr_cpus, key, fd; - struct perf_cpu_map *map; - if (kwork->cpu_list != NULL) { - fd = bpf_map__fd(skel->maps.perf_kwork_cpu_filter); + int idx, nr_cpus; + struct perf_cpu_map *map; + struct perf_cpu cpu; + int fd = bpf_map__fd(skel->maps.perf_kwork_cpu_filter); + if (fd < 0) { pr_debug("Invalid cpu filter fd\n"); return -1; @@ -165,8 +165,8 @@ static int setup_filters(struct perf_kwork *kwork) } nr_cpus = libbpf_num_possible_cpus(); - for (i = 0; i < perf_cpu_map__nr(map); i++) { - struct perf_cpu cpu = perf_cpu_map__cpu(map, i); + perf_cpu_map__for_each_cpu(cpu, idx, map) { + u8 val = 1; if (cpu.cpu >= nr_cpus) { perf_cpu_map__put(map); @@ -181,6 +181,8 @@ static int setup_filters(struct perf_kwork *kwork) } if (kwork->profile_name != NULL) { + int key, fd; + if (strlen(kwork->profile_name) >= MAX_KWORKNAME) { pr_err("Requested name filter %s too large, limit to %d\n", kwork->profile_name, MAX_KWORKNAME - 1); diff --git a/tools/perf/util/bpf_kwork_top.c b/tools/perf/util/bpf_kwork_top.c index 035e02272790..22a3b00a1e23 100644 --- a/tools/perf/util/bpf_kwork_top.c +++ b/tools/perf/util/bpf_kwork_top.c @@ -122,11 +122,11 @@ static bool valid_kwork_class_type(enum kwork_class_type type) static int setup_filters(struct perf_kwork *kwork) { - u8 val = 1; - int i, nr_cpus, fd; - struct perf_cpu_map *map; - if (kwork->cpu_list) { + int idx, nr_cpus, fd; + struct perf_cpu_map *map; + struct perf_cpu cpu; + fd = bpf_map__fd(skel->maps.kwork_top_cpu_filter); if (fd < 0) { pr_debug("Invalid cpu filter fd\n"); @@ -140,8 +140,8 @@ static int setup_filters(struct perf_kwork *kwork) } nr_cpus = libbpf_num_possible_cpus(); - for (i = 0; i < perf_cpu_map__nr(map); i++) { - struct perf_cpu cpu = perf_cpu_map__cpu(map, i); + perf_cpu_map__for_each_cpu(cpu, idx, map) { + u8 val = 1; if (cpu.cpu >= nr_cpus) { perf_cpu_map__put(map); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 356e30c42cd8..6a270d640acb 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -655,10 +655,10 @@ static char hex_char(unsigned char val) size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size) { - int i, cpu; + int idx; char *ptr = buf; unsigned char *bitmap; - struct perf_cpu last_cpu = perf_cpu_map__cpu(map, perf_cpu_map__nr(map) - 1); + struct perf_cpu c, last_cpu = perf_cpu_map__max(map); if (buf == NULL) return 0; @@ -669,12 +669,10 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size) return 0; } - for (i = 0; i < perf_cpu_map__nr(map); i++) { - cpu = perf_cpu_map__cpu(map, i).cpu; - bitmap[cpu / 8] |= 1 << (cpu % 8); - } + perf_cpu_map__for_each_cpu(c, idx, map) + bitmap[c.cpu / 8] |= 1 << (c.cpu % 8); - for (cpu = last_cpu.cpu / 4 * 4; cpu >= 0; cpu -= 4) { + for (int cpu = last_cpu.cpu / 4 * 4; cpu >= 0; cpu -= 4) { unsigned char bits = bitmap[cpu / 8]; if (cpu % 8) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index b4f0f60e60a6..8aa301948de5 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1699,13 +1699,15 @@ static void python_process_stat(struct perf_stat_config *config, { struct perf_thread_map *threads = counter->core.threads; struct perf_cpu_map *cpus = counter->core.cpus; - int cpu, thread; - for (thread = 0; thread < perf_thread_map__nr(threads); thread++) { - for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { - process_stat(counter, perf_cpu_map__cpu(cpus, cpu), + for (int thread = 0; thread < perf_thread_map__nr(threads); thread++) { + int idx; + struct perf_cpu cpu; + + perf_cpu_map__for_each_cpu(cpu, idx, cpus) { + process_stat(counter, cpu, perf_thread_map__pid(threads, thread), tstamp, - perf_counts(counter->counts, cpu, thread)); + perf_counts(counter->counts, idx, thread)); } } } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 06d0bd7fb459..02a932a83c51 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2749,6 +2749,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, int i, err = -1; struct perf_cpu_map *map; int nr_cpus = min(session->header.env.nr_cpus_avail, MAX_NR_CPUS); + struct perf_cpu cpu; for (i = 0; i < PERF_TYPE_MAX; ++i) { struct evsel *evsel; @@ -2770,9 +2771,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, return -1; } - for (i = 0; i < perf_cpu_map__nr(map); i++) { - struct perf_cpu cpu = perf_cpu_map__cpu(map, i); - + perf_cpu_map__for_each_cpu(cpu, i, map) { if (cpu.cpu >= nr_cpus) { pr_err("Requested CPU %d too large. " "Consider raising MAX_NR_CPUS\n", cpu.cpu); diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 1892e9b6aa7f..2b04f47f4db0 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -725,26 +725,24 @@ static void scan_core_topology(int *map, struct topology *t, int nr_cpus) static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus) { - int i; - int ret = 0; - struct perf_cpu_map *m; - struct perf_cpu c; + int idx, ret = 0; + struct perf_cpu_map *map; + struct perf_cpu cpu; - m = perf_cpu_map__new(s); - if (!m) + map = perf_cpu_map__new(s); + if (!map) return -1; - for (i = 0; i < perf_cpu_map__nr(m); i++) { - c = perf_cpu_map__cpu(m, i); - if (c.cpu >= nr_cpus) { + perf_cpu_map__for_each_cpu(cpu, idx, map) { + if (cpu.cpu >= nr_cpus) { ret = -1; break; } - __set_bit(c.cpu, cpumask_bits(b)); + __set_bit(cpu.cpu, cpumask_bits(b)); } - perf_cpu_map__put(m); + perf_cpu_map__put(map); return ret; } -- cgit v1.2.3 From b508965d35321534e84daf8946b2cc5f64517db9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:53 -0700 Subject: perf dwarf-aux: Remove unused pc argument It's not used, let's get rid of it. Signed-off-by: Namhyung Kim Reviewed-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 4 ++-- tools/perf/util/dwarf-aux.c | 7 ++----- tools/perf/util/dwarf-aux.h | 6 ++---- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 30c4d19fcf11..59ce5f4f4a40 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -263,7 +263,7 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, offset = loc->offset; if (reg == DWARF_REG_PC) { - if (die_find_variable_by_addr(&cu_die, pc, addr, &var_die, &offset)) { + if (die_find_variable_by_addr(&cu_die, addr, &var_die, &offset)) { ret = check_variable(&var_die, type_die, offset, /*is_pointer=*/false); loc->offset = offset; @@ -312,7 +312,7 @@ retry: /* Search from the inner-most scope to the outer */ for (i = nr_scopes - 1; i >= 0; i--) { if (reg == DWARF_REG_PC) { - if (!die_find_variable_by_addr(&scopes[i], pc, addr, + if (!die_find_variable_by_addr(&scopes[i], addr, &var_die, &offset)) continue; } else { diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 2791126069b4..e84d0d6a7750 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1456,7 +1456,6 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) /** * die_find_variable_by_addr - Find variable located at given address * @sc_die: a scope DIE - * @pc: the program address to find * @addr: the data address to find * @die_mem: a buffer to save the resulting DIE * @offset: the offset in the resulting type @@ -1464,12 +1463,10 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) * Find the variable DIE located at the given address (in PC-relative mode). * This is usually for global variables. */ -Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc, - Dwarf_Addr addr, Dwarf_Die *die_mem, - int *offset) +Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, + Dwarf_Die *die_mem, int *offset) { struct find_var_data data = { - .pc = pc, .addr = addr, }; Dwarf_Die *result; diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 85dd527ae1f7..9973801a20c1 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -146,9 +146,8 @@ Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, Dwarf_Die *die_mem); /* Find a (global) variable located in the 'addr' */ -Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc, - Dwarf_Addr addr, Dwarf_Die *die_mem, - int *offset); +Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, + Dwarf_Die *die_mem, int *offset); #else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ @@ -170,7 +169,6 @@ static inline Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die __maybe_unus } static inline Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die __maybe_unused, - Dwarf_Addr pc __maybe_unused, Dwarf_Addr addr __maybe_unused, Dwarf_Die *die_mem __maybe_unused, int *offset __maybe_unused) -- cgit v1.2.3 From 932dcc2c39aedf54ef291bc0b4129a54f5fe1e84 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:54 -0700 Subject: perf dwarf-aux: Add die_collect_vars() The die_collect_vars() is to find all variable information in the scope including function parameters. The struct die_var_type is to save the type of the variable with the location (reg and offset) as well as where it's defined in the code (addr). Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 118 +++++++++++++++++++++++++++++++++----------- tools/perf/util/dwarf-aux.h | 17 +++++++ 2 files changed, 107 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index e84d0d6a7750..785aa7a3d725 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1136,6 +1136,40 @@ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf) return ret < 0 ? ret : strbuf_addf(buf, "\t%s", dwarf_diename(vr_die)); } +#if defined(HAVE_DWARF_GETLOCATIONS_SUPPORT) || defined(HAVE_DWARF_CFI_SUPPORT) +static int reg_from_dwarf_op(Dwarf_Op *op) +{ + switch (op->atom) { + case DW_OP_reg0 ... DW_OP_reg31: + return op->atom - DW_OP_reg0; + case DW_OP_breg0 ... DW_OP_breg31: + return op->atom - DW_OP_breg0; + case DW_OP_regx: + case DW_OP_bregx: + return op->number; + default: + break; + } + return -1; +} + +static int offset_from_dwarf_op(Dwarf_Op *op) +{ + switch (op->atom) { + case DW_OP_reg0 ... DW_OP_reg31: + case DW_OP_regx: + return 0; + case DW_OP_breg0 ... DW_OP_breg31: + return op->number; + case DW_OP_bregx: + return op->number2; + default: + break; + } + return -1; +} +#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT || HAVE_DWARF_CFI_SUPPORT */ + #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT /** * die_get_var_innermost_scope - Get innermost scope range of given variable DIE @@ -1476,41 +1510,69 @@ Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, *offset = data.offset; return result; } -#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ -#ifdef HAVE_DWARF_CFI_SUPPORT -static int reg_from_dwarf_op(Dwarf_Op *op) +static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg) { - switch (op->atom) { - case DW_OP_reg0 ... DW_OP_reg31: - return op->atom - DW_OP_reg0; - case DW_OP_breg0 ... DW_OP_breg31: - return op->atom - DW_OP_breg0; - case DW_OP_regx: - case DW_OP_bregx: - return op->number; - default: - break; - } - return -1; + struct die_var_type **var_types = arg; + Dwarf_Die type_die; + int tag = dwarf_tag(die_mem); + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Op *ops; + size_t nops; + struct die_var_type *vt; + + if (tag != DW_TAG_variable && tag != DW_TAG_formal_parameter) + return DIE_FIND_CB_SIBLING; + + if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL) + return DIE_FIND_CB_SIBLING; + + /* + * Only collect the first location as it can reconstruct the + * remaining state by following the instructions. + * start = 0 means it covers the whole range. + */ + if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0) + return DIE_FIND_CB_SIBLING; + + if (die_get_real_type(die_mem, &type_die) == NULL) + return DIE_FIND_CB_SIBLING; + + vt = malloc(sizeof(*vt)); + if (vt == NULL) + return DIE_FIND_CB_END; + + vt->die_off = dwarf_dieoffset(&type_die); + vt->addr = start; + vt->reg = reg_from_dwarf_op(ops); + vt->offset = offset_from_dwarf_op(ops); + vt->next = *var_types; + *var_types = vt; + + return DIE_FIND_CB_SIBLING; } -static int offset_from_dwarf_op(Dwarf_Op *op) +/** + * die_collect_vars - Save all variables and parameters + * @sc_die: a scope DIE + * @var_types: a pointer to save the resulting list + * + * Save all variables and parameters in the @sc_die and save them to @var_types. + * The @var_types is a singly-linked list containing type and location info. + * Actual type can be retrieved using dwarf_offdie() with 'die_off' later. + * + * Callers should free @var_types. + */ +void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types) { - switch (op->atom) { - case DW_OP_reg0 ... DW_OP_reg31: - case DW_OP_regx: - return 0; - case DW_OP_breg0 ... DW_OP_breg31: - return op->number; - case DW_OP_bregx: - return op->number2; - default: - break; - } - return -1; + Dwarf_Die die_mem; + + die_find_child(sc_die, __die_collect_vars_cb, (void *)var_types, &die_mem); } +#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ +#ifdef HAVE_DWARF_CFI_SUPPORT /** * die_get_cfa - Get frame base information * @dwarf: a Dwarf info diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 9973801a20c1..cd171b06fd4c 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -135,6 +135,15 @@ void die_skip_prologue(Dwarf_Die *sp_die, Dwarf_Die *cu_die, /* Get the list of including scopes */ int die_get_scopes(Dwarf_Die *cu_die, Dwarf_Addr pc, Dwarf_Die **scopes); +/* Variable type information */ +struct die_var_type { + struct die_var_type *next; + u64 die_off; + u64 addr; + int reg; + int offset; +}; + #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT /* Get byte offset range of given variable DIE */ @@ -149,6 +158,9 @@ Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, Dwarf_Die *die_mem, int *offset); +/* Save all variables and parameters in this scope */ +void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types); + #else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, @@ -176,6 +188,11 @@ static inline Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die __maybe_unu return NULL; } +static inline void die_collect_vars(Dwarf_Die *sc_die __maybe_unused, + struct die_var_type **var_types __maybe_unused) +{ +} + #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #ifdef HAVE_DWARF_CFI_SUPPORT -- cgit v1.2.3 From 437683a9941891c17059d99561eb3ce85e0f51fa Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:55 -0700 Subject: perf dwarf-aux: Handle type transfer for memory access We want to track type states as instructions are executed. Each instruction can access compound types like struct or union and load/ store its members to a different location. The die_deref_ptr_type() is to find a type of memory access with a pointer variable. If it points to a compound type like struct, the target memory is a member in the struct. The access will happen with an offset indicating which member it refers. Let's follow the DWARF info to figure out the type of the pointer target. For example, say we have the following code. struct foo { int a; int b; }; struct foo *p = malloc(sizeof(*p)); p->b = 0; The last pointer access should produce x86 asm like below: mov 0x0, 4(%rbx) And we know %rbx register has a pointer to struct foo. Then offset 4 should return the debug info of member 'b'. Also variables of compound types can be accessed directly without a pointer. The die_get_member_type() is to handle a such case. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-4-namhyung@kernel.org [ Check if die_get_real_type() returned NULL ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 113 ++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.h | 6 +++ 2 files changed, 119 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 785aa7a3d725..09fd6f1f0ed8 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1838,3 +1838,116 @@ int die_get_scopes(Dwarf_Die *cu_die, Dwarf_Addr pc, Dwarf_Die **scopes) *scopes = data.scopes; return data.nr; } + +static int __die_find_member_offset_cb(Dwarf_Die *die_mem, void *arg) +{ + Dwarf_Die type_die; + Dwarf_Word size, loc; + Dwarf_Word offset = (long)arg; + int tag = dwarf_tag(die_mem); + + if (tag != DW_TAG_member) + return DIE_FIND_CB_SIBLING; + + /* Unions might not have location */ + if (die_get_data_member_location(die_mem, &loc) < 0) + loc = 0; + + if (offset == loc) + return DIE_FIND_CB_END; + + if (die_get_real_type(die_mem, &type_die) == NULL) { + // TODO: add a pr_debug_dtp() later for this unlikely failure + return DIE_FIND_CB_SIBLING; + } + + if (dwarf_aggregate_size(&type_die, &size) < 0) + size = 0; + + if (loc < offset && offset < (loc + size)) + return DIE_FIND_CB_END; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_get_member_type - Return type info of struct member + * @type_die: a type DIE + * @offset: offset in the type + * @die_mem: a buffer to save the resulting DIE + * + * This function returns a type of a member in @type_die where it's located at + * @offset if it's a struct. For now, it just returns the first matching + * member in a union. For other types, it'd return the given type directly + * if it's within the size of the type or NULL otherwise. + */ +Dwarf_Die *die_get_member_type(Dwarf_Die *type_die, int offset, + Dwarf_Die *die_mem) +{ + Dwarf_Die *member; + Dwarf_Die mb_type; + int tag; + + tag = dwarf_tag(type_die); + /* If it's not a compound type, return the type directly */ + if (tag != DW_TAG_structure_type && tag != DW_TAG_union_type) { + Dwarf_Word size; + + if (dwarf_aggregate_size(type_die, &size) < 0) + size = 0; + + if ((unsigned)offset >= size) + return NULL; + + *die_mem = *type_die; + return die_mem; + } + + mb_type = *type_die; + /* TODO: Handle union types better? */ + while (tag == DW_TAG_structure_type || tag == DW_TAG_union_type) { + member = die_find_child(&mb_type, __die_find_member_offset_cb, + (void *)(long)offset, die_mem); + if (member == NULL) + return NULL; + + if (die_get_real_type(member, &mb_type) == NULL) + return NULL; + + tag = dwarf_tag(&mb_type); + + if (tag == DW_TAG_structure_type || tag == DW_TAG_union_type) { + Dwarf_Word loc; + + /* Update offset for the start of the member struct */ + if (die_get_data_member_location(member, &loc) == 0) + offset -= loc; + } + } + *die_mem = mb_type; + return die_mem; +} + +/** + * die_deref_ptr_type - Return type info for pointer access + * @ptr_die: a pointer type DIE + * @offset: access offset for the pointer + * @die_mem: a buffer to save the resulting DIE + * + * This function follows the pointer in @ptr_die with given @offset + * and saves the resulting type in @die_mem. If the pointer points + * a struct type, actual member at the offset would be returned. + */ +Dwarf_Die *die_deref_ptr_type(Dwarf_Die *ptr_die, int offset, + Dwarf_Die *die_mem) +{ + Dwarf_Die type_die; + + if (dwarf_tag(ptr_die) != DW_TAG_pointer_type) + return NULL; + + if (die_get_real_type(ptr_die, &type_die) == NULL) + return NULL; + + return die_get_member_type(&type_die, offset, die_mem); +} diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index cd171b06fd4c..16c916311bc0 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -144,6 +144,12 @@ struct die_var_type { int offset; }; +/* Return type info of a member at offset */ +Dwarf_Die *die_get_member_type(Dwarf_Die *type_die, int offset, Dwarf_Die *die_mem); + +/* Return type info where the pointer and offset point to */ +Dwarf_Die *die_deref_ptr_type(Dwarf_Die *ptr_die, int offset, Dwarf_Die *die_mem); + #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT /* Get byte offset range of given variable DIE */ -- cgit v1.2.3 From 7a838c2fd2ac81e10bedcd912153a74ca662b309 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:56 -0700 Subject: perf dwarf-aux: Add die_find_func_rettype() The die_find_func_rettype() is to find a debug entry for the given function name and sets the type information of the return value. By convention, it'd return the pointer to the type die (should be the same as the given mem_die argument) if found, or NULL otherwise. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 43 +++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.h | 4 ++++ 2 files changed, 47 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 09fd6f1f0ed8..d4ec239c5adb 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -696,6 +696,49 @@ Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, return die_mem; } +static int __die_find_func_rettype_cb(Dwarf_Die *die_mem, void *data) +{ + const char *func_name; + + if (dwarf_tag(die_mem) != DW_TAG_subprogram) + return DIE_FIND_CB_SIBLING; + + func_name = dwarf_diename(die_mem); + if (func_name && !strcmp(func_name, data)) + return DIE_FIND_CB_END; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_find_func_rettype - Search a return type of function + * @cu_die: a CU DIE + * @name: target function name + * @die_mem: a buffer for result DIE + * + * Search a non-inlined function which matches to @name and stores the + * return type of the function to @die_mem and returns it if found. + * Returns NULL if failed. Note that it doesn't needs to find a + * definition of the function, so it doesn't match with address. + * Most likely, it can find a declaration at the top level. Thus the + * callback function continues to sibling entries only. + */ +Dwarf_Die *die_find_func_rettype(Dwarf_Die *cu_die, const char *name, + Dwarf_Die *die_mem) +{ + Dwarf_Die tmp_die; + + cu_die = die_find_child(cu_die, __die_find_func_rettype_cb, + (void *)name, &tmp_die); + if (!cu_die) + return NULL; + + if (die_get_real_type(&tmp_die, die_mem) == NULL) + return NULL; + + return die_mem; +} + struct __instance_walk_param { void *addr; int (*callback)(Dwarf_Die *, void *); diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 16c916311bc0..b0f25fbf9668 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -94,6 +94,10 @@ Dwarf_Die *die_find_top_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, Dwarf_Die *die_mem); +/* Search a non-inlined function by name and returns its return type */ +Dwarf_Die *die_find_func_rettype(Dwarf_Die *sp_die, const char *name, + Dwarf_Die *die_mem); + /* Walk on the instances of given DIE */ int die_walk_instances(Dwarf_Die *in_die, int (*callback)(Dwarf_Die *, void *), void *data); -- cgit v1.2.3 From 52a09bc24c6a4deeeb8030476a7663b6696d73f0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:57 -0700 Subject: perf map: Add map__objdump_2rip() Sometimes we want to convert an address in objdump output to map-relative address to match with a sample data. Let's add map__objdump_2rip() for that. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 17 +++++++++++++++++ tools/perf/util/map.h | 3 +++ 2 files changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 14a5ea70d81e..a5d57c201a30 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -587,6 +587,23 @@ u64 map__objdump_2mem(struct map *map, u64 ip) return ip + map__reloc(map); } +/* convert objdump address to relative address. (To be removed) */ +u64 map__objdump_2rip(struct map *map, u64 ip) +{ + const struct dso *dso = map__dso(map); + + if (!dso->adjust_symbols) + return ip; + + if (dso->rel) + return ip + map__pgoff(map); + + if (dso->kernel == DSO_SPACE__USER) + return ip - dso->text_offset; + + return map__map_ip(map, ip + map__reloc(map)); +} + bool map__contains_symbol(const struct map *map, const struct symbol *sym) { u64 ip = map__unmap_ip(map, sym->start); diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 49756716cb13..65e2609fa1b1 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -132,6 +132,9 @@ u64 map__rip_2objdump(struct map *map, u64 rip); /* objdump address -> memory address */ u64 map__objdump_2mem(struct map *map, u64 ip); +/* objdump address -> rip */ +u64 map__objdump_2rip(struct map *map, u64 ip); + struct symbol; struct thread; -- cgit v1.2.3 From a3f4d5b57dd8fd2a749be261d7253616f15671b5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:58 -0700 Subject: perf annotate-data: Introduce 'struct data_loc_info' The find_data_type() needs many information to describe the location of the data. Add the new 'struct data_loc_info' to pass those information at once. No functional changes intended. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 83 ++++++++++++++++++++++------------------- tools/perf/util/annotate-data.h | 38 ++++++++++++++++--- tools/perf/util/annotate.c | 30 +++++++-------- 3 files changed, 91 insertions(+), 60 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 59ce5f4f4a40..ff81d164aa57 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -239,21 +239,28 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, } /* The result will be saved in @type_die */ -static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, - const char *var_name, struct annotated_op_loc *loc, - Dwarf_Die *type_die) +static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) { + struct annotated_op_loc *loc = dloc->op; Dwarf_Die cu_die, var_die; Dwarf_Die *scopes = NULL; int reg, offset; int ret = -1; int i, nr_scopes; int fbreg = -1; - bool is_fbreg = false; int fb_offset = 0; + bool is_fbreg = false; + u64 pc; + + /* + * IP is a relative instruction address from the start of the map, as + * it can be randomized/relocated, it needs to translate to PC which is + * a file address for DWARF processing. + */ + pc = map__rip_2objdump(dloc->ms->map, dloc->ip); /* Get a compile_unit for this address */ - if (!find_cu_die(di, pc, &cu_die)) { + if (!find_cu_die(dloc->di, pc, &cu_die)) { pr_debug("cannot find CU for address %" PRIx64 "\n", pc); ann_data_stat.no_cuinfo++; return -1; @@ -263,18 +270,19 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, offset = loc->offset; if (reg == DWARF_REG_PC) { - if (die_find_variable_by_addr(&cu_die, addr, &var_die, &offset)) { + if (die_find_variable_by_addr(&cu_die, dloc->var_addr, &var_die, + &offset)) { ret = check_variable(&var_die, type_die, offset, /*is_pointer=*/false); - loc->offset = offset; + dloc->type_offset = offset; goto out; } - if (var_name && die_find_variable_at(&cu_die, var_name, pc, - &var_die)) { - ret = check_variable(&var_die, type_die, 0, + if (dloc->var_name && + die_find_variable_at(&cu_die, dloc->var_name, pc, &var_die)) { + ret = check_variable(&var_die, type_die, dloc->type_offset, /*is_pointer=*/false); - /* loc->offset will be updated by the caller */ + /* dloc->type_offset was updated by the caller */ goto out; } } @@ -291,10 +299,11 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr, dwarf_formblock(&attr, &block) == 0 && block.length == 1) { switch (*block.data) { case DW_OP_reg0 ... DW_OP_reg31: - fbreg = *block.data - DW_OP_reg0; + fbreg = dloc->fbreg = *block.data - DW_OP_reg0; break; case DW_OP_call_frame_cfa: - if (die_get_cfa(di->dbg, pc, &fbreg, + dloc->fb_cfa = true; + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fb_offset) < 0) fbreg = -1; break; @@ -312,7 +321,7 @@ retry: /* Search from the inner-most scope to the outer */ for (i = nr_scopes - 1; i >= 0; i--) { if (reg == DWARF_REG_PC) { - if (!die_find_variable_by_addr(&scopes[i], addr, + if (!die_find_variable_by_addr(&scopes[i], dloc->var_addr, &var_die, &offset)) continue; } else { @@ -325,7 +334,7 @@ retry: /* Found a variable, see if it's correct */ ret = check_variable(&var_die, type_die, offset, reg != DWARF_REG_PC && !is_fbreg); - loc->offset = offset; + dloc->type_offset = offset; goto out; } @@ -344,50 +353,46 @@ out: /** * find_data_type - Return a data type at the location - * @ms: map and symbol at the location - * @ip: instruction address of the memory access - * @loc: instruction operand location - * @addr: data address of the memory access - * @var_name: global variable name + * @dloc: data location * * This functions searches the debug information of the binary to get the data - * type it accesses. The exact location is expressed by (@ip, reg, offset) - * for pointer variables or (@ip, @addr) for global variables. Note that global - * variables might update the @loc->offset after finding the start of the variable. - * If it cannot find a global variable by address, it tried to fine a declaration - * of the variable using @var_name. In that case, @loc->offset won't be updated. + * type it accesses. The exact location is expressed by (ip, reg, offset) + * for pointer variables or (ip, addr) for global variables. Note that global + * variables might update the @dloc->type_offset after finding the start of the + * variable. If it cannot find a global variable by address, it tried to find + * a declaration of the variable using var_name. In that case, @dloc->offset + * won't be updated. * * It return %NULL if not found. */ -struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, - struct annotated_op_loc *loc, u64 addr, - const char *var_name) +struct annotated_data_type *find_data_type(struct data_loc_info *dloc) { struct annotated_data_type *result = NULL; - struct dso *dso = map__dso(ms->map); - struct debuginfo *di; + struct dso *dso = map__dso(dloc->ms->map); Dwarf_Die type_die; - u64 pc; - di = debuginfo__new(dso->long_name); - if (di == NULL) { + dloc->di = debuginfo__new(dso->long_name); + if (dloc->di == NULL) { pr_debug("cannot get the debug info\n"); return NULL; } /* - * IP is a relative instruction address from the start of the map, as - * it can be randomized/relocated, it needs to translate to PC which is - * a file address for DWARF processing. + * The type offset is the same as instruction offset by default. + * But when finding a global variable, the offset won't be valid. */ - pc = map__rip_2objdump(ms->map, ip); - if (find_data_type_die(di, pc, addr, var_name, loc, &type_die) < 0) + if (dloc->var_name == NULL) + dloc->type_offset = dloc->op->offset; + + dloc->fbreg = -1; + + if (find_data_type_die(dloc, &type_die) < 0) goto out; result = dso__findnew_data_type(dso, &type_die); out: - debuginfo__delete(di); + debuginfo__delete(dloc->di); return result; } diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 1b0db8e8c40e..ad6493ea2c8e 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -8,6 +8,7 @@ #include struct annotated_op_loc; +struct debuginfo; struct evsel; struct map_symbol; @@ -72,6 +73,35 @@ struct annotated_data_type { extern struct annotated_data_type unknown_type; extern struct annotated_data_type stackop_type; +/** + * struct data_loc_info - Data location information + * @ms: Map and Symbol info + * @ip: Instruction address + * @var_addr: Data address (for global variables) + * @var_name: Variable name (for global variables) + * @op: Instruction operand location (regs and offset) + * @di: Debug info + * @fbreg: Frame base register + * @fb_cfa: Whether the frame needs to check CFA + * @type_offset: Final offset in the type + */ +struct data_loc_info { + /* These are input field, should be filled by caller */ + struct map_symbol *ms; + u64 ip; + u64 var_addr; + const char *var_name; + struct annotated_op_loc *op; + + /* These are used internally */ + struct debuginfo *di; + int fbreg; + bool fb_cfa; + + /* This is for the result */ + int type_offset; +}; + /** * struct annotated_data_stat - Debug statistics * @total: Total number of entry @@ -106,9 +136,7 @@ extern struct annotated_data_stat ann_data_stat; #ifdef HAVE_DWARF_SUPPORT /* Returns data type at the location (ip, reg, offset) */ -struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, - struct annotated_op_loc *loc, u64 addr, - const char *var_name); +struct annotated_data_type *find_data_type(struct data_loc_info *dloc); /* Update type access histogram at the given offset */ int annotated_data_type__update_samples(struct annotated_data_type *adt, @@ -121,9 +149,7 @@ void annotated_data_type__tree_delete(struct rb_root *root); #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * -find_data_type(struct map_symbol *ms __maybe_unused, u64 ip __maybe_unused, - struct annotated_op_loc *loc __maybe_unused, - u64 addr __maybe_unused, const char *var_name __maybe_unused) +find_data_type(struct data_loc_info *dloc __maybe_unused) { return NULL; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ac002d907d81..a15ff6758210 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3816,9 +3816,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) struct annotated_op_loc *op_loc; struct annotated_data_type *mem_type; struct annotated_item_stat *istat; - u64 ip = he->ip, addr = 0; - const char *var_name = NULL; - int var_offset; + u64 ip = he->ip; int i; ann_data_stat.total++; @@ -3871,51 +3869,53 @@ retry: } for_each_insn_op_loc(&loc, i, op_loc) { + struct data_loc_info dloc = { + .ms = ms, + /* Recalculate IP for LOCK prefix or insn fusion */ + .ip = ms->sym->start + dl->al.offset, + .op = op_loc, + }; + if (!op_loc->mem_ref) continue; /* Recalculate IP because of LOCK prefix or insn fusion */ ip = ms->sym->start + dl->al.offset; - var_offset = op_loc->offset; - /* PC-relative addressing */ if (op_loc->reg1 == DWARF_REG_PC) { struct addr_location al; struct symbol *var; u64 map_addr; - addr = annotate_calc_pcrel(ms, ip, op_loc->offset, dl); + dloc.var_addr = annotate_calc_pcrel(ms, ip, op_loc->offset, dl); /* Kernel symbols might be relocated */ - map_addr = addr + map__reloc(ms->map); + map_addr = dloc.var_addr + map__reloc(ms->map); addr_location__init(&al); var = thread__find_symbol_fb(he->thread, he->cpumode, map_addr, &al); if (var) { - var_name = var->name; + dloc.var_name = var->name; /* Calculate type offset from the start of variable */ - var_offset = map_addr - map__unmap_ip(al.map, var->start); + dloc.type_offset = map_addr - map__unmap_ip(al.map, var->start); } addr_location__exit(&al); } - mem_type = find_data_type(ms, ip, op_loc, addr, var_name); + mem_type = find_data_type(&dloc); if (mem_type) istat->good++; else istat->bad++; - if (mem_type && var_name) - op_loc->offset = var_offset; - if (symbol_conf.annotate_data_sample) { annotated_data_type__update_samples(mem_type, evsel, - op_loc->offset, + dloc.type_offset, he->stat.nr_events, he->stat.period); } - he->mem_type_off = op_loc->offset; + he->mem_type_off = dloc.type_offset; return mem_type; } -- cgit v1.2.3 From 5cdd3fd7995a7a07b1654ea37e0fcc3c64f0cc44 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:50:59 -0700 Subject: perf annotate: Add annotate_get_basic_blocks() The annotate_get_basic_blocks() is to find a list of basic blocks from the source instruction to the destination instruction in a function. It'll be used to find variables in a scope. Use BFS (Breadth First Search) to find a shortest path to carry the variable/register state minimally. Also change find_disasm_line() to be used in annotate_get_basic_blocks() and add 'allow_update' argument to control if it can update the IP. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 222 ++++++++++++++++++++++++++++++++++++++++++++- tools/perf/util/annotate.h | 16 ++++ 2 files changed, 235 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index a15ff6758210..aa005c13ff67 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3714,7 +3714,8 @@ static void symbol__ensure_annotate(struct map_symbol *ms, struct evsel *evsel) } } -static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip) +static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip, + bool allow_update) { struct disasm_line *dl; struct annotation *notes; @@ -3727,7 +3728,8 @@ static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip) * llvm-objdump places "lock" in a separate line and * in that case, we want to get the next line. */ - if (!strcmp(dl->ins.name, "lock") && *dl->ops.raw == '\0') { + if (!strcmp(dl->ins.name, "lock") && + *dl->ops.raw == '\0' && allow_update) { ip++; continue; } @@ -3843,7 +3845,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) * Get a disasm to extract the location from the insn. * This is too slow... */ - dl = find_disasm_line(ms->sym, ip); + dl = find_disasm_line(ms->sym, ip, /*allow_update=*/true); if (dl == NULL) { ann_data_stat.no_insn++; return NULL; @@ -3937,3 +3939,217 @@ retry: istat->bad++; return NULL; } + +/* Basic block traversal (BFS) data structure */ +struct basic_block_data { + struct list_head queue; + struct list_head visited; +}; + +/* + * During the traversal, it needs to know the parent block where the current + * block block started from. Note that single basic block can be parent of + * two child basic blocks (in case of condition jump). + */ +struct basic_block_link { + struct list_head node; + struct basic_block_link *parent; + struct annotated_basic_block *bb; +}; + +/* Check any of basic block in the list already has the offset */ +static bool basic_block_has_offset(struct list_head *head, s64 offset) +{ + struct basic_block_link *link; + + list_for_each_entry(link, head, node) { + s64 begin_offset = link->bb->begin->al.offset; + s64 end_offset = link->bb->end->al.offset; + + if (begin_offset <= offset && offset <= end_offset) + return true; + } + return false; +} + +static bool is_new_basic_block(struct basic_block_data *bb_data, + struct disasm_line *dl) +{ + s64 offset = dl->al.offset; + + if (basic_block_has_offset(&bb_data->visited, offset)) + return false; + if (basic_block_has_offset(&bb_data->queue, offset)) + return false; + return true; +} + +/* Add a basic block starting from dl and link it to the parent */ +static int add_basic_block(struct basic_block_data *bb_data, + struct basic_block_link *parent, + struct disasm_line *dl) +{ + struct annotated_basic_block *bb; + struct basic_block_link *link; + + if (dl == NULL) + return -1; + + if (!is_new_basic_block(bb_data, dl)) + return 0; + + bb = zalloc(sizeof(*bb)); + if (bb == NULL) + return -1; + + bb->begin = dl; + bb->end = dl; + INIT_LIST_HEAD(&bb->list); + + link = malloc(sizeof(*link)); + if (link == NULL) { + free(bb); + return -1; + } + + link->bb = bb; + link->parent = parent; + list_add_tail(&link->node, &bb_data->queue); + return 0; +} + +/* Returns true when it finds the target in the current basic block */ +static bool process_basic_block(struct basic_block_data *bb_data, + struct basic_block_link *link, + struct symbol *sym, u64 target) +{ + struct disasm_line *dl, *next_dl, *last_dl; + struct annotation *notes = symbol__annotation(sym); + bool found = false; + + dl = link->bb->begin; + /* Check if it's already visited */ + if (basic_block_has_offset(&bb_data->visited, dl->al.offset)) + return false; + + last_dl = list_last_entry(¬es->src->source, + struct disasm_line, al.node); + + list_for_each_entry_from(dl, ¬es->src->source, al.node) { + /* Found the target instruction */ + if (sym->start + dl->al.offset == target) { + found = true; + break; + } + /* End of the function, finish the block */ + if (dl == last_dl) + break; + /* 'return' instruction finishes the block */ + if (dl->ins.ops == &ret_ops) + break; + /* normal instructions are part of the basic block */ + if (dl->ins.ops != &jump_ops) + continue; + /* jump to a different function, tail call or return */ + if (dl->ops.target.outside) + break; + /* jump instruction creates new basic block(s) */ + next_dl = find_disasm_line(sym, sym->start + dl->ops.target.offset, + /*allow_update=*/false); + add_basic_block(bb_data, link, next_dl); + + /* + * FIXME: determine conditional jumps properly. + * Conditional jumps create another basic block with the + * next disasm line. + */ + if (!strstr(dl->ins.name, "jmp")) { + next_dl = list_next_entry(dl, al.node); + add_basic_block(bb_data, link, next_dl); + } + break; + + } + link->bb->end = dl; + return found; +} + +/* + * It founds a target basic block, build a proper linked list of basic blocks + * by following the link recursively. + */ +static void link_found_basic_blocks(struct basic_block_link *link, + struct list_head *head) +{ + while (link) { + struct basic_block_link *parent = link->parent; + + list_move(&link->bb->list, head); + list_del(&link->node); + free(link); + + link = parent; + } +} + +static void delete_basic_blocks(struct basic_block_data *bb_data) +{ + struct basic_block_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &bb_data->queue, node) { + list_del(&link->node); + free(link->bb); + free(link); + } + + list_for_each_entry_safe(link, tmp, &bb_data->visited, node) { + list_del(&link->node); + free(link->bb); + free(link); + } +} + +/** + * annotate_get_basic_blocks - Get basic blocks for given address range + * @sym: symbol to annotate + * @src: source address + * @dst: destination address + * @head: list head to save basic blocks + * + * This function traverses disasm_lines from @src to @dst and save them in a + * list of annotated_basic_block to @head. It uses BFS to find the shortest + * path between two. The basic_block_link is to maintain parent links so + * that it can build a list of blocks from the start. + */ +int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst, + struct list_head *head) +{ + struct basic_block_data bb_data = { + .queue = LIST_HEAD_INIT(bb_data.queue), + .visited = LIST_HEAD_INIT(bb_data.visited), + }; + struct basic_block_link *link; + struct disasm_line *dl; + int ret = -1; + + dl = find_disasm_line(sym, src, /*allow_update=*/false); + if (dl == NULL) + return -1; + + if (add_basic_block(&bb_data, /*parent=*/NULL, dl) < 0) + return -1; + + /* Find shortest path from src to dst using BFS */ + while (!list_empty(&bb_data.queue)) { + link = list_first_entry(&bb_data.queue, struct basic_block_link, node); + + if (process_basic_block(&bb_data, link, sym, dst)) { + link_found_basic_blocks(link, head); + ret = 0; + break; + } + list_move(&link->node, &bb_data.visited); + } + delete_basic_blocks(&bb_data); + return ret; +} diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 13cc659e508c..0928663fddee 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -561,4 +561,20 @@ extern struct list_head ann_insn_stat; u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, struct disasm_line *dl); +/** + * struct annotated_basic_block - Basic block of instructions + * @list: List node + * @begin: start instruction in the block + * @end: end instruction in the block + */ +struct annotated_basic_block { + struct list_head list; + struct disasm_line *begin; + struct disasm_line *end; +}; + +/* Get a list of basic blocks from src to dst addresses */ +int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst, + struct list_head *head); + #endif /* __PERF_ANNOTATE_H */ -- cgit v1.2.3 From 90429524f3e7342e98fd3a14deaa46f1dd3f6ffe Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:00 -0700 Subject: perf annotate-data: Add debug messages Add a new debug option "type-profile" to enable the detailed info during the type analysis especially for instruction tracking. You can use this before the command name like 'report' or 'annotate'. $ perf --debug type-profile annotate --data-type Committer testing: First get some memory events: $ perf mem record ls Then, without data-type profiling debug: $ perf annotate --data-type | head Annotate type: 'struct rtld_global' in /usr/lib64/ld-linux-x86-64.so.2 (1 samples): ============================================================================ samples offset size field 1 0 4336 struct rtld_global { 0 0 0 struct link_namespaces* _dl_ns; 0 2560 8 size_t _dl_nns; 0 2568 40 __rtld_lock_recursive_t _dl_load_lock { 0 2568 40 pthread_mutex_t mutex { 0 2568 40 struct __pthread_mutex_s __data { 0 2568 4 int __lock; $ And with only data-type profiling: $ perf --debug type-profile annotate --data-type | head ----------------------------------------------------------- find_data_type_die [1e67] for reg13873052 (PC) offset=0x150e2 in dl_main CU die offset: 0x29cd3 found PC-rel by addr=0x34020 offset=0x20 ----------------------------------------------------------- find_data_type_die [2e] for reg12 offset=0 in __GI___readdir64 CU die offset: 0x137a45 frame base: cfa=1 fbreg=-1 found "__futex" in scope=2/2 (die: 0x137ad5) 0(reg12) type=int (die:2a) ----------------------------------------------------------- find_data_type_die [52] for reg5 offset=0 in __memmove_avx_unaligned_erms CU die offset: 0x1124ed no variable found Annotate type: 'struct rtld_global' in /usr/lib64/ld-linux-x86-64.so.2 (1 samples): ============================================================================ samples offset size field 1 0 4336 struct rtld_global { 0 0 0 struct link_namespaces* _dl_ns; 0 2560 8 size_t _dl_nns; 0 2568 40 __rtld_lock_recursive_t _dl_load_lock { 0 2568 40 pthread_mutex_t mutex { 0 2568 40 struct __pthread_mutex_s __data { 0 2568 4 int __lock; $ Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 74 +++++++++++++++++++++++++++++++++++++---- tools/perf/util/debug.c | 3 ++ tools/perf/util/debug.h | 1 + 3 files changed, 71 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index ff81d164aa57..f482ccfdaa91 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -23,6 +23,29 @@ #include "symbol.h" #include "symbol_conf.h" +#define pr_debug_dtp(fmt, ...) \ +do { \ + if (debug_type_profile) \ + pr_info(fmt, ##__VA_ARGS__); \ + else \ + pr_debug3(fmt, ##__VA_ARGS__); \ +} while (0) + +static void pr_debug_type_name(Dwarf_Die *die) +{ + struct strbuf sb; + char *str; + + if (!debug_type_profile && verbose < 3) + return; + + strbuf_init(&sb, 32); + die_get_typename_from_type(die, &sb); + str = strbuf_detach(&sb, NULL); + pr_info(" type=%s (die:%lx)\n", str, (long)dwarf_dieoffset(die)); + free(str); +} + /* * Compare type name and size to maintain them in a tree. * I'm not sure if DWARF would have information of a single type in many @@ -201,7 +224,7 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, /* Get the type of the variable */ if (die_get_real_type(var_die, type_die) == NULL) { - pr_debug("variable has no type\n"); + pr_debug_dtp("variable has no type\n"); ann_data_stat.no_typeinfo++; return -1; } @@ -215,7 +238,7 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, if ((dwarf_tag(type_die) != DW_TAG_pointer_type && dwarf_tag(type_die) != DW_TAG_array_type) || die_get_real_type(type_die, type_die) == NULL) { - pr_debug("no pointer or no type\n"); + pr_debug_dtp("no pointer or no type\n"); ann_data_stat.no_typeinfo++; return -1; } @@ -223,14 +246,15 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, /* Get the size of the actual type */ if (dwarf_aggregate_size(type_die, &size) < 0) { - pr_debug("type size is unknown\n"); + pr_debug_dtp("type size is unknown\n"); ann_data_stat.invalid_size++; return -1; } /* Minimal sanity check */ if ((unsigned)offset >= size) { - pr_debug("offset: %d is bigger than size: %" PRIu64 "\n", offset, size); + pr_debug_dtp("offset: %d is bigger than size: %"PRIu64"\n", + offset, size); ann_data_stat.bad_offset++; return -1; } @@ -251,6 +275,19 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) int fb_offset = 0; bool is_fbreg = false; u64 pc; + char buf[64]; + + if (dloc->op->multi_regs) + snprintf(buf, sizeof(buf), " or reg%d", dloc->op->reg2); + else if (dloc->op->reg1 == DWARF_REG_PC) + snprintf(buf, sizeof(buf), " (PC)"); + else + buf[0] = '\0'; + + pr_debug_dtp("-----------------------------------------------------------\n"); + pr_debug_dtp("%s [%"PRIx64"] for reg%d%s offset=%#x in %s\n", + __func__, dloc->ip - dloc->ms->sym->start, + dloc->op->reg1, buf, dloc->op->offset, dloc->ms->sym->name); /* * IP is a relative instruction address from the start of the map, as @@ -261,7 +298,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) /* Get a compile_unit for this address */ if (!find_cu_die(dloc->di, pc, &cu_die)) { - pr_debug("cannot find CU for address %" PRIx64 "\n", pc); + pr_debug_dtp("cannot find CU for address %"PRIx64"\n", pc); ann_data_stat.no_cuinfo++; return -1; } @@ -269,12 +306,17 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) reg = loc->reg1; offset = loc->offset; + pr_debug_dtp("CU die offset: %#lx\n", (long)dwarf_dieoffset(&cu_die)); + if (reg == DWARF_REG_PC) { if (die_find_variable_by_addr(&cu_die, dloc->var_addr, &var_die, &offset)) { ret = check_variable(&var_die, type_die, offset, /*is_pointer=*/false); dloc->type_offset = offset; + + pr_debug_dtp("found PC-rel by addr=%#"PRIx64" offset=%#x\n", + dloc->var_addr, offset); goto out; } @@ -310,6 +352,9 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) default: break; } + + pr_debug_dtp("frame base: cfa=%d fbreg=%d\n", + dloc->fb_cfa, fbreg); } } @@ -334,6 +379,19 @@ retry: /* Found a variable, see if it's correct */ ret = check_variable(&var_die, type_die, offset, reg != DWARF_REG_PC && !is_fbreg); + if (ret == 0) { + pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ", + dwarf_diename(&var_die), i+1, nr_scopes, + (long)dwarf_dieoffset(&scopes[i])); + if (reg == DWARF_REG_PC) + pr_debug_dtp("%#x(PC) offset=%#x", loc->offset, offset); + else if (reg == DWARF_REG_FB || is_fbreg) + pr_debug_dtp("%#x(reg%d) stack fb_offset=%#x offset=%#x", + loc->offset, reg, fb_offset, offset); + else + pr_debug_dtp("%#x(reg%d)", loc->offset, reg); + pr_debug_type_name(type_die); + } dloc->type_offset = offset; goto out; } @@ -343,8 +401,10 @@ retry: goto retry; } - if (ret < 0) + if (ret < 0) { + pr_debug_dtp("no variable found\n"); ann_data_stat.no_var++; + } out: free(scopes); @@ -373,7 +433,7 @@ struct annotated_data_type *find_data_type(struct data_loc_info *dloc) dloc->di = debuginfo__new(dso->long_name); if (dloc->di == NULL) { - pr_debug("cannot get the debug info\n"); + pr_debug_dtp("cannot get the debug info\n"); return NULL; } diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index c39ee0fcb8cf..d633d15329fa 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -41,6 +41,7 @@ static int redirect_to_stderr; int debug_data_convert; static FILE *_debug_file; bool debug_display_time; +int debug_type_profile; FILE *debug_file(void) { @@ -231,6 +232,7 @@ static struct sublevel_option debug_opts[] = { { .name = "data-convert", .value_ptr = &debug_data_convert }, { .name = "perf-event-open", .value_ptr = &debug_peo_args }, { .name = "kmaps", .value_ptr = &debug_kmaps }, + { .name = "type-profile", .value_ptr = &debug_type_profile }, { .name = NULL, } }; @@ -270,6 +272,7 @@ int perf_quiet_option(void) redirect_to_stderr = 0; debug_peo_args = 0; debug_kmaps = 0; + debug_type_profile = 0; return 0; } diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index 35a7a5ae762e..a4026d1fd6a3 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -14,6 +14,7 @@ extern int debug_peo_args; extern bool quiet, dump_trace; extern int debug_ordered_events; extern int debug_data_convert; +extern int debug_type_profile; #ifndef pr_fmt #define pr_fmt(fmt) fmt -- cgit v1.2.3 From 06b2ce75386df04b7aea53818ed3c42ee50ec426 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:01 -0700 Subject: perf annotate-data: Maintain variable type info As it collected basic block and variable information in each scope, it now can build a state table to find matching variable at the location. The struct type_state is to keep the type info saved in each register and stack slot. The update_var_state() updates the table when it finds variables in the current address. It expects die_collect_vars() filled a list of variables with type info and starting address. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-10-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 173 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.c | 4 + 2 files changed, 177 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index f482ccfdaa91..8eaa06f1cee5 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -46,6 +46,62 @@ static void pr_debug_type_name(Dwarf_Die *die) free(str); } +/* Type information in a register, valid when ok is true */ +struct type_state_reg { + Dwarf_Die type; + bool ok; +}; + +/* Type information in a stack location, dynamically allocated */ +struct type_state_stack { + struct list_head list; + Dwarf_Die type; + int offset; + int size; + bool compound; +}; + +/* FIXME: This should be arch-dependent */ +#define TYPE_STATE_MAX_REGS 16 + +/* + * State table to maintain type info in each register and stack location. + * It'll be updated when new variable is allocated or type info is moved + * to a new location (register or stack). As it'd be used with the + * shortest path of basic blocks, it only maintains a single table. + */ +struct type_state { + struct type_state_reg regs[TYPE_STATE_MAX_REGS]; + struct list_head stack_vars; +}; + +static bool has_reg_type(struct type_state *state, int reg) +{ + return (unsigned)reg < ARRAY_SIZE(state->regs); +} + +/* These declarations will be remove once they are changed to static */ +void init_type_state(struct type_state *state, struct arch *arch __maybe_unused); +void exit_type_state(struct type_state *state); +void update_var_state(struct type_state *state, struct data_loc_info *dloc, + u64 addr, u64 insn_offset, struct die_var_type *var_types); + +void init_type_state(struct type_state *state, struct arch *arch __maybe_unused) +{ + memset(state, 0, sizeof(*state)); + INIT_LIST_HEAD(&state->stack_vars); +} + +void exit_type_state(struct type_state *state) +{ + struct type_state_stack *stack, *tmp; + + list_for_each_entry_safe(stack, tmp, &state->stack_vars, list) { + list_del(&stack->list); + free(stack); + } +} + /* * Compare type name and size to maintain them in a tree. * I'm not sure if DWARF would have information of a single type in many @@ -262,6 +318,123 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, return 0; } +static struct type_state_stack *find_stack_state(struct type_state *state, + int offset) +{ + struct type_state_stack *stack; + + list_for_each_entry(stack, &state->stack_vars, list) { + if (offset == stack->offset) + return stack; + + if (stack->compound && stack->offset < offset && + offset < stack->offset + stack->size) + return stack; + } + return NULL; +} + +static void set_stack_state(struct type_state_stack *stack, int offset, + Dwarf_Die *type_die) +{ + int tag; + Dwarf_Word size; + + if (dwarf_aggregate_size(type_die, &size) < 0) + size = 0; + + tag = dwarf_tag(type_die); + + stack->type = *type_die; + stack->size = size; + stack->offset = offset; + + switch (tag) { + case DW_TAG_structure_type: + case DW_TAG_union_type: + stack->compound = true; + break; + default: + stack->compound = false; + break; + } +} + +static struct type_state_stack *findnew_stack_state(struct type_state *state, + int offset, Dwarf_Die *type_die) +{ + struct type_state_stack *stack = find_stack_state(state, offset); + + if (stack) { + set_stack_state(stack, offset, type_die); + return stack; + } + + stack = malloc(sizeof(*stack)); + if (stack) { + set_stack_state(stack, offset, type_die); + list_add(&stack->list, &state->stack_vars); + } + return stack; +} + +/** + * update_var_state - Update type state using given variables + * @state: type state table + * @dloc: data location info + * @addr: instruction address to match with variable + * @insn_offset: instruction offset (for debug) + * @var_types: list of variables with type info + * + * This function fills the @state table using @var_types info. Each variable + * is used only at the given location and updates an entry in the table. + */ +void update_var_state(struct type_state *state, struct data_loc_info *dloc, + u64 addr, u64 insn_offset, struct die_var_type *var_types) +{ + Dwarf_Die mem_die; + struct die_var_type *var; + int fbreg = dloc->fbreg; + int fb_offset = 0; + + if (dloc->fb_cfa) { + if (die_get_cfa(dloc->di->dbg, addr, &fbreg, &fb_offset) < 0) + fbreg = -1; + } + + for (var = var_types; var != NULL; var = var->next) { + if (var->addr != addr) + continue; + /* Get the type DIE using the offset */ + if (!dwarf_offdie(dloc->di->dbg, var->die_off, &mem_die)) + continue; + + if (var->reg == DWARF_REG_FB) { + findnew_stack_state(state, var->offset, &mem_die); + + pr_debug_dtp("var [%"PRIx64"] -%#x(stack) type=", + insn_offset, -var->offset); + pr_debug_type_name(&mem_die); + } else if (var->reg == fbreg) { + findnew_stack_state(state, var->offset - fb_offset, &mem_die); + + pr_debug_dtp("var [%"PRIx64"] -%#x(stack) type=", + insn_offset, -var->offset + fb_offset); + pr_debug_type_name(&mem_die); + } else if (has_reg_type(state, var->reg) && var->offset == 0) { + struct type_state_reg *reg; + + reg = &state->regs[var->reg]; + reg->type = mem_die; + reg->ok = true; + + pr_debug_dtp("var [%"PRIx64"] reg%d type=", + insn_offset, var->reg); + pr_debug_type_name(&mem_die); + } + } +} + /* The result will be saved in @type_die */ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) { diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index d4ec239c5adb..7dad99ee3ff3 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -9,6 +9,7 @@ #include #include "debug.h" #include "dwarf-aux.h" +#include "dwarf-regs.h" #include "strbuf.h" #include "string2.h" @@ -1190,6 +1191,8 @@ static int reg_from_dwarf_op(Dwarf_Op *op) case DW_OP_regx: case DW_OP_bregx: return op->number; + case DW_OP_fbreg: + return DWARF_REG_FB; default: break; } @@ -1203,6 +1206,7 @@ static int offset_from_dwarf_op(Dwarf_Op *op) case DW_OP_regx: return 0; case DW_OP_breg0 ... DW_OP_breg31: + case DW_OP_fbreg: return op->number; case DW_OP_bregx: return op->number2; -- cgit v1.2.3 From 4f903455befa257c50422a4570c4dca0020a1fc8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:02 -0700 Subject: perf annotate-data: Add update_insn_state() The update_insn_state() function is to update the type state table after processing each instruction. For now, it handles MOV (on x86) insn to transfer type info from the source location to the target. The location can be a register or a stack slot. Check carefully when memory reference happens and fetch the type correctly. It basically ignores write to a memory since it doesn't change the type info. One exception is writes to (new) stack slots for register spilling. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-11-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 161 +++++++++++++++++++++++++++++++++++++++- tools/perf/util/annotate-data.h | 2 + tools/perf/util/annotate.c | 1 + 3 files changed, 161 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 8eaa06f1cee5..592437b6c097 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -71,7 +71,9 @@ struct type_state_stack { * shortest path of basic blocks, it only maintains a single table. */ struct type_state { + /* state of general purpose registers */ struct type_state_reg regs[TYPE_STATE_MAX_REGS]; + /* state of stack location */ struct list_head stack_vars; }; @@ -85,6 +87,8 @@ void init_type_state(struct type_state *state, struct arch *arch __maybe_unused) void exit_type_state(struct type_state *state); void update_var_state(struct type_state *state, struct data_loc_info *dloc, u64 addr, u64 insn_offset, struct die_var_type *var_types); +void update_insn_state(struct type_state *state, struct data_loc_info *dloc, + struct disasm_line *dl); void init_type_state(struct type_state *state, struct arch *arch __maybe_unused) { @@ -412,13 +416,13 @@ void update_var_state(struct type_state *state, struct data_loc_info *dloc, if (var->reg == DWARF_REG_FB) { findnew_stack_state(state, var->offset, &mem_die); - pr_debug_dtp("var [%"PRIx64"] -%#x(stack) type=", + pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", insn_offset, -var->offset); pr_debug_type_name(&mem_die); } else if (var->reg == fbreg) { findnew_stack_state(state, var->offset - fb_offset, &mem_die); - pr_debug_dtp("var [%"PRIx64"] -%#x(stack) type=", + pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", insn_offset, -var->offset + fb_offset); pr_debug_type_name(&mem_die); } else if (has_reg_type(state, var->reg) && var->offset == 0) { @@ -428,13 +432,164 @@ void update_var_state(struct type_state *state, struct data_loc_info *dloc, reg->type = mem_die; reg->ok = true; - pr_debug_dtp("var [%"PRIx64"] reg%d type=", + pr_debug_dtp("var [%"PRIx64"] reg%d", insn_offset, var->reg); pr_debug_type_name(&mem_die); } } } +static void update_insn_state_x86(struct type_state *state, + struct data_loc_info *dloc, + struct disasm_line *dl) +{ + struct annotated_insn_loc loc; + struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; + struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; + struct type_state_reg *tsr; + Dwarf_Die type_die; + u32 insn_offset = dl->al.offset; + int fbreg = dloc->fbreg; + int fboff = 0; + + if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) + return; + + if (strncmp(dl->ins.name, "mov", 3)) + return; + + if (dloc->fb_cfa) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 pc = map__rip_2objdump(dloc->ms->map, ip); + + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) + fbreg = -1; + } + + /* Case 1. register to register transfers */ + if (!src->mem_ref && !dst->mem_ref) { + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) { + tsr->ok = false; + return; + } + + tsr->type = state->regs[src->reg1].type; + tsr->ok = true; + + pr_debug_dtp("mov [%x] reg%d -> reg%d", + insn_offset, src->reg1, dst->reg1); + pr_debug_type_name(&tsr->type); + } + /* Case 2. memory to register transers */ + if (src->mem_ref && !dst->mem_ref) { + int sreg = src->reg1; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + +retry: + /* Check stack variables with offset */ + if (sreg == fbreg) { + struct type_state_stack *stack; + int offset = src->offset - fboff; + + stack = find_stack_state(state, offset); + if (stack == NULL) { + tsr->ok = false; + return; + } else if (!stack->compound) { + tsr->type = stack->type; + tsr->ok = true; + } else if (die_get_member_type(&stack->type, + offset - stack->offset, + &type_die)) { + tsr->type = type_die; + tsr->ok = true; + } else { + tsr->ok = false; + return; + } + + pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", + insn_offset, -offset, dst->reg1); + pr_debug_type_name(&tsr->type); + } + /* And then dereference the pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + die_deref_ptr_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->ok = true; + + pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type); + } + /* Or try another register if any */ + else if (src->multi_regs && sreg == src->reg1 && + src->reg1 != src->reg2) { + sreg = src->reg2; + goto retry; + } + /* It failed to get a type info, mark it as invalid */ + else { + tsr->ok = false; + } + } + /* Case 3. register to memory transfers */ + if (!src->mem_ref && dst->mem_ref) { + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) + return; + + /* Check stack variables with offset */ + if (dst->reg1 == fbreg) { + struct type_state_stack *stack; + int offset = dst->offset - fboff; + + stack = find_stack_state(state, offset); + if (stack) { + /* + * The source register is likely to hold a type + * of member if it's a compound type. Do not + * update the stack variable type since we can + * get the member type later by using the + * die_get_member_type(). + */ + if (!stack->compound) + set_stack_state(stack, offset, + &state->regs[src->reg1].type); + } else { + findnew_stack_state(state, offset, + &state->regs[src->reg1].type); + } + + pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", + insn_offset, src->reg1, -offset); + pr_debug_type_name(&state->regs[src->reg1].type); + } + /* + * Ignore other transfers since it'd set a value in a struct + * and won't change the type. + */ + } + /* Case 4. memory to memory transfers (not handled for now) */ +} + +void update_insn_state(struct type_state *state, struct data_loc_info *dloc, + struct disasm_line *dl) +{ + if (arch__is(dloc->arch, "x86")) + update_insn_state_x86(state, dloc, dl); +} + /* The result will be saved in @type_die */ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) { diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index ad6493ea2c8e..7324cafe2c7b 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -75,6 +75,7 @@ extern struct annotated_data_type stackop_type; /** * struct data_loc_info - Data location information + * @arch: CPU architecture info * @ms: Map and Symbol info * @ip: Instruction address * @var_addr: Data address (for global variables) @@ -87,6 +88,7 @@ extern struct annotated_data_type stackop_type; */ struct data_loc_info { /* These are input field, should be filled by caller */ + struct arch *arch; struct map_symbol *ms; u64 ip; u64 var_addr; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index aa005c13ff67..9777df5dc2e3 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3872,6 +3872,7 @@ retry: for_each_insn_op_loc(&loc, i, op_loc) { struct data_loc_info dloc = { + .arch = arch, .ms = ms, /* Recalculate IP for LOCK prefix or insn fusion */ .ip = ms->sym->start + dl->al.offset, -- cgit v1.2.3 From 1ebb5e17ef21b492ee60654d4e22cbfb3763661f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:03 -0700 Subject: perf annotate-data: Add get_global_var_type() Accessing global variable is common when it tracks execution later. Factor out the common code into a function for later use. It adds thread and cpumode to struct data_loc_info to find (global) symbols if needed. Also remove var_name as it's retrieved in the helper function. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-12-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 62 +++++++++++++++++++++++++++++++---------- tools/perf/util/annotate-data.h | 7 +++-- tools/perf/util/annotate.c | 21 +++----------- 3 files changed, 57 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 592437b6c097..3b661e693410 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -22,6 +22,7 @@ #include "strbuf.h" #include "symbol.h" #include "symbol_conf.h" +#include "thread.h" #define pr_debug_dtp(fmt, ...) \ do { \ @@ -382,6 +383,50 @@ static struct type_state_stack *findnew_stack_state(struct type_state *state, return stack; } +static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, + u64 ip, u64 var_addr, int *var_offset, + Dwarf_Die *type_die) +{ + u64 pc, mem_addr; + int offset; + bool is_pointer = false; + const char *var_name = NULL; + Dwarf_Die var_die; + struct addr_location al; + struct symbol *sym; + + /* Try to get the variable by address first */ + if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) && + check_variable(&var_die, type_die, offset, is_pointer) == 0) { + *var_offset = offset; + return true; + } + + /* Kernel symbols might be relocated */ + mem_addr = var_addr + map__reloc(dloc->ms->map); + + addr_location__init(&al); + sym = thread__find_symbol_fb(dloc->thread, dloc->cpumode, + mem_addr, &al); + if (sym) { + var_name = sym->name; + /* Calculate type offset from the start of variable */ + *var_offset = mem_addr - map__unmap_ip(al.map, sym->start); + } + addr_location__exit(&al); + if (var_name == NULL) + return false; + + pc = map__rip_2objdump(dloc->ms->map, ip); + + /* Try to get the name of global variable */ + if (die_find_variable_at(cu_die, var_name, pc, &var_die) && + check_variable(&var_die, type_die, *var_offset, is_pointer) == 0) + return true; + + return false; +} + /** * update_var_state - Update type state using given variables * @state: type state table @@ -637,24 +682,14 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) pr_debug_dtp("CU die offset: %#lx\n", (long)dwarf_dieoffset(&cu_die)); if (reg == DWARF_REG_PC) { - if (die_find_variable_by_addr(&cu_die, dloc->var_addr, &var_die, - &offset)) { - ret = check_variable(&var_die, type_die, offset, - /*is_pointer=*/false); + if (get_global_var_type(&cu_die, dloc, dloc->ip, dloc->var_addr, + &offset, type_die)) { dloc->type_offset = offset; pr_debug_dtp("found PC-rel by addr=%#"PRIx64" offset=%#x\n", dloc->var_addr, offset); goto out; } - - if (dloc->var_name && - die_find_variable_at(&cu_die, dloc->var_name, pc, &var_die)) { - ret = check_variable(&var_die, type_die, dloc->type_offset, - /*is_pointer=*/false); - /* dloc->type_offset was updated by the caller */ - goto out; - } } /* Get a list of nested scopes - i.e. (inlined) functions and blocks. */ @@ -769,8 +804,7 @@ struct annotated_data_type *find_data_type(struct data_loc_info *dloc) * The type offset is the same as instruction offset by default. * But when finding a global variable, the offset won't be valid. */ - if (dloc->var_name == NULL) - dloc->type_offset = dloc->op->offset; + dloc->type_offset = dloc->op->offset; dloc->fbreg = -1; diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 7324cafe2c7b..acfbd1748d02 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -11,6 +11,7 @@ struct annotated_op_loc; struct debuginfo; struct evsel; struct map_symbol; +struct thread; /** * struct annotated_member - Type of member field @@ -76,10 +77,11 @@ extern struct annotated_data_type stackop_type; /** * struct data_loc_info - Data location information * @arch: CPU architecture info + * @thread: Thread info * @ms: Map and Symbol info * @ip: Instruction address * @var_addr: Data address (for global variables) - * @var_name: Variable name (for global variables) + * @cpumode: CPU execution mode * @op: Instruction operand location (regs and offset) * @di: Debug info * @fbreg: Frame base register @@ -89,10 +91,11 @@ extern struct annotated_data_type stackop_type; struct data_loc_info { /* These are input field, should be filled by caller */ struct arch *arch; + struct thread *thread; struct map_symbol *ms; u64 ip; u64 var_addr; - const char *var_name; + u8 cpumode; struct annotated_op_loc *op; /* These are used internally */ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9777df5dc2e3..abb641aa8ec0 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3873,9 +3873,11 @@ retry: for_each_insn_op_loc(&loc, i, op_loc) { struct data_loc_info dloc = { .arch = arch, + .thread = he->thread, .ms = ms, /* Recalculate IP for LOCK prefix or insn fusion */ .ip = ms->sym->start + dl->al.offset, + .cpumode = he->cpumode, .op = op_loc, }; @@ -3887,23 +3889,8 @@ retry: /* PC-relative addressing */ if (op_loc->reg1 == DWARF_REG_PC) { - struct addr_location al; - struct symbol *var; - u64 map_addr; - - dloc.var_addr = annotate_calc_pcrel(ms, ip, op_loc->offset, dl); - /* Kernel symbols might be relocated */ - map_addr = dloc.var_addr + map__reloc(ms->map); - - addr_location__init(&al); - var = thread__find_symbol_fb(he->thread, he->cpumode, - map_addr, &al); - if (var) { - dloc.var_name = var->name; - /* Calculate type offset from the start of variable */ - dloc.type_offset = map_addr - map__unmap_ip(al.map, var->start); - } - addr_location__exit(&al); + dloc.var_addr = annotate_calc_pcrel(ms, dloc.ip, + op_loc->offset, dl); } mem_type = find_data_type(&dloc); -- cgit v1.2.3 From 0a41e5d6849b4f705c377d34799485b546764970 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:04 -0700 Subject: perf annotate-data: Handle global variable access When updating the instruction states, it also needs to handle global variable accesses. Same as it does for PC-relative addressing, it can look up the type by address (if it's defined in the same file), or by name after finding the symbol by address (for declarations). Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-13-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 46 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 3b661e693410..2cc9f56e3eea 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -89,7 +89,7 @@ void exit_type_state(struct type_state *state); void update_var_state(struct type_state *state, struct data_loc_info *dloc, u64 addr, u64 insn_offset, struct die_var_type *var_types); void update_insn_state(struct type_state *state, struct data_loc_info *dloc, - struct disasm_line *dl); + Dwarf_Die *cu_die, struct disasm_line *dl); void init_type_state(struct type_state *state, struct arch *arch __maybe_unused) { @@ -485,7 +485,7 @@ void update_var_state(struct type_state *state, struct data_loc_info *dloc, } static void update_insn_state_x86(struct type_state *state, - struct data_loc_info *dloc, + struct data_loc_info *dloc, Dwarf_Die *cu_die, struct disasm_line *dl) { struct annotated_insn_loc loc; @@ -577,6 +577,29 @@ retry: insn_offset, src->offset, sreg, dst->reg1); pr_debug_type_name(&tsr->type); } + /* Or check if it's a global variable */ + else if (sreg == DWARF_REG_PC) { + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + u64 addr; + int offset; + + addr = annotate_calc_pcrel(ms, ip, src->offset, dl); + + if (!get_global_var_type(cu_die, dloc, ip, addr, &offset, + &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->ok = true; + + pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", + insn_offset, addr, dst->reg1); + pr_debug_type_name(&type_die); + } /* Or try another register if any */ else if (src->multi_regs && sreg == src->reg1 && src->reg1 != src->reg2) { @@ -628,11 +651,26 @@ retry: /* Case 4. memory to memory transfers (not handled for now) */ } +/** + * update_insn_state - Update type state for an instruction + * @state: type state table + * @dloc: data location info + * @cu_die: compile unit debug entry + * @dl: disasm line for the instruction + * + * This function updates the @state table for the target operand of the + * instruction at @dl if it transfers the type like MOV on x86. Since it + * tracks the type, it won't care about the values like in arithmetic + * instructions like ADD/SUB/MUL/DIV and INC/DEC. + * + * Note that ops->reg2 is only available when both mem_ref and multi_regs + * are true. + */ void update_insn_state(struct type_state *state, struct data_loc_info *dloc, - struct disasm_line *dl) + Dwarf_Die *cu_die, struct disasm_line *dl) { if (arch__is(dloc->arch, "x86")) - update_insn_state_x86(state, dloc, dl); + update_insn_state_x86(state, dloc, cu_die, dl); } /* The result will be saved in @type_die */ -- cgit v1.2.3 From cffb7910afbdf73ca5b2fdf8a617584a7dcecf14 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:05 -0700 Subject: perf annotate-data: Handle call instructions When updating instruction states, the call instruction should play a role since it changes the register states. For simplicity, mark some registers as caller-saved registers (should be arch-dependent), and invalidate them all after a function call. If the function returns something, the designated register (ret_reg) will have the type info. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-14-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 54 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 2cc9f56e3eea..6bcf22e523cb 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -47,10 +47,14 @@ static void pr_debug_type_name(Dwarf_Die *die) free(str); } -/* Type information in a register, valid when ok is true */ +/* + * Type information in a register, valid when @ok is true. + * The @caller_saved registers are invalidated after a function call. + */ struct type_state_reg { Dwarf_Die type; bool ok; + bool caller_saved; }; /* Type information in a stack location, dynamically allocated */ @@ -76,6 +80,8 @@ struct type_state { struct type_state_reg regs[TYPE_STATE_MAX_REGS]; /* state of stack location */ struct list_head stack_vars; + /* return value register */ + int ret_reg; }; static bool has_reg_type(struct type_state *state, int reg) @@ -91,10 +97,23 @@ void update_var_state(struct type_state *state, struct data_loc_info *dloc, void update_insn_state(struct type_state *state, struct data_loc_info *dloc, Dwarf_Die *cu_die, struct disasm_line *dl); -void init_type_state(struct type_state *state, struct arch *arch __maybe_unused) +void init_type_state(struct type_state *state, struct arch *arch) { memset(state, 0, sizeof(*state)); INIT_LIST_HEAD(&state->stack_vars); + + if (arch__is(arch, "x86")) { + state->regs[0].caller_saved = true; + state->regs[1].caller_saved = true; + state->regs[2].caller_saved = true; + state->regs[4].caller_saved = true; + state->regs[5].caller_saved = true; + state->regs[8].caller_saved = true; + state->regs[9].caller_saved = true; + state->regs[10].caller_saved = true; + state->regs[11].caller_saved = true; + state->ret_reg = 0; + } } void exit_type_state(struct type_state *state) @@ -500,6 +519,37 @@ static void update_insn_state_x86(struct type_state *state, if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) return; + if (ins__is_call(&dl->ins)) { + struct symbol *func = dl->ops.target.sym; + + if (func == NULL) + return; + + /* __fentry__ will preserve all registers */ + if (!strcmp(func->name, "__fentry__")) + return; + + pr_debug_dtp("call [%x] %s\n", insn_offset, func->name); + + /* Otherwise invalidate caller-saved registers after call */ + for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) { + if (state->regs[i].caller_saved) + state->regs[i].ok = false; + } + + /* Update register with the return type (if any) */ + if (die_find_func_rettype(cu_die, func->name, &type_die)) { + tsr = &state->regs[state->ret_reg]; + tsr->type = type_die; + tsr->ok = true; + + pr_debug_dtp("call [%x] return -> reg%d", + insn_offset, state->ret_reg); + pr_debug_type_name(&type_die); + } + return; + } + if (strncmp(dl->ins.name, "mov", 3)) return; -- cgit v1.2.3 From eb8a55e01de9a671e130b4c0e5b483997f4f491f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:06 -0700 Subject: perf annotate-data: Implement instruction tracking If it failed to find a variable for the location directly, it might be due to a missing variable in the source code. For example, accessing pointer variables in a chain can result in the case like below: struct foo *foo = ...; int i = foo->bar->baz; The DWARF debug information is created for each variable so it'd have one for 'foo'. But there's no variable for 'foo->bar' and then it cannot know the type of 'bar' and 'baz'. The above source code can be compiled to the follow x86 instructions: mov 0x8(%rax), %rcx mov 0x4(%rcx), %rdx <=== PMU sample mov %rdx, -4(%rbp) Let's say 'foo' is located in the %rax and it has a pointer to struct foo. But perf sample is captured in the second instruction and there is no variable or type info for the %rcx. It'd be great if compiler could generate debug info for %rcx, but we should handle it on our side. So this patch implements the logic to iterate instructions and update the type table for each location. As it already collected a list of scopes including the target instruction, we can use it to construct the type table smartly. +---------------- scope[0] subprogram | | +-------------- scope[1] lexical_block | | | | +------------ scope[2] inlined_subroutine | | | | | | +---------- scope[3] inlined_subroutine | | | | | | | | +-------- scope[4] lexical_block | | | | | | | | | | *** target instruction ... Image the target instruction has 5 scopes, each scope will have its own variables and parameters. Then it can start with the innermost scope (4). So it'd search the shortest path from the start of scope[4] to the target address and build a list of basic blocks. Then it iterates the basic blocks with the variables in the scope and update the table. If it finds a type at the target instruction, then returns it. Otherwise, it moves to the upper scope[3]. Now it'd search the shortest path from the start of scope[3] to the start of scope[4]. Then connect it to the existing basic block list. Then it'd iterate the blocks with variables for both scopes. It can repeat this until it finds a type at the target instruction or reaches to the top scope[0]. As the basic blocks contain the shortest path, it won't worry about branches and can update the table simply. The final check will be done by find_matching_type() in the next patch. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-15-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 1 + tools/perf/util/annotate-data.c | 223 +++++++++++++++++++++++++++++++++++++--- tools/perf/util/annotate-data.h | 1 + 3 files changed, 211 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 6c1cc797692d..f677671409b1 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -430,6 +430,7 @@ static void print_annotate_data_stat(struct annotated_data_stat *s) PRINT_STAT(no_typeinfo); PRINT_STAT(invalid_size); PRINT_STAT(bad_offset); + PRINT_STAT(insn_track); printf("\n"); #undef PRINT_STAT diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 6bcf22e523cb..13ba65693367 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -89,15 +89,7 @@ static bool has_reg_type(struct type_state *state, int reg) return (unsigned)reg < ARRAY_SIZE(state->regs); } -/* These declarations will be remove once they are changed to static */ -void init_type_state(struct type_state *state, struct arch *arch __maybe_unused); -void exit_type_state(struct type_state *state); -void update_var_state(struct type_state *state, struct data_loc_info *dloc, - u64 addr, u64 insn_offset, struct die_var_type *var_types); -void update_insn_state(struct type_state *state, struct data_loc_info *dloc, - Dwarf_Die *cu_die, struct disasm_line *dl); - -void init_type_state(struct type_state *state, struct arch *arch) +static void init_type_state(struct type_state *state, struct arch *arch) { memset(state, 0, sizeof(*state)); INIT_LIST_HEAD(&state->stack_vars); @@ -116,7 +108,7 @@ void init_type_state(struct type_state *state, struct arch *arch) } } -void exit_type_state(struct type_state *state) +static void exit_type_state(struct type_state *state) { struct type_state_stack *stack, *tmp; @@ -457,8 +449,8 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, * This function fills the @state table using @var_types info. Each variable * is used only at the given location and updates an entry in the table. */ -void update_var_state(struct type_state *state, struct data_loc_info *dloc, - u64 addr, u64 insn_offset, struct die_var_type *var_types) +static void update_var_state(struct type_state *state, struct data_loc_info *dloc, + u64 addr, u64 insn_offset, struct die_var_type *var_types) { Dwarf_Die mem_die; struct die_var_type *var; @@ -716,13 +708,207 @@ retry: * Note that ops->reg2 is only available when both mem_ref and multi_regs * are true. */ -void update_insn_state(struct type_state *state, struct data_loc_info *dloc, - Dwarf_Die *cu_die, struct disasm_line *dl) +static void update_insn_state(struct type_state *state, struct data_loc_info *dloc, + Dwarf_Die *cu_die, struct disasm_line *dl) { if (arch__is(dloc->arch, "x86")) update_insn_state_x86(state, dloc, cu_die, dl); } +/* + * Prepend this_blocks (from the outer scope) to full_blocks, removing + * duplicate disasm line. + */ +static void prepend_basic_blocks(struct list_head *this_blocks, + struct list_head *full_blocks) +{ + struct annotated_basic_block *first_bb, *last_bb; + + last_bb = list_last_entry(this_blocks, typeof(*last_bb), list); + first_bb = list_first_entry(full_blocks, typeof(*first_bb), list); + + if (list_empty(full_blocks)) + goto out; + + /* Last insn in this_blocks should be same as first insn in full_blocks */ + if (last_bb->end != first_bb->begin) { + pr_debug("prepend basic blocks: mismatched disasm line %"PRIx64" -> %"PRIx64"\n", + last_bb->end->al.offset, first_bb->begin->al.offset); + goto out; + } + + /* Is the basic block have only one disasm_line? */ + if (last_bb->begin == last_bb->end) { + list_del(&last_bb->list); + free(last_bb); + goto out; + } + + /* Point to the insn before the last when adding this block to full_blocks */ + last_bb->end = list_prev_entry(last_bb->end, al.node); + +out: + list_splice(this_blocks, full_blocks); +} + +static void delete_basic_blocks(struct list_head *basic_blocks) +{ + struct annotated_basic_block *bb, *tmp; + + list_for_each_entry_safe(bb, tmp, basic_blocks, list) { + list_del(&bb->list); + free(bb); + } +} + +/* Make sure all variables have a valid start address */ +static void fixup_var_address(struct die_var_type *var_types, u64 addr) +{ + while (var_types) { + /* + * Some variables have no address range meaning it's always + * available in the whole scope. Let's adjust the start + * address to the start of the scope. + */ + if (var_types->addr == 0) + var_types->addr = addr; + + var_types = var_types->next; + } +} + +static void delete_var_types(struct die_var_type *var_types) +{ + while (var_types) { + struct die_var_type *next = var_types->next; + + free(var_types); + var_types = next; + } +} + +/* It's at the target address, check if it has a matching type */ +static bool find_matching_type(struct type_state *state __maybe_unused, + struct data_loc_info *dloc __maybe_unused, + int reg __maybe_unused, + Dwarf_Die *type_die __maybe_unused) +{ + /* TODO */ + return false; +} + +/* Iterate instructions in basic blocks and update type table */ +static bool find_data_type_insn(struct data_loc_info *dloc, int reg, + struct list_head *basic_blocks, + struct die_var_type *var_types, + Dwarf_Die *cu_die, Dwarf_Die *type_die) +{ + struct type_state state; + struct symbol *sym = dloc->ms->sym; + struct annotation *notes = symbol__annotation(sym); + struct annotated_basic_block *bb; + bool found = false; + + init_type_state(&state, dloc->arch); + + list_for_each_entry(bb, basic_blocks, list) { + struct disasm_line *dl = bb->begin; + + pr_debug_dtp("bb: [%"PRIx64" - %"PRIx64"]\n", + bb->begin->al.offset, bb->end->al.offset); + + list_for_each_entry_from(dl, ¬es->src->source, al.node) { + u64 this_ip = sym->start + dl->al.offset; + u64 addr = map__rip_2objdump(dloc->ms->map, this_ip); + + /* Update variable type at this address */ + update_var_state(&state, dloc, addr, dl->al.offset, var_types); + + if (this_ip == dloc->ip) { + found = find_matching_type(&state, dloc, reg, + type_die); + goto out; + } + + /* Update type table after processing the instruction */ + update_insn_state(&state, dloc, cu_die, dl); + if (dl == bb->end) + break; + } + } + +out: + exit_type_state(&state); + return found; +} + +/* + * Construct a list of basic blocks for each scope with variables and try to find + * the data type by updating a type state table through instructions. + */ +static int find_data_type_block(struct data_loc_info *dloc, int reg, + Dwarf_Die *cu_die, Dwarf_Die *scopes, + int nr_scopes, Dwarf_Die *type_die) +{ + LIST_HEAD(basic_blocks); + struct die_var_type *var_types = NULL; + u64 src_ip, dst_ip, prev_dst_ip; + int ret = -1; + + /* TODO: other architecture support */ + if (!arch__is(dloc->arch, "x86")) + return -1; + + prev_dst_ip = dst_ip = dloc->ip; + for (int i = nr_scopes - 1; i >= 0; i--) { + Dwarf_Addr base, start, end; + LIST_HEAD(this_blocks); + + if (dwarf_ranges(&scopes[i], 0, &base, &start, &end) < 0) + break; + + pr_debug_dtp("scope: [%d/%d] (die:%lx)\n", + i + 1, nr_scopes, (long)dwarf_dieoffset(&scopes[i])); + src_ip = map__objdump_2rip(dloc->ms->map, start); + +again: + /* Get basic blocks for this scope */ + if (annotate_get_basic_blocks(dloc->ms->sym, src_ip, dst_ip, + &this_blocks) < 0) { + /* Try previous block if they are not connected */ + if (prev_dst_ip != dst_ip) { + dst_ip = prev_dst_ip; + goto again; + } + + pr_debug_dtp("cannot find a basic block from %"PRIx64" to %"PRIx64"\n", + src_ip - dloc->ms->sym->start, + dst_ip - dloc->ms->sym->start); + continue; + } + prepend_basic_blocks(&this_blocks, &basic_blocks); + + /* Get variable info for this scope and add to var_types list */ + die_collect_vars(&scopes[i], &var_types); + fixup_var_address(var_types, start); + + /* Find from start of this scope to the target instruction */ + if (find_data_type_insn(dloc, reg, &basic_blocks, var_types, + cu_die, type_die)) { + ret = 0; + break; + } + + /* Go up to the next scope and find blocks to the start */ + prev_dst_ip = dst_ip; + dst_ip = src_ip; + } + + delete_basic_blocks(&basic_blocks); + delete_var_types(var_types); + return ret; +} + /* The result will be saved in @type_die */ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) { @@ -847,6 +1033,15 @@ retry: goto out; } + if (reg != DWARF_REG_PC) { + ret = find_data_type_block(dloc, reg, &cu_die, scopes, + nr_scopes, type_die); + if (ret == 0) { + ann_data_stat.insn_track++; + goto out; + } + } + if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) { reg = loc->reg2; goto retry; diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index acfbd1748d02..ae0f87aed804 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -135,6 +135,7 @@ struct annotated_data_stat { int no_typeinfo; int invalid_size; int bad_offset; + int insn_track; }; extern struct annotated_data_stat ann_data_stat; -- cgit v1.2.3 From bdc80ace07106a62b51d1752869df29dbd65ad2c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:07 -0700 Subject: perf annotate-data: Check register state for type As instruction tracking updates the type state for each register, check the final type info for the target register at the given instruction. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-16-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 88 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 13ba65693367..f5329a78a97d 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -788,12 +788,83 @@ static void delete_var_types(struct die_var_type *var_types) } /* It's at the target address, check if it has a matching type */ -static bool find_matching_type(struct type_state *state __maybe_unused, - struct data_loc_info *dloc __maybe_unused, - int reg __maybe_unused, - Dwarf_Die *type_die __maybe_unused) +static bool check_matching_type(struct type_state *state, + struct data_loc_info *dloc, int reg, + Dwarf_Die *type_die) { - /* TODO */ + Dwarf_Word size; + u32 insn_offset = dloc->ip - dloc->ms->sym->start; + + pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d", + insn_offset, reg, dloc->op->offset, state->regs[reg].ok); + + if (state->regs[reg].ok) { + int tag = dwarf_tag(&state->regs[reg].type); + + pr_debug_dtp("\n"); + + /* + * Normal registers should hold a pointer (or array) to + * dereference a memory location. + */ + if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) + return false; + + /* Remove the pointer and get the target type */ + if (die_get_real_type(&state->regs[reg].type, type_die) == NULL) + return false; + + dloc->type_offset = dloc->op->offset; + + /* Get the size of the actual type */ + if (dwarf_aggregate_size(type_die, &size) < 0 || + (unsigned)dloc->type_offset >= size) + return false; + + return true; + } + + if (reg == dloc->fbreg) { + struct type_state_stack *stack; + + pr_debug_dtp(" fbreg\n"); + + stack = find_stack_state(state, dloc->type_offset); + if (stack == NULL) + return false; + + *type_die = stack->type; + /* Update the type offset from the start of slot */ + dloc->type_offset -= stack->offset; + + return true; + } + + if (dloc->fb_cfa) { + struct type_state_stack *stack; + u64 pc = map__rip_2objdump(dloc->ms->map, dloc->ip); + int fbreg, fboff; + + pr_debug_dtp(" cfa\n"); + + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) + fbreg = -1; + + if (reg != fbreg) + return false; + + stack = find_stack_state(state, dloc->type_offset - fboff); + if (stack == NULL) + return false; + + *type_die = stack->type; + /* Update the type offset from the start of slot */ + dloc->type_offset -= fboff + stack->offset; + + return true; + } + + pr_debug_dtp("\n"); return false; } @@ -825,8 +896,8 @@ static bool find_data_type_insn(struct data_loc_info *dloc, int reg, update_var_state(&state, dloc, addr, dl->al.offset, var_types); if (this_ip == dloc->ip) { - found = find_matching_type(&state, dloc, reg, - type_die); + found = check_matching_type(&state, dloc, reg, + type_die); goto out; } @@ -896,6 +967,9 @@ again: if (find_data_type_insn(dloc, reg, &basic_blocks, var_types, cu_die, type_die)) { ret = 0; + pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x", + dloc->op->offset, reg, dloc->type_offset); + pr_debug_type_name(type_die); break; } -- cgit v1.2.3 From cbaf89a8c5b467f99dd930f24a698f3fa338b012 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:08 -0700 Subject: perf annotate: Parse x86 segment register location Add a segment field in the struct annotated_insn_loc and save it for the segment based addressing like %gs:0x28. For simplicity it now handles %gs register only. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-17-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 36 ++++++++++++++++++++++++++++++++---- tools/perf/util/annotate.h | 15 +++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index abb641aa8ec0..3aa3a3b987ad 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -94,6 +94,7 @@ struct arch { char skip_functions_char; char register_char; char memory_ref_char; + char imm_char; } objdump; }; @@ -211,6 +212,7 @@ static struct arch architectures[] = { .comment_char = '#', .register_char = '%', .memory_ref_char = '(', + .imm_char = '$', }, }, { @@ -3585,6 +3587,12 @@ static int extract_reg_offset(struct arch *arch, const char *str, * %gs:0x18(%rbx). In that case it should skip the part. */ if (*str == arch->objdump.register_char) { + if (arch__is(arch, "x86")) { + /* FIXME: Handle other segment registers */ + if (!strncmp(str, "%gs:", 4)) + op_loc->segment = INSN_SEG_X86_GS; + } + while (*str && !isdigit(*str) && *str != arch->objdump.memory_ref_char) str++; @@ -3681,12 +3689,32 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, op_loc->multi_regs = multi_regs; extract_reg_offset(arch, insn_str, op_loc); } else { - char *s = strdup(insn_str); + char *s, *p = NULL; + + if (arch__is(arch, "x86")) { + /* FIXME: Handle other segment registers */ + if (!strncmp(insn_str, "%gs:", 4)) { + op_loc->segment = INSN_SEG_X86_GS; + op_loc->offset = strtol(insn_str + 4, + &p, 0); + if (p && p != insn_str + 4) + op_loc->imm = true; + continue; + } + } + + s = strdup(insn_str); + if (s == NULL) + return -1; - if (s) { + if (*s == arch->objdump.register_char) op_loc->reg1 = get_dwarf_regnum(s, 0); - free(s); + else if (*s == arch->objdump.imm_char) { + op_loc->offset = strtol(s + 1, &p, 0); + if (p && p != s + 1) + op_loc->imm = true; } + free(s); } } @@ -3881,7 +3909,7 @@ retry: .op = op_loc, }; - if (!op_loc->mem_ref) + if (!op_loc->mem_ref && op_loc->segment == INSN_SEG_NONE) continue; /* Recalculate IP because of LOCK prefix or insn fusion */ diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 0928663fddee..14980b65f812 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -511,15 +511,19 @@ int annotate_check_args(void); * @reg1: First register in the operand * @reg2: Second register in the operand * @offset: Memory access offset in the operand + * @segment: Segment selector register * @mem_ref: Whether the operand accesses memory * @multi_regs: Whether the second register is used + * @imm: Whether the operand is an immediate value (in offset) */ struct annotated_op_loc { int reg1; int reg2; int offset; + u8 segment; bool mem_ref; bool multi_regs; + bool imm; }; enum annotated_insn_ops { @@ -529,6 +533,17 @@ enum annotated_insn_ops { INSN_OP_MAX, }; +enum annotated_x86_segment { + INSN_SEG_NONE = 0, + + INSN_SEG_X86_CS, + INSN_SEG_X86_DS, + INSN_SEG_X86_ES, + INSN_SEG_X86_FS, + INSN_SEG_X86_GS, + INSN_SEG_X86_SS, +}; + /** * struct annotated_insn_loc - Location info of instruction * @ops: Array of location info for source and target operands -- cgit v1.2.3 From 02e17ca917423c622da10ac6bd0924c17462962e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:09 -0700 Subject: perf annotate-data: Handle this-cpu variables in kernel On x86, the kernel gets the current task using the current macro like below: #define current get_current() static __always_inline struct task_struct *get_current(void) { return this_cpu_read_stable(pcpu_hot.current_task); } So it returns the current_task field of struct pcpu_hot which is the first member. On my build, it's located at 0x32940. $ nm vmlinux | grep pcpu_hot 0000000000032940 D pcpu_hot And the current macro generates the instructions like below: mov %gs:0x32940, %rcx So the %gs segment register points to the beginning of the per-cpu region of this cpu and it points the variable with a constant. Let's update the instruction location info to have a segment register and handle %gs in kernel to look up a global variable. Pretend it as a global variable by changing the register number to DWARF_REG_PC. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-18-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 21 +++++++++++++++++++-- tools/perf/util/annotate.c | 7 +++++++ 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index f5329a78a97d..d57622ddd5d3 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -790,7 +790,7 @@ static void delete_var_types(struct die_var_type *var_types) /* It's at the target address, check if it has a matching type */ static bool check_matching_type(struct type_state *state, struct data_loc_info *dloc, int reg, - Dwarf_Die *type_die) + Dwarf_Die *cu_die, Dwarf_Die *type_die) { Dwarf_Word size; u32 insn_offset = dloc->ip - dloc->ms->sym->start; @@ -864,6 +864,23 @@ static bool check_matching_type(struct type_state *state, return true; } + if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) { + u64 addr; + int offset; + + if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm) { + pr_debug_dtp(" this-cpu var\n"); + + addr = dloc->op->offset; + + if (get_global_var_type(cu_die, dloc, dloc->ip, addr, + &offset, type_die)) { + dloc->type_offset = offset; + return true; + } + } + } + pr_debug_dtp("\n"); return false; } @@ -897,7 +914,7 @@ static bool find_data_type_insn(struct data_loc_info *dloc, int reg, if (this_ip == dloc->ip) { found = check_matching_type(&state, dloc, reg, - type_die); + cu_die, type_die); goto out; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 3aa3a3b987ad..e4121acb4f88 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3921,6 +3921,13 @@ retry: op_loc->offset, dl); } + /* This CPU access in kernel - pretend PC-relative addressing */ + if (map__dso(ms->map)->kernel && arch__is(arch, "x86") && + op_loc->segment == INSN_SEG_X86_GS && op_loc->imm) { + dloc.var_addr = op_loc->offset; + op_loc->reg1 = DWARF_REG_PC; + } + mem_type = find_data_type(&dloc); if (mem_type) istat->good++; -- cgit v1.2.3 From ad62edbfc55b23347539c8c6fff9a70de17c4b95 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:10 -0700 Subject: perf annotate-data: Track instructions with a this-cpu variable Like global variables, this per-cpu variables should be tracked correctly. Factor our get_global_var_type() to handle both global and per-cpu (for this cpu) variables in the same manner. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-19-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index d57622ddd5d3..48fea0c716ef 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -553,12 +553,41 @@ static void update_insn_state_x86(struct type_state *state, fbreg = -1; } - /* Case 1. register to register transfers */ + /* Case 1. register to register or segment:offset to register transfers */ if (!src->mem_ref && !dst->mem_ref) { if (!has_reg_type(state, dst->reg1)) return; tsr = &state->regs[dst->reg1]; + if (map__dso(dloc->ms->map)->kernel && + src->segment == INSN_SEG_X86_GS && src->imm) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr; + int offset; + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + var_addr = src->offset; + + if (!get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->ok = true; + + pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", + insn_offset, var_addr, dst->reg1); + pr_debug_type_name(&tsr->type); + return; + } + if (!has_reg_type(state, src->reg1) || !state->regs[src->reg1].ok) { tsr->ok = false; -- cgit v1.2.3 From f5b095924d0c1c2f0698df07c54887d92c39fd3a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:11 -0700 Subject: perf annotate-data: Support general per-cpu access This is to support per-cpu variable access often without a matching DWARF entry. For some reason, I cannot find debug info of per-cpu variables sometimes. They have more complex pattern to calculate the address of per-cpu variables like below. 2b7d: mov -0x1e0(%rbp),%rax ; rax = cpu 2b84: mov -0x7da0f7e0(,%rax,8),%rcx ; rcx = __per_cpu_offset[cpu] * 2b8c: mov 0x34870(%rcx),%rax ; *(__per_cpu_offset[cpu] + 0x34870) Let's assume the rax register has a number for a CPU at 2b7d. The next instruction is to get the per-cpu offset' for that cpu. The offset -0x7da0f7e0 is 0xffffffff825f0820 in u64 which is the address of the '__per_cpu_offset' array in my system. So it'd get the actual offset of that CPU's per-cpu region and save it to the rcx register. Then, at 2b8c, accesses using rcx can be handled same as the global variable access. To handle this case, it should check if the offset of the instruction matches to the address of '__per_cpu_offset'. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-20-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 213 +++++++++++++++++++++++++++++++--------- 1 file changed, 169 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 48fea0c716ef..83b5aa00f01c 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -24,6 +24,12 @@ #include "symbol_conf.h" #include "thread.h" +enum type_state_kind { + TSR_KIND_INVALID = 0, + TSR_KIND_TYPE, + TSR_KIND_PERCPU_BASE, +}; + #define pr_debug_dtp(fmt, ...) \ do { \ if (debug_type_profile) \ @@ -32,7 +38,7 @@ do { \ pr_debug3(fmt, ##__VA_ARGS__); \ } while (0) -static void pr_debug_type_name(Dwarf_Die *die) +static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) { struct strbuf sb; char *str; @@ -40,6 +46,18 @@ static void pr_debug_type_name(Dwarf_Die *die) if (!debug_type_profile && verbose < 3) return; + switch (kind) { + case TSR_KIND_INVALID: + pr_info("\n"); + return; + case TSR_KIND_PERCPU_BASE: + pr_info(" percpu base\n"); + return; + case TSR_KIND_TYPE: + default: + break; + } + strbuf_init(&sb, 32); die_get_typename_from_type(die, &sb); str = strbuf_detach(&sb, NULL); @@ -53,8 +71,10 @@ static void pr_debug_type_name(Dwarf_Die *die) */ struct type_state_reg { Dwarf_Die type; + u32 imm_value; bool ok; bool caller_saved; + u8 kind; }; /* Type information in a stack location, dynamically allocated */ @@ -64,6 +84,7 @@ struct type_state_stack { int offset; int size; bool compound; + u8 kind; }; /* FIXME: This should be arch-dependent */ @@ -82,6 +103,8 @@ struct type_state { struct list_head stack_vars; /* return value register */ int ret_reg; + /* stack pointer register */ + int stack_reg; }; static bool has_reg_type(struct type_state *state, int reg) @@ -105,6 +128,7 @@ static void init_type_state(struct type_state *state, struct arch *arch) state->regs[10].caller_saved = true; state->regs[11].caller_saved = true; state->ret_reg = 0; + state->stack_reg = 7; } } @@ -350,7 +374,7 @@ static struct type_state_stack *find_stack_state(struct type_state *state, return NULL; } -static void set_stack_state(struct type_state_stack *stack, int offset, +static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, Dwarf_Die *type_die) { int tag; @@ -364,6 +388,7 @@ static void set_stack_state(struct type_state_stack *stack, int offset, stack->type = *type_die; stack->size = size; stack->offset = offset; + stack->kind = kind; switch (tag) { case DW_TAG_structure_type: @@ -377,34 +402,60 @@ static void set_stack_state(struct type_state_stack *stack, int offset, } static struct type_state_stack *findnew_stack_state(struct type_state *state, - int offset, Dwarf_Die *type_die) + int offset, u8 kind, + Dwarf_Die *type_die) { struct type_state_stack *stack = find_stack_state(state, offset); if (stack) { - set_stack_state(stack, offset, type_die); + set_stack_state(stack, offset, kind, type_die); return stack; } stack = malloc(sizeof(*stack)); if (stack) { - set_stack_state(stack, offset, type_die); + set_stack_state(stack, offset, kind, type_die); list_add(&stack->list, &state->stack_vars); } return stack; } +static bool get_global_var_info(struct data_loc_info *dloc, u64 addr, + const char **var_name, int *var_offset) +{ + struct addr_location al; + struct symbol *sym; + u64 mem_addr; + + /* Kernel symbols might be relocated */ + mem_addr = addr + map__reloc(dloc->ms->map); + + addr_location__init(&al); + sym = thread__find_symbol_fb(dloc->thread, dloc->cpumode, + mem_addr, &al); + if (sym) { + *var_name = sym->name; + /* Calculate type offset from the start of variable */ + *var_offset = mem_addr - map__unmap_ip(al.map, sym->start); + } else { + *var_name = NULL; + } + addr_location__exit(&al); + if (*var_name == NULL) + return false; + + return true; +} + static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, u64 ip, u64 var_addr, int *var_offset, Dwarf_Die *type_die) { - u64 pc, mem_addr; + u64 pc; int offset; bool is_pointer = false; - const char *var_name = NULL; + const char *var_name; Dwarf_Die var_die; - struct addr_location al; - struct symbol *sym; /* Try to get the variable by address first */ if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) && @@ -413,19 +464,7 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, return true; } - /* Kernel symbols might be relocated */ - mem_addr = var_addr + map__reloc(dloc->ms->map); - - addr_location__init(&al); - sym = thread__find_symbol_fb(dloc->thread, dloc->cpumode, - mem_addr, &al); - if (sym) { - var_name = sym->name; - /* Calculate type offset from the start of variable */ - *var_offset = mem_addr - map__unmap_ip(al.map, sym->start); - } - addr_location__exit(&al); - if (var_name == NULL) + if (!get_global_var_info(dloc, var_addr, &var_name, var_offset)) return false; pc = map__rip_2objdump(dloc->ms->map, ip); @@ -470,27 +509,30 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo continue; if (var->reg == DWARF_REG_FB) { - findnew_stack_state(state, var->offset, &mem_die); + findnew_stack_state(state, var->offset, TSR_KIND_TYPE, + &mem_die); pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", insn_offset, -var->offset); - pr_debug_type_name(&mem_die); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); } else if (var->reg == fbreg) { - findnew_stack_state(state, var->offset - fb_offset, &mem_die); + findnew_stack_state(state, var->offset - fb_offset, + TSR_KIND_TYPE, &mem_die); pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", insn_offset, -var->offset + fb_offset); - pr_debug_type_name(&mem_die); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); } else if (has_reg_type(state, var->reg) && var->offset == 0) { struct type_state_reg *reg; reg = &state->regs[var->reg]; reg->type = mem_die; + reg->kind = TSR_KIND_TYPE; reg->ok = true; pr_debug_dtp("var [%"PRIx64"] reg%d", insn_offset, var->reg); - pr_debug_type_name(&mem_die); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); } } } @@ -533,11 +575,12 @@ static void update_insn_state_x86(struct type_state *state, if (die_find_func_rettype(cu_die, func->name, &type_die)) { tsr = &state->regs[state->ret_reg]; tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; tsr->ok = true; pr_debug_dtp("call [%x] return -> reg%d", insn_offset, state->ret_reg); - pr_debug_type_name(&type_die); + pr_debug_type_name(&type_die, tsr->kind); } return; } @@ -580,11 +623,12 @@ static void update_insn_state_x86(struct type_state *state, } tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; tsr->ok = true; pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", insn_offset, var_addr, dst->reg1); - pr_debug_type_name(&tsr->type); + pr_debug_type_name(&tsr->type, tsr->kind); return; } @@ -595,11 +639,12 @@ static void update_insn_state_x86(struct type_state *state, } tsr->type = state->regs[src->reg1].type; + tsr->kind = state->regs[src->reg1].kind; tsr->ok = true; pr_debug_dtp("mov [%x] reg%d -> reg%d", insn_offset, src->reg1, dst->reg1); - pr_debug_type_name(&tsr->type); + pr_debug_type_name(&tsr->type, tsr->kind); } /* Case 2. memory to register transers */ if (src->mem_ref && !dst->mem_ref) { @@ -622,11 +667,13 @@ retry: return; } else if (!stack->compound) { tsr->type = stack->type; + tsr->kind = stack->kind; tsr->ok = true; } else if (die_get_member_type(&stack->type, offset - stack->offset, &type_die)) { tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; tsr->ok = true; } else { tsr->ok = false; @@ -635,18 +682,20 @@ retry: pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", insn_offset, -offset, dst->reg1); - pr_debug_type_name(&tsr->type); + pr_debug_type_name(&tsr->type, tsr->kind); } /* And then dereference the pointer if it has one */ else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_TYPE && die_deref_ptr_type(&state->regs[sreg].type, src->offset, &type_die)) { tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; tsr->ok = true; pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", insn_offset, src->offset, sreg, dst->reg1); - pr_debug_type_name(&tsr->type); + pr_debug_type_name(&tsr->type, tsr->kind); } /* Or check if it's a global variable */ else if (sreg == DWARF_REG_PC) { @@ -665,11 +714,37 @@ retry: } tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; tsr->ok = true; pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", insn_offset, addr, dst->reg1); - pr_debug_type_name(&type_die); + pr_debug_type_name(&type_die, tsr->kind); + } + /* And check percpu access with base register */ + else if (has_reg_type(state, sreg) && + state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + int offset; + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + if (get_global_var_type(cu_die, dloc, ip, src->offset, + &offset, &type_die) && + die_get_member_type(&type_die, offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d type=", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } else { + tsr->ok = false; + } } /* Or try another register if any */ else if (src->multi_regs && sreg == src->reg1 && @@ -677,8 +752,22 @@ retry: sreg = src->reg2; goto retry; } - /* It failed to get a type info, mark it as invalid */ else { + int offset; + const char *var_name = NULL; + + /* it might be per-cpu variable (in kernel) access */ + if (src->offset < 0) { + if (get_global_var_info(dloc, (s64)src->offset, + &var_name, &offset) && + !strcmp(var_name, "__per_cpu_offset")) { + tsr->kind = TSR_KIND_PERCPU_BASE; + + pr_debug_dtp("mov [%x] percpu base reg%d\n", + insn_offset, dst->reg1); + } + } + tsr->ok = false; } } @@ -693,6 +782,8 @@ retry: struct type_state_stack *stack; int offset = dst->offset - fboff; + tsr = &state->regs[src->reg1]; + stack = find_stack_state(state, offset); if (stack) { /* @@ -703,16 +794,16 @@ retry: * die_get_member_type(). */ if (!stack->compound) - set_stack_state(stack, offset, - &state->regs[src->reg1].type); + set_stack_state(stack, offset, tsr->kind, + &tsr->type); } else { - findnew_stack_state(state, offset, - &state->regs[src->reg1].type); + findnew_stack_state(state, offset, tsr->kind, + &tsr->type); } pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", insn_offset, src->reg1, -offset); - pr_debug_type_name(&state->regs[src->reg1].type); + pr_debug_type_name(&tsr->type, tsr->kind); } /* * Ignore other transfers since it'd set a value in a struct @@ -824,10 +915,11 @@ static bool check_matching_type(struct type_state *state, Dwarf_Word size; u32 insn_offset = dloc->ip - dloc->ms->sym->start; - pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d", - insn_offset, reg, dloc->op->offset, state->regs[reg].ok); + pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d", + insn_offset, reg, dloc->op->offset, + state->regs[reg].ok, state->regs[reg].kind); - if (state->regs[reg].ok) { + if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) { int tag = dwarf_tag(&state->regs[reg].type); pr_debug_dtp("\n"); @@ -893,10 +985,25 @@ static bool check_matching_type(struct type_state *state, return true; } + if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) { + u64 var_addr = dloc->op->offset; + int var_offset; + + pr_debug_dtp(" percpu var\n"); + + if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, + &var_offset, type_die)) { + dloc->type_offset = var_offset; + return true; + } + return false; + } + if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) { u64 addr; int offset; + /* Direct this-cpu access like "%gs:0x34740" */ if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm) { pr_debug_dtp(" this-cpu var\n"); @@ -907,6 +1014,24 @@ static bool check_matching_type(struct type_state *state, dloc->type_offset = offset; return true; } + return false; + } + + /* Access to per-cpu base like "-0x7dcf0500(,%rdx,8)" */ + if (dloc->op->offset < 0 && reg != state->stack_reg) { + const char *var_name = NULL; + + addr = (s64) dloc->op->offset; + + if (get_global_var_info(dloc, addr, &var_name, &offset) && + !strcmp(var_name, "__per_cpu_offset") && offset == 0 && + get_global_var_type(cu_die, dloc, dloc->ip, addr, + &offset, type_die)) { + pr_debug_dtp(" percpu base\n"); + + dloc->type_offset = offset; + return true; + } } } @@ -1015,7 +1140,7 @@ again: ret = 0; pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x", dloc->op->offset, reg, dloc->type_offset); - pr_debug_type_name(type_die); + pr_debug_type_name(type_die, TSR_KIND_TYPE); break; } @@ -1147,7 +1272,7 @@ retry: loc->offset, reg, fb_offset, offset); else pr_debug_dtp("%#x(reg%d)", loc->offset, reg); - pr_debug_type_name(type_die); + pr_debug_type_name(type_die, TSR_KIND_TYPE); } dloc->type_offset = offset; goto out; -- cgit v1.2.3 From eb9190afaed6afd5ed54bb0a3269eec338663858 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:12 -0700 Subject: perf annotate-data: Handle ADD instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are different patterns for percpu variable access using a constant value added to the base.  2aeb:  mov    -0x7da0f7e0(,%rax,8),%r14 # r14 = __per_cpu_offset[cpu]  2af3:  mov    $0x34740,%rax # rax = address of runqueues * 2afa:  add    %rax,%r14 # r14 = &per_cpu(runqueues, cpu)  2bfd:  cmpl   $0x0,0x10(%r14) # cpu_rq(cpu)->has_blocked_load  2b03:  je     0x2b36 At the first instruction, r14 has the __per_cpu_offset. And then rax has an immediate value and then added to r14 to calculate the address of a per-cpu variable. So it needs to track the immediate values and ADD instructions. Similar but a little different case is to use "this_cpu_off" instead of "__per_cpu_offset" for the current CPU. This time the variable address comes with PC-rel addressing. 89: mov $0x34740,%rax # rax = address of runqueues * 90: add %gs:0x7f015f60(%rip),%rax # 19a78 98: incl 0xd8c(%rax) # cpu_rq(cpu)->sched_count Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-21-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 107 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 83b5aa00f01c..bd10a576cfbf 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -28,6 +28,8 @@ enum type_state_kind { TSR_KIND_INVALID = 0, TSR_KIND_TYPE, TSR_KIND_PERCPU_BASE, + TSR_KIND_CONST, + TSR_KIND_POINTER, }; #define pr_debug_dtp(fmt, ...) \ @@ -53,6 +55,13 @@ static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) case TSR_KIND_PERCPU_BASE: pr_info(" percpu base\n"); return; + case TSR_KIND_CONST: + pr_info(" constant\n"); + return; + case TSR_KIND_POINTER: + pr_info(" pointer"); + /* it also prints the type info */ + break; case TSR_KIND_TYPE: default: break; @@ -393,7 +402,7 @@ static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, switch (tag) { case DW_TAG_structure_type: case DW_TAG_union_type: - stack->compound = true; + stack->compound = (kind != TSR_KIND_POINTER); break; default: stack->compound = false; @@ -585,6 +594,58 @@ static void update_insn_state_x86(struct type_state *state, return; } + if (!strncmp(dl->ins.name, "add", 3)) { + u64 imm_value = -1ULL; + int offset; + const char *var_name = NULL; + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + + if (src->imm) + imm_value = src->offset; + else if (has_reg_type(state, src->reg1) && + state->regs[src->reg1].kind == TSR_KIND_CONST) + imm_value = state->regs[src->reg1].imm_value; + else if (src->reg1 == DWARF_REG_PC) { + u64 var_addr = annotate_calc_pcrel(dloc->ms, ip, + src->offset, dl); + + if (get_global_var_info(dloc, var_addr, + &var_name, &offset) && + !strcmp(var_name, "this_cpu_off") && + tsr->kind == TSR_KIND_CONST) { + tsr->kind = TSR_KIND_PERCPU_BASE; + imm_value = tsr->imm_value; + } + } + else + return; + + if (tsr->kind != TSR_KIND_PERCPU_BASE) + return; + + if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset, + &type_die) && offset == 0) { + /* + * This is not a pointer type, but it should be treated + * as a pointer. + */ + tsr->type = type_die; + tsr->kind = TSR_KIND_POINTER; + tsr->ok = true; + + pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", + insn_offset, imm_value, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + return; + } + if (strncmp(dl->ins.name, "mov", 3)) return; @@ -632,6 +693,16 @@ static void update_insn_state_x86(struct type_state *state, return; } + if (src->imm) { + tsr->kind = TSR_KIND_CONST; + tsr->imm_value = src->offset; + tsr->ok = true; + + pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n", + insn_offset, tsr->imm_value, dst->reg1); + return; + } + if (!has_reg_type(state, src->reg1) || !state->regs[src->reg1].ok) { tsr->ok = false; @@ -739,13 +810,26 @@ retry: tsr->kind = TSR_KIND_TYPE; tsr->ok = true; - pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d type=", + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", insn_offset, src->offset, sreg, dst->reg1); pr_debug_type_name(&tsr->type, tsr->kind); } else { tsr->ok = false; } } + /* And then dereference the calculated pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_POINTER && + die_get_member_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } /* Or try another register if any */ else if (src->multi_regs && sreg == src->reg1 && src->reg1 != src->reg2) { @@ -999,6 +1083,25 @@ static bool check_matching_type(struct type_state *state, return false; } + if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) { + pr_debug_dtp(" percpu ptr\n"); + + /* + * It's actaully pointer but the address was calculated using + * some arithmetic. So it points to the actual type already. + */ + *type_die = state->regs[reg].type; + + dloc->type_offset = dloc->op->offset; + + /* Get the size of the actual type */ + if (dwarf_aggregate_size(type_die, &size) < 0 || + (unsigned)dloc->type_offset >= size) + return false; + + return true; + } + if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) { u64 addr; int offset; -- cgit v1.2.3 From b3c95109c131fcc959d2473e7c384d8cc62d23d0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:13 -0700 Subject: perf annotate-data: Add stack canary type When the stack protector is enabled, compiler would generate code to check stack overflow with a special value called 'stack carary' at runtime. On x86_64, GCC hard-codes the stack canary as %gs:40. While there's a definition of fixed_percpu_data in asm/processor.h, it seems that the header is not included everywhere and many places it cannot find the type info. As it's in the well-known location (at %gs:40), let's add a pseudo stack canary type to handle it specially. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-22-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 46 +++++++++++++++++++++++++++++++++++++++++ tools/perf/util/annotate-data.h | 1 + tools/perf/util/annotate.c | 25 ++++++++++++++++++++++ 3 files changed, 72 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index bd10a576cfbf..633fe125fcd8 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -30,6 +30,7 @@ enum type_state_kind { TSR_KIND_PERCPU_BASE, TSR_KIND_CONST, TSR_KIND_POINTER, + TSR_KIND_CANARY, }; #define pr_debug_dtp(fmt, ...) \ @@ -62,6 +63,9 @@ static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) pr_info(" pointer"); /* it also prints the type info */ break; + case TSR_KIND_CANARY: + pr_info(" stack canary\n"); + return; case TSR_KIND_TYPE: default: break; @@ -676,6 +680,15 @@ static void update_insn_state_x86(struct type_state *state, */ var_addr = src->offset; + if (var_addr == 40) { + tsr->kind = TSR_KIND_CANARY; + tsr->ok = true; + + pr_debug_dtp("mov [%x] stack canary -> reg%d\n", + insn_offset, dst->reg1); + return; + } + if (!get_global_var_type(cu_die, dloc, ip, var_addr, &offset, &type_die) || !die_get_member_type(&type_die, offset, &type_die)) { @@ -991,6 +1004,16 @@ static void delete_var_types(struct die_var_type *var_types) } } +/* should match to is_stack_canary() in util/annotate.c */ +static void setup_stack_canary(struct data_loc_info *dloc) +{ + if (arch__is(dloc->arch, "x86")) { + dloc->op->segment = INSN_SEG_X86_GS; + dloc->op->imm = true; + dloc->op->offset = 40; + } +} + /* It's at the target address, check if it has a matching type */ static bool check_matching_type(struct type_state *state, struct data_loc_info *dloc, int reg, @@ -1038,6 +1061,11 @@ static bool check_matching_type(struct type_state *state, if (stack == NULL) return false; + if (stack->kind == TSR_KIND_CANARY) { + setup_stack_canary(dloc); + return false; + } + *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= stack->offset; @@ -1062,6 +1090,11 @@ static bool check_matching_type(struct type_state *state, if (stack == NULL) return false; + if (stack->kind == TSR_KIND_CANARY) { + setup_stack_canary(dloc); + return false; + } + *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= fboff + stack->offset; @@ -1102,6 +1135,19 @@ static bool check_matching_type(struct type_state *state, return true; } + if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) { + pr_debug_dtp(" stack canary\n"); + + /* + * This is a saved value of the stack canary which will be handled + * in the outer logic when it returns failure here. Pretend it's + * from the stack canary directly. + */ + setup_stack_canary(dloc); + + return false; + } + if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) { u64 addr; int offset; diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index ae0f87aed804..1b5a152163b5 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -73,6 +73,7 @@ struct annotated_data_type { extern struct annotated_data_type unknown_type; extern struct annotated_data_type stackop_type; +extern struct annotated_data_type canary_type; /** * struct data_loc_info - Data location information diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index e4121acb4f88..64e54ff1aa1d 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -118,6 +118,13 @@ struct annotated_data_type stackop_type = { }, }; +struct annotated_data_type canary_type = { + .self = { + .type_name = (char *)"(stack canary)", + .children = LIST_HEAD_INIT(canary_type.self.children), + }, +}; + static int arch__grow_instructions(struct arch *arch) { struct ins *new_instructions; @@ -3803,6 +3810,18 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) return false; } +static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) +{ + /* On x86_64, %gs:40 is used for stack canary */ + if (arch__is(arch, "x86")) { + if (loc->segment == INSN_SEG_X86_GS && loc->imm && + loc->offset == 40) + return true; + } + + return false; +} + u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, struct disasm_line *dl) { @@ -3929,6 +3948,12 @@ retry: } mem_type = find_data_type(&dloc); + + if (mem_type == NULL && is_stack_canary(arch, op_loc)) { + mem_type = &canary_type; + dloc.type_offset = 0; + } + if (mem_type) istat->good++; else -- cgit v1.2.3 From 55ee3d005d62279d3951a26d5c211a4d9aebc222 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:14 -0700 Subject: perf annotate-data: Add a cache for global variable types They are often searched by many different places. Let's add a cache for them to reduce the duplicate DWARF access. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-23-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 107 ++++++++++++++++++++++++++++++++++++++-- tools/perf/util/annotate-data.h | 7 +++ tools/perf/util/dso.c | 2 + tools/perf/util/dso.h | 6 ++- 4 files changed, 118 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 633fe125fcd8..969e2f82079c 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -433,6 +433,91 @@ static struct type_state_stack *findnew_stack_state(struct type_state *state, return stack; } +/* Maintain a cache for quick global variable lookup */ +struct global_var_entry { + struct rb_node node; + char *name; + u64 start; + u64 end; + u64 die_offset; +}; + +static int global_var_cmp(const void *_key, const struct rb_node *node) +{ + const u64 addr = (uintptr_t)_key; + struct global_var_entry *gvar; + + gvar = rb_entry(node, struct global_var_entry, node); + + if (gvar->start <= addr && addr < gvar->end) + return 0; + return gvar->start > addr ? -1 : 1; +} + +static bool global_var_less(struct rb_node *node_a, const struct rb_node *node_b) +{ + struct global_var_entry *gvar_a, *gvar_b; + + gvar_a = rb_entry(node_a, struct global_var_entry, node); + gvar_b = rb_entry(node_b, struct global_var_entry, node); + + return gvar_a->start < gvar_b->start; +} + +static struct global_var_entry *global_var__find(struct data_loc_info *dloc, u64 addr) +{ + struct dso *dso = map__dso(dloc->ms->map); + struct rb_node *node; + + node = rb_find((void *)(uintptr_t)addr, &dso->global_vars, global_var_cmp); + if (node == NULL) + return NULL; + + return rb_entry(node, struct global_var_entry, node); +} + +static bool global_var__add(struct data_loc_info *dloc, u64 addr, + const char *name, Dwarf_Die *type_die) +{ + struct dso *dso = map__dso(dloc->ms->map); + struct global_var_entry *gvar; + Dwarf_Word size; + + if (dwarf_aggregate_size(type_die, &size) < 0) + return false; + + gvar = malloc(sizeof(*gvar)); + if (gvar == NULL) + return false; + + gvar->name = strdup(name); + if (gvar->name == NULL) { + free(gvar); + return false; + } + + gvar->start = addr; + gvar->end = addr + size; + gvar->die_offset = dwarf_dieoffset(type_die); + + rb_add(&gvar->node, &dso->global_vars, global_var_less); + return true; +} + +void global_var_type__tree_delete(struct rb_root *root) +{ + struct global_var_entry *gvar; + + while (!RB_EMPTY_ROOT(root)) { + struct rb_node *node = rb_first(root); + + rb_erase(node, root); + gvar = rb_entry(node, struct global_var_entry, node); + free(gvar->name); + free(gvar); + } +} + static bool get_global_var_info(struct data_loc_info *dloc, u64 addr, const char **var_name, int *var_offset) { @@ -467,14 +552,25 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, u64 pc; int offset; bool is_pointer = false; - const char *var_name; + const char *var_name = NULL; + struct global_var_entry *gvar; Dwarf_Die var_die; + gvar = global_var__find(dloc, var_addr); + if (gvar) { + if (!dwarf_offdie(dloc->di->dbg, gvar->die_offset, type_die)) + return false; + + *var_offset = var_addr - gvar->start; + return true; + } + /* Try to get the variable by address first */ if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) && check_variable(&var_die, type_die, offset, is_pointer) == 0) { + var_name = dwarf_diename(&var_die); *var_offset = offset; - return true; + goto ok; } if (!get_global_var_info(dloc, var_addr, &var_name, var_offset)) @@ -485,9 +581,14 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, /* Try to get the name of global variable */ if (die_find_variable_at(cu_die, var_name, pc, &var_die) && check_variable(&var_die, type_die, *var_offset, is_pointer) == 0) - return true; + goto ok; return false; + +ok: + /* The address should point to the start of the variable */ + global_var__add(dloc, var_addr - *var_offset, var_name, type_die); + return true; } /** diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 1b5a152163b5..fe1e53d6e8c7 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -153,6 +153,9 @@ int annotated_data_type__update_samples(struct annotated_data_type *adt, /* Release all data type information in the tree */ void annotated_data_type__tree_delete(struct rb_root *root); +/* Release all global variable information in the tree */ +void global_var_type__tree_delete(struct rb_root *root); + #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * @@ -175,6 +178,10 @@ static inline void annotated_data_type__tree_delete(struct rb_root *root __maybe { } +static inline void global_var_type__tree_delete(struct rb_root *root __maybe_unused) +{ +} + #endif /* HAVE_DWARF_SUPPORT */ #endif /* _PERF_ANNOTATE_DATA_H */ diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 22fd5fa806ed..6e2a7198b382 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1329,6 +1329,7 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) dso->inlined_nodes = RB_ROOT_CACHED; dso->srclines = RB_ROOT_CACHED; dso->data_types = RB_ROOT; + dso->global_vars = RB_ROOT; dso->data.fd = -1; dso->data.status = DSO_DATA_STATUS_UNKNOWN; dso->symtab_type = DSO_BINARY_TYPE__NOT_FOUND; @@ -1373,6 +1374,7 @@ void dso__delete(struct dso *dso) dso->symbol_names_len = 0; zfree(&dso->symbol_names); annotated_data_type__tree_delete(&dso->data_types); + global_var_type__tree_delete(&dso->global_vars); if (dso->short_name_allocated) { zfree((char **)&dso->short_name); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index ce9f3849a773..2cdcd1e2ef8b 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -154,7 +154,8 @@ struct dso { size_t symbol_names_len; struct rb_root_cached inlined_nodes; struct rb_root_cached srclines; - struct rb_root data_types; + struct rb_root data_types; + struct rb_root global_vars; struct { u64 addr; @@ -411,4 +412,7 @@ int dso__strerror_load(struct dso *dso, char *buf, size_t buflen); void reset_fd_limit(void); +u64 dso__find_global_type(struct dso *dso, u64 addr); +u64 dso__findnew_global_type(struct dso *dso, u64 addr, u64 offset); + #endif /* __PERF_DSO */ -- cgit v1.2.3 From bd62de08084c24a4ff1b1875d53cc4cc1ea2312d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2024 22:51:15 -0700 Subject: perf annotate-data: Do not retry for invalid types In some cases, it was able to find a type or location info (for per-cpu variable) but cannot match because of invalid offset or missing global information. In those cases, it's meaningless to go to the outer scope and retry because there will be no additional information. Let's change the return type of find_matching_type() and bail out if it returns -1 for the cases. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240319055115.4063940-24-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 83 ++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 969e2f82079c..043d80791bd0 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1115,10 +1115,15 @@ static void setup_stack_canary(struct data_loc_info *dloc) } } -/* It's at the target address, check if it has a matching type */ -static bool check_matching_type(struct type_state *state, - struct data_loc_info *dloc, int reg, - Dwarf_Die *cu_die, Dwarf_Die *type_die) +/* + * It's at the target address, check if it has a matching type. + * It returns 1 if found, 0 if not or -1 if not found but no need to + * repeat the search. The last case is for per-cpu variables which + * are similar to global variables and no additional info is needed. + */ +static int check_matching_type(struct type_state *state, + struct data_loc_info *dloc, int reg, + Dwarf_Die *cu_die, Dwarf_Die *type_die) { Dwarf_Word size; u32 insn_offset = dloc->ip - dloc->ms->sym->start; @@ -1137,20 +1142,20 @@ static bool check_matching_type(struct type_state *state, * dereference a memory location. */ if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) - return false; + return -1; /* Remove the pointer and get the target type */ if (die_get_real_type(&state->regs[reg].type, type_die) == NULL) - return false; + return -1; dloc->type_offset = dloc->op->offset; /* Get the size of the actual type */ if (dwarf_aggregate_size(type_die, &size) < 0 || (unsigned)dloc->type_offset >= size) - return false; + return -1; - return true; + return 1; } if (reg == dloc->fbreg) { @@ -1160,18 +1165,18 @@ static bool check_matching_type(struct type_state *state, stack = find_stack_state(state, dloc->type_offset); if (stack == NULL) - return false; + return 0; if (stack->kind == TSR_KIND_CANARY) { setup_stack_canary(dloc); - return false; + return -1; } *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= stack->offset; - return true; + return 1; } if (dloc->fb_cfa) { @@ -1185,22 +1190,22 @@ static bool check_matching_type(struct type_state *state, fbreg = -1; if (reg != fbreg) - return false; + return 0; stack = find_stack_state(state, dloc->type_offset - fboff); if (stack == NULL) - return false; + return 0; if (stack->kind == TSR_KIND_CANARY) { setup_stack_canary(dloc); - return false; + return -1; } *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= fboff + stack->offset; - return true; + return 1; } if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) { @@ -1212,9 +1217,10 @@ static bool check_matching_type(struct type_state *state, if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, &var_offset, type_die)) { dloc->type_offset = var_offset; - return true; + return 1; } - return false; + /* No need to retry per-cpu (global) variables */ + return -1; } if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) { @@ -1231,9 +1237,9 @@ static bool check_matching_type(struct type_state *state, /* Get the size of the actual type */ if (dwarf_aggregate_size(type_die, &size) < 0 || (unsigned)dloc->type_offset >= size) - return false; + return -1; - return true; + return 1; } if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) { @@ -1246,7 +1252,7 @@ static bool check_matching_type(struct type_state *state, */ setup_stack_canary(dloc); - return false; + return -1; } if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) { @@ -1262,9 +1268,9 @@ static bool check_matching_type(struct type_state *state, if (get_global_var_type(cu_die, dloc, dloc->ip, addr, &offset, type_die)) { dloc->type_offset = offset; - return true; + return 1; } - return false; + return -1; } /* Access to per-cpu base like "-0x7dcf0500(,%rdx,8)" */ @@ -1280,26 +1286,28 @@ static bool check_matching_type(struct type_state *state, pr_debug_dtp(" percpu base\n"); dloc->type_offset = offset; - return true; + return 1; } + pr_debug_dtp(" negative offset\n"); + return -1; } } pr_debug_dtp("\n"); - return false; + return 0; } /* Iterate instructions in basic blocks and update type table */ -static bool find_data_type_insn(struct data_loc_info *dloc, int reg, - struct list_head *basic_blocks, - struct die_var_type *var_types, - Dwarf_Die *cu_die, Dwarf_Die *type_die) +static int find_data_type_insn(struct data_loc_info *dloc, int reg, + struct list_head *basic_blocks, + struct die_var_type *var_types, + Dwarf_Die *cu_die, Dwarf_Die *type_die) { struct type_state state; struct symbol *sym = dloc->ms->sym; struct annotation *notes = symbol__annotation(sym); struct annotated_basic_block *bb; - bool found = false; + int ret = 0; init_type_state(&state, dloc->arch); @@ -1317,8 +1325,8 @@ static bool find_data_type_insn(struct data_loc_info *dloc, int reg, update_var_state(&state, dloc, addr, dl->al.offset, var_types); if (this_ip == dloc->ip) { - found = check_matching_type(&state, dloc, reg, - cu_die, type_die); + ret = check_matching_type(&state, dloc, reg, + cu_die, type_die); goto out; } @@ -1331,7 +1339,7 @@ static bool find_data_type_insn(struct data_loc_info *dloc, int reg, out: exit_type_state(&state); - return found; + return ret; } /* @@ -1355,6 +1363,7 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg, for (int i = nr_scopes - 1; i >= 0; i--) { Dwarf_Addr base, start, end; LIST_HEAD(this_blocks); + int found; if (dwarf_ranges(&scopes[i], 0, &base, &start, &end) < 0) break; @@ -1385,15 +1394,19 @@ again: fixup_var_address(var_types, start); /* Find from start of this scope to the target instruction */ - if (find_data_type_insn(dloc, reg, &basic_blocks, var_types, - cu_die, type_die)) { - ret = 0; + found = find_data_type_insn(dloc, reg, &basic_blocks, var_types, + cu_die, type_die); + if (found > 0) { pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x", dloc->op->offset, reg, dloc->type_offset); pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = 0; break; } + if (found < 0) + break; + /* Go up to the next scope and find blocks to the start */ prev_dst_ip = dst_ip; dst_ip = src_ip; -- cgit v1.2.3 From 2316ef589181b5b2fd6cbced24439ba4ac096bd8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Mar 2024 14:49:32 -0300 Subject: perf beauty: Introduce scrape script for 'clone' syscall 'flags' argument It was using the first variation on producing a string representation for a binary flag, one that used the copy of uapi/linux/sched.h with preprocessor tricks that had to be updated everytime a new flag was introduced. Use the more recent scrape script + strarray + strarray__scnprintf_flags() combo. $ tools/perf/trace/beauty/clone.sh | head -5 static const char *clone_flags[] = { [ilog2(0x00000100) + 1] = "VM", [ilog2(0x00000200) + 1] = "FS", [ilog2(0x00000400) + 1] = "FILES", [ilog2(0x00000800) + 1] = "SIGHAND", $ Now we can move uapi/linux/sched.h from tools/include/, that is used for building perf to the scrape only directory tools/perf/trace/beauty/include. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZfnULIn3XKDq0bpc@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/sched.h | 148 --------------------- tools/perf/Makefile.perf | 14 +- tools/perf/check-headers.sh | 2 +- tools/perf/trace/beauty/clone.c | 46 +------ tools/perf/trace/beauty/clone.sh | 17 +++ tools/perf/trace/beauty/include/uapi/linux/sched.h | 148 +++++++++++++++++++++ 6 files changed, 182 insertions(+), 193 deletions(-) delete mode 100644 tools/include/uapi/linux/sched.h create mode 100755 tools/perf/trace/beauty/clone.sh create mode 100644 tools/perf/trace/beauty/include/uapi/linux/sched.h (limited to 'tools') diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h deleted file mode 100644 index 3bac0a8ceab2..000000000000 --- a/tools/include/uapi/linux/sched.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_SCHED_H -#define _UAPI_LINUX_SCHED_H - -#include - -/* - * cloning flags: - */ -#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ -#define CLONE_VM 0x00000100 /* set if VM shared between processes */ -#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ -#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ -#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ -#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */ -#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ -#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ -#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ -#define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ -#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ -#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ -#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ -#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ -#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ -#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ -#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ -#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ -#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ -#define CLONE_NEWUSER 0x10000000 /* New user namespace */ -#define CLONE_NEWPID 0x20000000 /* New pid namespace */ -#define CLONE_NEWNET 0x40000000 /* New network namespace */ -#define CLONE_IO 0x80000000 /* Clone io context */ - -/* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ - -/* - * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 - * syscalls only: - */ -#define CLONE_NEWTIME 0x00000080 /* New time namespace */ - -#ifndef __ASSEMBLY__ -/** - * struct clone_args - arguments for the clone3 syscall - * @flags: Flags for the new process as listed above. - * All flags are valid except for CSIGNAL and - * CLONE_DETACHED. - * @pidfd: If CLONE_PIDFD is set, a pidfd will be - * returned in this argument. - * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the - * child process will be returned in the child's - * memory. - * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of - * the child process will be returned in the - * parent's memory. - * @exit_signal: The exit_signal the parent process will be - * sent when the child exits. - * @stack: Specify the location of the stack for the - * child process. - * Note, @stack is expected to point to the - * lowest address. The stack direction will be - * determined by the kernel and set up - * appropriately based on @stack_size. - * @stack_size: The size of the stack for the child process. - * @tls: If CLONE_SETTLS is set, the tls descriptor - * is set to tls. - * @set_tid: Pointer to an array of type *pid_t. The size - * of the array is defined using @set_tid_size. - * This array is used to select PIDs/TIDs for - * newly created processes. The first element in - * this defines the PID in the most nested PID - * namespace. Each additional element in the array - * defines the PID in the parent PID namespace of - * the original PID namespace. If the array has - * less entries than the number of currently - * nested PID namespaces only the PIDs in the - * corresponding namespaces are set. - * @set_tid_size: This defines the size of the array referenced - * in @set_tid. This cannot be larger than the - * kernel's limit of nested PID namespaces. - * @cgroup: If CLONE_INTO_CGROUP is specified set this to - * a file descriptor for the cgroup. - * - * The structure is versioned by size and thus extensible. - * New struct members must go at the end of the struct and - * must be properly 64bit aligned. - */ -struct clone_args { - __aligned_u64 flags; - __aligned_u64 pidfd; - __aligned_u64 child_tid; - __aligned_u64 parent_tid; - __aligned_u64 exit_signal; - __aligned_u64 stack; - __aligned_u64 stack_size; - __aligned_u64 tls; - __aligned_u64 set_tid; - __aligned_u64 set_tid_size; - __aligned_u64 cgroup; -}; -#endif - -#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ - -/* - * Scheduling policies - */ -#define SCHED_NORMAL 0 -#define SCHED_FIFO 1 -#define SCHED_RR 2 -#define SCHED_BATCH 3 -/* SCHED_ISO: reserved but not implemented yet */ -#define SCHED_IDLE 5 -#define SCHED_DEADLINE 6 - -/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ -#define SCHED_RESET_ON_FORK 0x40000000 - -/* - * For the sched_{set,get}attr() calls - */ -#define SCHED_FLAG_RESET_ON_FORK 0x01 -#define SCHED_FLAG_RECLAIM 0x02 -#define SCHED_FLAG_DL_OVERRUN 0x04 -#define SCHED_FLAG_KEEP_POLICY 0x08 -#define SCHED_FLAG_KEEP_PARAMS 0x10 -#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 -#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 - -#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ - SCHED_FLAG_KEEP_PARAMS) - -#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ - SCHED_FLAG_UTIL_CLAMP_MAX) - -#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ - SCHED_FLAG_RECLAIM | \ - SCHED_FLAG_DL_OVERRUN | \ - SCHED_FLAG_KEEP_ALL | \ - SCHED_FLAG_UTIL_CLAMP) - -#endif /* _UAPI_LINUX_SCHED_H */ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index f5654d06e313..ccd2dcbc64f7 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -485,13 +485,20 @@ x86_arch_asm_dir := $(srctree)/tools/arch/x86/include/asm/ beauty_outdir := $(OUTPUT)trace/beauty/generated beauty_ioctl_outdir := $(beauty_outdir)/ioctl -drm_ioctl_array := $(beauty_ioctl_outdir)/drm_ioctl_array.c -drm_hdr_dir := $(srctree)/tools/include/uapi/drm -drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh # Create output directory if not already present $(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)') +clone_flags_array := $(beauty_outdir)/clone_flags_array.c +clone_flags_tbl := $(srctree)/tools/perf/trace/beauty/clone.sh + +$(clone_flags_array): $(beauty_uapi_linux_dir)/sched.h $(clone_flags_tbl) + $(Q)$(SHELL) '$(clone_flags_tbl)' $(beauty_uapi_linux_dir) > $@ + +drm_ioctl_array := $(beauty_ioctl_outdir)/drm_ioctl_array.c +drm_hdr_dir := $(srctree)/tools/include/uapi/drm +drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh + $(drm_ioctl_array): $(drm_hdr_dir)/drm.h $(drm_hdr_dir)/i915_drm.h $(drm_ioctl_tbl) $(Q)$(SHELL) '$(drm_ioctl_tbl)' $(drm_hdr_dir) > $@ @@ -765,6 +772,7 @@ build-dir = $(or $(__build-dir),.) prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \ arm64-sysreg-defs \ + $(clone_flags_array) \ $(drm_ioctl_array) \ $(fadvise_advice_array) \ $(fsconfig_arrays) \ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 859cd6f35b0a..413c9b747216 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -15,7 +15,6 @@ FILES=( "include/uapi/linux/kvm.h" "include/uapi/linux/in.h" "include/uapi/linux/perf_event.h" - "include/uapi/linux/sched.h" "include/uapi/linux/seccomp.h" "include/uapi/linux/vhost.h" "include/linux/bits.h" @@ -93,6 +92,7 @@ BEAUTY_FILES=( "include/uapi/linux/fs.h" "include/uapi/linux/mount.h" "include/uapi/linux/prctl.h" + "include/uapi/linux/sched.h" "include/uapi/linux/usbdevice_fs.h" "include/uapi/sound/asound.h" ) diff --git a/tools/perf/trace/beauty/clone.c b/tools/perf/trace/beauty/clone.c index f4db894e0af6..c9fa8f7e82b9 100644 --- a/tools/perf/trace/beauty/clone.c +++ b/tools/perf/trace/beauty/clone.c @@ -7,52 +7,16 @@ #include "trace/beauty/beauty.h" #include +#include #include -#include +#include static size_t clone__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) { - const char *prefix = "CLONE_"; - int printed = 0; +#include "trace/beauty/generated/clone_flags_array.c" + static DEFINE_STRARRAY(clone_flags, "CLONE_"); -#define P_FLAG(n) \ - if (flags & CLONE_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ - flags &= ~CLONE_##n; \ - } - - P_FLAG(VM); - P_FLAG(FS); - P_FLAG(FILES); - P_FLAG(SIGHAND); - P_FLAG(PIDFD); - P_FLAG(PTRACE); - P_FLAG(VFORK); - P_FLAG(PARENT); - P_FLAG(THREAD); - P_FLAG(NEWNS); - P_FLAG(SYSVSEM); - P_FLAG(SETTLS); - P_FLAG(PARENT_SETTID); - P_FLAG(CHILD_CLEARTID); - P_FLAG(DETACHED); - P_FLAG(UNTRACED); - P_FLAG(CHILD_SETTID); - P_FLAG(NEWCGROUP); - P_FLAG(NEWUTS); - P_FLAG(NEWIPC); - P_FLAG(NEWUSER); - P_FLAG(NEWPID); - P_FLAG(NEWNET); - P_FLAG(IO); - P_FLAG(CLEAR_SIGHAND); - P_FLAG(INTO_CGROUP); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; + return strarray__scnprintf_flags(&strarray__clone_flags, bf, size, show_prefix, flags); } size_t syscall_arg__scnprintf_clone_flags(char *bf, size_t size, struct syscall_arg *arg) diff --git a/tools/perf/trace/beauty/clone.sh b/tools/perf/trace/beauty/clone.sh new file mode 100755 index 000000000000..18b6c0d75693 --- /dev/null +++ b/tools/perf/trace/beauty/clone.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 1 ] ; then + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ +else + beauty_uapi_linux_dir=$1 +fi + +linux_sched=${beauty_uapi_linux_dir}/sched.h + +printf "static const char *clone_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+CLONE_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +grep -E $regex ${linux_sched} | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/include/uapi/linux/sched.h b/tools/perf/trace/beauty/include/uapi/linux/sched.h new file mode 100644 index 000000000000..3bac0a8ceab2 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/sched.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_SCHED_H +#define _UAPI_LINUX_SCHED_H + +#include + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#define CLONE_NEWNET 0x40000000 /* New network namespace */ +#define CLONE_IO 0x80000000 /* Clone io context */ + +/* Flags for the clone3() syscall. */ +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ + +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ + +#ifndef __ASSEMBLY__ +/** + * struct clone_args - arguments for the clone3 syscall + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * Note, @stack is expected to point to the + * lowest address. The stack direction will be + * determined by the kernel and set up + * appropriately based on @stack_size. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * @set_tid: Pointer to an array of type *pid_t. The size + * of the array is defined using @set_tid_size. + * This array is used to select PIDs/TIDs for + * newly created processes. The first element in + * this defines the PID in the most nested PID + * namespace. Each additional element in the array + * defines the PID in the parent PID namespace of + * the original PID namespace. If the array has + * less entries than the number of currently + * nested PID namespaces only the PIDs in the + * corresponding namespaces are set. + * @set_tid_size: This defines the size of the array referenced + * in @set_tid. This cannot be larger than the + * kernel's limit of nested PID namespaces. + * @cgroup: If CLONE_INTO_CGROUP is specified set this to + * a file descriptor for the cgroup. + * + * The structure is versioned by size and thus extensible. + * New struct members must go at the end of the struct and + * must be properly 64bit aligned. + */ +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; + __aligned_u64 cgroup; +}; +#endif + +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ + +/* + * Scheduling policies + */ +#define SCHED_NORMAL 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 +#define SCHED_BATCH 3 +/* SCHED_ISO: reserved but not implemented yet */ +#define SCHED_IDLE 5 +#define SCHED_DEADLINE 6 + +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 + +/* + * For the sched_{set,get}attr() calls + */ +#define SCHED_FLAG_RESET_ON_FORK 0x01 +#define SCHED_FLAG_RECLAIM 0x02 +#define SCHED_FLAG_DL_OVERRUN 0x04 +#define SCHED_FLAG_KEEP_POLICY 0x08 +#define SCHED_FLAG_KEEP_PARAMS 0x10 +#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 +#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 + +#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) + +#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ + SCHED_FLAG_UTIL_CLAMP_MAX) + +#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ + SCHED_FLAG_UTIL_CLAMP) + +#endif /* _UAPI_LINUX_SCHED_H */ -- cgit v1.2.3 From 525615ef6df476d5fe277d311c645b4c179d56db Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Mar 2024 16:19:10 -0800 Subject: perf list: Add tracepoint encoding to detailed output The tracepoint id holds the config value and is probed in determining what an event is. Add reading of the id so that we can display the event encoding as: $ perf list --details ... alarmtimer:alarmtimer_cancel [Tracepoint event] tracepoint/config=0x18c/ ... Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20240308001915.4060155-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print-events.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 7b54e9385442..e0d2b49bab66 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -92,34 +93,48 @@ void print_tracepoint_events(const struct print_callbacks *print_cb __maybe_unus evt_items = scandirat(events_fd, sys_dirent->d_name, &evt_namelist, NULL, alphasort); for (int j = 0; j < evt_items; j++) { + /* + * Buffer sized at twice the max filename length + 1 + * separator + 1 \0 terminator. + */ + char buf[NAME_MAX * 2 + 2]; + /* 16 possible hex digits and 22 other characters and \0. */ + char encoding[16 + 22]; struct dirent *evt_dirent = evt_namelist[j]; - char evt_path[MAXPATHLEN]; - int evt_fd; + struct io id; + __u64 config; if (evt_dirent->d_type != DT_DIR || !strcmp(evt_dirent->d_name, ".") || !strcmp(evt_dirent->d_name, "..")) goto next_evt; - snprintf(evt_path, sizeof(evt_path), "%s/id", evt_dirent->d_name); - evt_fd = openat(dir_fd, evt_path, O_RDONLY); - if (evt_fd < 0) + snprintf(buf, sizeof(buf), "%s/id", evt_dirent->d_name); + io__init(&id, openat(dir_fd, buf, O_RDONLY), buf, sizeof(buf)); + + if (id.fd < 0) + goto next_evt; + + if (io__get_dec(&id, &config) < 0) { + close(id.fd); goto next_evt; - close(evt_fd); + } + close(id.fd); - snprintf(evt_path, MAXPATHLEN, "%s:%s", + snprintf(buf, sizeof(buf), "%s:%s", sys_dirent->d_name, evt_dirent->d_name); + snprintf(encoding, sizeof(encoding), "tracepoint/config=0x%llx/", config); print_cb->print_event(print_state, /*topic=*/NULL, - /*pmu_name=*/NULL, - evt_path, + /*pmu_name=*/NULL, /* really "tracepoint" */ + /*event_name=*/buf, /*event_alias=*/NULL, /*scale_unit=*/NULL, /*deprecated=*/false, "Tracepoint event", /*desc=*/NULL, /*long_desc=*/NULL, - /*encoding_desc=*/NULL); + encoding); next_evt: free(evt_namelist[j]); } -- cgit v1.2.3 From 39aa4ff61fd310704d04063fc3f2842011e03b4d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Mar 2024 16:19:11 -0800 Subject: perf pmu: Drop "default_core" from alias names "default_core" is used by jevents.py for json events' PMU name when none is specified. On x86 the "default_core" is typically the PMU "cpu". When creating an alias see if the event's PMU name is "default_core" in which case don't record it. This means in places like "perf list" the PMU's name will be used in its place. Before: $ perf list --details ... cache: l1d.replacement [Counts the number of cache lines replaced in L1 data cache] default_core/event=0x51,period=0x186a3,umask=0x1/ ... After: $ perf list --details ... cache: l1d.replacement [Counts the number of cache lines replaced in L1 data cache. Unit: cpu] cpu/event=0x51,period=0x186a3,umask=0x1/ ... Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20240308001915.4060155-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index f39cbbc1a7ec..24be587e3537 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -518,7 +518,8 @@ static int perf_pmu__new_alias(struct perf_pmu *pmu, const char *name, unit = pe->unit; perpkg = pe->perpkg; deprecated = pe->deprecated; - pmu_name = pe->pmu; + if (pe->pmu && strcmp(pe->pmu, "default_core")) + pmu_name = pe->pmu; } alias = zalloc(sizeof(*alias)); -- cgit v1.2.3 From aa1f4ad287a70b07b441f608aed5a5ea066f18b6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Mar 2024 16:19:12 -0800 Subject: perf list: Allow wordwrap to wrap on commas A raw event encoding may be a block with terms separated by commas. If wrapping such a string it would be useful to break at the commas, so add this ability to wordwrap. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20240308001915.4060155-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 02bf608d585e..e0fe3d178d63 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -76,26 +77,38 @@ static void default_print_start(void *ps) static void default_print_end(void *print_state __maybe_unused) {} +static const char *skip_spaces_or_commas(const char *str) +{ + while (isspace(*str) || *str == ',') + ++str; + return str; +} + static void wordwrap(FILE *fp, const char *s, int start, int max, int corr) { int column = start; int n; bool saw_newline = false; + bool comma = false; while (*s) { - int wlen = strcspn(s, " \t\n"); + int wlen = strcspn(s, " ,\t\n"); + const char *sep = comma ? "," : " "; if ((column + wlen >= max && column > start) || saw_newline) { - fprintf(fp, "\n%*s", start, ""); + fprintf(fp, comma ? ",\n%*s" : "\n%*s", start, ""); column = start + corr; } - n = fprintf(fp, "%s%.*s", column > start ? " " : "", wlen, s); + if (column <= start) + sep = ""; + n = fprintf(fp, "%s%.*s", sep, wlen, s); if (n <= 0) break; saw_newline = s[wlen] == '\n'; s += wlen; + comma = s[0] == ','; column += n; - s = skip_spaces(s); + s = skip_spaces_or_commas(s); } } -- cgit v1.2.3 From 4ccf3bb703ed01a872de7d39db53f477d44ade0b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Mar 2024 16:19:13 -0800 Subject: perf list: Give more details about raw event encodings List all the PMUs, not just the first core one, and list real format specifiers with value ranges. Before: $ perf list ... rNNN [Raw hardware event descriptor] cpu/t1=v1[,t2=v2,t3 ...]/modifier [Raw hardware event descriptor] [(see 'man perf-list' on how to encode it)] mem:[/len][:access] [Hardware breakpoint] ... After: $ perf list ... rNNN [Raw event descriptor] cpu/event=0..255,pc,edge,.../modifier [Raw event descriptor] [(see 'man perf-list' or 'man perf-record' on how to encode it)] breakpoint//modifier [Raw event descriptor] cstate_core/event=0..0xffffffffffffffff/modifier [Raw event descriptor] cstate_pkg/event=0..0xffffffffffffffff/modifier [Raw event descriptor] i915/i915_eventid=0..0x1fffff/modifier [Raw event descriptor] intel_bts//modifier [Raw event descriptor] intel_pt/ptw,event,cyc_thresh=0..15,.../modifier [Raw event descriptor] kprobe/retprobe/modifier [Raw event descriptor] msr/event=0..0xffffffffffffffff/modifier [Raw event descriptor] power/event=0..255/modifier [Raw event descriptor] software//modifier [Raw event descriptor] tracepoint//modifier [Raw event descriptor] uncore_arb/event=0..255,edge,inv,.../modifier [Raw event descriptor] uncore_cbox/event=0..255,edge,inv,.../modifier [Raw event descriptor] uncore_clock/event=0..255/modifier [Raw event descriptor] uncore_imc_free_running/event=0..255,umask=0..255/modifier[Raw event descriptor] uprobe/ref_ctr_offset=0..0xffffffff,retprobe/modifier[Raw event descriptor] mem:[/len][:access] [Hardware breakpoint] ... With '--details' provide more details on the formats encoding: cpu/event=0..255,pc,edge,.../modifier [Raw event descriptor] [(see 'man perf-list' or 'man perf-record' on how to encode it)] cpu/event=0..255,pc,edge,offcore_rsp=0..0xffffffffffffffff,ldlat=0..0xffff,inv, umask=0..255,frontend=0..0xffffff,cmask=0..255,config=0..0xffffffffffffffff, config1=0..0xffffffffffffffff,config2=0..0xffffffffffffffff,config3=0..0xffffffffffffffff, name=string,period=number,freq=number,branch_type=(u|k|hv|any|...),time, call-graph=(fp|dwarf|lbr),stack-size=number,max-stack=number,nr=number,inherit,no-inherit, overwrite,no-overwrite,percore,aux-output,aux-sample-size=number/modifier breakpoint//modifier [Raw event descriptor] breakpoint//modifier cstate_core/event=0..0xffffffffffffffff/modifier [Raw event descriptor] cstate_core/event=0..0xffffffffffffffff/modifier cstate_pkg/event=0..0xffffffffffffffff/modifier [Raw event descriptor] cstate_pkg/event=0..0xffffffffffffffff/modifier i915/i915_eventid=0..0x1fffff/modifier [Raw event descriptor] i915/i915_eventid=0..0x1fffff/modifier intel_bts//modifier [Raw event descriptor] intel_bts//modifier intel_pt/ptw,event,cyc_thresh=0..15,.../modifier [Raw event descriptor] intel_pt/ptw,event,cyc_thresh=0..15,pt,notnt,branch,tsc,pwr_evt,fup_on_ptw,cyc,noretcomp, mtc,psb_period=0..15,mtc_period=0..15/modifier kprobe/retprobe/modifier [Raw event descriptor] kprobe/retprobe/modifier msr/event=0..0xffffffffffffffff/modifier [Raw event descriptor] msr/event=0..0xffffffffffffffff/modifier power/event=0..255/modifier [Raw event descriptor] power/event=0..255/modifier software//modifier [Raw event descriptor] software//modifier tracepoint//modifier [Raw event descriptor] tracepoint//modifier uncore_arb/event=0..255,edge,inv,.../modifier [Raw event descriptor] uncore_arb/event=0..255,edge,inv,umask=0..255,cmask=0..31/modifier uncore_cbox/event=0..255,edge,inv,.../modifier [Raw event descriptor] uncore_cbox/event=0..255,edge,inv,umask=0..255,cmask=0..31/modifier uncore_clock/event=0..255/modifier [Raw event descriptor] uncore_clock/event=0..255/modifier uncore_imc_free_running/event=0..255,umask=0..255/modifier[Raw event descriptor] uncore_imc_free_running/event=0..255,umask=0..255/modifier uprobe/ref_ctr_offset=0..0xffffffff,retprobe/modifier[Raw event descriptor] uprobe/ref_ctr_offset=0..0xffffffff,retprobe/modifier Committer notes: Address this build error in various distros: 55 58.44 ubuntu:24.04 : FAIL gcc version 13.2.0 (Ubuntu 13.2.0-17ubuntu2) util/pmu.c:1638:70: error: '_Static_assert' with no message is a C2x extension [-Werror,-Wc2x-extensions] 1638 | _Static_assert(ARRAY_SIZE(terms) == __PARSE_EVENTS__TERM_TYPE_NR - 6); | ^ | , "" 1 error generated. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20240308001915.4060155-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 62 +++++++++++++++++++++++++++- tools/perf/util/pmu.h | 3 ++ tools/perf/util/pmus.c | 94 ++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmus.h | 1 + tools/perf/util/print-events.c | 20 +-------- 5 files changed, 161 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 24be587e3537..81952c6cd3c6 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1603,6 +1603,62 @@ bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name) return false; } +int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb) +{ + static const char *const terms[] = { + "config=0..0xffffffffffffffff", + "config1=0..0xffffffffffffffff", + "config2=0..0xffffffffffffffff", + "config3=0..0xffffffffffffffff", + "name=string", + "period=number", + "freq=number", + "branch_type=(u|k|hv|any|...)", + "time", + "call-graph=(fp|dwarf|lbr)", + "stack-size=number", + "max-stack=number", + "nr=number", + "inherit", + "no-inherit", + "overwrite", + "no-overwrite", + "percore", + "aux-output", + "aux-sample-size=number", + }; + struct perf_pmu_format *format; + int ret; + + /* + * max-events and driver-config are missing above as are the internal + * types user, metric-id, raw, legacy cache and hardware. Assert against + * the enum parse_events__term_type so they are kept in sync. + */ + _Static_assert(ARRAY_SIZE(terms) == __PARSE_EVENTS__TERM_TYPE_NR - 6, + "perf_pmu__for_each_format()'s terms must be kept in sync with enum parse_events__term_type"); + list_for_each_entry(format, &pmu->format, list) { + perf_pmu_format__load(pmu, format); + ret = cb(state, format->name, (int)format->value, format->bits); + if (ret) + return ret; + } + if (!pmu->is_core) + return 0; + + for (size_t i = 0; i < ARRAY_SIZE(terms); i++) { + int config = PERF_PMU_FORMAT_VALUE_CONFIG; + + if (i < PERF_PMU_FORMAT_VALUE_CONFIG_END) + config = i; + + ret = cb(state, terms[i], config, /*bits=*/NULL); + if (ret) + return ret; + } + return 0; +} + bool is_pmu_core(const char *name) { return !strcmp(name, "cpu") || !strcmp(name, "cpum_cf") || is_sysfs_pmu_core(name); @@ -1697,8 +1753,12 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus, pmu_add_cpu_aliases(pmu); list_for_each_entry(event, &pmu->aliases, list) { size_t buf_used; + int pmu_name_len; info.pmu_name = event->pmu_name ?: pmu->name; + pmu_name_len = skip_duplicate_pmus + ? pmu_name_len_no_suffix(info.pmu_name, /*num=*/NULL) + : (int)strlen(info.pmu_name); info.alias = NULL; if (event->desc) { info.name = event->name; @@ -1723,7 +1783,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus, info.encoding_desc = buf + buf_used; parse_events_terms__to_strbuf(&event->terms, &sb); buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used, - "%s/%s/", info.pmu_name, sb.buf) + 1; + "%.*s/%s/", pmu_name_len, info.pmu_name, sb.buf) + 1; info.topic = event->topic; info.str = sb.buf; info.deprecated = event->deprecated; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index e35d985206db..9f5284b29ecf 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -196,6 +196,8 @@ struct pmu_event_info { }; typedef int (*pmu_event_callback)(void *state, struct pmu_event_info *info); +typedef int (*pmu_format_callback)(void *state, const char *name, int config, + const unsigned long *bits); void pmu_add_sys_aliases(struct perf_pmu *pmu); int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, @@ -215,6 +217,7 @@ int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, p int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load); void perf_pmu_format__set_value(void *format, int config, unsigned long *bits); bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name); +int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb); bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 16505071d362..2fd369e45832 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -16,6 +16,7 @@ #include "pmus.h" #include "pmu.h" #include "print-events.h" +#include "strbuf.h" /* * core_pmus: A PMU belongs to core_pmus if it's name is "cpu" or it's sysfs @@ -503,6 +504,99 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p zfree(&aliases); } +struct build_format_string_args { + struct strbuf short_string; + struct strbuf long_string; + int num_formats; +}; + +static int build_format_string(void *state, const char *name, int config, + const unsigned long *bits) +{ + struct build_format_string_args *args = state; + unsigned int num_bits; + int ret1, ret2 = 0; + + (void)config; + args->num_formats++; + if (args->num_formats > 1) { + strbuf_addch(&args->long_string, ','); + if (args->num_formats < 4) + strbuf_addch(&args->short_string, ','); + } + num_bits = bits ? bitmap_weight(bits, PERF_PMU_FORMAT_BITS) : 0; + if (num_bits <= 1) { + ret1 = strbuf_addf(&args->long_string, "%s", name); + if (args->num_formats < 4) + ret2 = strbuf_addf(&args->short_string, "%s", name); + } else if (num_bits > 8) { + ret1 = strbuf_addf(&args->long_string, "%s=0..0x%llx", name, + ULLONG_MAX >> (64 - num_bits)); + if (args->num_formats < 4) { + ret2 = strbuf_addf(&args->short_string, "%s=0..0x%llx", name, + ULLONG_MAX >> (64 - num_bits)); + } + } else { + ret1 = strbuf_addf(&args->long_string, "%s=0..%llu", name, + ULLONG_MAX >> (64 - num_bits)); + if (args->num_formats < 4) { + ret2 = strbuf_addf(&args->short_string, "%s=0..%llu", name, + ULLONG_MAX >> (64 - num_bits)); + } + } + return ret1 < 0 ? ret1 : (ret2 < 0 ? ret2 : 0); +} + +void perf_pmus__print_raw_pmu_events(const struct print_callbacks *print_cb, void *print_state) +{ + bool skip_duplicate_pmus = print_cb->skip_duplicate_pmus(print_state); + struct perf_pmu *(*scan_fn)(struct perf_pmu *); + struct perf_pmu *pmu = NULL; + + if (skip_duplicate_pmus) + scan_fn = perf_pmus__scan_skip_duplicates; + else + scan_fn = perf_pmus__scan; + + while ((pmu = scan_fn(pmu)) != NULL) { + struct build_format_string_args format_args = { + .short_string = STRBUF_INIT, + .long_string = STRBUF_INIT, + .num_formats = 0, + }; + int len = pmu_name_len_no_suffix(pmu->name, /*num=*/NULL); + const char *desc = "(see 'man perf-list' or 'man perf-record' on how to encode it)"; + + if (!pmu->is_core) + desc = NULL; + + strbuf_addf(&format_args.short_string, "%.*s/", len, pmu->name); + strbuf_addf(&format_args.long_string, "%.*s/", len, pmu->name); + perf_pmu__for_each_format(pmu, &format_args, build_format_string); + + if (format_args.num_formats > 3) + strbuf_addf(&format_args.short_string, ",.../modifier"); + else + strbuf_addf(&format_args.short_string, "/modifier"); + + strbuf_addf(&format_args.long_string, "/modifier"); + print_cb->print_event(print_state, + /*topic=*/NULL, + /*pmu_name=*/NULL, + format_args.short_string.buf, + /*event_alias=*/NULL, + /*scale_unit=*/NULL, + /*deprecated=*/false, + "Raw event descriptor", + desc, + /*long_desc=*/NULL, + format_args.long_string.buf); + + strbuf_release(&format_args.short_string); + strbuf_release(&format_args.long_string); + } +} + bool perf_pmus__have_event(const char *pname, const char *name) { struct perf_pmu *pmu = perf_pmus__find(pname); diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 94d2a08d894b..eec599d8aebd 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -18,6 +18,7 @@ struct perf_pmu *perf_pmus__scan_core(struct perf_pmu *pmu); const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state); +void perf_pmus__print_raw_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool perf_pmus__have_event(const char *pname, const char *name); int perf_pmus__num_core_pmus(void); bool perf_pmus__supports_extended_type(void); diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index e0d2b49bab66..3f38c27f0157 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -39,7 +39,7 @@ static const char * const event_type_descriptors[] = { "Software event", "Tracepoint event", "Hardware cache event", - "Raw hardware event descriptor", + "Raw event descriptor", "Hardware breakpoint", }; @@ -416,8 +416,6 @@ void print_symbol_events(const struct print_callbacks *print_cb, void *print_sta */ void print_events(const struct print_callbacks *print_cb, void *print_state) { - char *tmp; - print_symbol_events(print_cb, print_state, PERF_TYPE_HARDWARE, event_symbols_hw, PERF_COUNT_HW_MAX); print_symbol_events(print_cb, print_state, PERF_TYPE_SOFTWARE, @@ -441,21 +439,7 @@ void print_events(const struct print_callbacks *print_cb, void *print_state) /*long_desc=*/NULL, /*encoding_desc=*/NULL); - if (asprintf(&tmp, "%s/t1=v1[,t2=v2,t3 ...]/modifier", - perf_pmus__scan_core(/*pmu=*/NULL)->name) > 0) { - print_cb->print_event(print_state, - /*topic=*/NULL, - /*pmu_name=*/NULL, - tmp, - /*event_alias=*/NULL, - /*scale_unit=*/NULL, - /*deprecated=*/false, - event_type_descriptors[PERF_TYPE_RAW], - "(see 'man perf-list' on how to encode it)", - /*long_desc=*/NULL, - /*encoding_desc=*/NULL); - free(tmp); - } + perf_pmus__print_raw_pmu_events(print_cb, print_state); print_cb->print_event(print_state, /*topic=*/NULL, -- cgit v1.2.3 From 7093882067e2e2f88d3449c35c5f0f3f566c8a26 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Mar 2024 16:19:14 -0800 Subject: perf tools: Use pmus to describe type from attribute When dumping a perf_event_attr, use pmus to find the PMU and its name by the type number. This allows dynamically added PMUs to be described. Before: $ perf stat -vv -e data_read true ... perf_event_attr: type 24 size 136 config 0x20ff sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ... After: $ perf stat -vv -e data_read true ... perf_event_attr: type 24 (uncore_imc_free_running_0) size 136 config 0x20ff sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ... However, it also means that when we have a PMU name we prefer it to a hard coded name: Before: $ perf stat -vv -e faults true ... perf_event_attr: type 1 (PERF_TYPE_SOFTWARE) size 136 config 0x2 (PERF_COUNT_SW_PAGE_FAULTS) sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ... After: $ perf stat -vv -e faults true ... perf_event_attr: type 1 (software) size 136 config 0x2 (PERF_COUNT_SW_PAGE_FAULTS) sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ... It feels more consistent to do this, rather than only prefer a PMU name when a hard coded name isn't available. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20240308001915.4060155-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/perf_event_attr_fprintf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 8f04d3b7f3ec..29e66835da3a 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -7,6 +7,8 @@ #include #include #include "util/evsel_fprintf.h" +#include "util/pmu.h" +#include "util/pmus.h" #include "trace-event.h" struct bit_names { @@ -75,9 +77,12 @@ static void __p_read_format(char *buf, size_t size, u64 value) } #define ENUM_ID_TO_STR_CASE(x) case x: return (#x); -static const char *stringify_perf_type_id(u64 value) +static const char *stringify_perf_type_id(struct perf_pmu *pmu, u32 type) { - switch (value) { + if (pmu) + return pmu->name; + + switch (type) { ENUM_ID_TO_STR_CASE(PERF_TYPE_HARDWARE) ENUM_ID_TO_STR_CASE(PERF_TYPE_SOFTWARE) ENUM_ID_TO_STR_CASE(PERF_TYPE_TRACEPOINT) @@ -175,9 +180,9 @@ do { \ #define print_id_unsigned(_s) PRINT_ID(_s, "%"PRIu64) #define print_id_hex(_s) PRINT_ID(_s, "%#"PRIx64) -static void __p_type_id(char *buf, size_t size, u64 value) +static void __p_type_id(struct perf_pmu *pmu, char *buf, size_t size, u64 value) { - print_id_unsigned(stringify_perf_type_id(value)); + print_id_unsigned(stringify_perf_type_id(pmu, value)); } static void __p_config_hw_id(char *buf, size_t size, u64 value) @@ -246,7 +251,7 @@ static void __p_config_id(char *buf, size_t size, u32 type, u64 value) #define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val) #define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val) #define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) -#define p_type_id(val) __p_type_id(buf, BUF_SIZE, val) +#define p_type_id(val) __p_type_id(pmu, buf, BUF_SIZE, val) #define p_config_id(val) __p_config_id(buf, BUF_SIZE, attr->type, val) #define PRINT_ATTRn(_n, _f, _p, _a) \ @@ -262,6 +267,7 @@ do { \ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, attr__fprintf_f attr__fprintf, void *priv) { + struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); char buf[BUF_SIZE]; int ret = 0; -- cgit v1.2.3 From 67ee8e71daabb8632931b7559e5c8a4b69a427f8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Mar 2024 16:19:15 -0800 Subject: perf tools: Add/use PMU reverse lookup from config to name Add perf_pmu__name_from_config that does a reverse lookup from a config number to an alias name. The lookup is expensive as the config is computed for every alias by filling in a perf_event_attr, but this is only done when verbose output is enabled. The lookup also only considers config, and not config1, config2 or config3. An example of the output: $ perf stat -vv -e data_read true ... perf_event_attr: type 24 (uncore_imc_free_running_0) size 136 config 0x20ff (data_read) sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ... Committer notes: Fix the python binding build by adding dummies for not strictly needed perf_pmu__name_from_config() and perf_pmus__find_by_type(). Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20240308001915.4060155-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/perf_event_attr_fprintf.c | 10 ++++++++-- tools/perf/util/pmu.c | 18 ++++++++++++++++++ tools/perf/util/pmu.h | 1 + tools/perf/util/python.c | 10 ++++++++++ 4 files changed, 37 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 29e66835da3a..59fbbba79697 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -222,8 +222,14 @@ static void __p_config_tracepoint_id(char *buf, size_t size, u64 value) } #endif -static void __p_config_id(char *buf, size_t size, u32 type, u64 value) +static void __p_config_id(struct perf_pmu *pmu, char *buf, size_t size, u32 type, u64 value) { + const char *name = perf_pmu__name_from_config(pmu, value); + + if (name) { + print_id_hex(name); + return; + } switch (type) { case PERF_TYPE_HARDWARE: return __p_config_hw_id(buf, size, value); @@ -252,7 +258,7 @@ static void __p_config_id(char *buf, size_t size, u32 type, u64 value) #define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val) #define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) #define p_type_id(val) __p_type_id(pmu, buf, BUF_SIZE, val) -#define p_config_id(val) __p_config_id(buf, BUF_SIZE, attr->type, val) +#define p_config_id(val) __p_config_id(pmu, buf, BUF_SIZE, attr->type, val) #define PRINT_ATTRn(_n, _f, _p, _a) \ do { \ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 81952c6cd3c6..ab30f22eaf10 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -2146,3 +2146,21 @@ void perf_pmu__delete(struct perf_pmu *pmu) zfree(&pmu->id); free(pmu); } + +const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config) +{ + struct perf_pmu_alias *event; + + if (!pmu) + return NULL; + + pmu_add_cpu_aliases(pmu); + list_for_each_entry(event, &pmu->aliases, list) { + struct perf_event_attr attr = {.config = 0,}; + int ret = perf_pmu__config(pmu, &attr, &event->terms, NULL); + + if (ret == 0 && config == attr.config) + return event->name; + } + return NULL; +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 9f5284b29ecf..152700f78455 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -276,5 +276,6 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char struct perf_pmu *perf_pmu__create_placeholder_core_pmu(struct list_head *core_pmus); void perf_pmu__delete(struct perf_pmu *pmu); struct perf_pmu *perf_pmus__find_core_pmu(void); +const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config); #endif /* __PMU_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 075c0f79b1b9..0aeb97c11c03 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -103,6 +103,16 @@ int perf_pmu__scan_file(const struct perf_pmu *pmu, const char *name, const char return EOF; } +const char *perf_pmu__name_from_config(struct perf_pmu *pmu __maybe_unused, u64 config __maybe_unused) +{ + return NULL; +} + +struct perf_pmu *perf_pmus__find_by_type(unsigned int type __maybe_unused) +{ + return NULL; +} + int perf_pmus__num_core_pmus(void) { return 1; -- cgit v1.2.3 From 88ce0106a1f603bf360cb397e8fe293f8298fabb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Feb 2024 23:46:36 -0800 Subject: perf record: Delete session after stopping sideband thread The session has a header in it which contains a perf env with bpf_progs. The bpf_progs are accessed by the sideband thread and so the sideband thread must be stopped before the session is deleted, to avoid a use after free. This error was detected by AddressSanitizer in the following: ==2054673==ERROR: AddressSanitizer: heap-use-after-free on address 0x61d000161e00 at pc 0x55769289de54 bp 0x7f9df36d4ab0 sp 0x7f9df36d4aa8 READ of size 8 at 0x61d000161e00 thread T1 #0 0x55769289de53 in __perf_env__insert_bpf_prog_info util/env.c:42 #1 0x55769289dbb1 in perf_env__insert_bpf_prog_info util/env.c:29 #2 0x557692bbae29 in perf_env__add_bpf_info util/bpf-event.c:483 #3 0x557692bbb01a in bpf_event__sb_cb util/bpf-event.c:512 #4 0x5576928b75f4 in perf_evlist__poll_thread util/sideband_evlist.c:68 #5 0x7f9df96a63eb in start_thread nptl/pthread_create.c:444 #6 0x7f9df9726a4b in clone3 ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 0x61d000161e00 is located 384 bytes inside of 2136-byte region [0x61d000161c80,0x61d0001624d8) freed by thread T0 here: #0 0x7f9dfa6d7288 in __interceptor_free libsanitizer/asan/asan_malloc_linux.cpp:52 #1 0x557692978d50 in perf_session__delete util/session.c:319 #2 0x557692673959 in __cmd_record tools/perf/builtin-record.c:2884 #3 0x55769267a9f0 in cmd_record tools/perf/builtin-record.c:4259 #4 0x55769286710c in run_builtin tools/perf/perf.c:349 #5 0x557692867678 in handle_internal_command tools/perf/perf.c:402 #6 0x557692867a40 in run_argv tools/perf/perf.c:446 #7 0x557692867fae in main tools/perf/perf.c:562 #8 0x7f9df96456c9 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 Fixes: 657ee5531903339b ("perf evlist: Introduce side band thread") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Christian Brauner Cc: Disha Goel Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: K Prateek Nayak Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Tim Chen Cc: Yicong Yang Link: https://lore.kernel.org/r/20240301074639.2260708-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index ff7e1d6cfcd2..40d2c1c48666 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2881,10 +2881,10 @@ out_delete_session: } #endif zstd_fini(&session->zstd_data); - perf_session__delete(session); - if (!opts->no_bpf_event) evlist__stop_sb_thread(rec->sb_evlist); + + perf_session__delete(session); return status; } -- cgit v1.2.3 From f68c981be0628e141615aa30575efb687da09a2e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Feb 2024 23:46:37 -0800 Subject: perf test: Stat output per thread of just the parent process Per-thread mode requires either system-wide (-a), a pid (-p) or a tid (-t). The stat output tests were using system-wide mode but this is racy when threads are starting and exiting - something that happens a lot when running the tests in parallel (perf test -p). Avoid the race conditions by using pid mode with the pid of the parent process. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Christian Brauner Cc: Disha Goel Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Tim Chen Cc: Yicong Yang Link: https://lore.kernel.org/r/20240301074639.2260708-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/stat_output.sh | 2 +- tools/perf/tests/shell/stat+json_output.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/lib/stat_output.sh b/tools/perf/tests/shell/lib/stat_output.sh index c81d6a9f7983..9a176ceae4a3 100644 --- a/tools/perf/tests/shell/lib/stat_output.sh +++ b/tools/perf/tests/shell/lib/stat_output.sh @@ -79,7 +79,7 @@ check_per_thread() echo "[Skip] paranoid and not root" return fi - perf stat --per-thread -a $2 true + perf stat --per-thread -p $$ $2 true commachecker --per-thread echo "[Success]" } diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh index 2b9c6212dffc..6b630d33c328 100755 --- a/tools/perf/tests/shell/stat+json_output.sh +++ b/tools/perf/tests/shell/stat+json_output.sh @@ -105,7 +105,7 @@ check_per_thread() echo "[Skip] paranoia and not root" return fi - perf stat -j --per-thread -a -o "${stat_output}" true + perf stat -j --per-thread -p $$ -o "${stat_output}" true $PYTHON $pythonchecker --per-thread --file "${stat_output}" echo "[Success]" } -- cgit v1.2.3 From e120f7091a25460a19967380725558c36bca7c6c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Feb 2024 23:46:38 -0800 Subject: perf test: Use a single fd for the child process out/err Switch from dumping err then out, to a single file descriptor for both of them. This allows the err and output to be correctly interleaved in verbose output. Fixes: b482f5f8e0168f1e ("perf tests: Add option to run tests in parallel") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Christian Brauner Cc: Disha Goel Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: K Prateek Nayak Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Tim Chen Cc: Yicong Yang Link: https://lore.kernel.org/r/20240301074639.2260708-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index d13ee7683d9d..e05b370b1e2b 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -274,11 +274,8 @@ static int finish_test(struct child_test *child_test, int width) struct test_suite *t = child_test->test; int i = child_test->test_num; int subi = child_test->subtest; - int out = child_test->process.out; int err = child_test->process.err; - bool out_done = out <= 0; bool err_done = err <= 0; - struct strbuf out_output = STRBUF_INIT; struct strbuf err_output = STRBUF_INIT; int ret; @@ -290,11 +287,9 @@ static int finish_test(struct child_test *child_test, int width) pr_info("%3d: %-*s:\n", i + 1, width, test_description(t, -1)); /* - * Busy loop reading from the child's stdout and stderr that are set to - * be non-blocking until EOF. + * Busy loop reading from the child's stdout/stderr that are set to be + * non-blocking until EOF. */ - if (!out_done) - fcntl(out, F_SETFL, O_NONBLOCK); if (!err_done) fcntl(err, F_SETFL, O_NONBLOCK); if (verbose > 1) { @@ -303,11 +298,8 @@ static int finish_test(struct child_test *child_test, int width) else pr_info("%3d: %s:\n", i + 1, test_description(t, -1)); } - while (!out_done || !err_done) { - struct pollfd pfds[2] = { - { .fd = out, - .events = POLLIN | POLLERR | POLLHUP | POLLNVAL, - }, + while (!err_done) { + struct pollfd pfds[1] = { { .fd = err, .events = POLLIN | POLLERR | POLLHUP | POLLNVAL, }, @@ -317,21 +309,7 @@ static int finish_test(struct child_test *child_test, int width) /* Poll to avoid excessive spinning, timeout set for 1000ms. */ poll(pfds, ARRAY_SIZE(pfds), /*timeout=*/1000); - if (!out_done && pfds[0].revents) { - errno = 0; - len = read(out, buf, sizeof(buf) - 1); - - if (len <= 0) { - out_done = errno != EAGAIN; - } else { - buf[len] = '\0'; - if (verbose > 1) - fprintf(stdout, "%s", buf); - else - strbuf_addstr(&out_output, buf); - } - } - if (!err_done && pfds[1].revents) { + if (!err_done && pfds[0].revents) { errno = 0; len = read(err, buf, sizeof(buf) - 1); @@ -354,14 +332,10 @@ static int finish_test(struct child_test *child_test, int width) pr_info("%3d.%1d: %s:\n", i + 1, subi + 1, test_description(t, subi)); else pr_info("%3d: %s:\n", i + 1, test_description(t, -1)); - fprintf(stdout, "%s", out_output.buf); fprintf(stderr, "%s", err_output.buf); } - strbuf_release(&out_output); strbuf_release(&err_output); print_test_result(t, i, subi, ret, width); - if (out > 0) - close(out); if (err > 0) close(err); return 0; @@ -394,6 +368,7 @@ static int start_test(struct test_suite *test, int i, int subi, struct child_tes (*child)->process.no_stdout = 1; (*child)->process.no_stderr = 1; } else { + (*child)->process.stdout_to_stderr = 1; (*child)->process.out = -1; (*child)->process.err = -1; } -- cgit v1.2.3 From 5f2f051a9386aa6d62d081a9889ca321ca8c309b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Feb 2024 23:46:39 -0800 Subject: perf test: Read child test 10 times a second rather than 1 Make the perf test output smoother by timing out the poll of the child process after 100ms rather than 1s. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Christian Brauner Cc: Disha Goel Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Tim Chen Cc: Yicong Yang Link: https://lore.kernel.org/r/20240301074639.2260708-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index e05b370b1e2b..ddb2f4e38ea5 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -307,8 +307,8 @@ static int finish_test(struct child_test *child_test, int width) char buf[512]; ssize_t len; - /* Poll to avoid excessive spinning, timeout set for 1000ms. */ - poll(pfds, ARRAY_SIZE(pfds), /*timeout=*/1000); + /* Poll to avoid excessive spinning, timeout set for 100ms. */ + poll(pfds, ARRAY_SIZE(pfds), /*timeout=*/100); if (!err_done && pfds[0].revents) { errno = 0; len = read(err, buf, sizeof(buf) - 1); -- cgit v1.2.3 From f664d5159de275d3453ee5699b3376575c6aa156 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 1 Mar 2024 12:13:05 -0800 Subject: perf tools: Suggest inbuilt commands for unknown command The existing unknown command code looks for perf scripts like perf-archive.sh and perf-iostat.sh, however, inbuilt commands aren't suggested. Add the inbuilt commands so they may be suggested too. Before: $ perf reccord perf: 'reccord' is not a perf-command. See 'perf --help'. $ After: $ perf reccord perf: 'reccord' is not a perf-command. See 'perf --help'. Did you mean this? record $ Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240301201306.2680986-1-irogers@google.com [ Added some fixes from Ian to problems I noticed while testing ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin.h | 4 +++- tools/perf/perf.c | 23 ++++++++++++++----- tools/perf/util/help-unknown-cmd.c | 45 ++++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index f2ab5bae2150..f4375deabfa3 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -2,8 +2,10 @@ #ifndef BUILTIN_H #define BUILTIN_H +struct cmdnames; + void list_common_cmds_help(void); -const char *help_unknown_cmd(const char *cmd); +const char *help_unknown_cmd(const char *cmd, struct cmdnames *main_cmds); int cmd_annotate(int argc, const char **argv); int cmd_bench(int argc, const char **argv); diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 921bee0a6437..bd3f80b5bb46 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -18,6 +18,7 @@ #include #include "util/parse-events.h" #include +#include #include "util/debug.h" #include "util/event.h" #include "util/util.h" // usage() @@ -458,7 +459,7 @@ static int libperf_print(enum libperf_print_level level, int main(int argc, const char **argv) { - int err; + int err, done_help = 0; const char *cmd; char sbuf[STRERR_BUFSIZE]; @@ -557,22 +558,32 @@ int main(int argc, const char **argv) pthread__block_sigwinch(); while (1) { - static int done_help; - run_argv(&argc, &argv); if (errno != ENOENT) break; if (!done_help) { - cmd = argv[0] = help_unknown_cmd(cmd); + struct cmdnames main_cmds = {}; + + for (unsigned int i = 0; i < ARRAY_SIZE(commands); i++) { + add_cmdname(&main_cmds, + commands[i].cmd, + strlen(commands[i].cmd)); + } + cmd = argv[0] = help_unknown_cmd(cmd, &main_cmds); + clean_cmdnames(&main_cmds); done_help = 1; + if (!cmd) + break; } else break; } - fprintf(stderr, "Failed to run command '%s': %s\n", - cmd, str_error_r(errno, sbuf, sizeof(sbuf))); + if (cmd) { + fprintf(stderr, "Failed to run command '%s': %s\n", + cmd, str_error_r(errno, sbuf, sizeof(sbuf))); + } out: if (debug_fp) fclose(debug_fp); diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c index eab99ea6ac01..2ba3369f1620 100644 --- a/tools/perf/util/help-unknown-cmd.c +++ b/tools/perf/util/help-unknown-cmd.c @@ -52,46 +52,44 @@ static int add_cmd_list(struct cmdnames *cmds, struct cmdnames *old) return 0; } -const char *help_unknown_cmd(const char *cmd) +const char *help_unknown_cmd(const char *cmd, struct cmdnames *main_cmds) { unsigned int i, n = 0, best_similarity = 0; - struct cmdnames main_cmds, other_cmds; + struct cmdnames other_cmds; - memset(&main_cmds, 0, sizeof(main_cmds)); - memset(&other_cmds, 0, sizeof(main_cmds)); + memset(&other_cmds, 0, sizeof(other_cmds)); perf_config(perf_unknown_cmd_config, NULL); - load_command_list("perf-", &main_cmds, &other_cmds); + load_command_list("perf-", main_cmds, &other_cmds); - if (add_cmd_list(&main_cmds, &other_cmds) < 0) { + if (add_cmd_list(main_cmds, &other_cmds) < 0) { fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n"); goto end; } - qsort(main_cmds.names, main_cmds.cnt, - sizeof(main_cmds.names), cmdname_compare); - uniq(&main_cmds); + qsort(main_cmds->names, main_cmds->cnt, + sizeof(main_cmds->names), cmdname_compare); + uniq(main_cmds); - if (main_cmds.cnt) { + if (main_cmds->cnt) { /* This reuses cmdname->len for similarity index */ - for (i = 0; i < main_cmds.cnt; ++i) - main_cmds.names[i]->len = - levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4); + for (i = 0; i < main_cmds->cnt; ++i) + main_cmds->names[i]->len = + levenshtein(cmd, main_cmds->names[i]->name, 0, 2, 1, 4); - qsort(main_cmds.names, main_cmds.cnt, - sizeof(*main_cmds.names), levenshtein_compare); + qsort(main_cmds->names, main_cmds->cnt, + sizeof(*main_cmds->names), levenshtein_compare); - best_similarity = main_cmds.names[0]->len; + best_similarity = main_cmds->names[0]->len; n = 1; - while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len) + while (n < main_cmds->cnt && best_similarity == main_cmds->names[n]->len) ++n; } if (autocorrect && n == 1) { - const char *assumed = main_cmds.names[0]->name; + const char *assumed = main_cmds->names[0]->name; - main_cmds.names[0] = NULL; - clean_cmdnames(&main_cmds); + main_cmds->names[0] = NULL; clean_cmdnames(&other_cmds); fprintf(stderr, "WARNING: You called a perf program named '%s', " "which does not exist.\n" @@ -107,15 +105,14 @@ const char *help_unknown_cmd(const char *cmd) fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd); - if (main_cmds.cnt && best_similarity < 6) { + if (main_cmds->cnt && best_similarity < 6) { fprintf(stderr, "\nDid you mean %s?\n", n < 2 ? "this": "one of these"); for (i = 0; i < n; i++) - fprintf(stderr, "\t%s\n", main_cmds.names[i]->name); + fprintf(stderr, "\t%s\n", main_cmds->names[i]->name); } end: - clean_cmdnames(&main_cmds); clean_cmdnames(&other_cmds); - exit(1); + return NULL; } -- cgit v1.2.3 From 7aea01eaf4f301bd652a25d543bcba6c23cb4b85 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 1 Mar 2024 12:13:06 -0800 Subject: perf help: Lower levenshtein penality for deleting character The levenshtein penalty for deleting a character was far higher than subsituting or inserting a character. Lower the penalty to match that of inserting a character. Before: $ perf recccord perf: 'recccord' is not a perf-command. See 'perf --help'. $ After: $ perf recccord perf: 'recccord' is not a perf-command. See 'perf --help'. Did you mean this? record $ Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240301201306.2680986-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/help-unknown-cmd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c index 2ba3369f1620..a0a46e34f8d1 100644 --- a/tools/perf/util/help-unknown-cmd.c +++ b/tools/perf/util/help-unknown-cmd.c @@ -73,10 +73,14 @@ const char *help_unknown_cmd(const char *cmd, struct cmdnames *main_cmds) if (main_cmds->cnt) { /* This reuses cmdname->len for similarity index */ - for (i = 0; i < main_cmds->cnt; ++i) + for (i = 0; i < main_cmds->cnt; ++i) { main_cmds->names[i]->len = - levenshtein(cmd, main_cmds->names[i]->name, 0, 2, 1, 4); - + levenshtein(cmd, main_cmds->names[i]->name, + /*swap_penalty=*/0, + /*substition_penality=*/2, + /*insertion_penality=*/1, + /*deletion_penalty=*/1); + } qsort(main_cmds->names, main_cmds->cnt, sizeof(*main_cmds->names), levenshtein_compare); -- cgit v1.2.3 From 4cef0e7ae76b877e67909b16b3044f0b4a504f32 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 1 Mar 2024 09:47:11 -0800 Subject: perf tests: Run tests in parallel by default Switch from running tests sequentially to running in parallel by default. Change the opt-in '-p' or '--parallel' flag to '-S' or '--sequential'. On an 8 core tigerlake an address sanitizer run time changes from: 326.54user 622.73system 6:59.91elapsed 226%CPU to: 973.02user 583.98system 3:01.17elapsed 859%CPU So over twice as fast, saving 4 minutes. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240301174711.2646944-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index ddb2f4e38ea5..73f53b02f733 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -39,8 +39,8 @@ * making them easier to debug. */ static bool dont_fork; -/* Fork the tests in parallel and then wait for their completion. */ -static bool parallel; +/* Don't fork the tests in parallel and wait for their completion. */ +static bool sequential; const char *dso_to_test; const char *test_objdump_path = "objdump"; @@ -374,7 +374,7 @@ static int start_test(struct test_suite *test, int i, int subi, struct child_tes } (*child)->process.no_exec_cmd = run_test_child; err = start_command(&(*child)->process); - if (err || parallel) + if (err || !sequential) return err; return finish_test(*child, width); } @@ -440,7 +440,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist) int err = start_test(t, curr, -1, &child_tests[child_test_num++], width); if (err) { - /* TODO: if parallel waitpid the already forked children. */ + /* TODO: if !sequential waitpid the already forked children. */ free(child_tests); return err; } @@ -460,7 +460,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist) } } for (i = 0; i < child_test_num; i++) { - if (parallel) { + if (!sequential) { int ret = finish_test(child_tests[i], width); if (ret) @@ -536,8 +536,8 @@ int cmd_test(int argc, const char **argv) "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('F', "dont-fork", &dont_fork, "Do not fork for testcase"), - OPT_BOOLEAN('p', "parallel", ¶llel, - "Run the tests altogether in parallel"), + OPT_BOOLEAN('S', "sequential", &sequential, + "Run the tests one after another rather than in parallel"), OPT_STRING('w', "workload", &workload, "work", "workload to run for testing"), OPT_STRING(0, "dso", &dso_to_test, "dso", "dso to test"), OPT_STRING(0, "objdump", &test_objdump_path, "path", @@ -564,6 +564,9 @@ int cmd_test(int argc, const char **argv) if (workload) return run_workload(workload, argc, argv); + if (dont_fork) + sequential = true; + symbol_conf.priv_size = sizeof(int); symbol_conf.try_vmlinux_path = true; -- cgit v1.2.3 From 3d6cfbaf279ddec9d92d434268ac7aab1a4935ca Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Mar 2024 14:49:32 -0300 Subject: perf beauty: Introduce scrape script for various fs syscalls 'flags' arguments It was using the first variation on producing a string representation for a binary flag, one that used the system's fcntl.h and preprocessor tricks that had to be updated everytime a new flag was introduced. Use the more recent scrape script + strarray + strarray__scnprintf_flags() combo. $ tools/perf/trace/beauty/fs_at_flags.sh static const char *fs_at_flags[] = { [ilog2(0x100) + 1] = "SYMLINK_NOFOLLOW", [ilog2(0x200) + 1] = "REMOVEDIR", [ilog2(0x400) + 1] = "SYMLINK_FOLLOW", [ilog2(0x800) + 1] = "NO_AUTOMOUNT", [ilog2(0x1000) + 1] = "EMPTY_PATH", [ilog2(0x0000) + 1] = "STATX_SYNC_AS_STAT", [ilog2(0x2000) + 1] = "STATX_FORCE_SYNC", [ilog2(0x4000) + 1] = "STATX_DONT_SYNC", [ilog2(0x8000) + 1] = "RECURSIVE", [ilog2(0x80000000) + 1] = "GETATTR_NOSEC", }; $ Now we need a copy of uapi/linux/fcntl.h from tools/include/ in the scrape only directory tools/perf/trace/beauty/include and will use that fs_at_flags array for other fs syscalls. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240320193115.811899-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 7 ++ tools/perf/builtin-trace.c | 2 +- tools/perf/check-headers.sh | 1 + tools/perf/trace/beauty/Build | 1 + tools/perf/trace/beauty/beauty.h | 4 +- tools/perf/trace/beauty/fs_at_flags.c | 26 +++++ tools/perf/trace/beauty/fs_at_flags.sh | 21 ++++ tools/perf/trace/beauty/include/uapi/linux/fcntl.h | 123 +++++++++++++++++++++ tools/perf/trace/beauty/statx.c | 31 ------ 9 files changed, 182 insertions(+), 34 deletions(-) create mode 100644 tools/perf/trace/beauty/fs_at_flags.c create mode 100755 tools/perf/trace/beauty/fs_at_flags.sh create mode 100644 tools/perf/trace/beauty/include/uapi/linux/fcntl.h (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index ccd2dcbc64f7..73d5603450b0 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -489,6 +489,12 @@ beauty_ioctl_outdir := $(beauty_outdir)/ioctl # Create output directory if not already present $(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)') +fs_at_flags_array := $(beauty_outdir)/fs_at_flags_array.c +fs_at_flags_tbl := $(srctree)/tools/perf/trace/beauty/fs_at_flags.sh + +$(fs_at_flags_array): $(beauty_uapi_linux_dir)/fcntl.h $(fs_at_flags_tbl) + $(Q)$(SHELL) '$(fs_at_flags_tbl)' $(beauty_uapi_linux_dir) > $@ + clone_flags_array := $(beauty_outdir)/clone_flags_array.c clone_flags_tbl := $(srctree)/tools/perf/trace/beauty/clone.sh @@ -772,6 +778,7 @@ build-dir = $(or $(__build-dir),.) prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \ arm64-sysreg-defs \ + $(fs_at_flags_array) \ $(clone_flags_array) \ $(drm_ioctl_array) \ $(fadvise_advice_array) \ diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8fb032caeaf5..8417387aafa8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1144,7 +1144,7 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "stat", .alias = "newstat", }, { .name = "statx", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, - [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } , [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, { .name = "swapoff", .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 413c9b747216..d23a84fdf3ef 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -89,6 +89,7 @@ BEAUTY_FILES=( "arch/x86/include/asm/irq_vectors.h" "arch/x86/include/uapi/asm/prctl.h" "include/linux/socket.h" + "include/uapi/linux/fcntl.h" "include/uapi/linux/fs.h" "include/uapi/linux/mount.h" "include/uapi/linux/prctl.h" diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index d11ce256f511..d8ce1b698983 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -1,6 +1,7 @@ perf-y += clone.o perf-y += fcntl.o perf-y += flock.o +perf-y += fs_at_flags.o perf-y += fsmount.o perf-y += fspick.o ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 9feb794f5c6e..c94ae8701bc6 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -234,8 +234,8 @@ size_t syscall_arg__scnprintf_socket_protocol(char *bf, size_t size, struct sysc size_t syscall_arg__scnprintf_socket_level(char *bf, size_t size, struct syscall_arg *arg); #define SCA_SK_LEVEL syscall_arg__scnprintf_socket_level -size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg); -#define SCA_STATX_FLAGS syscall_arg__scnprintf_statx_flags +size_t syscall_arg__scnprintf_fs_at_flags(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_FS_AT_FLAGS syscall_arg__scnprintf_fs_at_flags size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg); #define SCA_STATX_MASK syscall_arg__scnprintf_statx_mask diff --git a/tools/perf/trace/beauty/fs_at_flags.c b/tools/perf/trace/beauty/fs_at_flags.c new file mode 100644 index 000000000000..2a099953d993 --- /dev/null +++ b/tools/perf/trace/beauty/fs_at_flags.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * trace/beauty/fs_at_flags.c + * + * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo + */ + +#include "trace/beauty/beauty.h" +#include +#include + +#include "trace/beauty/generated/fs_at_flags_array.c" +static DEFINE_STRARRAY(fs_at_flags, "AT_"); + +static size_t fs_at__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) +{ + return strarray__scnprintf_flags(&strarray__fs_at_flags, bf, size, show_prefix, flags); +} + +size_t syscall_arg__scnprintf_fs_at_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + bool show_prefix = arg->show_string_prefix; + int flags = arg->val; + + return fs_at__scnprintf_flags(flags, bf, size, show_prefix); +} diff --git a/tools/perf/trace/beauty/fs_at_flags.sh b/tools/perf/trace/beauty/fs_at_flags.sh new file mode 100755 index 000000000000..456f59addf74 --- /dev/null +++ b/tools/perf/trace/beauty/fs_at_flags.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 1 ] ; then + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ +else + beauty_uapi_linux_dir=$1 +fi + +linux_fcntl=${beauty_uapi_linux_dir}/fcntl.h + +printf "static const char *fs_at_flags[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+AT_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +# AT_EACCESS is only meaningful to faccessat, so we will special case it there... +# AT_STATX_SYNC_TYPE is not a bit, its a mask of AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC and AT_STATX_DONT_SYNC +grep -E $regex ${linux_fcntl} | \ + grep -v AT_EACCESS | \ + grep -v AT_STATX_SYNC_TYPE | \ + sed -r "s/$regex/\2 \1/g" | \ + xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n" +printf "};\n" diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h new file mode 100644 index 000000000000..282e90aeb163 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FCNTL_H +#define _UAPI_LINUX_FCNTL_H + +#include +#include + +#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) +#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) + +/* + * Cancel a blocking posix lock; internal use only until we expose an + * asynchronous lock api to userspace: + */ +#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5) + +/* Create a file descriptor with FD_CLOEXEC set. */ +#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) + +/* + * Request nofications on a directory. + * See below for events that may be notified. + */ +#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) + +/* + * Set and get of pipe page size array + */ +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) + +/* + * Set/Get seals + */ +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +/* + * Types of seals + */ +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ +/* (1U << 31) is reserved for signed error codes */ + +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWH_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + +/* + * Types of directory notifications that may be requested. + */ +#define DN_ACCESS 0x00000001 /* File accessed */ +#define DN_MODIFY 0x00000002 /* File modified */ +#define DN_CREATE 0x00000004 /* File created */ +#define DN_DELETE 0x00000008 /* File removed */ +#define DN_RENAME 0x00000010 /* File renamed */ +#define DN_ATTRIB 0x00000020 /* File changed attibutes */ +#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ + +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ +#define AT_FDCWD -100 /* Special value used to indicate + openat should use the current + working directory. */ +#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ +#define AT_REMOVEDIR 0x200 /* Remove directory instead of + unlinking file. */ +#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ +#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ + +#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ +#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ +#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ +#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ + +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ + +/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ +#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to + compare object identity and may not + be usable to open_by_handle_at(2) */ +#if defined(__KERNEL__) +#define AT_GETATTR_NOSEC 0x80000000 +#endif + +#endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index 1f7e34ed4e02..4e0059fd0211 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -8,7 +8,6 @@ #include "trace/beauty/beauty.h" #include #include -#include #include #ifndef STATX_MNT_ID @@ -21,36 +20,6 @@ #define STATX_MNT_ID_UNIQUE 0x00004000U #endif -size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg) -{ - bool show_prefix = arg->show_string_prefix; - const char *prefix = "AT_"; - int printed = 0, flags = arg->val; - - if (flags == 0) - return scnprintf(bf, size, "%s%s", show_prefix ? "AT_STATX_" : "", "SYNC_AS_STAT"); -#define P_FLAG(n) \ - if (flags & AT_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ - flags &= ~AT_##n; \ - } - - P_FLAG(SYMLINK_NOFOLLOW); - P_FLAG(REMOVEDIR); - P_FLAG(SYMLINK_FOLLOW); - P_FLAG(NO_AUTOMOUNT); - P_FLAG(EMPTY_PATH); - P_FLAG(STATX_FORCE_SYNC); - P_FLAG(STATX_DONT_SYNC); - -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg) { bool show_prefix = arg->show_string_prefix; -- cgit v1.2.3 From f122b3d6d179455ec77dc17690bea7e970433254 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Mar 2024 14:49:32 -0300 Subject: perf beauty: Introduce scrape script for the 'statx' syscall 'mask' argument It was using the first variation on producing a string representation for a binary flag, one that used the system's stat.h and preprocessor tricks that had to be updated everytime a new flag was introduced. Use the more recent scrape script + strarray + strarray__scnprintf_flags() combo. $ tools/perf/trace/beauty/statx_mask.sh static const char *statx_mask[] = { [ilog2(0x00000001) + 1] = "TYPE", [ilog2(0x00000002) + 1] = "MODE", [ilog2(0x00000004) + 1] = "NLINK", [ilog2(0x00000008) + 1] = "UID", [ilog2(0x00000010) + 1] = "GID", [ilog2(0x00000020) + 1] = "ATIME", [ilog2(0x00000040) + 1] = "MTIME", [ilog2(0x00000080) + 1] = "CTIME", [ilog2(0x00000100) + 1] = "INO", [ilog2(0x00000200) + 1] = "SIZE", [ilog2(0x00000400) + 1] = "BLOCKS", [ilog2(0x00000800) + 1] = "BTIME", [ilog2(0x00001000) + 1] = "MNT_ID", [ilog2(0x00002000) + 1] = "DIOALIGN", [ilog2(0x00004000) + 1] = "MNT_ID_UNIQUE", }; $ Now we need a copy of uapi/linux/stat.h from tools/include/ in the scrape only directory tools/perf/trace/beauty/include. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240320193115.811899-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 7 + tools/perf/check-headers.sh | 1 + tools/perf/trace/beauty/include/uapi/linux/stat.h | 195 ++++++++++++++++++++++ tools/perf/trace/beauty/statx.c | 50 +----- tools/perf/trace/beauty/statx_mask.sh | 23 +++ 5 files changed, 235 insertions(+), 41 deletions(-) create mode 100644 tools/perf/trace/beauty/include/uapi/linux/stat.h create mode 100755 tools/perf/trace/beauty/statx_mask.sh (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 73d5603450b0..0d2dbdfc44df 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -673,6 +673,12 @@ arch_errno_tbl := $(srctree)/tools/perf/trace/beauty/arch_errno_names.sh $(arch_errno_name_array): $(arch_errno_tbl) $(Q)$(SHELL) '$(arch_errno_tbl)' '$(patsubst -%,,$(CC))' $(arch_errno_hdr_dir) > $@ +statx_mask_array := $(beauty_outdir)/statx_mask_array.c +statx_mask_tbl := $(srctree)/tools/perf/trace/beauty/statx_mask.sh + +$(statx_mask_array): $(beauty_uapi_linux_dir)/stat.h $(statx_mask_tbl) + $(Q)$(SHELL) '$(statx_mask_tbl)' $(beauty_uapi_linux_dir) > $@ + sync_file_range_arrays := $(beauty_outdir)/sync_file_range_arrays.c sync_file_range_tbls := $(srctree)/tools/perf/trace/beauty/sync_file_range.sh @@ -807,6 +813,7 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \ $(x86_arch_prctl_code_array) \ $(rename_flags_array) \ $(arch_errno_name_array) \ + $(statx_mask_array) \ $(sync_file_range_arrays) \ $(LIBAPI) \ $(LIBPERF) \ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index d23a84fdf3ef..76726a5a7c78 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -94,6 +94,7 @@ BEAUTY_FILES=( "include/uapi/linux/mount.h" "include/uapi/linux/prctl.h" "include/uapi/linux/sched.h" + "include/uapi/linux/stat.h" "include/uapi/linux/usbdevice_fs.h" "include/uapi/sound/asound.h" ) diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h new file mode 100644 index 000000000000..2f2ee82d5517 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_STAT_H +#define _UAPI_LINUX_STAT_H + +#include + +#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) + +#define S_IFMT 00170000 +#define S_IFSOCK 0140000 +#define S_IFLNK 0120000 +#define S_IFREG 0100000 +#define S_IFBLK 0060000 +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFIFO 0010000 +#define S_ISUID 0004000 +#define S_ISGID 0002000 +#define S_ISVTX 0001000 + +#define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) +#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) +#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) +#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) +#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK) +#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO) +#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) + +#define S_IRWXU 00700 +#define S_IRUSR 00400 +#define S_IWUSR 00200 +#define S_IXUSR 00100 + +#define S_IRWXG 00070 +#define S_IRGRP 00040 +#define S_IWGRP 00020 +#define S_IXGRP 00010 + +#define S_IRWXO 00007 +#define S_IROTH 00004 +#define S_IWOTH 00002 +#define S_IXOTH 00001 + +#endif + +/* + * Timestamp structure for the timestamps in struct statx. + * + * tv_sec holds the number of seconds before (negative) or after (positive) + * 00:00:00 1st January 1970 UTC. + * + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. + * + * __reserved is held in case we need a yet finer resolution. + */ +struct statx_timestamp { + __s64 tv_sec; + __u32 tv_nsec; + __s32 __reserved; +}; + +/* + * Structures for the extended file attribute retrieval system call + * (statx()). + * + * The caller passes a mask of what they're specifically interested in as a + * parameter to statx(). What statx() actually got will be indicated in + * st_mask upon return. + * + * For each bit in the mask argument: + * + * - if the datum is not supported: + * + * - the bit will be cleared, and + * + * - the datum will be set to an appropriate fabricated value if one is + * available (eg. CIFS can take a default uid and gid), otherwise + * + * - the field will be cleared; + * + * - otherwise, if explicitly requested: + * + * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is + * set or if the datum is considered out of date, and + * + * - the field will be filled in and the bit will be set; + * + * - otherwise, if not requested, but available in approximate form without any + * effort, it will be filled in anyway, and the bit will be set upon return + * (it might not be up to date, however, and no attempt will be made to + * synchronise the internal state first); + * + * - otherwise the field and the bit will be cleared before returning. + * + * Items in STATX_BASIC_STATS may be marked unavailable on return, but they + * will have values installed for compatibility purposes so that stat() and + * co. can be emulated in userspace. + */ +struct statx { + /* 0x00 */ + __u32 stx_mask; /* What results were written [uncond] */ + __u32 stx_blksize; /* Preferred general I/O size [uncond] */ + __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ + /* 0x10 */ + __u32 stx_nlink; /* Number of hard links */ + __u32 stx_uid; /* User ID of owner */ + __u32 stx_gid; /* Group ID of owner */ + __u16 stx_mode; /* File mode */ + __u16 __spare0[1]; + /* 0x20 */ + __u64 stx_ino; /* Inode number */ + __u64 stx_size; /* File size */ + __u64 stx_blocks; /* Number of 512-byte blocks allocated */ + __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ + /* 0x40 */ + struct statx_timestamp stx_atime; /* Last access time */ + struct statx_timestamp stx_btime; /* File creation time */ + struct statx_timestamp stx_ctime; /* Last attribute change time */ + struct statx_timestamp stx_mtime; /* Last data modification time */ + /* 0x80 */ + __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ + __u32 stx_rdev_minor; + __u32 stx_dev_major; /* ID of device containing file [uncond] */ + __u32 stx_dev_minor; + /* 0x90 */ + __u64 stx_mnt_id; + __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ + __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + /* 0xa0 */ + __u64 __spare3[12]; /* Spare space for future expansion */ + /* 0x100 */ +}; + +/* + * Flags to be stx_mask + * + * Query request/result mask for statx() and struct statx::stx_mask. + * + * These bits should be set in the mask argument of statx() to request + * particular items when calling statx(). + */ +#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */ +#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */ +#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */ +#define STATX_UID 0x00000008U /* Want/got stx_uid */ +#define STATX_GID 0x00000010U /* Want/got stx_gid */ +#define STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define STATX_INO 0x00000100U /* Want/got stx_ino */ +#define STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ +#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ + +#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ + +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + +/* + * Attributes to be found in stx_attributes and masked in stx_attributes_mask. + * + * These give information about the features or the state of a file that might + * be of use to ordinary userspace programs such as GUIs or ls rather than + * specialised tools. + * + * Note that the flags marked [I] correspond to the FS_IOC_SETFLAGS flags + * semantically. Where possible, the numerical value is picked to correspond + * also. Note that the DAX attribute indicates that the file is in the CPU + * direct access state. It does not correspond to the per-inode flag that + * some filesystems support. + * + */ +#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */ +#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */ +#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ +#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ +#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ +#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ +#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ +#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ + + +#endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c index 4e0059fd0211..24843e614b93 100644 --- a/tools/perf/trace/beauty/statx.c +++ b/tools/perf/trace/beauty/statx.c @@ -6,52 +6,20 @@ */ #include "trace/beauty/beauty.h" -#include #include -#include +#include -#ifndef STATX_MNT_ID -#define STATX_MNT_ID 0x00001000U -#endif -#ifndef STATX_DIOALIGN -#define STATX_DIOALIGN 0x00002000U -#endif -#ifndef STATX_MNT_ID_UNIQUE -#define STATX_MNT_ID_UNIQUE 0x00004000U -#endif +static size_t statx__scnprintf_mask(unsigned long mask, char *bf, size_t size, bool show_prefix) +{ + #include "trace/beauty/generated/statx_mask_array.c" + static DEFINE_STRARRAY(statx_mask, "STATX_"); + return strarray__scnprintf_flags(&strarray__statx_mask, bf, size, show_prefix, mask); +} size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg) { bool show_prefix = arg->show_string_prefix; - const char *prefix = "STATX_"; - int printed = 0, flags = arg->val; - -#define P_FLAG(n) \ - if (flags & STATX_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ - flags &= ~STATX_##n; \ - } - - P_FLAG(TYPE); - P_FLAG(MODE); - P_FLAG(NLINK); - P_FLAG(UID); - P_FLAG(GID); - P_FLAG(ATIME); - P_FLAG(MTIME); - P_FLAG(CTIME); - P_FLAG(INO); - P_FLAG(SIZE); - P_FLAG(BLOCKS); - P_FLAG(BTIME); - P_FLAG(MNT_ID); - P_FLAG(DIOALIGN); - P_FLAG(MNT_ID_UNIQUE); - -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + int mask = arg->val; - return printed; + return statx__scnprintf_mask(mask, bf, size, show_prefix); } diff --git a/tools/perf/trace/beauty/statx_mask.sh b/tools/perf/trace/beauty/statx_mask.sh new file mode 100755 index 000000000000..18c802ed0c71 --- /dev/null +++ b/tools/perf/trace/beauty/statx_mask.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# SPDX-License-Identifier: LGPL-2.1 + +if [ $# -ne 1 ] ; then + beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux/ +else + beauty_uapi_linux_dir=$1 +fi + +linux_stat=${beauty_uapi_linux_dir}/stat.h + +printf "static const char *statx_mask[] = {\n" +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+STATX_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +# STATX_BASIC_STATS its a bitmask formed by the mask in the normal stat struct +# STATX_ALL is another bitmask and deprecated +# STATX_ATTR_*: Attributes to be found in stx_attributes and masked in stx_attributes_mask +grep -E $regex ${linux_stat} | \ + grep -v STATX_ALL | \ + grep -v STATX_BASIC_STATS | \ + grep -v '\ Date: Tue, 19 Mar 2024 14:49:32 -0300 Subject: perf beauty: Introduce faccessat2 flags scnprintf routine The fsaccessat and fsaccessat2 now have beautifiers for its arguments. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240320193115.811899-4-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 9 +++++++++ tools/perf/trace/beauty/beauty.h | 3 +++ tools/perf/trace/beauty/fs_at_flags.c | 24 ++++++++++++++++++++++++ 3 files changed, 36 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8417387aafa8..58546e8af9fc 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -947,6 +947,15 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, { .name = "eventfd2", .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, + { .name = "faccessat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, + { .name = "faccessat2", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, + [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, }, { .name = "fchmodat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, { .name = "fchownat", diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index c94ae8701bc6..78d10d92d351 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -237,6 +237,9 @@ size_t syscall_arg__scnprintf_socket_level(char *bf, size_t size, struct syscall size_t syscall_arg__scnprintf_fs_at_flags(char *bf, size_t size, struct syscall_arg *arg); #define SCA_FS_AT_FLAGS syscall_arg__scnprintf_fs_at_flags +size_t syscall_arg__scnprintf_faccessat2_flags(char *bf, size_t size, struct syscall_arg *arg); +#define SCA_FACCESSAT2_FLAGS syscall_arg__scnprintf_faccessat2_flags + size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg); #define SCA_STATX_MASK syscall_arg__scnprintf_statx_mask diff --git a/tools/perf/trace/beauty/fs_at_flags.c b/tools/perf/trace/beauty/fs_at_flags.c index 2a099953d993..c1365e8f0b96 100644 --- a/tools/perf/trace/beauty/fs_at_flags.c +++ b/tools/perf/trace/beauty/fs_at_flags.c @@ -7,6 +7,7 @@ #include "trace/beauty/beauty.h" #include +#include #include #include "trace/beauty/generated/fs_at_flags_array.c" @@ -24,3 +25,26 @@ size_t syscall_arg__scnprintf_fs_at_flags(char *bf, size_t size, struct syscall_ return fs_at__scnprintf_flags(flags, bf, size, show_prefix); } + +static size_t faccessat2__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix) +{ + int printed = 0; + + // AT_EACCESS is the same as AT_REMOVEDIR, that is in fs_at_flags_array, + // special case it here. + if (flags & AT_EACCESS) { + flags &= ~AT_EACCESS; + printed += scnprintf(bf + printed, size - printed, "%sEACCESS%s", + show_prefix ? strarray__fs_at_flags.prefix : "", flags ? "|" : ""); + } + + return strarray__scnprintf_flags(&strarray__fs_at_flags, bf + printed, size - printed, show_prefix, flags); +} + +size_t syscall_arg__scnprintf_faccessat2_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + bool show_prefix = arg->show_string_prefix; + int flags = arg->val; + + return faccessat2__scnprintf_flags(flags, bf, size, show_prefix); +} -- cgit v1.2.3 From 4d92328290274c6bf9060ba384ccabbf7e72918b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Mar 2024 16:15:20 -0300 Subject: perf trace: Beautify the 'flags' arg of unlinkat Reusing the fs_at_flags array done for the 'stat' syscall. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240320193115.811899-5-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 58546e8af9fc..ef0dfffd99fd 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1171,7 +1171,9 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "utimensat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, { .name = "wait4", .errpid = true, -- cgit v1.2.3 From 0831638e8c279a08094fb6cc7aa201580ef74db2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Mar 2024 16:25:19 -0300 Subject: perf trace: Fix 'newfstatat'/'fstatat' argument pretty printing There were needless two entries, one for 'newfstatat' and another for 'fstatat', keep just one and pretty print its 'flags' argument using the fs_at_flags scnprintf that is also used by other FS syscalls such as 'stat', now: root@number:~# perf trace -e newfstatat --max-events=5 0.000 ( 0.010 ms): abrt-dump-jour/1400 newfstatat(dfd: 7, filename: "", statbuf: 0x7fff0d127000, flag: EMPTY_PATH) = 0 0.020 ( 0.003 ms): abrt-dump-jour/1400 newfstatat(dfd: 9, filename: "", statbuf: 0x55752507b0e8, flag: EMPTY_PATH) = 0 0.039 ( 0.004 ms): abrt-dump-jour/1400 newfstatat(dfd: 19, filename: "", statbuf: 0x557525061378, flag: EMPTY_PATH) = 0 0.047 ( 0.003 ms): abrt-dump-jour/1400 newfstatat(dfd: 20, filename: "", statbuf: 0x5575250b8cc8, flag: EMPTY_PATH) = 0 0.053 ( 0.003 ms): abrt-dump-jour/1400 newfstatat(dfd: 22, filename: "", statbuf: 0x5575250535d8, flag: EMPTY_PATH) = 0 root@number:~# Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240320193115.811899-6-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index ef0dfffd99fd..d3ec244e692a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -978,7 +978,6 @@ static const struct syscall_fmt syscall_fmts[] = { [1] = { .scnprintf = SCA_FILENAME, /* path */ }, [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, }, { .name = "fstat", .alias = "newfstat", }, - { .name = "fstatat", .alias = "newfstatat", }, { .name = "futex", .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, @@ -1060,8 +1059,10 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, { .name = "nanosleep", .arg = { [0] = { .scnprintf = SCA_TIMESPEC, /* req */ }, }, }, - { .name = "newfstatat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + { .name = "newfstatat", .alias = "fstatat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "open", .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "open_by_handle_at", -- cgit v1.2.3 From 581037151910126a7934e369e4b6ac70eda9a703 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 21 Mar 2024 11:13:30 -0300 Subject: perf probe: Add missing libgen.h header needed for using basename() This prototype is obtained indirectly, by luck, from some other header in probe-event.c in most systems, but recently exploded on alpine:edge: 8 13.39 alpine:edge : FAIL gcc version 13.2.1 20240309 (Alpine 13.2.1_git20240309) util/probe-event.c: In function 'convert_exec_to_group': util/probe-event.c:225:16: error: implicit declaration of function 'basename' [-Werror=implicit-function-declaration] 225 | ptr1 = basename(exec_copy); | ^~~~~~~~ util/probe-event.c:225:14: error: assignment to 'char *' from 'int' makes pointer from integer without a cast [-Werror=int-conversion] 225 | ptr1 = basename(exec_copy); | ^ cc1: all warnings being treated as errors make[3]: *** [/git/perf-6.8.0/tools/build/Makefile.build:158: util] Error 2 Fix it by adding the libgen.h header where basename() is prototyped. Fixes: fb7345bbf7fad9bf ("perf probe: Support basic dwarf-based operations on uprobe events") Cc: Masami Hiramatsu Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 2a0ad9ecf0a2..5c12459e9765 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 4376424acd154c455b92695c6e85504601509919 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:05 -0700 Subject: perf vendor events intel: Update cascadelakex to 1.21 Update events from 1.20 to 1.21 as released in: https://github.com/intel/perfmon/commit/fcfdba3be8f3be81ad6b509fdebf953ead92dc2c Largely fixes spelling and descriptions. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json | 10 +++++----- tools/perf/pmu-events/arch/x86/cascadelakex/memory.json | 2 +- tools/perf/pmu-events/arch/x86/cascadelakex/other.json | 2 +- tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json | 2 +- .../arch/x86/cascadelakex/uncore-interconnect.json | 14 +++++++------- .../pmu-events/arch/x86/cascadelakex/virtual-memory.json | 2 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json b/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json index 095904c77001..d6f543471b24 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switches", "EventCode": "0xAB", "EventName": "DSB2MITE_SWITCHES.COUNT", - "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses.\nNote: Invoking MITE requires two or three cycles delay.", + "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses. Note: Invoking MITE requires two or three cycles delay.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -267,11 +267,11 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.DSB_CYCLES_OK]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.DSB_CYCLES_OK]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", "SampleAfterValue": "2000003", "UMask": "0x18" }, @@ -321,11 +321,11 @@ "UMask": "0x18" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "SampleAfterValue": "2000003", "UMask": "0x18" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json b/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json index a00ad0aaf1ba..c69b2c33334b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/memory.json @@ -6866,7 +6866,7 @@ "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).", "EventCode": "0xC9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "PublicDescription": "Number of times RTM abort was triggered.", "SampleAfterValue": "2000003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/other.json b/tools/perf/pmu-events/arch/x86/cascadelakex/other.json index 3ab5e91a4c1c..95d42ac36717 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/other.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server michroarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json b/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json index 66d686cc933e..c50ddf5b40dd 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json @@ -396,7 +396,7 @@ "Errata": "SKL091, SKL044", "EventCode": "0xC0", "EventName": "INST_RETIRED.NOP", - "PEBS": "1", + "PEBS": "2", "SampleAfterValue": "2000003", "UMask": "0x2" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json index 1a342dff1503..3fe9ce483bbe 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -47,7 +47,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x2", "Unit": "IRP" }, @@ -56,7 +56,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.DRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x4", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIDCAHINT", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x20", "Unit": "IRP" }, @@ -74,7 +74,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIRDCUR", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x1", "Unit": "IRP" }, @@ -101,7 +101,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -500,7 +500,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json b/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json index f59405877ae8..73feadaf7674 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/virtual-memory.json @@ -205,7 +205,7 @@ "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture.", + "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake microarchitecture.", "SampleAfterValue": "100003", "UMask": "0x10" }, diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 5297d25f4e03..281419d2edeb 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -5,7 +5,7 @@ GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core GenuineIntel-6-(3D|47),v29,broadwell,core GenuineIntel-6-56,v11,broadwellde,core GenuineIntel-6-4F,v22,broadwellx,core -GenuineIntel-6-55-[56789ABCDEF],v1.20,cascadelakex,core +GenuineIntel-6-55-[56789ABCDEF],v1.21,cascadelakex,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core GenuineIntel-6-CF,v1.03,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core -- cgit v1.2.3 From 36f353a1ebf88280f58d1ebfe2731251d9159456 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:06 -0700 Subject: perf vendor events intel: Update emeraldrapids to 1.06 Update events from 1.03 to 1.96 as released in: https://github.com/intel/perfmon/commit/21a8be3ea7918749141db4036fb65a2343cd865d Fixes spelling and descriptions. Adds cache miss latency events UNC_CHA_TOR_(INSERTS|OCCUPANCY).IO_(PCIRDCUR|ITOM|ITOMCACHENEAR)_(LOCAL|REMOTE). Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/emeraldrapids/frontend.json | 2 +- .../pmu-events/arch/x86/emeraldrapids/memory.json | 1 + .../arch/x86/emeraldrapids/pipeline.json | 3 + .../arch/x86/emeraldrapids/uncore-cache.json | 112 ++++++++++++++++++++- .../x86/emeraldrapids/uncore-interconnect.json | 26 ++--- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 6 files changed, 129 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json index 9e53da55d0c1..93d99318a623 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json @@ -267,7 +267,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json index e8bf7c9c44e1..5420f529f491 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json @@ -264,6 +264,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json index 1f8200fb8964..e2086bedeca8 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json @@ -428,6 +428,7 @@ "BriefDescription": "INST_RETIRED.MACRO_FUSED", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", + "PEBS": "1", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -435,6 +436,7 @@ "BriefDescription": "Retired NOP instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", + "PEBS": "1", "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" @@ -451,6 +453,7 @@ "BriefDescription": "Iterations of Repeat string retired instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", + "PEBS": "1", "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json index 86a8f3b7fe1d..141dab46682e 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json @@ -4197,6 +4197,42 @@ "UMask": "0xcd43ff04", "Unit": "CHA" }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437f04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; Misses from local IO", "EventCode": "0x35", @@ -4207,7 +4243,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; ItoM misses from local IO", + "BriefDescription": "TOR Inserts : ItoM, indicating a full cacheline write request, from IO Devices that missed the LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", "PerPkg": "1", @@ -4225,7 +4261,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; RdCur and FsRdCur misses from local IO", + "BriefDescription": "TOR Inserts; RdCur and FsRdCur requests from local IO that miss LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR", "PerPkg": "1", @@ -4251,6 +4287,24 @@ "UMask": "0xc8f3ff04", "Unit": "CHA" }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; RFO from local IO", "EventCode": "0x35", @@ -5713,6 +5767,42 @@ "UMask": "0xcd43fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO", "EventCode": "0x36", @@ -5722,6 +5812,24 @@ "UMask": "0xc8f3fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RFO misses from local IO", "EventCode": "0x36", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json index 65d088556bae..22bb490e9666 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json @@ -4888,7 +4888,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AD)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AD)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AD", "PerPkg": "1", @@ -4897,7 +4897,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AK)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AK)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AK", "PerPkg": "1", @@ -4906,7 +4906,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AKC)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AKC)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AKC", "PerPkg": "1", @@ -4915,7 +4915,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (BL)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (BL)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.BL", "PerPkg": "1", @@ -4924,7 +4924,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (IV)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (IV)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.IV", "PerPkg": "1", @@ -5291,7 +5291,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5300,7 +5300,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5309,7 +5309,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5318,7 +5318,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -5763,7 +5763,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5772,7 +5772,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5781,7 +5781,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5790,7 +5790,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 281419d2edeb..85b89b1098cf 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -7,7 +7,7 @@ GenuineIntel-6-56,v11,broadwellde,core GenuineIntel-6-4F,v22,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.21,cascadelakex,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core -GenuineIntel-6-CF,v1.03,emeraldrapids,core +GenuineIntel-6-CF,v1.06,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-B6,v1.01,grandridge,core -- cgit v1.2.3 From a02dc01cef3717720edb97c4ce9bc8f86ee67a83 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:07 -0700 Subject: perf vendor events intel: Update grandridge to 1.02 Update events from 1.01 to 1.02 as released in: https://github.com/intel/perfmon/commit/b2a81e803add1ba0af68a442c975683d226d868c Fixes spelling and descriptions. Adds topdown events and uncore cache UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT, UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT, UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/grandridge/pipeline.json | 43 +++++++++++++++++++--- .../arch/x86/grandridge/uncore-cache.json | 28 +++++++++++++- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 3 files changed, 66 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json index daa0639bb1ca..90292dc03d33 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json @@ -249,10 +249,17 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", "SampleAfterValue": "1000003" }, { @@ -284,7 +291,7 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003" @@ -296,6 +303,12 @@ "SampleAfterValue": "1000003", "UMask": "0x1" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", "EventCode": "0x74", @@ -317,6 +330,13 @@ "SampleAfterValue": "1000003", "UMask": "0x20" }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb", "EventCode": "0x74", @@ -325,11 +345,17 @@ "UMask": "0x10" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", "EventCode": "0x71", @@ -402,12 +428,19 @@ "UMask": "0x4" }, { - "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL_P]", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL]", + "EventCode": "0x72", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of uops issued by the front end every cycle.", "EventCode": "0x0e", diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json index 74dfd9272cef..36614429dd72 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json @@ -5,7 +5,6 @@ "EventName": "UNC_CHACMS_CLOCKTICKS", "PerPkg": "1", "PortMask": "0x000", - "PublicDescription": "UNC_CHACMS_CLOCKTICKS", "Unit": "CHACMS" }, { @@ -1216,6 +1215,15 @@ "UMask": "0xc88fff01", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores", + "UMask": "0xc827ff01", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", "EventCode": "0x36", @@ -1252,6 +1260,15 @@ "UMask": "0xc88ffd01", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores that hit the LLC", + "UMask": "0xc827fd01", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that hit the cache", "EventCode": "0x36", @@ -1405,6 +1422,15 @@ "UMask": "0xc88efe01", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt issued by iA Cores that missed the LLC", + "UMask": "0xc827fe01", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", "EventCode": "0x36", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 85b89b1098cf..650c28574576 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -10,7 +10,7 @@ GenuineIntel-6-9[6C],v1.04,elkhartlake,core GenuineIntel-6-CF,v1.06,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -GenuineIntel-6-B6,v1.01,grandridge,core +GenuineIntel-6-B6,v1.02,grandridge,core GenuineIntel-6-A[DE],v1.01,graniterapids,core GenuineIntel-6-(3C|45|46),v35,haswell,core GenuineIntel-6-3F,v28,haswellx,core -- cgit v1.2.3 From 5157c2042eec305929d26acfaea1c237561d3d63 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:08 -0700 Subject: perf vendor events intel: Update icelakex to 1.24 Update events from 1.23 to 1.24 as released in: https://github.com/intel/perfmon/commit/d883888ae60882028e387b6fe1ebf683beb693fa Fixes spelling and descriptions. Adds the uncore events UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL and UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE, while removing UNC_IIO_NUM_REQ_FROM_CPU.IRP. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/icelakex/frontend.json | 2 +- .../perf/pmu-events/arch/x86/icelakex/memory.json | 1 + .../pmu-events/arch/x86/icelakex/uncore-cache.json | 22 +++++++- .../arch/x86/icelakex/uncore-interconnect.json | 64 +++++++++++----------- .../pmu-events/arch/x86/icelakex/uncore-io.json | 11 ---- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 6 files changed, 55 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/icelakex/frontend.json b/tools/perf/pmu-events/arch/x86/icelakex/frontend.json index f6edc4222f42..66669d062e68 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/frontend.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/frontend.json @@ -282,7 +282,7 @@ "CounterMask": "5", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/memory.json b/tools/perf/pmu-events/arch/x86/icelakex/memory.json index f36ac04f8d76..875b584b8443 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/memory.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/memory.json @@ -319,6 +319,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json index b6ce14ebf844..a950ba3ddcb4 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-cache.json @@ -1580,7 +1580,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.CODE_READ", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.CODE", @@ -1677,7 +1677,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.DATA_READ", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_ALL", @@ -6782,6 +6782,24 @@ "UMask": "0xc8f3ff04", "Unit": "CHA" }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices and targets local memory : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices and targets remote memory : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts : RFOs issued by IO Devices", "EventCode": "0x35", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json index a066a009c511..6997e6f7d366 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json @@ -13523,7 +13523,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -13532,7 +13532,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -13541,7 +13541,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -13550,7 +13550,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -13559,7 +13559,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Request : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Request : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x8", "Unit": "UPI" }, @@ -13568,7 +13568,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Request, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Request, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x108", "Unit": "UPI" }, @@ -13577,7 +13577,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPCNFLT", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Conflict : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Conflict : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x1aa", "Unit": "UPI" }, @@ -13586,7 +13586,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPI", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Invalid : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Invalid : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x12a", "Unit": "UPI" }, @@ -13595,7 +13595,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xc", "Unit": "UPI" }, @@ -13604,7 +13604,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - Data, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10c", "Unit": "UPI" }, @@ -13613,7 +13613,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xa", "Unit": "UPI" }, @@ -13622,7 +13622,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Response - No Data, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10a", "Unit": "UPI" }, @@ -13631,7 +13631,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Snoop : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Snoop : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x9", "Unit": "UPI" }, @@ -13640,7 +13640,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Snoop, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Snoop, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x109", "Unit": "UPI" }, @@ -13649,7 +13649,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Writeback : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Writeback : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xd", "Unit": "UPI" }, @@ -13658,7 +13658,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Writeback, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Writeback, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10d", "Unit": "UPI" }, @@ -14038,7 +14038,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -14047,7 +14047,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -14056,7 +14056,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -14065,7 +14065,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -14074,7 +14074,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Request : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Request : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x8", "Unit": "UPI" }, @@ -14083,7 +14083,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Request, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Request, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x108", "Unit": "UPI" }, @@ -14092,7 +14092,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPCNFLT", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Conflict : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Conflict : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x1aa", "Unit": "UPI" }, @@ -14101,7 +14101,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPI", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Invalid : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Invalid : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x12a", "Unit": "UPI" }, @@ -14110,7 +14110,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xc", "Unit": "UPI" }, @@ -14119,7 +14119,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - Data, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10c", "Unit": "UPI" }, @@ -14128,7 +14128,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xa", "Unit": "UPI" }, @@ -14137,7 +14137,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Response - No Data, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10a", "Unit": "UPI" }, @@ -14146,7 +14146,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x9", "Unit": "UPI" }, @@ -14155,7 +14155,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Snoop, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x109", "Unit": "UPI" }, @@ -14164,7 +14164,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xd", "Unit": "UPI" }, @@ -14173,7 +14173,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Writeback, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10d", "Unit": "UPI" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json index 9cef8862c428..1b8a719b81a5 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-io.json @@ -2476,17 +2476,6 @@ "UMask": "0x10", "Unit": "IIO" }, - { - "BriefDescription": "Number requests sent to PCIe from main die : From IRP", - "EventCode": "0xC2", - "EventName": "UNC_IIO_NUM_REQ_FROM_CPU.IRP", - "FCMask": "0x07", - "PerPkg": "1", - "PortMask": "0xFF", - "PublicDescription": "Number requests sent to PCIe from main die : From IRP : Captures Posted/Non-posted allocations from IRP. i.e. either non-confined P2P traffic or from the CPU", - "UMask": "0x1", - "Unit": "IIO" - }, { "BriefDescription": "Number requests sent to PCIe from main die : From ITC", "EventCode": "0xC2", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 650c28574576..a219dc3bd1ef 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -15,7 +15,7 @@ GenuineIntel-6-A[DE],v1.01,graniterapids,core GenuineIntel-6-(3C|45|46),v35,haswell,core GenuineIntel-6-3F,v28,haswellx,core GenuineIntel-6-7[DE],v1.21,icelake,core -GenuineIntel-6-6[AC],v1.23,icelakex,core +GenuineIntel-6-6[AC],v1.24,icelakex,core GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core -- cgit v1.2.3 From 3670ffbda1892bdab2328225b614c0be18885fea Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:09 -0700 Subject: perf vendor events intel: Update lunarlake to 1.01 Update events from 1.00 to 1.01 as released in: https://github.com/intel/perfmon/commit/56ab8d837ac566d51a4d8748b6b4b817a22c9b84 Various encoding and description updates. Adds the events CPU_CLK_UNHALTED.CORE, CPU_CLK_UNHALTED.CORE_P, CPU_CLK_UNHALTED.REF_TSC_P, CPU_CLK_UNHALTED.THREAD, MISC_RETIRED.LBR_INSERTS, TOPDOWN_BAD_SPECULATION.ALL_P, TOPDOWN_BE_BOUND.ALL_P, TOPDOWN_FE_BOUND.ALL_P, TOPDOWN_RETIRING.ALL_P. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/lunarlake/cache.json | 24 ++--- .../pmu-events/arch/x86/lunarlake/frontend.json | 2 +- .../perf/pmu-events/arch/x86/lunarlake/memory.json | 4 +- .../perf/pmu-events/arch/x86/lunarlake/other.json | 4 +- .../pmu-events/arch/x86/lunarlake/pipeline.json | 109 ++++++++++++++++++--- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 6 files changed, 113 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json index 1823149067b5..fb48be357c4e 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json @@ -31,7 +31,7 @@ "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x4f", "Unit": "cpu_atom" }, @@ -94,7 +94,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x400", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -106,7 +106,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x80", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -118,7 +118,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x10", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -130,7 +130,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x800", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -142,7 +142,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x100", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -154,7 +154,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x20", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -166,7 +166,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x4", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -178,7 +178,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x200", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -190,7 +190,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x40", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -202,7 +202,7 @@ "MSRIndex": "0x3F6", "MSRValue": "0x8", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x5", "Unit": "cpu_atom" }, @@ -212,7 +212,7 @@ "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY", "PEBS": "2", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0x6", "Unit": "cpu_atom" } diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json index 5e4ef81b43d6..3a24934e8d6e 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.\nSoftware can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json index 51d70ba00bd4..9c188d80b7b9 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json @@ -155,7 +155,7 @@ "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x3FBFC00001", + "MSRValue": "0xFE7F8000001", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -175,7 +175,7 @@ "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_MISS", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x3FBFC00002", + "MSRValue": "0xFE7F8000002", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/other.json b/tools/perf/pmu-events/arch/x86/lunarlake/other.json index 69adaed5686d..377f717db6cc 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/other.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/other.json @@ -24,7 +24,7 @@ "EventCode": "0xB7", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x184000001", + "MSRValue": "0x1FBC000001", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_atom" @@ -34,7 +34,7 @@ "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", - "MSRValue": "0x184000001", + "MSRValue": "0x1E780000001", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json index 2bde664fdc0f..2c9f85ec8c4a 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json @@ -38,10 +38,18 @@ { "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles", "EventName": "CPU_CLK_UNHALTED.CORE", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_atom" }, + { + "BriefDescription": "Core cycles when the core is not in a halt state.", + "EventName": "CPU_CLK_UNHALTED.CORE", + "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the programmable counters available for other events.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, { "BriefDescription": "Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", "EventCode": "0x3c", @@ -49,10 +57,18 @@ "SampleAfterValue": "2000003", "Unit": "cpu_atom" }, + { + "BriefDescription": "Thread cycles when thread is not in halt state [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.CORE_P", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", + "SampleAfterValue": "2000003", + "Unit": "cpu_core" + }, { "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles", "EventName": "CPU_CLK_UNHALTED.REF_TSC", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x3", "Unit": "cpu_atom" }, @@ -64,6 +80,15 @@ "UMask": "0x3", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts the number of unhalted reference clock cycles", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", + "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Reference cycles when the core is not in halt state.", "EventCode": "0x3c", @@ -74,9 +99,16 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Core cycles when the thread is not in halt state", + "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles", + "EventName": "CPU_CLK_UNHALTED.THREAD", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Core cycles when the thread is not in a halt state.", "EventName": "CPU_CLK_UNHALTED.THREAD", - "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events.", + "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the programmable counters available for other events.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -89,10 +121,10 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Thread cycles when thread is not in halt state", + "BriefDescription": "Thread cycles when thread is not in halt state [This event is alias to CPU_CLK_UNHALTED.CORE_P]", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. [This event is alias to CPU_CLK_UNHALTED.CORE_P]", "SampleAfterValue": "2000003", "Unit": "cpu_core" }, @@ -100,7 +132,7 @@ "BriefDescription": "Fixed Counter: Counts the number of instructions retired", "EventName": "INST_RETIRED.ANY", "PEBS": "1", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_atom" }, @@ -148,11 +180,29 @@ "UMask": "0x82", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL.", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "LBR record is inserted", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, { "BriefDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.\nSoftware can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "10000003", "UMask": "0x2", "Unit": "cpu_core" @@ -175,21 +225,35 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the IQ. Also, includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", "SampleAfterValue": "1000003", - "UMask": "0x5", "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0xa4", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0xa4", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, { "BriefDescription": "Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls", "EventName": "TOPDOWN_FE_BOUND.ALL", @@ -198,18 +262,35 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "EventCode": "0x9c", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots.", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003", "UMask": "0x7", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of consumed retirement slots.", + "EventCode": "0xc2", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, { "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.\nSoftware can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index a219dc3bd1ef..710f8dfefeed 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -20,7 +20,7 @@ GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core GenuineIntel-6-(57|85),v16,knightslanding,core -GenuineIntel-6-BD,v1.00,lunarlake,core +GenuineIntel-6-BD,v1.01,lunarlake,core GenuineIntel-6-A[AC],v1.07,meteorlake,core GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core -- cgit v1.2.3 From 84d0e8c6db884733aaf59b7a224ed0dc208afeed Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:10 -0700 Subject: perf vendor events intel: Update meteorlake to 1.08 Update events from 1.07 to 1.08 as released in: https://github.com/intel/perfmon/commit/f0f8f3e163d9eb84e6ce8e2108a22cb43b2527e5 Various description updates. Adds topdown, offcore and uncore events OCR.DEMAND_DATA_RD.L3_HIT, OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD, OCR.DEMAND_RFO.L3_HIT, OCR.DEMAND_DATA_RD.L3_MISS, OCR.DEMAND_RFO.L3_MISS, OCR.DEMAND_DATA_RD.ANY_RESPONSE, OCR.DEMAND_DATA_RD.DRAM, OCR.DEMAND_RFO.ANY_RESPONSE, OCR.DEMAND_RFO.DRAM, TOPDOWN_BAD_SPECULATION.ALL_P, TOPDOWN_BE_BOUND.ALL_P, TOPDOWN_FE_BOUND.ALL_P, TOPDOWN_RETIRING.ALL_P, UNC_ARB_DAT_OCCUPANCY.RD and UNC_HAC_ARB_COH_TRK_REQUESTS.ALL. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../perf/pmu-events/arch/x86/meteorlake/cache.json | 30 +++++++++++++++ .../pmu-events/arch/x86/meteorlake/frontend.json | 4 +- .../pmu-events/arch/x86/meteorlake/memory.json | 20 ++++++++++ .../perf/pmu-events/arch/x86/meteorlake/other.json | 42 ++++++++++++++++++++- .../pmu-events/arch/x86/meteorlake/pipeline.json | 44 ++++++++++++++++++---- .../arch/x86/meteorlake/uncore-interconnect.json | 22 +++++++++-- 7 files changed, 150 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 710f8dfefeed..fedaacbe981a 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -21,7 +21,7 @@ GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core GenuineIntel-6-(57|85),v16,knightslanding,core GenuineIntel-6-BD,v1.01,lunarlake,core -GenuineIntel-6-A[AC],v1.07,meteorlake,core +GenuineIntel-6-A[AC],v1.08,meteorlake,core GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core GenuineIntel-6-A7,v1.02,rocketlake,core diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json index 47861a6dd8e9..af7acb15f661 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json @@ -966,6 +966,16 @@ "UMask": "0x3", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.", "EventCode": "0xB7", @@ -986,6 +996,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x4003C0001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.", "EventCode": "0xB7", @@ -1006,6 +1026,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.", "EventCode": "0xB7", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json index 9da8689eda81..f3b7b211afb5 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json @@ -378,7 +378,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -455,7 +455,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.\nThe count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. The count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json index a5b83293f157..617d0e255fd5 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json @@ -296,6 +296,16 @@ "UMask": "0x4", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3FBFC00001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", "EventCode": "0x2A,0x2B", @@ -306,6 +316,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3FBFC00002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", "EventCode": "0x2A,0x2B", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/other.json b/tools/perf/pmu-events/arch/x86/meteorlake/other.json index 7effc1f271e7..0bc2cb2eabb3 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/other.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/other.json @@ -17,6 +17,16 @@ "UMask": "0x1", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts demand data reads that have any type of response.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that have any type of response.", "EventCode": "0x2A,0x2B", @@ -27,6 +37,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand data reads that were supplied by DRAM.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.DRAM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x184000001", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand data reads that were supplied by DRAM.", "EventCode": "0x2A,0x2B", @@ -37,6 +57,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", "EventCode": "0x2A,0x2B", @@ -47,6 +77,16 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by DRAM.", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.DRAM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x184000002", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts streaming stores that have any type of response.", "EventCode": "0xB7", @@ -97,7 +137,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state. For Tremont, UMWAIT and TPAUSE will only put the CPU into C0.1 activity state (not C0.2 activity state)", + "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.", "EventCode": "0x75", "EventName": "SERIALIZATION.C01_MS_SCB", "SampleAfterValue": "200003", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json index 24bbfcebd2be..5ff4a7a32250 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json @@ -1067,7 +1067,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.\nThe count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "10000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1116,10 +1116,18 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, @@ -1156,7 +1164,7 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003", @@ -1170,6 +1178,13 @@ "UMask": "0x1", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", "EventCode": "0x74", @@ -1211,12 +1226,19 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", "EventCode": "0x71", @@ -1299,13 +1321,21 @@ "Unit": "cpu_atom" }, { - "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL_P]", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, + { + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL]", + "EventCode": "0x72", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003", + "Unit": "cpu_atom" + }, { "BriefDescription": "Number of non dec-by-all uops decoded by decoder", "EventCode": "0x76", @@ -1591,7 +1621,7 @@ "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.\nSoftware can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", + "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json index 08b5c7574cfc..901d8510f90f 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/uncore-interconnect.json @@ -1,4 +1,20 @@ [ + { + "BriefDescription": "Each cycle counts number of coherent reads pending on data return from memory controller that were issued by any core.", + "EventCode": "0x85", + "EventName": "UNC_ARB_DAT_OCCUPANCY.RD", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "ARB" + }, + { + "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, etc.", + "EventCode": "0x84", + "EventName": "UNC_HAC_ARB_COH_TRK_REQUESTS.ALL", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "HAC_ARB" + }, { "BriefDescription": "Number of all coherent Data Read entries. Doesn't include prefetches", "EventCode": "0x81", @@ -9,7 +25,7 @@ }, { "BriefDescription": "Number of all CMI transactions", - "EventCode": "0x8a", + "EventCode": "0x8A", "EventName": "UNC_HAC_ARB_TRANSACTIONS.ALL", "PerPkg": "1", "UMask": "0x1", @@ -17,7 +33,7 @@ }, { "BriefDescription": "Number of all CMI reads", - "EventCode": "0x8a", + "EventCode": "0x8A", "EventName": "UNC_HAC_ARB_TRANSACTIONS.READS", "PerPkg": "1", "UMask": "0x2", @@ -25,7 +41,7 @@ }, { "BriefDescription": "Number of all CMI writes not including Mflush", - "EventCode": "0x8a", + "EventCode": "0x8A", "EventName": "UNC_HAC_ARB_TRANSACTIONS.WRITES", "PerPkg": "1", "UMask": "0x4", -- cgit v1.2.3 From 2edee9e666bb1cdb60d5d16f510ad9b51c84c33e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:11 -0700 Subject: perf vendor events intel: Update sapphirerapids to 1.20 Update events from 1.17 to 1.20 as released in: https://github.com/intel/perfmon/commit/6f674057745acf0125395638ca6be36458a59bda Various description updates. Adds uncore events UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL, UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_REMOTE, UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL, UNC_CHA_TOR_INSERTS.IO_ITOM_REMOTE, UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL, UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE, UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL, UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE, UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL, UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE, UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL, UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE and removes core events AMX_OPS_RETIRED.BF16 and AMX_OPS_RETIRED.INT8. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../pmu-events/arch/x86/sapphirerapids/cache.json | 1 + .../arch/x86/sapphirerapids/frontend.json | 2 +- .../pmu-events/arch/x86/sapphirerapids/memory.json | 1 + .../arch/x86/sapphirerapids/pipeline.json | 19 +--- .../arch/x86/sapphirerapids/uncore-cache.json | 112 ++++++++++++++++++++- .../x86/sapphirerapids/uncore-interconnect.json | 26 ++--- 7 files changed, 130 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index fedaacbe981a..e5de35c96358 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -26,7 +26,7 @@ GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core GenuineIntel-6-A7,v1.02,rocketlake,core GenuineIntel-6-2A,v19,sandybridge,core -GenuineIntel-6-8F,v1.17,sapphirerapids,core +GenuineIntel-6-8F,v1.20,sapphirerapids,core GenuineIntel-6-AF,v1.01,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json index 9606e76b98d6..b0447aad0dfc 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json @@ -432,6 +432,7 @@ "BriefDescription": "Retired load instructions with remote Intel(R) Optane(TM) DC persistent memory as the data source where the data request missed all caches.", "EventCode": "0xd3", "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM", + "PEBS": "1", "PublicDescription": "Counts retired load instructions with remote Intel(R) Optane(TM) DC persistent memory as the data source and the data request missed L3.", "SampleAfterValue": "100007", "UMask": "0x10" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json index 9e53da55d0c1..93d99318a623 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json @@ -267,7 +267,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json index e8bf7c9c44e1..5420f529f491 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json @@ -264,6 +264,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json index 2cfe814d2015..e2086bedeca8 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json @@ -1,20 +1,4 @@ [ - { - "BriefDescription": "AMX retired arithmetic BF16 operations.", - "EventCode": "0xce", - "EventName": "AMX_OPS_RETIRED.BF16", - "PublicDescription": "Number of AMX-based retired arithmetic bfloat16 (BF16) floating-point operations. Counts TDPBF16PS FP instructions. SW to use operation multiplier of 4", - "SampleAfterValue": "1000003", - "UMask": "0x2" - }, - { - "BriefDescription": "AMX retired arithmetic integer 8-bit operations.", - "EventCode": "0xce", - "EventName": "AMX_OPS_RETIRED.INT8", - "PublicDescription": "Number of AMX-based retired arithmetic integer operations of 8-bit width source operands. Counts TDPB[SS,UU,US,SU]D instructions. SW should use operation multiplier of 8.", - "SampleAfterValue": "1000003", - "UMask": "0x1" - }, { "BriefDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE", "CounterMask": "1", @@ -444,6 +428,7 @@ "BriefDescription": "INST_RETIRED.MACRO_FUSED", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", + "PEBS": "1", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -451,6 +436,7 @@ "BriefDescription": "Retired NOP instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", + "PEBS": "1", "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" @@ -467,6 +453,7 @@ "BriefDescription": "Iterations of Repeat string retired instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", + "PEBS": "1", "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json index cf6fa70f37c1..25a2b9695135 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json @@ -4143,6 +4143,42 @@ "UMask": "0xcd43ff04", "Unit": "CHA" }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear (partial write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437f04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM (write) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; Misses from local IO", "EventCode": "0x35", @@ -4153,7 +4189,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; ItoM misses from local IO", + "BriefDescription": "TOR Inserts : ItoM, indicating a full cacheline write request, from IO Devices that missed the LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", "PerPkg": "1", @@ -4171,7 +4207,7 @@ "Unit": "CHA" }, { - "BriefDescription": "TOR Inserts; RdCur and FsRdCur misses from local IO", + "BriefDescription": "TOR Inserts; RdCur and FsRdCur requests from local IO that miss LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR", "PerPkg": "1", @@ -4197,6 +4233,24 @@ "UMask": "0xc8f3ff04", "Unit": "CHA" }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on a remote socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCUR (read) transactions from an IO device that addresses memory on the local socket", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37f04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Inserts; RFO from local IO", "EventCode": "0x35", @@ -5565,6 +5619,42 @@ "UMask": "0xcd43fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcd437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; ITOM misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xcc437e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO", "EventCode": "0x36", @@ -5574,6 +5664,24 @@ "UMask": "0xc8f3fe04", "Unit": "CHA" }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets local memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f2fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy; RdCur and FsRdCur misses from local IO and targets remote memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", + "UMask": "0xc8f37e04", + "Unit": "CHA" + }, { "BriefDescription": "TOR Occupancy; RFO misses from local IO", "EventCode": "0x36", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json index 65d088556bae..22bb490e9666 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json @@ -4888,7 +4888,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AD)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AD)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AD", "PerPkg": "1", @@ -4897,7 +4897,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AK)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AK)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AK", "PerPkg": "1", @@ -4906,7 +4906,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (AKC)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (AKC)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.AKC", "PerPkg": "1", @@ -4915,7 +4915,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (BL)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (BL)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.BL", "PerPkg": "1", @@ -4924,7 +4924,7 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO\r\nIngress (V-EMIB) (IV)", + "BriefDescription": "Number of cycles incoming messages from the vertical ring that are bounced at the SBO Ingress (V-EMIB) (IV)", "EventCode": "0x4B", "EventName": "UNC_MDF_CRS_TxR_V_BOUNCES.IV", "PerPkg": "1", @@ -5291,7 +5291,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5300,7 +5300,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5309,7 +5309,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5318,7 +5318,7 @@ "EventCode": "0x05", "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Receive path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, @@ -5763,7 +5763,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xe", "Unit": "UPI" }, @@ -5772,7 +5772,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10e", "Unit": "UPI" }, @@ -5781,7 +5781,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0xf", "Unit": "UPI" }, @@ -5790,7 +5790,7 @@ "EventCode": "0x04", "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", "PerPkg": "1", - "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port.\r\nMatch based on UMask specific bits:\r\nZ: Message Class (3-bit)\r\nY: Message Class Enable\r\nW: Opcode (4-bit)\r\nV: Opcode Enable\r\nU: Local Enable\r\nT: Remote Enable\r\nS: Data Hdr Enable\r\nR: Non-Data Hdr Enable\r\nQ: Dual Slot Hdr Enable\r\nP: Single Slot Hdr Enable\r\nLink Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases.\r\nNote: If Message Class is disabled, we expect opcode to also be disabled.", + "PublicDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode : Matches on Transmit path of a UPI port. Match based on UMask specific bits: Z: Message Class (3-bit) Y: Message Class Enable W: Opcode (4-bit) V: Opcode Enable U: Local Enable T: Remote Enable S: Data Hdr Enable R: Non-Data Hdr Enable Q: Dual Slot Hdr Enable P: Single Slot Hdr Enable Link Layer control types are excluded (LL CTRL, slot NULL, LLCRD) even under specific opcode match_en cases. Note: If Message Class is disabled, we expect opcode to also be disabled.", "UMask": "0x10f", "Unit": "UPI" }, -- cgit v1.2.3 From bf270b15c0e73bbd150f9010f601257952279d5c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:12 -0700 Subject: perf vendor events intel: Update sierraforest to 1.02 Update events from 1.01 to 1.02 as released in: https://github.com/intel/perfmon/commit/451dd41ae627b56433ad4065bf3632789eb70834 Various description updates. Adds topdown events TOPDOWN_BAD_SPECULATION.ALL_P, TOPDOWN_BE_BOUND.ALL_P, TOPDOWN_FE_BOUND.ALL_P and TOPDOWN_RETIRING.ALL_P. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../pmu-events/arch/x86/sierraforest/pipeline.json | 36 +++++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index e5de35c96358..ac32377ab01a 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -27,7 +27,7 @@ GenuineIntel-6-2E,v4,nehalemex,core GenuineIntel-6-A7,v1.02,rocketlake,core GenuineIntel-6-2A,v19,sandybridge,core GenuineIntel-6-8F,v1.20,sapphirerapids,core -GenuineIntel-6-AF,v1.01,sierraforest,core +GenuineIntel-6-AF,v1.02,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core GenuineIntel-6-55-[01234],v1.32,skylakex,core diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json index ba9843110f07..90292dc03d33 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json @@ -249,10 +249,17 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", - "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL_P]", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL_P", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear. [This event is alias to TOPDOWN_BAD_SPECULATION.ALL]", "SampleAfterValue": "1000003" }, { @@ -284,7 +291,7 @@ "UMask": "0x1" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003" @@ -296,6 +303,12 @@ "SampleAfterValue": "1000003", "UMask": "0x1" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", "EventCode": "0x74", @@ -332,11 +345,17 @@ "UMask": "0x10" }, { - "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL_P]", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls [This event is alias to TOPDOWN_FE_BOUND.ALL]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ALL_P", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", "EventCode": "0x71", @@ -409,12 +428,19 @@ "UMask": "0x4" }, { - "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL_P]", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003" }, + { + "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL [This event is alias to TOPDOWN_RETIRING.ALL]", + "EventCode": "0x72", + "EventName": "TOPDOWN_RETIRING.ALL_P", + "PEBS": "1", + "SampleAfterValue": "1000003" + }, { "BriefDescription": "Counts the number of uops issued by the front end every cycle.", "EventCode": "0x0e", -- cgit v1.2.3 From d70cc755caef9c6f0ed05c9346df052decf250bd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:13 -0700 Subject: perf vendor events intel: Update skylakex to 1.33 Update events from 1.32 to 1.33 as released in: https://github.com/intel/perfmon/commit/3fe7390dd18496c35ec3a9cf17de0473fd5485cb Various description updates. Adds the event OFFCORE_RESPONSE.ALL_READS.L3_HIT.HIT_OTHER_CORE_FWD. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- tools/perf/pmu-events/arch/x86/skylakex/cache.json | 9 +++++++++ tools/perf/pmu-events/arch/x86/skylakex/frontend.json | 10 +++++----- tools/perf/pmu-events/arch/x86/skylakex/memory.json | 2 +- tools/perf/pmu-events/arch/x86/skylakex/other.json | 2 +- tools/perf/pmu-events/arch/x86/skylakex/pipeline.json | 2 +- .../pmu-events/arch/x86/skylakex/uncore-interconnect.json | 14 +++++++------- tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json | 2 +- .../perf/pmu-events/arch/x86/skylakex/virtual-memory.json | 2 +- 9 files changed, 27 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index ac32377ab01a..e67e7906b2cf 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -30,7 +30,7 @@ GenuineIntel-6-8F,v1.20,sapphirerapids,core GenuineIntel-6-AF,v1.02,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core -GenuineIntel-6-55-[01234],v1.32,skylakex,core +GenuineIntel-6-55-[01234],v1.33,skylakex,core GenuineIntel-6-86,v1.21,snowridgex,core GenuineIntel-6-8[CD],v1.15,tigerlake,core GenuineIntel-6-2C,v5,westmereep-dp,core diff --git a/tools/perf/pmu-events/arch/x86/skylakex/cache.json b/tools/perf/pmu-events/arch/x86/skylakex/cache.json index d28d8822a51a..14229f4b29d8 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/cache.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/cache.json @@ -763,6 +763,15 @@ "SampleAfterValue": "100003", "UMask": "0x1" }, + { + "BriefDescription": "OFFCORE_RESPONSE.ALL_READS.L3_HIT.HIT_OTHER_CORE_FWD hit in the L3 and the snoop to one of the sibling cores hits the line in E/S/F state and the line is forwarded.", + "EventCode": "0xB7, 0xBB", + "EventName": "OFFCORE_RESPONSE.ALL_READS.L3_HIT.HIT_OTHER_CORE_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x8003C07F7", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, { "BriefDescription": "Counts all demand & prefetch RFOs that have any response type.", "EventCode": "0xB7, 0xBB", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/frontend.json b/tools/perf/pmu-events/arch/x86/skylakex/frontend.json index 095904c77001..d6f543471b24 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/frontend.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switches", "EventCode": "0xAB", "EventName": "DSB2MITE_SWITCHES.COUNT", - "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses.\nNote: Invoking MITE requires two or three cycles delay.", + "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses. Note: Invoking MITE requires two or three cycles delay.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -267,11 +267,11 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.DSB_CYCLES_OK]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.DSB_CYCLES_OK]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", "SampleAfterValue": "2000003", "UMask": "0x18" }, @@ -321,11 +321,11 @@ "UMask": "0x18" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "SampleAfterValue": "2000003", "UMask": "0x18" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/memory.json b/tools/perf/pmu-events/arch/x86/skylakex/memory.json index 2b797dbc75fe..dba3cd6b3690 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/memory.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/memory.json @@ -864,7 +864,7 @@ "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).", "EventCode": "0xC9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "PublicDescription": "Number of times RTM abort was triggered.", "SampleAfterValue": "2000003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/skylakex/other.json b/tools/perf/pmu-events/arch/x86/skylakex/other.json index cda8a7a45f0c..2511d722327a 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/other.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server michroarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json index 66d686cc933e..c50ddf5b40dd 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json @@ -396,7 +396,7 @@ "Errata": "SKL091, SKL044", "EventCode": "0xC0", "EventName": "INST_RETIRED.NOP", - "PEBS": "1", + "PEBS": "2", "SampleAfterValue": "2000003", "UMask": "0x2" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json index 3eece8a728b5..f32d4d9d283a 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -47,7 +47,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x2", "Unit": "IRP" }, @@ -56,7 +56,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.DRD", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x4", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIDCAHINT", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x20", "Unit": "IRP" }, @@ -74,7 +74,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.PCIRDCUR", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x1", "Unit": "IRP" }, @@ -101,7 +101,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -500,7 +500,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID.; Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json index 2a3a709018bb..743c91f3d2f0 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-io.json @@ -34,7 +34,7 @@ "EventCode": "0x1", "EventName": "UNC_IIO_CLOCKTICKS", "PerPkg": "1", - "PublicDescription": "Counts clockticks of the 1GHz trafiic controller clock in the IIO unit.", + "PublicDescription": "Counts clockticks of the 1GHz traffic controller clock in the IIO unit.", "Unit": "IIO" }, { diff --git a/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json b/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json index f59405877ae8..73feadaf7674 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json @@ -205,7 +205,7 @@ "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture.", + "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake microarchitecture.", "SampleAfterValue": "100003", "UMask": "0x10" }, -- cgit v1.2.3 From 70e7028c5b94c8d71b8df27ab663cdd72185af44 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:14 -0700 Subject: perf vendor events intel: Update skylake to v58 Update events from: https://github.com/intel/perfmon/commit/f2e5136e062a91ae554dc40530132e66f9271848 This change didn't increase the version number from v58. Updates various descriptions. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/skylake/frontend.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/skylake/frontend.json b/tools/perf/pmu-events/arch/x86/skylake/frontend.json index 095904c77001..d6f543471b24 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/skylake/frontend.json @@ -19,7 +19,7 @@ "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switches", "EventCode": "0xAB", "EventName": "DSB2MITE_SWITCHES.COUNT", - "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses.\nNote: Invoking MITE requires two or three cycles delay.", + "PublicDescription": "This event counts the number of the Decode Stream Buffer (DSB)-to-MITE switches including all misses because of missing Decode Stream Buffer (DSB) cache and u-arch forced misses. Note: Invoking MITE requires two or three cycles delay.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -267,11 +267,11 @@ "UMask": "0x4" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.DSB_CYCLES_OK]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.DSB_CYCLES_OK]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.DSB_CYCLES_OK]", "SampleAfterValue": "2000003", "UMask": "0x18" }, @@ -321,11 +321,11 @@ "UMask": "0x18" }, { - "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 or more Uops [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "CounterMask": "4", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", + "PublicDescription": "Counts the number of cycles 4 or more uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ. [This event is alias to IDQ.ALL_DSB_CYCLES_4_UOPS]", "SampleAfterValue": "2000003", "UMask": "0x18" }, -- cgit v1.2.3 From 7bce27f8d33ac370f74a2f9c36608c2ce7cd2fa7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:15 -0700 Subject: perf vendor events intel: Update snowridgex to 1.22 Update events from 1.21 to 1.22 as released in: https://github.com/intel/perfmon/commit/ba4f96039f96231b51e3eb69d5a21e2b00f6de5b Updates various descriptions and removes the event UNC_IIO_NUM_REQ_FROM_CPU.IRP. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json | 4 ++-- .../pmu-events/arch/x86/snowridgex/uncore-interconnect.json | 6 +++--- tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json | 11 ----------- 4 files changed, 6 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index e67e7906b2cf..c372e3594a69 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -31,7 +31,7 @@ GenuineIntel-6-AF,v1.02,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core GenuineIntel-6-55-[01234],v1.33,skylakex,core -GenuineIntel-6-86,v1.21,snowridgex,core +GenuineIntel-6-86,v1.22,snowridgex,core GenuineIntel-6-8[CD],v1.15,tigerlake,core GenuineIntel-6-2C,v5,westmereep-dp,core GenuineIntel-6-25,v4,westmereep-sp,core diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json index a68a5bb05c22..4090e4da1bd0 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json @@ -1444,7 +1444,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.DATA_READ_LOCAL", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.DMND_READ_LOCAL", @@ -1638,7 +1638,7 @@ "Unit": "CHA" }, { - "BriefDescription": "This event is deprecated. Refer to new event UNC_CHA_LLC_LOOKUP.RFO_LOCAL", + "BriefDescription": "This event is deprecated.", "Deprecated": "1", "EventCode": "0x34", "EventName": "UNC_CHA_LLC_LOOKUP.RFO_PREF_LOCAL", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json index 7e2895f7fe3d..7cc3635b118b 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Coherent Ops : CLFlush : Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Coherent Ops : CLFlush : Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Coherent Ops : WbMtoI : Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Coherent Ops : WbMtoI : Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -454,7 +454,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Inbound Transaction Count : Writes : Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID. : Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Inbound Transaction Count : Writes : Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID. : Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json index ecdd6f0f8e8f..de156e499f56 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json @@ -2505,17 +2505,6 @@ "UMask": "0x10", "Unit": "IIO" }, - { - "BriefDescription": "Number requests sent to PCIe from main die : From IRP", - "EventCode": "0xC2", - "EventName": "UNC_IIO_NUM_REQ_FROM_CPU.IRP", - "FCMask": "0x07", - "PerPkg": "1", - "PortMask": "0xFF", - "PublicDescription": "Number requests sent to PCIe from main die : From IRP : Captures Posted/Non-posted allocations from IRP. i.e. either non-confined P2P traffic or from the CPU", - "UMask": "0x1", - "Unit": "IIO" - }, { "BriefDescription": "Number requests sent to PCIe from main die : From ITC", "EventCode": "0xC2", -- cgit v1.2.3 From af34a16d3090f8a2f7eac24290209dacb9471132 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 23:00:16 -0700 Subject: perf vendor events intel: Remove info metrics erroneously in TopdownL1 Bug affected server metrics only. This doesn't impact default metrics but if the TopdownL1 metric group is specified. Passes on the fix in: https://github.com/intel/perfmon/commit/b09f0a3953234ec592b4a872b87764c78da05d8b Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Weilin Wang Link: https://lore.kernel.org/r/20240321060016.1464787-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/broadwellx/bdx-metrics.json | 35 +++--- .../arch/x86/cascadelakex/clx-metrics.json | 85 ++++++--------- .../pmu-events/arch/x86/haswellx/hsx-metrics.json | 35 +++--- .../pmu-events/arch/x86/icelakex/icx-metrics.json | 95 +++++++--------- .../arch/x86/sapphirerapids/spr-metrics.json | 119 ++++++++------------- .../pmu-events/arch/x86/skylakex/skx-metrics.json | 85 ++++++--------- 6 files changed, 181 insertions(+), 273 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index f2d378c9d68f..0aed533da882 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -732,9 +732,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "", @@ -745,9 +744,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -764,9 +762,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -807,9 +804,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -838,16 +834,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -867,9 +861,8 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 7f88b156f73b..297046818efe 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -670,23 +670,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1045,9 +1042,8 @@ { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1088,9 +1084,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1107,9 +1102,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1132,23 +1126,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -1189,9 +1180,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1202,9 +1192,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1233,16 +1222,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1253,9 +1240,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "Un-cacheable retired load per kilo instruction", @@ -1273,16 +1259,14 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1313,9 +1297,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index 21e2cb5e3178..83d50d80a148 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -618,9 +618,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "", @@ -631,9 +630,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -650,9 +648,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", @@ -669,9 +666,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -700,16 +696,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -729,9 +723,8 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index c015b8277dc7..769ba12bef87 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -667,23 +667,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1045,16 +1042,14 @@ { "BriefDescription": "\"Bus lock\" per kilo instruction", "MetricExpr": "tma_info_memory_mix_bus_lock_pki", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_bus_lock_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_bus_lock_pki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1095,9 +1090,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1114,9 +1108,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1139,23 +1132,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", @@ -1190,9 +1180,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1203,9 +1192,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1240,23 +1228,20 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l3_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l3_miss_latency" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1267,9 +1252,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "\"Bus lock\" per kilo instruction", @@ -1293,16 +1277,14 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1332,9 +1314,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 6f0e6360e989..f8c0eac8b828 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -727,23 +727,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0) + 0 * slots", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots)) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1113,16 +1110,14 @@ { "BriefDescription": "\"Bus lock\" per kilo instruction", "MetricExpr": "tma_info_memory_mix_bus_lock_pki", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_bus_lock_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_bus_lock_pki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1163,9 +1158,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1182,9 +1176,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1207,23 +1200,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -1264,9 +1254,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1277,9 +1266,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1314,23 +1302,20 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l3_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l3_miss_latency" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1341,9 +1326,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "\"Bus lock\" per kilo instruction", @@ -1385,53 +1369,46 @@ { "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / INST_RETIRED.ANY", - "MetricGroup": "Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_offcore_mwrite_any_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Offcore", + "MetricName": "tma_info_memory_offcore_mwrite_any_pki" }, { "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / INST_RETIRED.ANY", - "MetricGroup": "CacheHits;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_offcore_read_any_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "CacheHits;Offcore", + "MetricName": "tma_info_memory_offcore_read_any_pki" }, { "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_offcore_read_l3m_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Offcore", + "MetricName": "tma_info_memory_offcore_read_l3m_pki" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "tma_info_memory_r2c_dram_bw", - "MetricgroupNoGroup": "TopdownL1", "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." }, { "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "tma_info_memory_r2c_l3m_bw", - "MetricgroupNoGroup": "TopdownL1", "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." }, { "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", "MetricName": "tma_info_memory_r2c_offcore_bw", - "MetricgroupNoGroup": "TopdownL1", "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." }, { @@ -1458,9 +1435,8 @@ { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1490,9 +1466,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 025e836a1c80..8126f952a30c 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -652,23 +652,20 @@ { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_core_bound_likely", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_core_bound_likely" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))", - "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_dsb_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_botlnk_dsb_misses" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", - "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", - "MetricName": "tma_info_botlnk_ic_misses", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_botlnk_ic_misses" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", @@ -1021,9 +1018,8 @@ { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", - "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_code_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_code_stlb_mpki" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", @@ -1064,9 +1060,8 @@ { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "tma_info_memory_latency_data_l2_mlp", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_data_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_data_l2_mlp" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", @@ -1083,9 +1078,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", @@ -1108,23 +1102,20 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki" }, { "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", - "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l2_evictions_silent_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_l2_evictions_silent_pki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", @@ -1165,9 +1156,8 @@ { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_access_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t" }, { "BriefDescription": "", @@ -1178,9 +1168,8 @@ { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", - "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t" }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", @@ -1209,16 +1198,14 @@ { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_miss_latency", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_l2_mlp", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_load_l2_mlp" }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", @@ -1229,9 +1216,8 @@ { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_load_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_load_stlb_mpki" }, { "BriefDescription": "Un-cacheable retired load per kilo instruction", @@ -1249,16 +1235,14 @@ { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_page_walks_utilization", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_page_walks_utilization" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", - "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_store_stlb_mpki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_store_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1289,9 +1273,8 @@ { "BriefDescription": "Un-cacheable retired load per kilo instruction", "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", - "MetricGroup": "Mem;TopdownL1;tma_L1_group", - "MetricName": "tma_info_memory_uc_load_pki", - "MetricgroupNoGroup": "TopdownL1" + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_uc_load_pki" }, { "BriefDescription": "", -- cgit v1.2.3 From 2a5049b75d22c971e73501784f10548c1d69c407 Mon Sep 17 00:00:00 2001 From: Anne Macedo Date: Tue, 19 Mar 2024 14:36:26 +0000 Subject: perf lock contention: Trim backtrace by skipping traceiter functions The 'perf lock contention' program currently shows the caller of the locks as __traceiter_contention_begin+0x??. This caller can be ignored, as it is from the traceiter itself. Instead, it should show the real callers for the locks. When fiddling with the --stack-skip parameter, the actual callers for the locks start to show up. However, just ignore the __traceiter_contention_begin and the __traceiter_contention_end symbols so the actual callers will show up. Before this patch is applied: sudo perf lock con -a -b -- sleep 3 contended total wait max wait avg wait type caller 8 2.33 s 2.28 s 291.18 ms rwlock:W __traceiter_contention_begin+0x44 4 2.33 s 2.28 s 582.35 ms rwlock:W __traceiter_contention_begin+0x44 7 140.30 ms 46.77 ms 20.04 ms rwlock:W __traceiter_contention_begin+0x44 2 63.35 ms 33.76 ms 31.68 ms mutex trace_contention_begin+0x84 2 46.74 ms 46.73 ms 23.37 ms rwlock:W __traceiter_contention_begin+0x44 1 13.54 us 13.54 us 13.54 us mutex trace_contention_begin+0x84 1 3.67 us 3.67 us 3.67 us rwsem:R __traceiter_contention_begin+0x44 Before this patch is applied - using --stack-skip 5 sudo perf lock con --stack-skip 5 -a -b -- sleep 3 contended total wait max wait avg wait type caller 2 2.24 s 2.24 s 1.12 s rwlock:W do_epoll_wait+0x5a0 4 1.65 s 824.21 ms 412.08 ms rwlock:W do_exit+0x338 2 824.35 ms 824.29 ms 412.17 ms spinlock get_signal+0x108 2 824.14 ms 824.14 ms 412.07 ms rwlock:W release_task+0x68 1 25.22 ms 25.22 ms 25.22 ms mutex cgroup_kn_lock_live+0x58 1 24.71 us 24.71 us 24.71 us spinlock do_exit+0x44 1 22.04 us 22.04 us 22.04 us rwsem:R lock_mm_and_find_vma+0xb0 After this patch is applied: sudo ./perf lock con -a -b -- sleep 3 contended total wait max wait avg wait type caller 4 4.13 s 2.07 s 1.03 s rwlock:W release_task+0x68 2 2.07 s 2.07 s 1.03 s rwlock:R mm_update_next_owner+0x50 2 2.07 s 2.07 s 1.03 s rwlock:W do_exit+0x338 1 41.56 ms 41.56 ms 41.56 ms mutex cgroup_kn_lock_live+0x58 2 36.12 us 18.83 us 18.06 us rwlock:W do_exit+0x338 Signed-off-by: Anne Macedo Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240319143629.3422590-1-retpolanne@posteo.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 24 ++++++++++++++++++++++++ tools/perf/util/machine.h | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 527517db3182..5eb9044bc223 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -3266,6 +3266,17 @@ bool machine__is_lock_function(struct machine *machine, u64 addr) sym = machine__find_kernel_symbol_by_name(machine, "__lock_text_end", &kmap); machine->lock.text_end = map__unmap_ip(kmap, sym->start); + + sym = machine__find_kernel_symbol_by_name(machine, "__traceiter_contention_begin", &kmap); + if (sym) { + machine->traceiter.text_start = map__unmap_ip(kmap, sym->start); + machine->traceiter.text_end = map__unmap_ip(kmap, sym->end); + } + sym = machine__find_kernel_symbol_by_name(machine, "trace_contention_begin", &kmap); + if (sym) { + machine->trace.text_start = map__unmap_ip(kmap, sym->start); + machine->trace.text_end = map__unmap_ip(kmap, sym->end); + } } /* failed to get kernel symbols */ @@ -3280,5 +3291,18 @@ bool machine__is_lock_function(struct machine *machine, u64 addr) if (machine->lock.text_start <= addr && addr < machine->lock.text_end) return true; + /* traceiter functions currently don't have their own section + * but we consider them lock functions + */ + if (machine->traceiter.text_start != 0) { + if (machine->traceiter.text_start <= addr && addr < machine->traceiter.text_end) + return true; + } + + if (machine->trace.text_start != 0) { + if (machine->trace.text_start <= addr && addr < machine->trace.text_end) + return true; + } + return false; } diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index e28c787616fe..4312f6db6de0 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -49,7 +49,7 @@ struct machine { struct { u64 text_start; u64 text_end; - } sched, lock; + } sched, lock, traceiter, trace; pid_t *current_tid; size_t current_tid_sz; union { /* Tool specific area */ -- cgit v1.2.3 From b3ad832d8da583ff4237b04a1ba23cdbf8918907 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 21 Mar 2024 09:02:48 -0700 Subject: perf dso: Reorder members to save space in 'struct dso' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Save 40 bytes and move from 8 to 7 cache lines. Make member dwfl dependent on being a powerpc build. Squeeze bits of int/enum types when appropriate. Remove holes/padding by reordering variables. Before: struct dso { struct mutex lock; /* 0 40 */ struct list_head node; /* 40 16 */ struct rb_node rb_node __attribute__((__aligned__(8))); /* 56 24 */ /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ struct rb_root * root; /* 80 8 */ struct rb_root_cached symbols; /* 88 16 */ struct symbol * * symbol_names; /* 104 8 */ size_t symbol_names_len; /* 112 8 */ struct rb_root_cached inlined_nodes; /* 120 16 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ struct rb_root_cached srclines; /* 136 16 */ struct { u64 addr; /* 152 8 */ struct symbol * symbol; /* 160 8 */ } last_find_result; /* 152 16 */ void * a2l; /* 168 8 */ char * symsrc_filename; /* 176 8 */ unsigned int a2l_fails; /* 184 4 */ enum dso_space_type kernel; /* 188 4 */ /* --- cacheline 3 boundary (192 bytes) --- */ _Bool is_kmod; /* 192 1 */ /* XXX 3 bytes hole, try to pack */ enum dso_swap_type needs_swap; /* 196 4 */ enum dso_binary_type symtab_type; /* 200 4 */ enum dso_binary_type binary_type; /* 204 4 */ enum dso_load_errno load_errno; /* 208 4 */ u8 adjust_symbols:1; /* 212: 0 1 */ u8 has_build_id:1; /* 212: 1 1 */ u8 header_build_id:1; /* 212: 2 1 */ u8 has_srcline:1; /* 212: 3 1 */ u8 hit:1; /* 212: 4 1 */ u8 annotate_warned:1; /* 212: 5 1 */ u8 auxtrace_warned:1; /* 212: 6 1 */ u8 short_name_allocated:1; /* 212: 7 1 */ u8 long_name_allocated:1; /* 213: 0 1 */ u8 is_64_bit:1; /* 213: 1 1 */ /* XXX 6 bits hole, try to pack */ _Bool sorted_by_name; /* 214 1 */ _Bool loaded; /* 215 1 */ u8 rel; /* 216 1 */ /* XXX 7 bytes hole, try to pack */ struct build_id bid; /* 224 32 */ /* --- cacheline 4 boundary (256 bytes) --- */ u64 text_offset; /* 256 8 */ u64 text_end; /* 264 8 */ const char * short_name; /* 272 8 */ const char * long_name; /* 280 8 */ u16 long_name_len; /* 288 2 */ u16 short_name_len; /* 290 2 */ /* XXX 4 bytes hole, try to pack */ void * dwfl; /* 296 8 */ struct auxtrace_cache * auxtrace_cache; /* 304 8 */ int comp; /* 312 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 5 boundary (320 bytes) --- */ struct { struct rb_root cache; /* 320 8 */ int fd; /* 328 4 */ int status; /* 332 4 */ u32 status_seen; /* 336 4 */ /* XXX 4 bytes hole, try to pack */ u64 file_size; /* 344 8 */ struct list_head open_entry; /* 352 16 */ u64 elf_base_addr; /* 368 8 */ u64 debug_frame_offset; /* 376 8 */ /* --- cacheline 6 boundary (384 bytes) --- */ u64 eh_frame_hdr_addr; /* 384 8 */ u64 eh_frame_hdr_offset; /* 392 8 */ } data; /* 320 80 */ struct { u32 id; /* 400 4 */ u32 sub_id; /* 404 4 */ struct perf_env * env; /* 408 8 */ } bpf_prog; /* 400 16 */ union { void * priv; /* 416 8 */ u64 db_id; /* 416 8 */ }; /* 416 8 */ struct nsinfo * nsinfo; /* 424 8 */ struct dso_id id; /* 432 24 */ /* --- cacheline 7 boundary (448 bytes) was 8 bytes ago --- */ refcount_t refcnt; /* 456 4 */ char name[]; /* 460 0 */ /* size: 464, cachelines: 8, members: 49 */ /* sum members: 440, holes: 4, sum holes: 18 */ /* sum bitfield members: 10 bits, bit holes: 1, sum bit holes: 6 bits */ /* padding: 4 */ /* forced alignments: 1 */ /* last cacheline: 16 bytes */ } __attribute__((__aligned__(8))); After: struct dso { struct mutex lock; /* 0 40 */ struct list_head node; /* 40 16 */ struct rb_node rb_node __attribute__((__aligned__(8))); /* 56 24 */ /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ struct rb_root * root; /* 80 8 */ struct rb_root_cached symbols; /* 88 16 */ struct symbol * * symbol_names; /* 104 8 */ size_t symbol_names_len; /* 112 8 */ struct rb_root_cached inlined_nodes; /* 120 16 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ struct rb_root_cached srclines; /* 136 16 */ struct { u64 addr; /* 152 8 */ struct symbol * symbol; /* 160 8 */ } last_find_result; /* 152 16 */ struct build_id bid; /* 168 32 */ /* --- cacheline 3 boundary (192 bytes) was 8 bytes ago --- */ u64 text_offset; /* 200 8 */ u64 text_end; /* 208 8 */ const char * short_name; /* 216 8 */ const char * long_name; /* 224 8 */ void * a2l; /* 232 8 */ char * symsrc_filename; /* 240 8 */ struct nsinfo * nsinfo; /* 248 8 */ /* --- cacheline 4 boundary (256 bytes) --- */ struct auxtrace_cache * auxtrace_cache; /* 256 8 */ union { void * priv; /* 264 8 */ u64 db_id; /* 264 8 */ }; /* 264 8 */ struct { struct perf_env * env; /* 272 8 */ u32 id; /* 280 4 */ u32 sub_id; /* 284 4 */ } bpf_prog; /* 272 16 */ struct { struct rb_root cache; /* 288 8 */ struct list_head open_entry; /* 296 16 */ u64 file_size; /* 312 8 */ /* --- cacheline 5 boundary (320 bytes) --- */ u64 elf_base_addr; /* 320 8 */ u64 debug_frame_offset; /* 328 8 */ u64 eh_frame_hdr_addr; /* 336 8 */ u64 eh_frame_hdr_offset; /* 344 8 */ int fd; /* 352 4 */ int status; /* 356 4 */ u32 status_seen; /* 360 4 */ } data; /* 288 80 */ /* XXX last struct has 4 bytes of padding */ struct dso_id id; /* 368 24 */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ unsigned int a2l_fails; /* 392 4 */ int comp; /* 396 4 */ refcount_t refcnt; /* 400 4 */ enum dso_load_errno load_errno; /* 404 4 */ u16 long_name_len; /* 408 2 */ u16 short_name_len; /* 410 2 */ enum dso_binary_type symtab_type:8; /* 412: 0 4 */ enum dso_binary_type binary_type:8; /* 412: 8 4 */ enum dso_space_type kernel:2; /* 412:16 4 */ enum dso_swap_type needs_swap:2; /* 412:18 4 */ /* Bitfield combined with next fields */ _Bool is_kmod:1; /* 414: 4 1 */ u8 adjust_symbols:1; /* 414: 5 1 */ u8 has_build_id:1; /* 414: 6 1 */ u8 header_build_id:1; /* 414: 7 1 */ u8 has_srcline:1; /* 415: 0 1 */ u8 hit:1; /* 415: 1 1 */ u8 annotate_warned:1; /* 415: 2 1 */ u8 auxtrace_warned:1; /* 415: 3 1 */ u8 short_name_allocated:1; /* 415: 4 1 */ u8 long_name_allocated:1; /* 415: 5 1 */ u8 is_64_bit:1; /* 415: 6 1 */ /* XXX 1 bit hole, try to pack */ _Bool sorted_by_name; /* 416 1 */ _Bool loaded; /* 417 1 */ u8 rel; /* 418 1 */ char name[]; /* 419 0 */ /* size: 424, cachelines: 7, members: 48 */ /* sum members: 415 */ /* sum bitfield members: 31 bits, bit holes: 1, sum bit holes: 1 bits */ /* padding: 5 */ /* paddings: 1, sum paddings: 4 */ /* forced alignments: 1 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Markus Elfring Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Sun Haiyong Cc: Yanteng Si Cc: zhaimingbing Link: https://lore.kernel.org/r/20240321160300.1635121-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.h | 84 +++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 2cdcd1e2ef8b..17dab230a2ca 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -161,66 +161,66 @@ struct dso { u64 addr; struct symbol *symbol; } last_find_result; - void *a2l; - char *symsrc_filename; - unsigned int a2l_fails; - enum dso_space_type kernel; - bool is_kmod; - enum dso_swap_type needs_swap; - enum dso_binary_type symtab_type; - enum dso_binary_type binary_type; - enum dso_load_errno load_errno; - u8 adjust_symbols:1; - u8 has_build_id:1; - u8 header_build_id:1; - u8 has_srcline:1; - u8 hit:1; - u8 annotate_warned:1; - u8 auxtrace_warned:1; - u8 short_name_allocated:1; - u8 long_name_allocated:1; - u8 is_64_bit:1; - bool sorted_by_name; - bool loaded; - u8 rel; struct build_id bid; u64 text_offset; u64 text_end; const char *short_name; const char *long_name; - u16 long_name_len; - u16 short_name_len; + void *a2l; + char *symsrc_filename; +#if defined(__powerpc__) void *dwfl; /* DWARF debug info */ +#endif + struct nsinfo *nsinfo; struct auxtrace_cache *auxtrace_cache; - int comp; - + union { /* Tool specific area */ + void *priv; + u64 db_id; + }; + /* bpf prog information */ + struct { + struct perf_env *env; + u32 id; + u32 sub_id; + } bpf_prog; /* dso data file */ struct { struct rb_root cache; - int fd; - int status; - u32 status_seen; - u64 file_size; struct list_head open_entry; + u64 file_size; u64 elf_base_addr; u64 debug_frame_offset; u64 eh_frame_hdr_addr; u64 eh_frame_hdr_offset; + int fd; + int status; + u32 status_seen; } data; - /* bpf prog information */ - struct { - u32 id; - u32 sub_id; - struct perf_env *env; - } bpf_prog; - - union { /* Tool specific area */ - void *priv; - u64 db_id; - }; - struct nsinfo *nsinfo; struct dso_id id; + unsigned int a2l_fails; + int comp; refcount_t refcnt; + enum dso_load_errno load_errno; + u16 long_name_len; + u16 short_name_len; + enum dso_binary_type symtab_type:8; + enum dso_binary_type binary_type:8; + enum dso_space_type kernel:2; + enum dso_swap_type needs_swap:2; + bool is_kmod:1; + u8 adjust_symbols:1; + u8 has_build_id:1; + u8 header_build_id:1; + u8 has_srcline:1; + u8 hit:1; + u8 annotate_warned:1; + u8 auxtrace_warned:1; + u8 short_name_allocated:1; + u8 long_name_allocated:1; + u8 is_64_bit:1; + bool sorted_by_name; + bool loaded; + u8 rel; char name[]; }; -- cgit v1.2.3 From 4962e1949608fed932d661ef45803a5bee69bebe Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2024 17:07:33 -0300 Subject: perf beauty: Move uapi/linux/vhost.h copy out of the directory used to build perf It is only used to generate string tables, not to build perf, so move it to the tools/perf/trace/beauty/include/ hierarchy, that is used just for scraping. This is a something that should've have happened, as happened with the linux/socket.h scrapper, do it now as Ian suggested while doing an audit/refactor session in the headers used by perf. No other tools/ living code uses it, just coming from either 'make install_headers' or from the system /usr/include/ directory. Suggested-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fWZVrpRufO4w-S4EcSi9STXcTAN2ERLwTSN7yrSSA-otQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/vhost.h | 230 --------------------- tools/perf/Makefile.perf | 5 +- tools/perf/check-headers.sh | 2 +- tools/perf/trace/beauty/include/uapi/linux/vhost.h | 230 +++++++++++++++++++++ tools/perf/trace/beauty/vhost_virtio_ioctl.sh | 6 +- 5 files changed, 236 insertions(+), 237 deletions(-) delete mode 100644 tools/include/uapi/linux/vhost.h create mode 100644 tools/perf/trace/beauty/include/uapi/linux/vhost.h (limited to 'tools') diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h deleted file mode 100644 index 649560c685f1..000000000000 --- a/tools/include/uapi/linux/vhost.h +++ /dev/null @@ -1,230 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_VHOST_H -#define _LINUX_VHOST_H -/* Userspace interface for in-kernel virtio accelerators. */ - -/* vhost is used to reduce the number of system calls involved in virtio. - * - * Existing virtio net code is used in the guest without modification. - * - * This header includes interface used by userspace hypervisor for - * device configuration. - */ - -#include -#include -#include - -#define VHOST_FILE_UNBIND -1 - -/* ioctls */ - -#define VHOST_VIRTIO 0xAF - -/* Features bitmask for forward compatibility. Transport bits are used for - * vhost specific features. */ -#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) -#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) - -/* Set current process as the (exclusive) owner of this file descriptor. This - * must be called before any other vhost command. Further calls to - * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ -#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) -/* Give up ownership, and reset the device to default values. - * Allows subsequent call to VHOST_OWNER_SET to succeed. */ -#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) - -/* Set up/modify memory layout */ -#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) - -/* Write logging setup. */ -/* Memory writes can optionally be logged by setting bit at an offset - * (calculated from the physical address) from specified log base. - * The bit is set using an atomic 32 bit operation. */ -/* Set base address for logging. */ -#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) -/* Specify an eventfd file descriptor to signal on log write. */ -#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) -/* By default, a device gets one vhost_worker that its virtqueues share. This - * command allows the owner of the device to create an additional vhost_worker - * for the device. It can later be bound to 1 or more of its virtqueues using - * the VHOST_ATTACH_VRING_WORKER command. - * - * This must be called after VHOST_SET_OWNER and the caller must be the owner - * of the device. The new thread will inherit caller's cgroups and namespaces, - * and will share the caller's memory space. The new thread will also be - * counted against the caller's RLIMIT_NPROC value. - * - * The worker's ID used in other commands will be returned in - * vhost_worker_state. - */ -#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) -/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any - * virtqueue. If userspace is not able to call this for workers its created, - * the kernel will free all the device's workers when the device is closed. - */ -#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) - -/* Ring setup. */ -/* Set number of descriptors in ring. This parameter can not - * be modified while ring is running (bound to a device). */ -#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) -/* Set addresses for the ring. */ -#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) -/* Base value where queue looks for available descriptors */ -#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) -/* Get accessor: reads index, writes value in num */ -#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) - -/* Set the vring byte order in num. Valid values are VHOST_VRING_LITTLE_ENDIAN - * or VHOST_VRING_BIG_ENDIAN (other values return -EINVAL). - * The byte order cannot be changed while the device is active: trying to do so - * returns -EBUSY. - * This is a legacy only API that is simply ignored when VIRTIO_F_VERSION_1 is - * set. - * Not all kernel configurations support this ioctl, but all configurations that - * support SET also support GET. - */ -#define VHOST_VRING_LITTLE_ENDIAN 0 -#define VHOST_VRING_BIG_ENDIAN 1 -#define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) -#define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) -/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's - * virtqueues. - * - * This will replace the virtqueue's existing worker. If the replaced worker - * is no longer attached to any virtqueues, it can be freed with - * VHOST_FREE_WORKER. - */ -#define VHOST_ATTACH_VRING_WORKER _IOW(VHOST_VIRTIO, 0x15, \ - struct vhost_vring_worker) -/* Return the vring worker's ID */ -#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ - struct vhost_vring_worker) - -/* The following ioctls use eventfd file descriptors to signal and poll - * for events. */ - -/* Set eventfd to poll for added buffers */ -#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) -/* Set eventfd to signal when buffers have beed used */ -#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) -/* Set eventfd to signal an error */ -#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) -/* Set busy loop timeout (in us) */ -#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ - struct vhost_vring_state) -/* Get busy loop timeout (in us) */ -#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ - struct vhost_vring_state) - -/* Set or get vhost backend capability */ - -#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) -#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) - -/* VHOST_NET specific defines */ - -/* Attach virtio net ring to a raw socket, or tap device. - * The socket must be already bound to an ethernet device, this device will be - * used for transmit. Pass fd -1 to unbind from the socket and the transmit - * device. This can be used to stop the ring (e.g. for migration). */ -#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) - -/* VHOST_SCSI specific defines */ - -#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) -#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) -/* Changing this breaks userspace. */ -#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) -/* Set and get the events missed flag */ -#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) -#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) - -/* VHOST_VSOCK specific defines */ - -#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) -#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) - -/* VHOST_VDPA specific defines */ - -/* Get the device id. The device ids follow the same definition of - * the device id defined in virtio-spec. - */ -#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32) -/* Get and set the status. The status bits follow the same definition - * of the device status defined in virtio-spec. - */ -#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8) -#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8) -/* Get and set the device config. The device config follows the same - * definition of the device config defined in virtio-spec. - */ -#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \ - struct vhost_vdpa_config) -#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \ - struct vhost_vdpa_config) -/* Enable/disable the ring. */ -#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \ - struct vhost_vring_state) -/* Get the max ring size. */ -#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) - -/* Set event fd for config interrupt*/ -#define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) - -/* Get the valid iova range */ -#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ - struct vhost_vdpa_iova_range) -/* Get the config size */ -#define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) - -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - -/* Get the number of address spaces. */ -#define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) - -/* Get the group for a virtqueue: read index, write group in num, - * The virtqueue index is stored in the index field of - * vhost_vring_state. The group for this specific virtqueue is - * returned via num field of vhost_vring_state. - */ -#define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B, \ - struct vhost_vring_state) -/* Set the ASID for a virtqueue group. The group index is stored in - * the index field of vhost_vring_state, the ASID associated with this - * group is stored at num field of vhost_vring_state. - */ -#define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ - struct vhost_vring_state) - -/* Suspend a device so it does not process virtqueue requests anymore - * - * After the return of ioctl the device must preserve all the necessary state - * (the virtqueue vring base plus the possible device specific states) that is - * required for restoring in the future. The device must not change its - * configuration after that point. - */ -#define VHOST_VDPA_SUSPEND _IO(VHOST_VIRTIO, 0x7D) - -/* Resume a device so it can resume processing virtqueue requests - * - * After the return of this ioctl the device will have restored all the - * necessary states and it is fully operational to continue processing the - * virtqueue descriptors. - */ -#define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) - -/* Get the group for the descriptor table including driver & device areas - * of a virtqueue: read index, write group in num. - * The virtqueue index is stored in the index field of vhost_vring_state. - * The group ID of the descriptor table for this specific virtqueue - * is returned via num field of vhost_vring_state. - */ -#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ - struct vhost_vring_state) -#endif diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 0d2dbdfc44df..5c35c0d89306 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -580,11 +580,10 @@ $(sockaddr_arrays): $(beauty_linux_dir)/socket.h $(sockaddr_tbl) $(Q)$(SHELL) '$(sockaddr_tbl)' $(beauty_linux_dir) > $@ vhost_virtio_ioctl_array := $(beauty_ioctl_outdir)/vhost_virtio_ioctl_array.c -vhost_virtio_hdr_dir := $(srctree)/tools/include/uapi/linux vhost_virtio_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/vhost_virtio_ioctl.sh -$(vhost_virtio_ioctl_array): $(vhost_virtio_hdr_dir)/vhost.h $(vhost_virtio_ioctl_tbl) - $(Q)$(SHELL) '$(vhost_virtio_ioctl_tbl)' $(vhost_virtio_hdr_dir) > $@ +$(vhost_virtio_ioctl_array): $(beauty_uapi_linux_dir)/vhost.h $(vhost_virtio_ioctl_tbl) + $(Q)$(SHELL) '$(vhost_virtio_ioctl_tbl)' $(beauty_uapi_linux_dir) > $@ perf_ioctl_array := $(beauty_ioctl_outdir)/perf_ioctl_array.c perf_hdr_dir := $(srctree)/tools/include/uapi/linux diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 76726a5a7c78..95a3a404bc6f 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -16,7 +16,6 @@ FILES=( "include/uapi/linux/in.h" "include/uapi/linux/perf_event.h" "include/uapi/linux/seccomp.h" - "include/uapi/linux/vhost.h" "include/linux/bits.h" "include/vdso/bits.h" "include/linux/const.h" @@ -96,6 +95,7 @@ BEAUTY_FILES=( "include/uapi/linux/sched.h" "include/uapi/linux/stat.h" "include/uapi/linux/usbdevice_fs.h" + "include/uapi/linux/vhost.h" "include/uapi/sound/asound.h" ) diff --git a/tools/perf/trace/beauty/include/uapi/linux/vhost.h b/tools/perf/trace/beauty/include/uapi/linux/vhost.h new file mode 100644 index 000000000000..649560c685f1 --- /dev/null +++ b/tools/perf/trace/beauty/include/uapi/linux/vhost.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include +#include +#include + +#define VHOST_FILE_UNBIND -1 + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Write logging setup. */ +/* Memory writes can optionally be logged by setting bit at an offset + * (calculated from the physical address) from specified log base. + * The bit is set using an atomic 32 bit operation. */ +/* Set base address for logging. */ +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Specify an eventfd file descriptor to signal on log write. */ +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +/* By default, a device gets one vhost_worker that its virtqueues share. This + * command allows the owner of the device to create an additional vhost_worker + * for the device. It can later be bound to 1 or more of its virtqueues using + * the VHOST_ATTACH_VRING_WORKER command. + * + * This must be called after VHOST_SET_OWNER and the caller must be the owner + * of the device. The new thread will inherit caller's cgroups and namespaces, + * and will share the caller's memory space. The new thread will also be + * counted against the caller's RLIMIT_NPROC value. + * + * The worker's ID used in other commands will be returned in + * vhost_worker_state. + */ +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any + * virtqueue. If userspace is not able to call this for workers its created, + * the kernel will free all the device's workers when the device is closed. + */ +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) + +/* Ring setup. */ +/* Set number of descriptors in ring. This parameter can not + * be modified while ring is running (bound to a device). */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Set addresses for the ring. */ +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) + +/* Set the vring byte order in num. Valid values are VHOST_VRING_LITTLE_ENDIAN + * or VHOST_VRING_BIG_ENDIAN (other values return -EINVAL). + * The byte order cannot be changed while the device is active: trying to do so + * returns -EBUSY. + * This is a legacy only API that is simply ignored when VIRTIO_F_VERSION_1 is + * set. + * Not all kernel configurations support this ioctl, but all configurations that + * support SET also support GET. + */ +#define VHOST_VRING_LITTLE_ENDIAN 0 +#define VHOST_VRING_BIG_ENDIAN 1 +#define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) +#define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's + * virtqueues. + * + * This will replace the virtqueue's existing worker. If the replaced worker + * is no longer attached to any virtqueues, it can be freed with + * VHOST_FREE_WORKER. + */ +#define VHOST_ATTACH_VRING_WORKER _IOW(VHOST_VIRTIO, 0x15, \ + struct vhost_vring_worker) +/* Return the vring worker's ID */ +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ + struct vhost_vring_worker) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +/* Set busy loop timeout (in us) */ +#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, \ + struct vhost_vring_state) +/* Get busy loop timeout (in us) */ +#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ + struct vhost_vring_state) + +/* Set or get vhost backend capability */ + +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) + +/* VHOST_NET specific defines */ + +/* Attach virtio net ring to a raw socket, or tap device. + * The socket must be already bound to an ethernet device, this device will be + * used for transmit. Pass fd -1 to unbind from the socket and the transmit + * device. This can be used to stop the ring (e.g. for migration). */ +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* VHOST_SCSI specific defines */ + +#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) +#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) +/* Changing this breaks userspace. */ +#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int) +/* Set and get the events missed flag */ +#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) +#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) + +/* VHOST_VSOCK specific defines */ + +#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) +#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) + +/* VHOST_VDPA specific defines */ + +/* Get the device id. The device ids follow the same definition of + * the device id defined in virtio-spec. + */ +#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32) +/* Get and set the status. The status bits follow the same definition + * of the device status defined in virtio-spec. + */ +#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8) +#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8) +/* Get and set the device config. The device config follows the same + * definition of the device config defined in virtio-spec. + */ +#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \ + struct vhost_vdpa_config) +#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \ + struct vhost_vdpa_config) +/* Enable/disable the ring. */ +#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \ + struct vhost_vring_state) +/* Get the max ring size. */ +#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) + +/* Set event fd for config interrupt*/ +#define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) + +/* Get the valid iova range */ +#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ + struct vhost_vdpa_iova_range) +/* Get the config size */ +#define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + +/* Get the number of address spaces. */ +#define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) + +/* Get the group for a virtqueue: read index, write group in num, + * The virtqueue index is stored in the index field of + * vhost_vring_state. The group for this specific virtqueue is + * returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B, \ + struct vhost_vring_state) +/* Set the ASID for a virtqueue group. The group index is stored in + * the index field of vhost_vring_state, the ASID associated with this + * group is stored at num field of vhost_vring_state. + */ +#define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ + struct vhost_vring_state) + +/* Suspend a device so it does not process virtqueue requests anymore + * + * After the return of ioctl the device must preserve all the necessary state + * (the virtqueue vring base plus the possible device specific states) that is + * required for restoring in the future. The device must not change its + * configuration after that point. + */ +#define VHOST_VDPA_SUSPEND _IO(VHOST_VIRTIO, 0x7D) + +/* Resume a device so it can resume processing virtqueue requests + * + * After the return of this ioctl the device will have restored all the + * necessary states and it is fully operational to continue processing the + * virtqueue descriptors. + */ +#define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) + +/* Get the group for the descriptor table including driver & device areas + * of a virtqueue: read index, write group in num. + * The virtqueue index is stored in the index field of vhost_vring_state. + * The group ID of the descriptor table for this specific virtqueue + * is returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ + struct vhost_vring_state) +#endif diff --git a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh index 2dd0a3b1f55a..e4f395e7650a 100755 --- a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh +++ b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh @@ -1,18 +1,18 @@ #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 -[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ +[ $# -eq 1 ] && beauty_uapi_linux_dir=$1 || beauty_uapi_linux_dir=tools/perf/trace/beauty/include/uapi/linux printf "static const char *vhost_virtio_ioctl_cmds[] = {\n" regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*' -grep -E $regex ${header_dir}/vhost.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/vhost.h | \ sed -r "s/$regex/\2 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" printf "static const char *vhost_virtio_ioctl_read_cmds[] = {\n" regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?R\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*' -grep -E $regex ${header_dir}/vhost.h | \ +grep -E $regex ${beauty_uapi_linux_dir}/vhost.h | \ sed -r "s/$regex/\2 \1/g" | \ sort | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" -- cgit v1.2.3 From 1684d6eb99e480ff653af60e20ff5e7e55e69ccd Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 22 Mar 2024 09:57:28 +0000 Subject: selftests/bpf: Use syscall(SYS_gettid) instead of gettid() wrapper in bench MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With glibc 2.28, selftests compilation fails for benchs/bench_trigger.c: benchs/bench_trigger.c: In function ‘inc_counter’: benchs/bench_trigger.c:25:23: error: implicit declaration of function ‘gettid’; did you mean ‘getgid’? [-Werror=implicit-function-declaration] 25 | tid = gettid(); | ^~~~~~ | getgid cc1: all warnings being treated as errors It appears support for the gettid() wrapper is variable across glibc versions, so may be safer to use syscall(SYS_gettid) instead. Fixes: 520fad2e3206 ("selftests/bpf: scale benchmark counting by using per-CPU counters") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240322095728.95671-1-alan.maguire@oracle.com --- tools/testing/selftests/bpf/benchs/bench_trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index b7aea79495ba..9c50412f72e9 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -22,7 +22,7 @@ static __always_inline void inc_counter(struct counter *counters) unsigned slot; if (unlikely(tid == 0)) - tid = gettid(); + tid = syscall(SYS_gettid); /* multiplicative hashing, it's fast */ slot = 2654435769U * tid; -- cgit v1.2.3 From af8d27bf15c8d68c60d830552055fcdba5b5f045 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 22 Mar 2024 14:49:36 +0100 Subject: selftests/bpf: Mark uprobe trigger functions with nocf_check attribute Some distros seem to enable the -fcf-protection=branch by default, which breaks our setup on first instruction of uprobe trigger functions and place there endbr64 instruction. Marking them with nocf_check attribute to skip that. Ignoring unknown attribute warning in gcc for bench objects, because nocf_check can be used only when -fcf-protection=branch is enabled, otherwise we get a warning and break compilation. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240322134936.1075395-1-jolsa@kernel.org --- tools/include/linux/compiler.h | 4 ++++ tools/testing/selftests/bpf/benchs/bench_trigger.c | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 7b65566f3e42..8a63a9913495 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -58,6 +58,10 @@ #define noinline #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 9c50412f72e9..d66eddacd642 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -9,6 +9,8 @@ /* adjust slot shift in inc_hits() if changing */ #define MAX_BUCKETS 256 +#pragma GCC diagnostic ignored "-Wattributes" + /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; @@ -167,7 +169,7 @@ static void trigger_fmodret_setup(void) * GCC doesn't generate stack setup preample for these functions due to them * having no input arguments and doing nothing in the body. */ -__weak void uprobe_target_nop(void) +__nocf_check __weak void uprobe_target_nop(void) { asm volatile ("nop"); } @@ -176,7 +178,7 @@ __weak void opaque_noop_func(void) { } -__weak int uprobe_target_push(void) +__nocf_check __weak int uprobe_target_push(void) { /* overhead of function call is negligible compared to uprobe * triggering, so this shouldn't affect benchmark results much @@ -185,7 +187,7 @@ __weak int uprobe_target_push(void) return 1; } -__weak void uprobe_target_ret(void) +__nocf_check __weak void uprobe_target_ret(void) { asm volatile (""); } -- cgit v1.2.3 From 61df575632d6b39213f47810c441bddbd87c3606 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 19 Mar 2024 10:54:12 -0700 Subject: libbpf: Add new sec_def "sk_skb/verdict" The new sec_def specifies sk_skb program type with BPF_SK_SKB_VERDICT attachment type. This way, libbpf will set expected_attach_type properly for the program. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240319175412.2941149-1-yonghong.song@linux.dev --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 86df0d50cba7..8eb2cd4ef288 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9312,6 +9312,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/verdict", SK_SKB, BPF_SK_SKB_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), -- cgit v1.2.3 From 476a5e929119725fbfcf1307cc8517d9dd2a014d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 24 Mar 2024 20:38:42 -0700 Subject: bpf: Sync uapi bpf.h to tools directory There is a difference between kernel uapi bpf.h and tools uapi bpf.h. There is no functionality difference, but let us sync properly to make it easy for later bpf.h update. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240325033842.1693553-1-yonghong.song@linux.dev --- tools/include/uapi/linux/bpf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf80b614c4db..9585f5345353 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1662,9 +1662,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; - __aligned_u64 cookie; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ -- cgit v1.2.3 From c29083f3f5069d811b3f3c7592a0dc45ec42960c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 25 Mar 2024 09:56:15 +0800 Subject: selftests/bpf: Use start_server in bpf_tcp_ca To simplify the code, use BPF selftests helper start_server() in bpf_tcp_ca.c instead of open-coding it. This helper is defined in network_helpers.c, and exported in network_helpers.h, which is already included in bpf_tcp_ca.c. Signed-off-by: Geliang Tang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/9926a79118db27dd6d91c4854db011c599cabd0e.1711331517.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index a88e6e07e4f5..94cb22b01482 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -89,7 +89,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) WRITE_ONCE(stop, 0); - lfd = socket(AF_INET6, SOCK_STREAM, 0); + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; @@ -103,21 +103,10 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) settimeo(lfd, 0) || settimeo(fd, 0)) goto done; - /* bind, listen and start server thread to accept */ - sa6.sin6_family = AF_INET6; - sa6.sin6_addr = in6addr_loopback; - err = bind(lfd, (struct sockaddr *)&sa6, addrlen); - if (!ASSERT_NEQ(err, -1, "bind")) - goto done; - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); if (!ASSERT_NEQ(err, -1, "getsockname")) goto done; - err = listen(lfd, 1); - if (!ASSERT_NEQ(err, -1, "listen")) - goto done; - if (sk_stg_map) { err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd, &expected_stg, BPF_NOEXIST); -- cgit v1.2.3 From 14bb1e8c8d4ad5d9d2febb7d19c70a3cf536e1e5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Mar 2024 23:13:53 -0700 Subject: selftests/bpf: Fix flaky test btf_map_in_map/lookup_update Recently, I frequently hit the following test failure: [root@arch-fb-vm1 bpf]# ./test_progs -n 33/1 test_lookup_update:PASS:skel_open 0 nsec [...] test_lookup_update:PASS:sync_rcu 0 nsec test_lookup_update:FAIL:map1_leak inner_map1 leaked! #33/1 btf_map_in_map/lookup_update:FAIL #33 btf_map_in_map:FAIL In the test, after map is closed and then after two rcu grace periods, it is assumed that map_id is not available to user space. But the above assumption cannot be guaranteed. After zero or one or two rcu grace periods in different siturations, the actual freeing-map-work is put into a workqueue. Later on, when the work is dequeued, the map will be actually freed. See bpf_map_put() in kernel/bpf/syscall.c. By using workqueue, there is no ganrantee that map will be actually freed after a couple of rcu grace periods. This patch removed such map leak detection and then the test can pass consistently. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240322061353.632136-1-yonghong.song@linux.dev --- .../selftests/bpf/prog_tests/btf_map_in_map.c | 26 +--------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c index a8b53b8736f0..f66ceccd7029 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -25,7 +25,7 @@ static void test_lookup_update(void) int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd; struct test_btf_map_in_map *skel; - int err, key = 0, val, i, fd; + int err, key = 0, val, i; skel = test_btf_map_in_map__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) @@ -102,30 +102,6 @@ static void test_lookup_update(void) CHECK(map1_id == 0, "map1_id", "failed to get ID 1\n"); CHECK(map2_id == 0, "map2_id", "failed to get ID 2\n"); - test_btf_map_in_map__destroy(skel); - skel = NULL; - - /* we need to either wait for or force synchronize_rcu(), before - * checking for "still exists" condition, otherwise map could still be - * resolvable by ID, causing false positives. - * - * Older kernels (5.8 and earlier) freed map only after two - * synchronize_rcu()s, so trigger two, to be entirely sure. - */ - CHECK(kern_sync_rcu(), "sync_rcu", "failed\n"); - CHECK(kern_sync_rcu(), "sync_rcu", "failed\n"); - - fd = bpf_map_get_fd_by_id(map1_id); - if (CHECK(fd >= 0, "map1_leak", "inner_map1 leaked!\n")) { - close(fd); - goto cleanup; - } - fd = bpf_map_get_fd_by_id(map2_id); - if (CHECK(fd >= 0, "map2_leak", "inner_map2 leaked!\n")) { - close(fd); - goto cleanup; - } - cleanup: test_btf_map_in_map__destroy(skel); } -- cgit v1.2.3 From 8034b31464c53d6e182f65a293a87b50ddf6dd7e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 21 Mar 2024 23:04:20 +0800 Subject: workqueue: remove unnecessary import and function in wq_monitor.py Remove unnecessary import and function in wq_monitor.py Signed-off-by: Kemeng Shi Signed-off-by: Tejun Heo --- tools/workqueue/wq_monitor.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py index a8856a9c45dc..9e964c5be40c 100644 --- a/tools/workqueue/wq_monitor.py +++ b/tools/workqueue/wq_monitor.py @@ -32,16 +32,13 @@ https://github.com/osandov/drgn. rescued The number of work items executed by the rescuer. """ -import sys import signal -import os import re import time import json import drgn -from drgn.helpers.linux.list import list_for_each_entry,list_empty -from drgn.helpers.linux.cpumask import for_each_possible_cpu +from drgn.helpers.linux.list import list_for_each_entry import argparse parser = argparse.ArgumentParser(description=desc, @@ -54,10 +51,6 @@ parser.add_argument('-j', '--json', action='store_true', help='Output in json') args = parser.parse_args() -def err(s): - print(s, file=sys.stderr, flush=True) - sys.exit(1) - workqueues = prog['workqueues'] WQ_UNBOUND = prog['WQ_UNBOUND'] -- cgit v1.2.3 From fa61e9aec97f7e6873559f1d7766d3807fb2dc0e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:28 +0100 Subject: selftests: net: libs: Change variable fallback syntax The current syntax of X=${X:=X} first evaluates the ${X:=Y} expression, which either uses the existing value of $X if there is one, or uses the value of "Y" as a fallback, and assigns it to X. The expression is then replaced with the now-current value of $X. Assigning that value to X once more is meaningless. So avoid the outer X=... bit, and instead express the same idea though the do-nothing ":" built-in as : "${X:=Y}". This also cleans up the block nicely and makes it more readable. Signed-off-by: Petr Machata Reviewed-by: Benjamin Poirier Link: https://lore.kernel.org/r/1890ddc58420c2c0d5ba3154c87ecc6d9faf6947.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 48 +++++++++++----------- .../testing/selftests/net/forwarding/tc_common.sh | 2 +- tools/testing/selftests/net/lib.sh | 3 +- 3 files changed, 27 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index e579c2e0c462..ee48c4603ff0 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -5,30 +5,30 @@ # Defines # Can be overridden by the configuration file. -PING=${PING:=ping} -PING6=${PING6:=ping6} -MZ=${MZ:=mausezahn} -MZ_DELAY=${MZ_DELAY:=0} -ARPING=${ARPING:=arping} -TEAMD=${TEAMD:=teamd} -WAIT_TIME=${WAIT_TIME:=5} -PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} -PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no} -NETIF_TYPE=${NETIF_TYPE:=veth} -NETIF_CREATE=${NETIF_CREATE:=yes} -MCD=${MCD:=smcrouted} -MC_CLI=${MC_CLI:=smcroutectl} -PING_COUNT=${PING_COUNT:=10} -PING_TIMEOUT=${PING_TIMEOUT:=5} -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} -INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600} -LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000} -REQUIRE_JQ=${REQUIRE_JQ:=yes} -REQUIRE_MZ=${REQUIRE_MZ:=yes} -REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no} -STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no} -TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=} -TROUTE6=${TROUTE6:=traceroute6} +: "${PING:=ping}" +: "${PING6:=ping6}" +: "${MZ:=mausezahn}" +: "${MZ_DELAY:=0}" +: "${ARPING:=arping}" +: "${TEAMD:=teamd}" +: "${WAIT_TIME:=5}" +: "${PAUSE_ON_FAIL:=no}" +: "${PAUSE_ON_CLEANUP:=no}" +: "${NETIF_TYPE:=veth}" +: "${NETIF_CREATE:=yes}" +: "${MCD:=smcrouted}" +: "${MC_CLI:=smcroutectl}" +: "${PING_COUNT:=10}" +: "${PING_TIMEOUT:=5}" +: "${WAIT_TIMEOUT:=20}" +: "${INTERFACE_TIMEOUT:=600}" +: "${LOW_AGEING_TIME:=1000}" +: "${REQUIRE_JQ:=yes}" +: "${REQUIRE_MZ:=yes}" +: "${REQUIRE_MTOOLS:=no}" +: "${STABLE_MAC_ADDRS:=no}" +: "${TCPDUMP_EXTRA_FLAGS:=}" +: "${TROUTE6:=traceroute6}" net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index bce8bb8d2b6f..2e3326edfa9a 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -4,7 +4,7 @@ CHECK_TC="yes" # Can be overridden by the configuration file. See lib.sh -TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms +: "${TC_HIT_TIMEOUT:=1000}" # ms tc_check_packets() { diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index f9fe182dfbd4..5b366cc4fc43 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -4,7 +4,8 @@ ############################################################################## # Defines -WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} +: "${WAIT_TIMEOUT:=20}" + BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms # Kselftest framework requirement - SKIP code is 4. -- cgit v1.2.3 From fd36fd26ae7c1eed956466e1821222fde49205e4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:29 +0100 Subject: selftests: forwarding.config.sample: Move overrides to lib.sh forwarding.config.sample, net/lib.sh and net/forwarding/lib.sh contain definitions and redefinitions of some of the same variables. The overlap between net/forwarding/lib.sh and forwarding.config.sample is especially large. This duplication is a potential source of confusion and problems. It would be overall less error prone if each variable were defined in one place only. In this patch set, that place is the library itself. Therefore move all comments from forwarding.config.sample to net/forwarding/lib.sh. Move over also a definition of TC_FLAG, which was missing from lib.sh entirely. Additionally, add to lib.sh a default definition of the topology variables. The logic behind this is that forgetting to specify forwarding.config was a frequent source of frustration for the selftest users. But really, most of the time the default veth based topology is just fine. We considered just sourcing forwarding.config.sample instead if forwarding.config is not available, but this is a cleaner solution. That means the syntax of the forwarding.config.sample override has to change to an array assignment, so that the whole variable is overwritten, not just individual keys, which could leave the value of some keys unchanged. Do the same in lib.sh for any cut'n'pasters out there. The config file is then given a sort of carte blanche to redefine whatever variables it sees fit from the libraries. This is described in a comment in the file. Only a handful of variables are left behind, to illustrate the customization. The fact that the variables are now missing from forwarding.config.sample, and therefore would miss from forwarding.config derived from that file as well, should not change anything. This is just the sample file. Users that keep their own forwarding.config would retain it as before. The only observable change is introduction of TC_FLAG to lib.sh, because now the filters would not be attempted to install to HW datapath. For veth pairs this does not change anything. For HW deployments, users presumably have forwarding.config with this value overridden. Signed-off-by: Petr Machata Reviewed-by: Benjamin Poirier Link: https://lore.kernel.org/r/b9b8a11a22821a7aa532211ff461a34f596e26bf.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/forwarding/forwarding.config.sample | 53 +++++------------ tools/testing/selftests/net/forwarding/lib.sh | 69 ++++++++++++++++++---- 2 files changed, 74 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample index 1fc4f0242fc5..f1ca95e79a65 100644 --- a/tools/testing/selftests/net/forwarding/forwarding.config.sample +++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample @@ -3,51 +3,28 @@ ############################################################################## # Topology description. p1 looped back to p2, p3 to p4 and so on. -declare -A NETIFS -NETIFS[p1]=veth0 -NETIFS[p2]=veth1 -NETIFS[p3]=veth2 -NETIFS[p4]=veth3 -NETIFS[p5]=veth4 -NETIFS[p6]=veth5 -NETIFS[p7]=veth6 -NETIFS[p8]=veth7 -NETIFS[p9]=veth8 -NETIFS[p10]=veth9 +NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) # Port that does not have a cable connected. NETIF_NO_CABLE=eth8 ############################################################################## -# Defines +# In addition to the topology-related variables, it is also possible to override +# in this file other variables that net/lib.sh, net/forwarding/lib.sh or other +# libraries or selftests use. E.g.: -# IPv4 ping utility name -PING=ping -# IPv6 ping utility name. Some distributions use 'ping' for IPv6. PING6=ping6 -# Packet generator. Some distributions use 'mz'. MZ=mausezahn -# mausezahn delay between transmissions in microseconds. -MZ_DELAY=0 -# Time to wait after interfaces participating in the test are all UP WAIT_TIME=5 -# Whether to pause on failure or not. -PAUSE_ON_FAIL=no -# Whether to pause on cleanup or not. -PAUSE_ON_CLEANUP=no -# Type of network interface to create -NETIF_TYPE=veth -# Whether to create virtual interfaces (veth) or not -NETIF_CREATE=yes -# Timeout (in seconds) before ping exits regardless of how many packets have -# been sent or received -PING_TIMEOUT=5 -# Minimum ageing_time (in centiseconds) supported by hardware -LOW_AGEING_TIME=1000 -# Flag for tc match, supposed to be skip_sw/skip_hw which means do not process -# filter by software/hardware -TC_FLAG=skip_hw -# IPv6 traceroute utility name. -TROUTE6=traceroute6 - diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ee48c4603ff0..dbd4348f85b8 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1,34 +1,83 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +############################################################################## +# Topology description. p1 looped back to p2, p3 to p4 and so on. + +declare -A NETIFS=( + [p1]=veth0 + [p2]=veth1 + [p3]=veth2 + [p4]=veth3 + [p5]=veth4 + [p6]=veth5 + [p7]=veth6 + [p8]=veth7 + [p9]=veth8 + [p10]=veth9 +) + +# Port that does not have a cable connected. +: "${NETIF_NO_CABLE:=eth8}" + ############################################################################## # Defines -# Can be overridden by the configuration file. +# Networking utilities. : "${PING:=ping}" -: "${PING6:=ping6}" -: "${MZ:=mausezahn}" -: "${MZ_DELAY:=0}" +: "${PING6:=ping6}" # Some distros just use ping. : "${ARPING:=arping}" +: "${TROUTE6:=traceroute6}" + +# Packet generator. +: "${MZ:=mausezahn}" # Some distributions use 'mz'. +: "${MZ_DELAY:=0}" + +# Host configuration tools. : "${TEAMD:=teamd}" +: "${MCD:=smcrouted}" +: "${MC_CLI:=smcroutectl}" + +# Constants for netdevice bring-up: +# Default time in seconds to wait for an interface to come up before giving up +# and bailing out. Used during initial setup. +: "${INTERFACE_TIMEOUT:=600}" +# Like INTERFACE_TIMEOUT, but default for ad-hoc waiting in testing scripts. +: "${WAIT_TIMEOUT:=20}" +# Time to wait after interfaces participating in the test are all UP. : "${WAIT_TIME:=5}" + +# Whether to pause on, respectively, after a failure and before cleanup. : "${PAUSE_ON_FAIL:=no}" : "${PAUSE_ON_CLEANUP:=no}" -: "${NETIF_TYPE:=veth}" + +# Whether to create virtual interfaces, and what netdevice type they should be. : "${NETIF_CREATE:=yes}" -: "${MCD:=smcrouted}" -: "${MC_CLI:=smcroutectl}" +: "${NETIF_TYPE:=veth}" + +# Constants for ping tests: +# How many packets should be sent. : "${PING_COUNT:=10}" +# Timeout (in seconds) before ping exits regardless of how many packets have +# been sent or received : "${PING_TIMEOUT:=5}" -: "${WAIT_TIMEOUT:=20}" -: "${INTERFACE_TIMEOUT:=600}" + +# Minimum ageing_time (in centiseconds) supported by hardware : "${LOW_AGEING_TIME:=1000}" + +# Whether to check for availability of certain tools. : "${REQUIRE_JQ:=yes}" : "${REQUIRE_MZ:=yes}" : "${REQUIRE_MTOOLS:=no}" + +# Whether to override MAC addresses on interfaces participating in the test. : "${STABLE_MAC_ADDRS:=no}" + +# Flags for tcpdump : "${TCPDUMP_EXTRA_FLAGS:=}" -: "${TROUTE6:=traceroute6}" + +# Flags for TC filters. +: "${TC_FLAG:=skip_hw}" net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -- cgit v1.2.3 From 0cb862871fe7d338e0cf5229f0399d5c1630b8d0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:30 +0100 Subject: selftests: forwarding: README: Document customization That any sort of customization is possible at all, let alone how it should be done, is currently not at all clear. Document the whats and hows in README. Signed-off-by: Petr Machata Reviewed-by: Benjamin Poirier Link: https://lore.kernel.org/r/e819623af6aaeea49e9dc36cecd95694fad73bb8.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/README | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index b8a2af8fcfb7..7fdb6a9ca543 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -56,3 +56,36 @@ o Checks shall be added to lib.sh for any external dependencies. o Code shall be checked using ShellCheck [1] prior to submission. 1. https://www.shellcheck.net/ + +Customization +============= + +The forwarding selftests framework uses a number of variables that +influence its behavior and tools it invokes, and how it invokes them, in +various ways. A number of these variables can be overridden. The way these +overridable variables are specified is typically one of the following two +syntaxes: + + : "${VARIABLE:=default_value}" + VARIABLE=${VARIABLE:=default_value} + +Any of these variables can be overridden. Notably net/forwarding/lib.sh and +net/lib.sh contain a number of overridable variables. + +One way of overriding these variables is through the environment: + + PAUSE_ON_FAIL=yes ./some_test.sh + +The variable NETIFS is special. Since it is an array variable, there is no +way to pass it through the environment. Its value can instead be given as +consecutive arguments to the selftest: + + ./some_test.sh swp{1..8} + +A way to customize variables in a persistent fashion is to create a file +named forwarding.config in this directory. lib.sh sources the file if +present, so it can contain any shell code. Typically it will contain +assignments of variables whose value should be overridden. + +forwarding.config.sample is available in the directory as an example of +how forwarding.config might look. -- cgit v1.2.3 From 0faa565bc4a708f4a81abfb6a4426c56609cfd2b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:31 +0100 Subject: selftests: forwarding: ipip_lib: Do not import lib.sh This library is always sourced in the context where lib.sh has already been sourced as well. Therefore drop the explicit sourcing and expect the client to already have done it. This will simplify moving some of the clients to a different directory. Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/a4da5e9cd42a34cbace917a048ca71081719d6ac.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/ipip_lib.sh | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh index 30f36a57bae6..01e62c4ac94d 100644 --- a/tools/testing/selftests/net/forwarding/ipip_lib.sh +++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh @@ -141,7 +141,6 @@ # | $h2 + | # | 192.0.2.18/28 | # +---------------------------+ -source lib.sh h1_create() { -- cgit v1.2.3 From 40d269c000bda9fcd276a0412a9cebd3f6e344c5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:32 +0100 Subject: selftests: forwarding: Move several selftests The tests in net/forwarding are generally expected to be HW-independent. There are however several tests that, while not depending on any HW in particular, nevertheless depend on being used on HW interfaces. Placing these selftests to net/forwarding is confusing, because the selftest will just report it can't be run on veth pairs. At the same time, placing them to a particular driver's selftests subdirectory would be wrong. Instead, add a new directory, drivers/net/hw, where these generic but HW independent selftests should be placed. Move over several such tests including one helper library. Since typically these tests will not be expected to run, omit the directory drivers/net/hw from the TARGETS list in selftests/Makefile. Retain a Makefile in the new directory itself, so that a user can make -C into that directory and act on those tests explicitly. Cc: Roger Quadros Cc: Tobias Waldekranz Cc: Danielle Ratson Cc: Davide Caratti Cc: Johannes Nixdorf Suggested-by: Jakub Kicinski Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/e11dae1f62703059e9fc2240004288ac7cc15756.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/Makefile | 25 ++ .../selftests/drivers/net/hw/devlink_port_split.py | 309 +++++++++++++++++++ tools/testing/selftests/drivers/net/hw/ethtool.sh | 302 ++++++++++++++++++ .../drivers/net/hw/ethtool_extended_state.sh | 118 +++++++ .../selftests/drivers/net/hw/ethtool_lib.sh | 120 ++++++++ .../testing/selftests/drivers/net/hw/ethtool_mm.sh | 341 +++++++++++++++++++++ .../selftests/drivers/net/hw/ethtool_rmon.sh | 144 +++++++++ .../selftests/drivers/net/hw/hw_stats_l3.sh | 341 +++++++++++++++++++++ .../selftests/drivers/net/hw/hw_stats_l3_gre.sh | 112 +++++++ tools/testing/selftests/drivers/net/hw/loopback.sh | 103 +++++++ tools/testing/selftests/drivers/net/hw/settings | 1 + tools/testing/selftests/net/Makefile | 1 - tools/testing/selftests/net/devlink_port_split.py | 309 ------------------- tools/testing/selftests/net/forwarding/Makefile | 8 - tools/testing/selftests/net/forwarding/ethtool.sh | 301 ------------------ .../net/forwarding/ethtool_extended_state.sh | 117 ------- .../selftests/net/forwarding/ethtool_lib.sh | 120 -------- .../testing/selftests/net/forwarding/ethtool_mm.sh | 340 -------------------- .../selftests/net/forwarding/ethtool_rmon.sh | 143 --------- .../selftests/net/forwarding/hw_stats_l3.sh | 340 -------------------- .../selftests/net/forwarding/hw_stats_l3_gre.sh | 111 ------- tools/testing/selftests/net/forwarding/loopback.sh | 102 ------ 22 files changed, 1916 insertions(+), 1892 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/hw/Makefile create mode 100755 tools/testing/selftests/drivers/net/hw/devlink_port_split.py create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh create mode 100644 tools/testing/selftests/drivers/net/hw/ethtool_lib.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_mm.sh create mode 100755 tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh create mode 100755 tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh create mode 100755 tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh create mode 100755 tools/testing/selftests/drivers/net/hw/loopback.sh create mode 100644 tools/testing/selftests/drivers/net/hw/settings delete mode 100755 tools/testing/selftests/net/devlink_port_split.py delete mode 100755 tools/testing/selftests/net/forwarding/ethtool.sh delete mode 100755 tools/testing/selftests/net/forwarding/ethtool_extended_state.sh delete mode 100644 tools/testing/selftests/net/forwarding/ethtool_lib.sh delete mode 100755 tools/testing/selftests/net/forwarding/ethtool_mm.sh delete mode 100755 tools/testing/selftests/net/forwarding/ethtool_rmon.sh delete mode 100755 tools/testing/selftests/net/forwarding/hw_stats_l3.sh delete mode 100755 tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh delete mode 100755 tools/testing/selftests/net/forwarding/loopback.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile new file mode 100644 index 000000000000..2259a39a70ed --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = \ + devlink_port_split.py \ + ethtool.sh \ + ethtool_extended_state.sh \ + ethtool_mm.sh \ + ethtool_rmon.sh \ + hw_stats_l3.sh \ + hw_stats_l3_gre.sh \ + loopback.sh \ + # + +TEST_FILES := \ + ethtool_lib.sh \ + # + +TEST_INCLUDES := \ + ../../../net/lib.sh \ + ../../../net/forwarding/lib.sh \ + ../../../net/forwarding/ipip_lib.sh \ + ../../../net/forwarding/tc_common.sh \ + # + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/hw/devlink_port_split.py b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py new file mode 100755 index 000000000000..2d84c7a0be6b --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from subprocess import PIPE, Popen +import json +import time +import argparse +import collections +import sys + +# +# Test port split configuration using devlink-port lanes attribute. +# The test is skipped in case the attribute is not available. +# +# First, check that all the ports with 1 lane fail to split. +# Second, check that all the ports with more than 1 lane can be split +# to all valid configurations (e.g., split to 2, split to 4 etc.) +# + + +# Kselftest framework requirement - SKIP code is 4 +KSFT_SKIP=4 +Port = collections.namedtuple('Port', 'bus_info name') + + +def run_command(cmd, should_fail=False): + """ + Run a command in subprocess. + Return: Tuple of (stdout, stderr). + """ + + p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) + stdout, stderr = p.communicate() + stdout, stderr = stdout.decode(), stderr.decode() + + if stderr != "" and not should_fail: + print("Error sending command: %s" % cmd) + print(stdout) + print(stderr) + return stdout, stderr + + +class devlink_ports(object): + """ + Class that holds information on the devlink ports, required to the tests; + if_names: A list of interfaces in the devlink ports. + """ + + def get_if_names(dev): + """ + Get a list of physical devlink ports. + Return: Array of tuples (bus_info/port, if_name). + """ + + arr = [] + + cmd = "devlink -j port show" + stdout, stderr = run_command(cmd) + assert stderr == "" + ports = json.loads(stdout)['port'] + + validate_devlink_output(ports, 'flavour') + + for port in ports: + if dev in port: + if ports[port]['flavour'] == 'physical': + arr.append(Port(bus_info=port, name=ports[port]['netdev'])) + + return arr + + def __init__(self, dev): + self.if_names = devlink_ports.get_if_names(dev) + + +def get_max_lanes(port): + """ + Get the $port's maximum number of lanes. + Return: number of lanes, e.g. 1, 2, 4 and 8. + """ + + cmd = "devlink -j port show %s" % port + stdout, stderr = run_command(cmd) + assert stderr == "" + values = list(json.loads(stdout)['port'].values())[0] + + if 'lanes' in values: + lanes = values['lanes'] + else: + lanes = 0 + return lanes + + +def get_split_ability(port): + """ + Get the $port split ability. + Return: split ability, true or false. + """ + + cmd = "devlink -j port show %s" % port.name + stdout, stderr = run_command(cmd) + assert stderr == "" + values = list(json.loads(stdout)['port'].values())[0] + + return values['splittable'] + + +def split(k, port, should_fail=False): + """ + Split $port into $k ports. + If should_fail == True, the split should fail. Otherwise, should pass. + Return: Array of sub ports after splitting. + If the $port wasn't split, the array will be empty. + """ + + cmd = "devlink port split %s count %s" % (port.bus_info, k) + stdout, stderr = run_command(cmd, should_fail=should_fail) + + if should_fail: + if not test(stderr != "", "%s is unsplittable" % port.name): + print("split an unsplittable port %s" % port.name) + return create_split_group(port, k) + else: + if stderr == "": + return create_split_group(port, k) + print("didn't split a splittable port %s" % port.name) + + return [] + + +def unsplit(port): + """ + Unsplit $port. + """ + + cmd = "devlink port unsplit %s" % port + stdout, stderr = run_command(cmd) + test(stderr == "", "Unsplit port %s" % port) + + +def exists(port, dev): + """ + Check if $port exists in the devlink ports. + Return: True is so, False otherwise. + """ + + return any(dev_port.name == port + for dev_port in devlink_ports.get_if_names(dev)) + + +def exists_and_lanes(ports, lanes, dev): + """ + Check if every port in the list $ports exists in the devlink ports and has + $lanes number of lanes after splitting. + Return: True if both are True, False otherwise. + """ + + for port in ports: + max_lanes = get_max_lanes(port) + if not exists(port, dev): + print("port %s doesn't exist in devlink ports" % port) + return False + if max_lanes != lanes: + print("port %s has %d lanes, but %s were expected" + % (port, lanes, max_lanes)) + return False + return True + + +def test(cond, msg): + """ + Check $cond and print a message accordingly. + Return: True is pass, False otherwise. + """ + + if cond: + print("TEST: %-60s [ OK ]" % msg) + else: + print("TEST: %-60s [FAIL]" % msg) + + return cond + + +def create_split_group(port, k): + """ + Create the split group for $port. + Return: Array with $k elements, which are the split port group. + """ + + return list(port.name + "s" + str(i) for i in range(k)) + + +def split_unsplittable_port(port, k): + """ + Test that splitting of unsplittable port fails. + """ + + # split to max + new_split_group = split(k, port, should_fail=True) + + if new_split_group != []: + unsplit(port.bus_info) + + +def split_splittable_port(port, k, lanes, dev): + """ + Test that splitting of splittable port passes correctly. + """ + + new_split_group = split(k, port) + + # Once the split command ends, it takes some time to the sub ifaces' + # to get their names. Use udevadm to continue only when all current udev + # events are handled. + cmd = "udevadm settle" + stdout, stderr = run_command(cmd) + assert stderr == "" + + if new_split_group != []: + test(exists_and_lanes(new_split_group, lanes/k, dev), + "split port %s into %s" % (port.name, k)) + + unsplit(port.bus_info) + + +def validate_devlink_output(devlink_data, target_property=None): + """ + Determine if test should be skipped by checking: + 1. devlink_data contains values + 2. The target_property exist in devlink_data + """ + skip_reason = None + if any(devlink_data.values()): + if target_property: + skip_reason = "{} not found in devlink output, test skipped".format(target_property) + for key in devlink_data: + if target_property in devlink_data[key]: + skip_reason = None + else: + skip_reason = 'devlink output is empty, test skipped' + + if skip_reason: + print(skip_reason) + sys.exit(KSFT_SKIP) + + +def make_parser(): + parser = argparse.ArgumentParser(description='A test for port splitting.') + parser.add_argument('--dev', + help='The devlink handle of the device under test. ' + + 'The default is the first registered devlink ' + + 'handle.') + + return parser + + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + dev = args.dev + if not dev: + cmd = "devlink -j dev show" + stdout, stderr = run_command(cmd) + assert stderr == "" + + validate_devlink_output(json.loads(stdout)) + devs = json.loads(stdout)['dev'] + dev = list(devs.keys())[0] + + cmd = "devlink dev show %s" % dev + stdout, stderr = run_command(cmd) + if stderr != "": + print("devlink device %s can not be found" % dev) + sys.exit(1) + + ports = devlink_ports(dev) + + found_max_lanes = False + for port in ports.if_names: + max_lanes = get_max_lanes(port.name) + + # If max lanes is 0, do not test port splitting at all + if max_lanes == 0: + continue + + # If 1 lane, shouldn't be able to split + elif max_lanes == 1: + test(not get_split_ability(port), + "%s should not be able to split" % port.name) + split_unsplittable_port(port, max_lanes) + + # Else, splitting should pass and all the split ports should exist. + else: + lane = max_lanes + test(get_split_ability(port), + "%s should be able to split" % port.name) + while lane > 1: + split_splittable_port(port, lane, max_lanes, dev) + + lane //= 2 + found_max_lanes = True + + if not found_max_lanes: + print(f"Test not started, no port of device {dev} reports max_lanes") + sys.exit(KSFT_SKIP) + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh new file mode 100755 index 000000000000..187429670ee7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -0,0 +1,302 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + same_speeds_autoneg_off + different_speeds_autoneg_off + combination_of_neg_on_and_off + advertise_subset_of_speeds + check_highest_speed_is_chosen + different_speeds_autoneg_on +" +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source ethtool_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/24 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/24 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy +} + +same_speeds_autoneg_off() +{ + # Check that when each of the reported speeds is forced, the links come + # up and are operational. + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0)) + + for speed in "${speeds_arr[@]}"; do + RET=0 + ethtool_set $h1 speed $speed autoneg off + ethtool_set $h2 speed $speed autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "speed $speed autoneg off" + log_test "force of same speed autoneg off" + log_info "speed = $speed" + done + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +different_speeds_autoneg_off() +{ + # Test that when we force different speeds, links are not up and ping + # fails. + RET=0 + + local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0)) + local speed1=${speeds_arr[0]} + local speed2=${speeds_arr[1]} + + ethtool_set $h1 speed $speed1 autoneg off + ethtool_set $h2 speed $speed2 autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_fail $? "ping with different speeds" + + log_test "force of different speeds autoneg off" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +combination_of_neg_on_and_off() +{ + # Test that when one device is forced to a speed supported by both + # endpoints and the other device is configured to autoneg on, the links + # are up and ping passes. + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) + + for speed in "${speeds_arr[@]}"; do + RET=0 + ethtool_set $h1 speed $speed autoneg off + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" + log_test "one side with autoneg off and another with autoneg on" + log_info "force speed = $speed" + done + + ethtool -s $h1 autoneg on +} + +hex_speed_value_get() +{ + local speed=$1; shift + + local shift_size=${speed_values[$speed]} + speed=$((0x1 << $"shift_size")) + printf "%#x" "$speed" +} + +subset_of_common_speeds_get() +{ + local dev1=$1; shift + local dev2=$1; shift + local adver=$1; shift + + local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver)) + local speed_to_advertise=0 + local speed_to_remove=${speeds_arr[0]} + speed_to_remove+='base' + + local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver)) + + for speed in ${speeds_mode_arr[@]}; do + if [[ $speed != $speed_to_remove* ]]; then + speed=$(hex_speed_value_get $speed) + speed_to_advertise=$(($speed_to_advertise | \ + $speed)) + fi + + done + + # Convert to hex. + printf "%#x" "$speed_to_advertise" +} + +speed_to_advertise_get() +{ + # The function returns the hex number that is composed by OR-ing all + # the modes corresponding to the provided speed. + local speed_without_mode=$1; shift + local supported_speeds=("$@"); shift + local speed_to_advertise=0 + + speed_without_mode+='base' + + for speed in ${supported_speeds[@]}; do + if [[ $speed == $speed_without_mode* ]]; then + speed=$(hex_speed_value_get $speed) + speed_to_advertise=$(($speed_to_advertise | \ + $speed)) + fi + + done + + # Convert to hex. + printf "%#x" "$speed_to_advertise" +} + +advertise_subset_of_speeds() +{ + # Test that when one device advertises a subset of speeds and another + # advertises a specific speed (but all modes of this speed), the links + # are up and ping passes. + RET=0 + + local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) + ethtool_set $h1 advertise $speed_1_to_advertise + + if [ $RET != 0 ]; then + log_test "advertise subset of speeds" + return + fi + + local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1)) + # Check only speeds that h1 advertised. Remove the first speed. + unset speeds_arr_without_mode[0] + local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1)) + + for speed_value in ${speeds_arr_without_mode[@]}; do + RET=0 + local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \ + "${speeds_arr_with_mode[@]}") + ethtool_set $h2 advertise $speed_2_to_advertise + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" + + log_test "advertise subset of speeds" + log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" + done + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +check_highest_speed_is_chosen() +{ + # Test that when one device advertises a subset of speeds, the other + # chooses the highest speed. This test checks configuration without + # traffic. + RET=0 + + local max_speed + local chosen_speed + local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) + + ethtool_set $h1 advertise $speed_to_advertise + + if [ $RET != 0 ]; then + log_test "check highest speed" + return + fi + + local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) + + max_speed=${speeds_arr[0]} + for current in ${speeds_arr[@]}; do + if [[ $current -gt $max_speed ]]; then + max_speed=$current + fi + done + + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + chosen_speed=$(ethtool $h1 | grep 'Speed:') + chosen_speed=${chosen_speed%"Mb/s"*} + chosen_speed=${chosen_speed#*"Speed: "} + ((chosen_speed == max_speed)) + check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed" + + log_test "check highest speed" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +different_speeds_autoneg_on() +{ + # Test that when we configure links to advertise different speeds, + # links are not up and ping fails. + RET=0 + + local -a speeds=($(different_speeds_get $h1 $h2 1 1)) + local speed1=${speeds[0]} + local speed2=${speeds[1]} + + speed1=$(hex_speed_value_get $speed1) + speed2=$(hex_speed_value_get $speed2) + + ethtool_set $h1 advertise $speed1 + ethtool_set $h2 advertise $speed2 + + if (($RET)); then + setup_wait_dev_with_timeout $h1 + setup_wait_dev_with_timeout $h2 + ping_do $h1 192.0.2.2 + check_fail $? "ping with different speeds autoneg on" + fi + + log_test "advertise different speeds autoneg on" + + ethtool -s $h2 autoneg on + ethtool -s $h1 autoneg on +} + +skip_on_veth + +trap cleanup EXIT + +setup_prepare +setup_wait + +declare -gA speed_values +eval "speed_values=($(speeds_arr_get))" + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh new file mode 100755 index 000000000000..b0f931260a27 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + autoneg + autoneg_force_mode + no_cable +" + +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source ethtool_lib.sh + +TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + swp3=$NETIF_NO_CABLE +} + +ethtool_ext_state() +{ + local dev=$1; shift + local expected_ext_state=$1; shift + local expected_ext_substate=${1:-""}; shift + + local ext_state=$(ethtool $dev | grep "Link detected" \ + | cut -d "(" -f2 | cut -d ")" -f1) + local ext_substate=$(echo $ext_state | cut -sd "," -f2 \ + | sed -e 's/^[[:space:]]*//') + ext_state=$(echo $ext_state | cut -d "," -f1) + + if [[ $ext_state != $expected_ext_state ]]; then + echo "Expected \"$expected_ext_state\", got \"$ext_state\"" + return 1 + fi + if [[ $ext_substate != $expected_ext_substate ]]; then + echo "Expected \"$expected_ext_substate\", got \"$ext_substate\"" + return 1 + fi +} + +autoneg() +{ + local msg + + RET=0 + + ip link set dev $swp1 up + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected") + check_err $? "$msg" + + log_test "Autoneg, No partner detected" + + ip link set dev $swp1 down +} + +autoneg_force_mode() +{ + local msg + + RET=0 + + ip link set dev $swp1 up + ip link set dev $swp2 up + + local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0)) + local speed1=${speeds_arr[0]} + local speed2=${speeds_arr[1]} + + ethtool_set $swp1 speed $speed1 autoneg off + ethtool_set $swp2 speed $speed2 autoneg off + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" + + log_test "Autoneg, No partner detected during force mode" + + ethtool -s $swp2 autoneg on + ethtool -s $swp1 autoneg on + + ip link set dev $swp2 down + ip link set dev $swp1 down +} + +no_cable() +{ + local msg + + RET=0 + + ip link set dev $swp3 up + + msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable") + check_err $? "$msg" + + log_test "No cable" + + ip link set dev $swp3 down +} + +skip_on_veth + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh new file mode 100644 index 000000000000..b9bfb45085af --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +speeds_arr_get() +{ + cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \ + {sub(/,$/, "") \ + sub(/ETHTOOL_LINK_MODE_/,"") \ + sub(/_BIT/,"") \ + sub(/_Full/,"/Full") \ + sub(/_Half/,"/Half");\ + print "["$1"]="$3}' + + awk "${cmd}" /usr/include/linux/ethtool.h +} + +ethtool_set() +{ + local cmd="$@" + local out=$(ethtool -s $cmd 2>&1 | wc -l) + + check_err $out "error in configuration. $cmd" +} + +dev_linkmodes_params_get() +{ + local dev=$1; shift + local adver=$1; shift + local -a linkmodes_params + local param_count + local arr + + if (($adver)); then + mode="Advertised link modes" + else + mode="Supported link modes" + fi + + local -a dev_linkmodes=($(dev_speeds_get $dev 1 $adver)) + for ((i=0; i<${#dev_linkmodes[@]}; i++)); do + linkmodes_params[$i]=$(echo -e "${dev_linkmodes[$i]}" | \ + # Replaces all non numbers with spaces + sed -e 's/[^0-9]/ /g' | \ + # Squeeze spaces in sequence to 1 space + tr -s ' ') + # Count how many numbers were found in the linkmode + param_count=$(echo "${linkmodes_params[$i]}" | wc -w) + if [[ $param_count -eq 1 ]]; then + linkmodes_params[$i]="${linkmodes_params[$i]} 1" + elif [[ $param_count -ge 3 ]]; then + arr=(${linkmodes_params[$i]}) + # Take only first two params + linkmodes_params[$i]=$(echo "${arr[@]:0:2}") + fi + done + echo ${linkmodes_params[@]} +} + +dev_speeds_get() +{ + local dev=$1; shift + local with_mode=$1; shift + local adver=$1; shift + local speeds_str + + if (($adver)); then + mode="Advertised link modes" + else + mode="Supported link modes" + fi + + speeds_str=$(ethtool "$dev" | \ + # Snip everything before the link modes section. + sed -n '/'"$mode"':/,$p' | \ + # Quit processing the rest at the start of the next section. + # When checking, skip the header of this section (hence the 2,). + sed -n '2,${/^[\t][^ \t]/q};p' | \ + # Drop the section header of the current section. + cut -d':' -f2) + + local -a speeds_arr=($speeds_str) + if [[ $with_mode -eq 0 ]]; then + for ((i=0; i<${#speeds_arr[@]}; i++)); do + speeds_arr[$i]=${speeds_arr[$i]%base*} + done + fi + echo ${speeds_arr[@]} +} + +common_speeds_get() +{ + dev1=$1; shift + dev2=$1; shift + with_mode=$1; shift + adver=$1; shift + + local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver)) + local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver)) + + comm -12 \ + <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \ + <(printf '%s\n' "${dev2_speeds[@]}" | sort -u) +} + +different_speeds_get() +{ + local dev1=$1; shift + local dev2=$1; shift + local with_mode=$1; shift + local adver=$1; shift + + local -a speeds_arr + + speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver)) + if [[ ${#speeds_arr[@]} < 2 ]]; then + check_err 1 "cannot check different speeds. There are not enough speeds" + fi + + echo ${speeds_arr[0]} ${speeds_arr[1]} +} diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh new file mode 100755 index 000000000000..c301e735c8ab --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + manual_with_verification_h1_to_h2 + manual_with_verification_h2_to_h1 + manual_without_verification_h1_to_h2 + manual_without_verification_h2_to_h1 + manual_failed_verification_h1_to_h2 + manual_failed_verification_h2_to_h1 + lldp +" + +NUM_NETIFS=2 +REQUIRE_MZ=no +PREEMPTIBLE_PRIO=0 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +traffic_test() +{ + local if=$1; shift + local src=$1; shift + local num_pkts=10000 + local before= + local after= + local delta= + + if [ ${has_pmac_stats[$if]} = false ]; then + src="aggregate" + fi + + before=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) + + $MZ $if -q -c $num_pkts -p 64 -b bcast -t ip -R $PREEMPTIBLE_PRIO + + after=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) + + delta=$((after - before)) + + # Allow an extra 1% tolerance for random packets sent by the stack + [ $delta -ge $num_pkts ] && [ $delta -le $((num_pkts + 100)) ] +} + +manual_with_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + # It isn't completely clear from IEEE 802.3-2018 Figure 99-5: Transmit + # Processing state diagram whether the "send_r" variable (send response + # to verification frame) should be taken into consideration while the + # MAC Merge TX direction is disabled. That being said, at least the + # NXP ENETC does not, and requires tx-enabled on in order to respond to + # the link partner's verification frames. + ethtool --set-mm $rx tx-enabled on + ethtool --set-mm $tx verify-enabled on tx-enabled on + + # Wait for verification to finish + sleep 1 + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'SUCCEEDED' + check_err "$?" "Verification did not succeed" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_err "$?" "pMAC TX is not active" + + traffic_test $tx "pmac" + check_err "$?" "Traffic did not get sent through $tx's pMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + ethtool --set-mm $rx tx-enabled off + + log_test "Manual configuration with verification: $tx to $rx" +} + +manual_with_verification_h1_to_h2() +{ + manual_with_verification $h1 $h2 +} + +manual_with_verification_h2_to_h1() +{ + manual_with_verification $h2 $h1 +} + +manual_without_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + ethtool --set-mm $tx verify-enabled off tx-enabled on + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'DISABLED' + check_err "$?" "Verification is not disabled" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_err "$?" "pMAC TX is not active" + + traffic_test $tx "pmac" + check_err "$?" "Traffic did not get sent through $tx's pMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + + log_test "Manual configuration without verification: $tx to $rx" +} + +manual_without_verification_h1_to_h2() +{ + manual_without_verification $h1 $h2 +} + +manual_without_verification_h2_to_h1() +{ + manual_without_verification $h2 $h1 +} + +manual_failed_verification() +{ + local tx=$1; shift + local rx=$1; shift + + RET=0 + + ethtool --set-mm $rx pmac-enabled off + ethtool --set-mm $tx verify-enabled on tx-enabled on + + # Wait for verification to time out + sleep 1 + + ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ + grep -q 'SUCCEEDED' + check_fail "$?" "Verification succeeded when it shouldn't have" + + ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' + check_fail "$?" "pMAC TX is active when it shouldn't have" + + traffic_test $tx "emac" + check_err "$?" "Traffic did not get sent through $tx's eMAC" + + ethtool --set-mm $tx verify-enabled off tx-enabled off + ethtool --set-mm $rx pmac-enabled on + + log_test "Manual configuration with failed verification: $tx to $rx" +} + +manual_failed_verification_h1_to_h2() +{ + manual_failed_verification $h1 $h2 +} + +manual_failed_verification_h2_to_h1() +{ + manual_failed_verification $h2 $h1 +} + +smallest_supported_add_frag_size() +{ + local iface=$1 + local rx_min_frag_size= + + rx_min_frag_size=$(ethtool --json --show-mm $iface | \ + jq '.[]."rx-min-frag-size"') + + if [ $rx_min_frag_size -le 60 ]; then + echo 0 + elif [ $rx_min_frag_size -le 124 ]; then + echo 1 + elif [ $rx_min_frag_size -le 188 ]; then + echo 2 + elif [ $rx_min_frag_size -le 252 ]; then + echo 3 + else + echo "$iface: RX min frag size $rx_min_frag_size cannot be advertised over LLDP" + exit 1 + fi +} + +expected_add_frag_size() +{ + local iface=$1 + local requested=$2 + local min=$(smallest_supported_add_frag_size $iface) + + [ $requested -le $min ] && echo $min || echo $requested +} + +lldp_change_add_frag_size() +{ + local add_frag_size=$1 + local pattern= + + lldptool -T -i $h1 -V addEthCaps addFragSize=$add_frag_size >/dev/null + # Wait for TLVs to be received + sleep 2 + pattern=$(printf "Additional fragment size: %d" \ + $(expected_add_frag_size $h1 $add_frag_size)) + lldptool -i $h2 -t -n -V addEthCaps | grep -q "$pattern" +} + +lldp() +{ + RET=0 + + systemctl start lldpad + + # Configure the interfaces to receive and transmit LLDPDUs + lldptool -L -i $h1 adminStatus=rxtx >/dev/null + lldptool -L -i $h2 adminStatus=rxtx >/dev/null + + # Enable the transmission of Additional Ethernet Capabilities TLV + lldptool -T -i $h1 -V addEthCaps enableTx=yes >/dev/null + lldptool -T -i $h2 -V addEthCaps enableTx=yes >/dev/null + + # Wait for TLVs to be received + sleep 2 + + lldptool -i $h1 -t -n -V addEthCaps | \ + grep -q "Preemption capability active" + check_err "$?" "$h1 pMAC TX is not active" + + lldptool -i $h2 -t -n -V addEthCaps | \ + grep -q "Preemption capability active" + check_err "$?" "$h2 pMAC TX is not active" + + lldp_change_add_frag_size 3 + check_err "$?" "addFragSize 3" + + lldp_change_add_frag_size 2 + check_err "$?" "addFragSize 2" + + lldp_change_add_frag_size 1 + check_err "$?" "addFragSize 1" + + lldp_change_add_frag_size 0 + check_err "$?" "addFragSize 0" + + traffic_test $h1 "pmac" + check_err "$?" "Traffic did not get sent through $h1's pMAC" + + traffic_test $h2 "pmac" + check_err "$?" "Traffic did not get sent through $h2's pMAC" + + systemctl stop lldpad + + log_test "LLDP" +} + +h1_create() +{ + ip link set dev $h1 up + + tc qdisc add dev $h1 root mqprio num_tc 4 map 0 1 2 3 \ + queues 1@0 1@1 1@2 1@3 \ + fp P E E E \ + hw 1 + + ethtool --set-mm $h1 pmac-enabled on tx-enabled off verify-enabled off +} + +h2_create() +{ + ip link set dev $h2 up + + ethtool --set-mm $h2 pmac-enabled on tx-enabled off verify-enabled off + + tc qdisc add dev $h2 root mqprio num_tc 4 map 0 1 2 3 \ + queues 1@0 1@1 1@2 1@3 \ + fp P E E E \ + hw 1 +} + +h1_destroy() +{ + ethtool --set-mm $h1 pmac-enabled off tx-enabled off verify-enabled off + + tc qdisc del dev $h1 root + + ip link set dev $h1 down +} + +h2_destroy() +{ + tc qdisc del dev $h2 root + + ethtool --set-mm $h2 pmac-enabled off tx-enabled off verify-enabled off + + ip link set dev $h2 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy +} + +check_ethtool_mm_support +check_tc_fp_support +require_command lldptool +bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" + +for netif in ${NETIFS[@]}; do + ethtool --show-mm $netif 2>&1 &> /dev/null + if [[ $? -ne 0 ]]; then + echo "SKIP: $netif does not support MAC Merge" + exit $ksft_skip + fi + + if check_ethtool_pmac_std_stats_support $netif eth-mac; then + has_pmac_stats[$netif]=true + else + has_pmac_stats[$netif]=false + echo "$netif does not report pMAC statistics, falling back to aggregate" + fi +done + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh new file mode 100755 index 000000000000..709433a4c886 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + rmon_rx_histogram + rmon_tx_histogram +" + +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +ETH_FCS_LEN=4 +ETH_HLEN=$((6+6+2)) + +declare -A netif_mtu + +ensure_mtu() +{ + local iface=$1; shift + local len=$1; shift + local current=$(ip -j link show dev $iface | jq -r '.[0].mtu') + local required=$((len - ETH_HLEN - ETH_FCS_LEN)) + + if [ $current -lt $required ]; then + ip link set dev $iface mtu $required || return 1 + fi +} + +bucket_test() +{ + local iface=$1; shift + local neigh=$1; shift + local set=$1; shift + local bucket=$1; shift + local len=$1; shift + local num_rx=10000 + local num_tx=20000 + local expected= + local before= + local after= + local delta= + + # Mausezahn does not include FCS bytes in its length - but the + # histogram counters do + len=$((len - ETH_FCS_LEN)) + + before=$(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") + + # Send 10k one way and 20k in the other, to detect counters + # mapped to the wrong direction + $MZ $neigh -q -c $num_rx -p $len -a own -b bcast -d 10us + $MZ $iface -q -c $num_tx -p $len -a own -b bcast -d 10us + + after=$(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") + + delta=$((after - before)) + + expected=$([ $set = rx ] && echo $num_rx || echo $num_tx) + + # Allow some extra tolerance for other packets sent by the stack + [ $delta -ge $expected ] && [ $delta -le $((expected + 100)) ] +} + +rmon_histogram() +{ + local iface=$1; shift + local neigh=$1; shift + local set=$1; shift + local nbuckets=0 + local step= + + RET=0 + + while read -r -a bucket; do + step="$set-pkts${bucket[0]}to${bucket[1]} on $iface" + + for if in $iface $neigh; do + if ! ensure_mtu $if ${bucket[0]}; then + log_test_skip "$if does not support the required MTU for $step" + return + fi + done + + if ! bucket_test $iface $neigh $set $nbuckets ${bucket[0]}; then + check_err 1 "$step failed" + return 1 + fi + log_test "$step" + nbuckets=$((nbuckets + 1)) + done < <(ethtool --json -S $iface --groups rmon | \ + jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) + + if [ $nbuckets -eq 0 ]; then + log_test_skip "$iface does not support $set histogram counters" + return + fi +} + +rmon_rx_histogram() +{ + rmon_histogram $h1 $h2 rx + rmon_histogram $h2 $h1 rx +} + +rmon_tx_histogram() +{ + rmon_histogram $h1 $h2 tx + rmon_histogram $h2 $h1 tx +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + for iface in $h1 $h2; do + netif_mtu[$iface]=$(ip -j link show dev $iface | jq -r '.[0].mtu') + ip link set dev $iface up + done +} + +cleanup() +{ + pre_cleanup + + for iface in $h2 $h1; do + ip link set dev $iface \ + mtu ${netif_mtu[$iface]} \ + down + done +} + +check_ethtool_counter_group_support +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh new file mode 100755 index 000000000000..744760117005 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -0,0 +1,341 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +----------------------+ +# | H1 | | H2 | +# | | | | +# | $h1.200 + | | + $h2.200 | +# | 192.0.2.1/28 | | | | 192.0.2.18/28 | +# | 2001:db8:1::1/64 | | | | 2001:db8:2::1/64 | +# | | | | | | +# | $h1 + | | + $h2 | +# | | | | | | +# +------------------|-+ +-|--------------------+ +# | | +# +------------------|-------------------------|--------------------+ +# | SW | | | +# | | | | +# | $rp1 + + $rp2 | +# | | | | +# | $rp1.200 + + $rp2.200 | +# | 192.0.2.2/28 192.0.2.17/28 | +# | 2001:db8:1::2/64 2001:db8:2::2/64 | +# | | +# +-----------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + respin_enablement + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + reapply_config + ping_ipv4 + ping_ipv6 + test_stats_rx_ipv4 + test_stats_tx_ipv4 + test_stats_rx_ipv6 + test_stats_tx_ipv6 + test_stats_report_rx + test_stats_report_tx + test_destroy_enabled + test_double_enable +" +NUM_NETIFS=4 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1_create() +{ + simple_if_init $h1 + vlan_create $h1 200 v$h1 192.0.2.1/28 2001:db8:1::1/64 + ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 + ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + vlan_destroy $h1 200 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + vlan_create $h2 200 v$h2 192.0.2.18/28 2001:db8:2::1/64 + ip route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 + ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 + ip route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 + vlan_destroy $h2 200 + simple_if_fini $h2 +} + +router_rp1_200_create() +{ + ip link add name $rp1.200 link $rp1 type vlan id 200 + ip link set dev $rp1.200 addrgenmode eui64 + ip link set dev $rp1.200 up + ip address add dev $rp1.200 192.0.2.2/28 + ip address add dev $rp1.200 2001:db8:1::2/64 + ip stats set dev $rp1.200 l3_stats on +} + +router_rp1_200_destroy() +{ + ip stats set dev $rp1.200 l3_stats off + ip address del dev $rp1.200 2001:db8:1::2/64 + ip address del dev $rp1.200 192.0.2.2/28 + ip link del dev $rp1.200 +} + +router_create() +{ + ip link set dev $rp1 up + router_rp1_200_create + + ip link set dev $rp2 up + vlan_create $rp2 200 "" 192.0.2.17/28 2001:db8:2::2/64 +} + +router_destroy() +{ + vlan_destroy $rp2 200 + ip link set dev $rp2 down + + router_rp1_200_destroy + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1mac=$(mac_get $rp1) + rp2mac=$(mac_get $rp2) + + vrf_prepare + + h1_create + h2_create + + router_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + router_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1.200 192.0.2.18 " IPv4" +} + +ping_ipv6() +{ + ping_test $h1.200 2001:db8:2::1 " IPv6" +} + +send_packets_rx_ipv4() +{ + # Send 21 packets instead of 20, because the first one might trap and go + # through the SW datapath, which might not bump the HW counter. + $MZ $h1.200 -c 21 -d 20msec -p 100 \ + -a own -b $rp1mac -A 192.0.2.1 -B 192.0.2.18 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_rx_ipv6() +{ + $MZ $h1.200 -6 -c 21 -d 20msec -p 100 \ + -a own -b $rp1mac -A 2001:db8:1::1 -B 2001:db8:2::1 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_tx_ipv4() +{ + $MZ $h2.200 -c 21 -d 20msec -p 100 \ + -a own -b $rp2mac -A 192.0.2.18 -B 192.0.2.1 \ + -q -t udp sp=54321,dp=12345 +} + +send_packets_tx_ipv6() +{ + $MZ $h2.200 -6 -c 21 -d 20msec -p 100 \ + -a own -b $rp2mac -A 2001:db8:2::1 -B 2001:db8:1::1 \ + -q -t udp sp=54321,dp=12345 +} + +___test_stats() +{ + local dir=$1; shift + local prot=$1; shift + + local a + local b + + a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) + send_packets_${dir}_${prot} + "$@" + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $rp1.200 ${dir} packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" +} + +__test_stats() +{ + local dir=$1; shift + local prot=$1; shift + + RET=0 + ___test_stats "$dir" "$prot" + log_test "Test $dir packets: $prot" +} + +test_stats_rx_ipv4() +{ + __test_stats rx ipv4 +} + +test_stats_tx_ipv4() +{ + __test_stats tx ipv4 +} + +test_stats_rx_ipv6() +{ + __test_stats rx ipv6 +} + +test_stats_tx_ipv6() +{ + __test_stats tx ipv6 +} + +# Make sure everything works well even after stats have been disabled and +# reenabled on the same device without touching the L3 configuration. +respin_enablement() +{ + log_info "Turning stats off and on again" + ip stats set dev $rp1.200 l3_stats off + ip stats set dev $rp1.200 l3_stats on +} + +# For the initial run, l3_stats is enabled on a completely set up netdevice. Now +# do it the other way around: enabling the L3 stats on an L2 netdevice, and only +# then apply the L3 configuration. +reapply_config() +{ + log_info "Reapplying configuration" + + router_rp1_200_destroy + + ip link add name $rp1.200 link $rp1 type vlan id 200 + ip link set dev $rp1.200 addrgenmode none + ip stats set dev $rp1.200 l3_stats on + ip link set dev $rp1.200 addrgenmode eui64 + ip link set dev $rp1.200 up + ip address add dev $rp1.200 192.0.2.2/28 + ip address add dev $rp1.200 2001:db8:1::2/64 +} + +__test_stats_report() +{ + local dir=$1; shift + local prot=$1; shift + + local a + local b + + RET=0 + + a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) + send_packets_${dir}_${prot} + ip address flush dev $rp1.200 + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $rp1.200 ${dir} packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" + log_test "Test ${dir} packets: stats pushed on loss of L3" + + ip stats set dev $rp1.200 l3_stats off + ip link del dev $rp1.200 + router_rp1_200_create +} + +test_stats_report_rx() +{ + __test_stats_report rx ipv4 +} + +test_stats_report_tx() +{ + __test_stats_report tx ipv4 +} + +test_destroy_enabled() +{ + RET=0 + + ip link del dev $rp1.200 + router_rp1_200_create + + log_test "Destroy l3_stats-enabled netdev" +} + +test_double_enable() +{ + RET=0 + ___test_stats rx ipv4 \ + ip stats set dev $rp1.200 l3_stats on + log_test "Test stat retention across a spurious enablement" +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | + jq '.[].info.l3_stats.used') +kind=$(ip -j -d link show dev $rp1 | + jq -r '.[].linkinfo.info_kind') +if [[ $used != true ]]; then + if [[ $kind == veth ]]; then + log_test_skip "l3_stats not offloaded on veth interface" + EXIT_STATUS=$ksft_skip + else + RET=1 log_test "l3_stats not offloaded" + fi +else + tests_run +fi + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh new file mode 100755 index 000000000000..354be353455f --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test L3 stats on IP-in-IP GRE tunnel without key. + +# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more +# details. + +ALL_TESTS=" + ping_ipv4 + test_stats_rx + test_stats_tx +" +NUM_NETIFS=6 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/ipip_lib.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + ol1=${NETIFS[p2]} + + ul1=${NETIFS[p3]} + ul2=${NETIFS[p4]} + + ol2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + ol1mac=$(mac_get $ol1) + + forwarding_enable + vrf_prepare + h1_create + h2_create + sw1_flat_create gre $ol1 $ul1 + sw2_flat_create gre $ol2 $ul2 + ip stats set dev g1a l3_stats on + ip stats set dev g2a l3_stats on +} + +cleanup() +{ + pre_cleanup + + ip stats set dev g1a l3_stats off + ip stats set dev g2a l3_stats off + + sw2_flat_destroy $ol2 $ul2 + sw1_flat_destroy $ol1 $ul1 + h2_destroy + h1_destroy + + vrf_cleanup + forwarding_restore +} + +ping_ipv4() +{ + RET=0 + + ping_test $h1 192.0.2.18 " gre flat" +} + +send_packets_ipv4() +{ + # Send 21 packets instead of 20, because the first one might trap and go + # through the SW datapath, which might not bump the HW counter. + $MZ $h1 -c 21 -d 20msec -p 100 \ + -a own -b $ol1mac -A 192.0.2.1 -B 192.0.2.18 \ + -q -t udp sp=54321,dp=12345 +} + +test_stats() +{ + local dev=$1; shift + local dir=$1; shift + + local a + local b + + RET=0 + + a=$(hw_stats_get l3_stats $dev $dir packets) + send_packets_ipv4 + b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ + hw_stats_get l3_stats $dev $dir packets) + check_err $? "Traffic not reflected in the counter: $a -> $b" + + log_test "Test $dir packets: $prot" +} + +test_stats_tx() +{ + test_stats g1a tx +} + +test_stats_rx() +{ + test_stats g2a rx +} + +skip_on_veth + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/loopback.sh b/tools/testing/selftests/drivers/net/hw/loopback.sh new file mode 100755 index 000000000000..5acc3ff820aa --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/loopback.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +ALL_TESTS="loopback_test" +NUM_NETIFS=2 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/tc_common.sh +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 + tc qdisc add dev $h1 clsact +} + +h1_destroy() +{ + tc qdisc del dev $h1 clsact + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 +} + +h2_destroy() +{ + simple_if_fini $h2 +} + +loopback_test() +{ + RET=0 + + tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \ + skip_hw arp_op reply arp_tip 192.0.2.1 action drop + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 1 + check_fail $? "Matched on a filter without loopback setup" + + ethtool -K $h1 loopback on + check_err $? "Failed to enable loopback" + + setup_wait_dev $h1 + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 1 + check_err $? "Did not match on filter with loopback" + + ethtool -K $h1 loopback off + check_err $? "Failed to disable loopback" + + $MZ $h1 -c 1 -t arp -q + + tc_check_packets "dev $h1 ingress" 101 2 + check_fail $? "Matched on a filter after loopback was removed" + + tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower + + log_test "loopback" +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + vrf_prepare + + h1_create + h2_create + + if ethtool -k $h1 | grep loopback | grep -q fixed; then + log_test "SKIP: dev $h1 does not support loopback feature" + exit $ksft_skip + fi +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/settings b/tools/testing/selftests/drivers/net/hw/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7b6918d5f4af..cb418a2346bc 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -20,7 +20,6 @@ TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh -TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py deleted file mode 100755 index 2d84c7a0be6b..000000000000 --- a/tools/testing/selftests/net/devlink_port_split.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -from subprocess import PIPE, Popen -import json -import time -import argparse -import collections -import sys - -# -# Test port split configuration using devlink-port lanes attribute. -# The test is skipped in case the attribute is not available. -# -# First, check that all the ports with 1 lane fail to split. -# Second, check that all the ports with more than 1 lane can be split -# to all valid configurations (e.g., split to 2, split to 4 etc.) -# - - -# Kselftest framework requirement - SKIP code is 4 -KSFT_SKIP=4 -Port = collections.namedtuple('Port', 'bus_info name') - - -def run_command(cmd, should_fail=False): - """ - Run a command in subprocess. - Return: Tuple of (stdout, stderr). - """ - - p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) - stdout, stderr = p.communicate() - stdout, stderr = stdout.decode(), stderr.decode() - - if stderr != "" and not should_fail: - print("Error sending command: %s" % cmd) - print(stdout) - print(stderr) - return stdout, stderr - - -class devlink_ports(object): - """ - Class that holds information on the devlink ports, required to the tests; - if_names: A list of interfaces in the devlink ports. - """ - - def get_if_names(dev): - """ - Get a list of physical devlink ports. - Return: Array of tuples (bus_info/port, if_name). - """ - - arr = [] - - cmd = "devlink -j port show" - stdout, stderr = run_command(cmd) - assert stderr == "" - ports = json.loads(stdout)['port'] - - validate_devlink_output(ports, 'flavour') - - for port in ports: - if dev in port: - if ports[port]['flavour'] == 'physical': - arr.append(Port(bus_info=port, name=ports[port]['netdev'])) - - return arr - - def __init__(self, dev): - self.if_names = devlink_ports.get_if_names(dev) - - -def get_max_lanes(port): - """ - Get the $port's maximum number of lanes. - Return: number of lanes, e.g. 1, 2, 4 and 8. - """ - - cmd = "devlink -j port show %s" % port - stdout, stderr = run_command(cmd) - assert stderr == "" - values = list(json.loads(stdout)['port'].values())[0] - - if 'lanes' in values: - lanes = values['lanes'] - else: - lanes = 0 - return lanes - - -def get_split_ability(port): - """ - Get the $port split ability. - Return: split ability, true or false. - """ - - cmd = "devlink -j port show %s" % port.name - stdout, stderr = run_command(cmd) - assert stderr == "" - values = list(json.loads(stdout)['port'].values())[0] - - return values['splittable'] - - -def split(k, port, should_fail=False): - """ - Split $port into $k ports. - If should_fail == True, the split should fail. Otherwise, should pass. - Return: Array of sub ports after splitting. - If the $port wasn't split, the array will be empty. - """ - - cmd = "devlink port split %s count %s" % (port.bus_info, k) - stdout, stderr = run_command(cmd, should_fail=should_fail) - - if should_fail: - if not test(stderr != "", "%s is unsplittable" % port.name): - print("split an unsplittable port %s" % port.name) - return create_split_group(port, k) - else: - if stderr == "": - return create_split_group(port, k) - print("didn't split a splittable port %s" % port.name) - - return [] - - -def unsplit(port): - """ - Unsplit $port. - """ - - cmd = "devlink port unsplit %s" % port - stdout, stderr = run_command(cmd) - test(stderr == "", "Unsplit port %s" % port) - - -def exists(port, dev): - """ - Check if $port exists in the devlink ports. - Return: True is so, False otherwise. - """ - - return any(dev_port.name == port - for dev_port in devlink_ports.get_if_names(dev)) - - -def exists_and_lanes(ports, lanes, dev): - """ - Check if every port in the list $ports exists in the devlink ports and has - $lanes number of lanes after splitting. - Return: True if both are True, False otherwise. - """ - - for port in ports: - max_lanes = get_max_lanes(port) - if not exists(port, dev): - print("port %s doesn't exist in devlink ports" % port) - return False - if max_lanes != lanes: - print("port %s has %d lanes, but %s were expected" - % (port, lanes, max_lanes)) - return False - return True - - -def test(cond, msg): - """ - Check $cond and print a message accordingly. - Return: True is pass, False otherwise. - """ - - if cond: - print("TEST: %-60s [ OK ]" % msg) - else: - print("TEST: %-60s [FAIL]" % msg) - - return cond - - -def create_split_group(port, k): - """ - Create the split group for $port. - Return: Array with $k elements, which are the split port group. - """ - - return list(port.name + "s" + str(i) for i in range(k)) - - -def split_unsplittable_port(port, k): - """ - Test that splitting of unsplittable port fails. - """ - - # split to max - new_split_group = split(k, port, should_fail=True) - - if new_split_group != []: - unsplit(port.bus_info) - - -def split_splittable_port(port, k, lanes, dev): - """ - Test that splitting of splittable port passes correctly. - """ - - new_split_group = split(k, port) - - # Once the split command ends, it takes some time to the sub ifaces' - # to get their names. Use udevadm to continue only when all current udev - # events are handled. - cmd = "udevadm settle" - stdout, stderr = run_command(cmd) - assert stderr == "" - - if new_split_group != []: - test(exists_and_lanes(new_split_group, lanes/k, dev), - "split port %s into %s" % (port.name, k)) - - unsplit(port.bus_info) - - -def validate_devlink_output(devlink_data, target_property=None): - """ - Determine if test should be skipped by checking: - 1. devlink_data contains values - 2. The target_property exist in devlink_data - """ - skip_reason = None - if any(devlink_data.values()): - if target_property: - skip_reason = "{} not found in devlink output, test skipped".format(target_property) - for key in devlink_data: - if target_property in devlink_data[key]: - skip_reason = None - else: - skip_reason = 'devlink output is empty, test skipped' - - if skip_reason: - print(skip_reason) - sys.exit(KSFT_SKIP) - - -def make_parser(): - parser = argparse.ArgumentParser(description='A test for port splitting.') - parser.add_argument('--dev', - help='The devlink handle of the device under test. ' + - 'The default is the first registered devlink ' + - 'handle.') - - return parser - - -def main(cmdline=None): - parser = make_parser() - args = parser.parse_args(cmdline) - - dev = args.dev - if not dev: - cmd = "devlink -j dev show" - stdout, stderr = run_command(cmd) - assert stderr == "" - - validate_devlink_output(json.loads(stdout)) - devs = json.loads(stdout)['dev'] - dev = list(devs.keys())[0] - - cmd = "devlink dev show %s" % dev - stdout, stderr = run_command(cmd) - if stderr != "": - print("devlink device %s can not be found" % dev) - sys.exit(1) - - ports = devlink_ports(dev) - - found_max_lanes = False - for port in ports.if_names: - max_lanes = get_max_lanes(port.name) - - # If max lanes is 0, do not test port splitting at all - if max_lanes == 0: - continue - - # If 1 lane, shouldn't be able to split - elif max_lanes == 1: - test(not get_split_ability(port), - "%s should not be able to split" % port.name) - split_unsplittable_port(port, max_lanes) - - # Else, splitting should pass and all the split ports should exist. - else: - lane = max_lanes - test(get_split_ability(port), - "%s should be able to split" % port.name) - while lane > 1: - split_splittable_port(port, lane, max_lanes, dev) - - lane //= 2 - found_max_lanes = True - - if not found_max_lanes: - print(f"Test not started, no port of device {dev} reports max_lanes") - sys.exit(KSFT_SKIP) - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 535865b3d1d6..56e3557ba8a6 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -15,18 +15,12 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ bridge_vlan_unaware.sh \ custom_multipath_hash.sh \ dual_vxlan_bridge.sh \ - ethtool_extended_state.sh \ - ethtool_mm.sh \ - ethtool_rmon.sh \ - ethtool.sh \ gre_custom_multipath_hash.sh \ gre_inner_v4_multipath.sh \ gre_inner_v6_multipath.sh \ gre_multipath_nh_res.sh \ gre_multipath_nh.sh \ gre_multipath.sh \ - hw_stats_l3.sh \ - hw_stats_l3_gre.sh \ ip6_forward_instats_vrf.sh \ ip6gre_custom_multipath_hash.sh \ ip6gre_flat_key.sh \ @@ -44,7 +38,6 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ local_termination.sh \ - loopback.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ mirror_gre_bridge_1d_vlan.sh \ @@ -113,7 +106,6 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_symmetric.sh TEST_FILES := devlink_lib.sh \ - ethtool_lib.sh \ fib_offload_lib.sh \ forwarding.config.sample \ ip6gre_lib.sh \ diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/net/forwarding/ethtool.sh deleted file mode 100755 index aa2eafb7b243..000000000000 --- a/tools/testing/selftests/net/forwarding/ethtool.sh +++ /dev/null @@ -1,301 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - same_speeds_autoneg_off - different_speeds_autoneg_off - combination_of_neg_on_and_off - advertise_subset_of_speeds - check_highest_speed_is_chosen - different_speeds_autoneg_on -" -NUM_NETIFS=2 -source lib.sh -source ethtool_lib.sh - -h1_create() -{ - simple_if_init $h1 192.0.2.1/24 -} - -h1_destroy() -{ - simple_if_fini $h1 192.0.2.1/24 -} - -h2_create() -{ - simple_if_init $h2 192.0.2.2/24 -} - -h2_destroy() -{ - simple_if_fini $h2 192.0.2.2/24 -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - h1_create - h2_create -} - -cleanup() -{ - pre_cleanup - - h2_destroy - h1_destroy -} - -same_speeds_autoneg_off() -{ - # Check that when each of the reported speeds is forced, the links come - # up and are operational. - local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0)) - - for speed in "${speeds_arr[@]}"; do - RET=0 - ethtool_set $h1 speed $speed autoneg off - ethtool_set $h2 speed $speed autoneg off - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_err $? "speed $speed autoneg off" - log_test "force of same speed autoneg off" - log_info "speed = $speed" - done - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -different_speeds_autoneg_off() -{ - # Test that when we force different speeds, links are not up and ping - # fails. - RET=0 - - local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0)) - local speed1=${speeds_arr[0]} - local speed2=${speeds_arr[1]} - - ethtool_set $h1 speed $speed1 autoneg off - ethtool_set $h2 speed $speed2 autoneg off - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_fail $? "ping with different speeds" - - log_test "force of different speeds autoneg off" - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -combination_of_neg_on_and_off() -{ - # Test that when one device is forced to a speed supported by both - # endpoints and the other device is configured to autoneg on, the links - # are up and ping passes. - local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) - - for speed in "${speeds_arr[@]}"; do - RET=0 - ethtool_set $h1 speed $speed autoneg off - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" - log_test "one side with autoneg off and another with autoneg on" - log_info "force speed = $speed" - done - - ethtool -s $h1 autoneg on -} - -hex_speed_value_get() -{ - local speed=$1; shift - - local shift_size=${speed_values[$speed]} - speed=$((0x1 << $"shift_size")) - printf "%#x" "$speed" -} - -subset_of_common_speeds_get() -{ - local dev1=$1; shift - local dev2=$1; shift - local adver=$1; shift - - local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver)) - local speed_to_advertise=0 - local speed_to_remove=${speeds_arr[0]} - speed_to_remove+='base' - - local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver)) - - for speed in ${speeds_mode_arr[@]}; do - if [[ $speed != $speed_to_remove* ]]; then - speed=$(hex_speed_value_get $speed) - speed_to_advertise=$(($speed_to_advertise | \ - $speed)) - fi - - done - - # Convert to hex. - printf "%#x" "$speed_to_advertise" -} - -speed_to_advertise_get() -{ - # The function returns the hex number that is composed by OR-ing all - # the modes corresponding to the provided speed. - local speed_without_mode=$1; shift - local supported_speeds=("$@"); shift - local speed_to_advertise=0 - - speed_without_mode+='base' - - for speed in ${supported_speeds[@]}; do - if [[ $speed == $speed_without_mode* ]]; then - speed=$(hex_speed_value_get $speed) - speed_to_advertise=$(($speed_to_advertise | \ - $speed)) - fi - - done - - # Convert to hex. - printf "%#x" "$speed_to_advertise" -} - -advertise_subset_of_speeds() -{ - # Test that when one device advertises a subset of speeds and another - # advertises a specific speed (but all modes of this speed), the links - # are up and ping passes. - RET=0 - - local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) - ethtool_set $h1 advertise $speed_1_to_advertise - - if [ $RET != 0 ]; then - log_test "advertise subset of speeds" - return - fi - - local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1)) - # Check only speeds that h1 advertised. Remove the first speed. - unset speeds_arr_without_mode[0] - local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1)) - - for speed_value in ${speeds_arr_without_mode[@]}; do - RET=0 - local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \ - "${speeds_arr_with_mode[@]}") - ethtool_set $h2 advertise $speed_2_to_advertise - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" - - log_test "advertise subset of speeds" - log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" - done - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -check_highest_speed_is_chosen() -{ - # Test that when one device advertises a subset of speeds, the other - # chooses the highest speed. This test checks configuration without - # traffic. - RET=0 - - local max_speed - local chosen_speed - local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1) - - ethtool_set $h1 advertise $speed_to_advertise - - if [ $RET != 0 ]; then - log_test "check highest speed" - return - fi - - local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1)) - - max_speed=${speeds_arr[0]} - for current in ${speeds_arr[@]}; do - if [[ $current -gt $max_speed ]]; then - max_speed=$current - fi - done - - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - chosen_speed=$(ethtool $h1 | grep 'Speed:') - chosen_speed=${chosen_speed%"Mb/s"*} - chosen_speed=${chosen_speed#*"Speed: "} - ((chosen_speed == max_speed)) - check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed" - - log_test "check highest speed" - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -different_speeds_autoneg_on() -{ - # Test that when we configure links to advertise different speeds, - # links are not up and ping fails. - RET=0 - - local -a speeds=($(different_speeds_get $h1 $h2 1 1)) - local speed1=${speeds[0]} - local speed2=${speeds[1]} - - speed1=$(hex_speed_value_get $speed1) - speed2=$(hex_speed_value_get $speed2) - - ethtool_set $h1 advertise $speed1 - ethtool_set $h2 advertise $speed2 - - if (($RET)); then - setup_wait_dev_with_timeout $h1 - setup_wait_dev_with_timeout $h2 - ping_do $h1 192.0.2.2 - check_fail $? "ping with different speeds autoneg on" - fi - - log_test "advertise different speeds autoneg on" - - ethtool -s $h2 autoneg on - ethtool -s $h1 autoneg on -} - -skip_on_veth - -trap cleanup EXIT - -setup_prepare -setup_wait - -declare -gA speed_values -eval "speed_values=($(speeds_arr_get))" - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh deleted file mode 100755 index 17f89c3b7c02..000000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - autoneg - autoneg_force_mode - no_cable -" - -NUM_NETIFS=2 -source lib.sh -source ethtool_lib.sh - -TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms - -setup_prepare() -{ - swp1=${NETIFS[p1]} - swp2=${NETIFS[p2]} - swp3=$NETIF_NO_CABLE -} - -ethtool_ext_state() -{ - local dev=$1; shift - local expected_ext_state=$1; shift - local expected_ext_substate=${1:-""}; shift - - local ext_state=$(ethtool $dev | grep "Link detected" \ - | cut -d "(" -f2 | cut -d ")" -f1) - local ext_substate=$(echo $ext_state | cut -sd "," -f2 \ - | sed -e 's/^[[:space:]]*//') - ext_state=$(echo $ext_state | cut -d "," -f1) - - if [[ $ext_state != $expected_ext_state ]]; then - echo "Expected \"$expected_ext_state\", got \"$ext_state\"" - return 1 - fi - if [[ $ext_substate != $expected_ext_substate ]]; then - echo "Expected \"$expected_ext_substate\", got \"$ext_substate\"" - return 1 - fi -} - -autoneg() -{ - local msg - - RET=0 - - ip link set dev $swp1 up - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ - "Autoneg" "No partner detected") - check_err $? "$msg" - - log_test "Autoneg, No partner detected" - - ip link set dev $swp1 down -} - -autoneg_force_mode() -{ - local msg - - RET=0 - - ip link set dev $swp1 up - ip link set dev $swp2 up - - local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0)) - local speed1=${speeds_arr[0]} - local speed2=${speeds_arr[1]} - - ethtool_set $swp1 speed $speed1 autoneg off - ethtool_set $swp2 speed $speed2 autoneg off - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ - "Autoneg" "No partner detected during force mode") - check_err $? "$msg" - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \ - "Autoneg" "No partner detected during force mode") - check_err $? "$msg" - - log_test "Autoneg, No partner detected during force mode" - - ethtool -s $swp2 autoneg on - ethtool -s $swp1 autoneg on - - ip link set dev $swp2 down - ip link set dev $swp1 down -} - -no_cable() -{ - local msg - - RET=0 - - ip link set dev $swp3 up - - msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable") - check_err $? "$msg" - - log_test "No cable" - - ip link set dev $swp3 down -} - -skip_on_veth - -setup_prepare - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/net/forwarding/ethtool_lib.sh deleted file mode 100644 index b9bfb45085af..000000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_lib.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -speeds_arr_get() -{ - cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \ - {sub(/,$/, "") \ - sub(/ETHTOOL_LINK_MODE_/,"") \ - sub(/_BIT/,"") \ - sub(/_Full/,"/Full") \ - sub(/_Half/,"/Half");\ - print "["$1"]="$3}' - - awk "${cmd}" /usr/include/linux/ethtool.h -} - -ethtool_set() -{ - local cmd="$@" - local out=$(ethtool -s $cmd 2>&1 | wc -l) - - check_err $out "error in configuration. $cmd" -} - -dev_linkmodes_params_get() -{ - local dev=$1; shift - local adver=$1; shift - local -a linkmodes_params - local param_count - local arr - - if (($adver)); then - mode="Advertised link modes" - else - mode="Supported link modes" - fi - - local -a dev_linkmodes=($(dev_speeds_get $dev 1 $adver)) - for ((i=0; i<${#dev_linkmodes[@]}; i++)); do - linkmodes_params[$i]=$(echo -e "${dev_linkmodes[$i]}" | \ - # Replaces all non numbers with spaces - sed -e 's/[^0-9]/ /g' | \ - # Squeeze spaces in sequence to 1 space - tr -s ' ') - # Count how many numbers were found in the linkmode - param_count=$(echo "${linkmodes_params[$i]}" | wc -w) - if [[ $param_count -eq 1 ]]; then - linkmodes_params[$i]="${linkmodes_params[$i]} 1" - elif [[ $param_count -ge 3 ]]; then - arr=(${linkmodes_params[$i]}) - # Take only first two params - linkmodes_params[$i]=$(echo "${arr[@]:0:2}") - fi - done - echo ${linkmodes_params[@]} -} - -dev_speeds_get() -{ - local dev=$1; shift - local with_mode=$1; shift - local adver=$1; shift - local speeds_str - - if (($adver)); then - mode="Advertised link modes" - else - mode="Supported link modes" - fi - - speeds_str=$(ethtool "$dev" | \ - # Snip everything before the link modes section. - sed -n '/'"$mode"':/,$p' | \ - # Quit processing the rest at the start of the next section. - # When checking, skip the header of this section (hence the 2,). - sed -n '2,${/^[\t][^ \t]/q};p' | \ - # Drop the section header of the current section. - cut -d':' -f2) - - local -a speeds_arr=($speeds_str) - if [[ $with_mode -eq 0 ]]; then - for ((i=0; i<${#speeds_arr[@]}; i++)); do - speeds_arr[$i]=${speeds_arr[$i]%base*} - done - fi - echo ${speeds_arr[@]} -} - -common_speeds_get() -{ - dev1=$1; shift - dev2=$1; shift - with_mode=$1; shift - adver=$1; shift - - local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver)) - local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver)) - - comm -12 \ - <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \ - <(printf '%s\n' "${dev2_speeds[@]}" | sort -u) -} - -different_speeds_get() -{ - local dev1=$1; shift - local dev2=$1; shift - local with_mode=$1; shift - local adver=$1; shift - - local -a speeds_arr - - speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver)) - if [[ ${#speeds_arr[@]} < 2 ]]; then - check_err 1 "cannot check different speeds. There are not enough speeds" - fi - - echo ${speeds_arr[0]} ${speeds_arr[1]} -} diff --git a/tools/testing/selftests/net/forwarding/ethtool_mm.sh b/tools/testing/selftests/net/forwarding/ethtool_mm.sh deleted file mode 100755 index 50d5bfb17ef1..000000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_mm.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - manual_with_verification_h1_to_h2 - manual_with_verification_h2_to_h1 - manual_without_verification_h1_to_h2 - manual_without_verification_h2_to_h1 - manual_failed_verification_h1_to_h2 - manual_failed_verification_h2_to_h1 - lldp -" - -NUM_NETIFS=2 -REQUIRE_MZ=no -PREEMPTIBLE_PRIO=0 -source lib.sh - -traffic_test() -{ - local if=$1; shift - local src=$1; shift - local num_pkts=10000 - local before= - local after= - local delta= - - if [ ${has_pmac_stats[$if]} = false ]; then - src="aggregate" - fi - - before=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) - - $MZ $if -q -c $num_pkts -p 64 -b bcast -t ip -R $PREEMPTIBLE_PRIO - - after=$(ethtool_std_stats_get $if "eth-mac" "FramesTransmittedOK" $src) - - delta=$((after - before)) - - # Allow an extra 1% tolerance for random packets sent by the stack - [ $delta -ge $num_pkts ] && [ $delta -le $((num_pkts + 100)) ] -} - -manual_with_verification() -{ - local tx=$1; shift - local rx=$1; shift - - RET=0 - - # It isn't completely clear from IEEE 802.3-2018 Figure 99-5: Transmit - # Processing state diagram whether the "send_r" variable (send response - # to verification frame) should be taken into consideration while the - # MAC Merge TX direction is disabled. That being said, at least the - # NXP ENETC does not, and requires tx-enabled on in order to respond to - # the link partner's verification frames. - ethtool --set-mm $rx tx-enabled on - ethtool --set-mm $tx verify-enabled on tx-enabled on - - # Wait for verification to finish - sleep 1 - - ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ - grep -q 'SUCCEEDED' - check_err "$?" "Verification did not succeed" - - ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' - check_err "$?" "pMAC TX is not active" - - traffic_test $tx "pmac" - check_err "$?" "Traffic did not get sent through $tx's pMAC" - - ethtool --set-mm $tx verify-enabled off tx-enabled off - ethtool --set-mm $rx tx-enabled off - - log_test "Manual configuration with verification: $tx to $rx" -} - -manual_with_verification_h1_to_h2() -{ - manual_with_verification $h1 $h2 -} - -manual_with_verification_h2_to_h1() -{ - manual_with_verification $h2 $h1 -} - -manual_without_verification() -{ - local tx=$1; shift - local rx=$1; shift - - RET=0 - - ethtool --set-mm $tx verify-enabled off tx-enabled on - - ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ - grep -q 'DISABLED' - check_err "$?" "Verification is not disabled" - - ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' - check_err "$?" "pMAC TX is not active" - - traffic_test $tx "pmac" - check_err "$?" "Traffic did not get sent through $tx's pMAC" - - ethtool --set-mm $tx verify-enabled off tx-enabled off - - log_test "Manual configuration without verification: $tx to $rx" -} - -manual_without_verification_h1_to_h2() -{ - manual_without_verification $h1 $h2 -} - -manual_without_verification_h2_to_h1() -{ - manual_without_verification $h2 $h1 -} - -manual_failed_verification() -{ - local tx=$1; shift - local rx=$1; shift - - RET=0 - - ethtool --set-mm $rx pmac-enabled off - ethtool --set-mm $tx verify-enabled on tx-enabled on - - # Wait for verification to time out - sleep 1 - - ethtool --json --show-mm $tx | jq -r '.[]."verify-status"' | \ - grep -q 'SUCCEEDED' - check_fail "$?" "Verification succeeded when it shouldn't have" - - ethtool --json --show-mm $tx | jq -r '.[]."tx-active"' | grep -q 'true' - check_fail "$?" "pMAC TX is active when it shouldn't have" - - traffic_test $tx "emac" - check_err "$?" "Traffic did not get sent through $tx's eMAC" - - ethtool --set-mm $tx verify-enabled off tx-enabled off - ethtool --set-mm $rx pmac-enabled on - - log_test "Manual configuration with failed verification: $tx to $rx" -} - -manual_failed_verification_h1_to_h2() -{ - manual_failed_verification $h1 $h2 -} - -manual_failed_verification_h2_to_h1() -{ - manual_failed_verification $h2 $h1 -} - -smallest_supported_add_frag_size() -{ - local iface=$1 - local rx_min_frag_size= - - rx_min_frag_size=$(ethtool --json --show-mm $iface | \ - jq '.[]."rx-min-frag-size"') - - if [ $rx_min_frag_size -le 60 ]; then - echo 0 - elif [ $rx_min_frag_size -le 124 ]; then - echo 1 - elif [ $rx_min_frag_size -le 188 ]; then - echo 2 - elif [ $rx_min_frag_size -le 252 ]; then - echo 3 - else - echo "$iface: RX min frag size $rx_min_frag_size cannot be advertised over LLDP" - exit 1 - fi -} - -expected_add_frag_size() -{ - local iface=$1 - local requested=$2 - local min=$(smallest_supported_add_frag_size $iface) - - [ $requested -le $min ] && echo $min || echo $requested -} - -lldp_change_add_frag_size() -{ - local add_frag_size=$1 - local pattern= - - lldptool -T -i $h1 -V addEthCaps addFragSize=$add_frag_size >/dev/null - # Wait for TLVs to be received - sleep 2 - pattern=$(printf "Additional fragment size: %d" \ - $(expected_add_frag_size $h1 $add_frag_size)) - lldptool -i $h2 -t -n -V addEthCaps | grep -q "$pattern" -} - -lldp() -{ - RET=0 - - systemctl start lldpad - - # Configure the interfaces to receive and transmit LLDPDUs - lldptool -L -i $h1 adminStatus=rxtx >/dev/null - lldptool -L -i $h2 adminStatus=rxtx >/dev/null - - # Enable the transmission of Additional Ethernet Capabilities TLV - lldptool -T -i $h1 -V addEthCaps enableTx=yes >/dev/null - lldptool -T -i $h2 -V addEthCaps enableTx=yes >/dev/null - - # Wait for TLVs to be received - sleep 2 - - lldptool -i $h1 -t -n -V addEthCaps | \ - grep -q "Preemption capability active" - check_err "$?" "$h1 pMAC TX is not active" - - lldptool -i $h2 -t -n -V addEthCaps | \ - grep -q "Preemption capability active" - check_err "$?" "$h2 pMAC TX is not active" - - lldp_change_add_frag_size 3 - check_err "$?" "addFragSize 3" - - lldp_change_add_frag_size 2 - check_err "$?" "addFragSize 2" - - lldp_change_add_frag_size 1 - check_err "$?" "addFragSize 1" - - lldp_change_add_frag_size 0 - check_err "$?" "addFragSize 0" - - traffic_test $h1 "pmac" - check_err "$?" "Traffic did not get sent through $h1's pMAC" - - traffic_test $h2 "pmac" - check_err "$?" "Traffic did not get sent through $h2's pMAC" - - systemctl stop lldpad - - log_test "LLDP" -} - -h1_create() -{ - ip link set dev $h1 up - - tc qdisc add dev $h1 root mqprio num_tc 4 map 0 1 2 3 \ - queues 1@0 1@1 1@2 1@3 \ - fp P E E E \ - hw 1 - - ethtool --set-mm $h1 pmac-enabled on tx-enabled off verify-enabled off -} - -h2_create() -{ - ip link set dev $h2 up - - ethtool --set-mm $h2 pmac-enabled on tx-enabled off verify-enabled off - - tc qdisc add dev $h2 root mqprio num_tc 4 map 0 1 2 3 \ - queues 1@0 1@1 1@2 1@3 \ - fp P E E E \ - hw 1 -} - -h1_destroy() -{ - ethtool --set-mm $h1 pmac-enabled off tx-enabled off verify-enabled off - - tc qdisc del dev $h1 root - - ip link set dev $h1 down -} - -h2_destroy() -{ - tc qdisc del dev $h2 root - - ethtool --set-mm $h2 pmac-enabled off tx-enabled off verify-enabled off - - ip link set dev $h2 down -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - h1_create - h2_create -} - -cleanup() -{ - pre_cleanup - - h2_destroy - h1_destroy -} - -check_ethtool_mm_support -check_tc_fp_support -require_command lldptool -bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" - -for netif in ${NETIFS[@]}; do - ethtool --show-mm $netif 2>&1 &> /dev/null - if [[ $? -ne 0 ]]; then - echo "SKIP: $netif does not support MAC Merge" - exit $ksft_skip - fi - - if check_ethtool_pmac_std_stats_support $netif eth-mac; then - has_pmac_stats[$netif]=true - else - has_pmac_stats[$netif]=false - echo "$netif does not report pMAC statistics, falling back to aggregate" - fi -done - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh b/tools/testing/selftests/net/forwarding/ethtool_rmon.sh deleted file mode 100755 index 41a34a61f763..000000000000 --- a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -ALL_TESTS=" - rmon_rx_histogram - rmon_tx_histogram -" - -NUM_NETIFS=2 -source lib.sh - -ETH_FCS_LEN=4 -ETH_HLEN=$((6+6+2)) - -declare -A netif_mtu - -ensure_mtu() -{ - local iface=$1; shift - local len=$1; shift - local current=$(ip -j link show dev $iface | jq -r '.[0].mtu') - local required=$((len - ETH_HLEN - ETH_FCS_LEN)) - - if [ $current -lt $required ]; then - ip link set dev $iface mtu $required || return 1 - fi -} - -bucket_test() -{ - local iface=$1; shift - local neigh=$1; shift - local set=$1; shift - local bucket=$1; shift - local len=$1; shift - local num_rx=10000 - local num_tx=20000 - local expected= - local before= - local after= - local delta= - - # Mausezahn does not include FCS bytes in its length - but the - # histogram counters do - len=$((len - ETH_FCS_LEN)) - - before=$(ethtool --json -S $iface --groups rmon | \ - jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") - - # Send 10k one way and 20k in the other, to detect counters - # mapped to the wrong direction - $MZ $neigh -q -c $num_rx -p $len -a own -b bcast -d 10us - $MZ $iface -q -c $num_tx -p $len -a own -b bcast -d 10us - - after=$(ethtool --json -S $iface --groups rmon | \ - jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") - - delta=$((after - before)) - - expected=$([ $set = rx ] && echo $num_rx || echo $num_tx) - - # Allow some extra tolerance for other packets sent by the stack - [ $delta -ge $expected ] && [ $delta -le $((expected + 100)) ] -} - -rmon_histogram() -{ - local iface=$1; shift - local neigh=$1; shift - local set=$1; shift - local nbuckets=0 - local step= - - RET=0 - - while read -r -a bucket; do - step="$set-pkts${bucket[0]}to${bucket[1]} on $iface" - - for if in $iface $neigh; do - if ! ensure_mtu $if ${bucket[0]}; then - log_test_skip "$if does not support the required MTU for $step" - return - fi - done - - if ! bucket_test $iface $neigh $set $nbuckets ${bucket[0]}; then - check_err 1 "$step failed" - return 1 - fi - log_test "$step" - nbuckets=$((nbuckets + 1)) - done < <(ethtool --json -S $iface --groups rmon | \ - jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) - - if [ $nbuckets -eq 0 ]; then - log_test_skip "$iface does not support $set histogram counters" - return - fi -} - -rmon_rx_histogram() -{ - rmon_histogram $h1 $h2 rx - rmon_histogram $h2 $h1 rx -} - -rmon_tx_histogram() -{ - rmon_histogram $h1 $h2 tx - rmon_histogram $h2 $h1 tx -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - for iface in $h1 $h2; do - netif_mtu[$iface]=$(ip -j link show dev $iface | jq -r '.[0].mtu') - ip link set dev $iface up - done -} - -cleanup() -{ - pre_cleanup - - for iface in $h2 $h1; do - ip link set dev $iface \ - mtu ${netif_mtu[$iface]} \ - down - done -} - -check_ethtool_counter_group_support -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3.sh deleted file mode 100755 index 48584a51388f..000000000000 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# +--------------------+ +----------------------+ -# | H1 | | H2 | -# | | | | -# | $h1.200 + | | + $h2.200 | -# | 192.0.2.1/28 | | | | 192.0.2.18/28 | -# | 2001:db8:1::1/64 | | | | 2001:db8:2::1/64 | -# | | | | | | -# | $h1 + | | + $h2 | -# | | | | | | -# +------------------|-+ +-|--------------------+ -# | | -# +------------------|-------------------------|--------------------+ -# | SW | | | -# | | | | -# | $rp1 + + $rp2 | -# | | | | -# | $rp1.200 + + $rp2.200 | -# | 192.0.2.2/28 192.0.2.17/28 | -# | 2001:db8:1::2/64 2001:db8:2::2/64 | -# | | -# +-----------------------------------------------------------------+ - -ALL_TESTS=" - ping_ipv4 - ping_ipv6 - test_stats_rx_ipv4 - test_stats_tx_ipv4 - test_stats_rx_ipv6 - test_stats_tx_ipv6 - respin_enablement - test_stats_rx_ipv4 - test_stats_tx_ipv4 - test_stats_rx_ipv6 - test_stats_tx_ipv6 - reapply_config - ping_ipv4 - ping_ipv6 - test_stats_rx_ipv4 - test_stats_tx_ipv4 - test_stats_rx_ipv6 - test_stats_tx_ipv6 - test_stats_report_rx - test_stats_report_tx - test_destroy_enabled - test_double_enable -" -NUM_NETIFS=4 -source lib.sh - -h1_create() -{ - simple_if_init $h1 - vlan_create $h1 200 v$h1 192.0.2.1/28 2001:db8:1::1/64 - ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 - ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 -} - -h1_destroy() -{ - ip -6 route del 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2 - ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 - vlan_destroy $h1 200 - simple_if_fini $h1 -} - -h2_create() -{ - simple_if_init $h2 - vlan_create $h2 200 v$h2 192.0.2.18/28 2001:db8:2::1/64 - ip route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 - ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 -} - -h2_destroy() -{ - ip -6 route del 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::2 - ip route del 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.17 - vlan_destroy $h2 200 - simple_if_fini $h2 -} - -router_rp1_200_create() -{ - ip link add name $rp1.200 link $rp1 type vlan id 200 - ip link set dev $rp1.200 addrgenmode eui64 - ip link set dev $rp1.200 up - ip address add dev $rp1.200 192.0.2.2/28 - ip address add dev $rp1.200 2001:db8:1::2/64 - ip stats set dev $rp1.200 l3_stats on -} - -router_rp1_200_destroy() -{ - ip stats set dev $rp1.200 l3_stats off - ip address del dev $rp1.200 2001:db8:1::2/64 - ip address del dev $rp1.200 192.0.2.2/28 - ip link del dev $rp1.200 -} - -router_create() -{ - ip link set dev $rp1 up - router_rp1_200_create - - ip link set dev $rp2 up - vlan_create $rp2 200 "" 192.0.2.17/28 2001:db8:2::2/64 -} - -router_destroy() -{ - vlan_destroy $rp2 200 - ip link set dev $rp2 down - - router_rp1_200_destroy - ip link set dev $rp1 down -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - rp1=${NETIFS[p2]} - - rp2=${NETIFS[p3]} - h2=${NETIFS[p4]} - - rp1mac=$(mac_get $rp1) - rp2mac=$(mac_get $rp2) - - vrf_prepare - - h1_create - h2_create - - router_create - - forwarding_enable -} - -cleanup() -{ - pre_cleanup - - forwarding_restore - - router_destroy - - h2_destroy - h1_destroy - - vrf_cleanup -} - -ping_ipv4() -{ - ping_test $h1.200 192.0.2.18 " IPv4" -} - -ping_ipv6() -{ - ping_test $h1.200 2001:db8:2::1 " IPv6" -} - -send_packets_rx_ipv4() -{ - # Send 21 packets instead of 20, because the first one might trap and go - # through the SW datapath, which might not bump the HW counter. - $MZ $h1.200 -c 21 -d 20msec -p 100 \ - -a own -b $rp1mac -A 192.0.2.1 -B 192.0.2.18 \ - -q -t udp sp=54321,dp=12345 -} - -send_packets_rx_ipv6() -{ - $MZ $h1.200 -6 -c 21 -d 20msec -p 100 \ - -a own -b $rp1mac -A 2001:db8:1::1 -B 2001:db8:2::1 \ - -q -t udp sp=54321,dp=12345 -} - -send_packets_tx_ipv4() -{ - $MZ $h2.200 -c 21 -d 20msec -p 100 \ - -a own -b $rp2mac -A 192.0.2.18 -B 192.0.2.1 \ - -q -t udp sp=54321,dp=12345 -} - -send_packets_tx_ipv6() -{ - $MZ $h2.200 -6 -c 21 -d 20msec -p 100 \ - -a own -b $rp2mac -A 2001:db8:2::1 -B 2001:db8:1::1 \ - -q -t udp sp=54321,dp=12345 -} - -___test_stats() -{ - local dir=$1; shift - local prot=$1; shift - - local a - local b - - a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) - send_packets_${dir}_${prot} - "$@" - b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ - hw_stats_get l3_stats $rp1.200 ${dir} packets) - check_err $? "Traffic not reflected in the counter: $a -> $b" -} - -__test_stats() -{ - local dir=$1; shift - local prot=$1; shift - - RET=0 - ___test_stats "$dir" "$prot" - log_test "Test $dir packets: $prot" -} - -test_stats_rx_ipv4() -{ - __test_stats rx ipv4 -} - -test_stats_tx_ipv4() -{ - __test_stats tx ipv4 -} - -test_stats_rx_ipv6() -{ - __test_stats rx ipv6 -} - -test_stats_tx_ipv6() -{ - __test_stats tx ipv6 -} - -# Make sure everything works well even after stats have been disabled and -# reenabled on the same device without touching the L3 configuration. -respin_enablement() -{ - log_info "Turning stats off and on again" - ip stats set dev $rp1.200 l3_stats off - ip stats set dev $rp1.200 l3_stats on -} - -# For the initial run, l3_stats is enabled on a completely set up netdevice. Now -# do it the other way around: enabling the L3 stats on an L2 netdevice, and only -# then apply the L3 configuration. -reapply_config() -{ - log_info "Reapplying configuration" - - router_rp1_200_destroy - - ip link add name $rp1.200 link $rp1 type vlan id 200 - ip link set dev $rp1.200 addrgenmode none - ip stats set dev $rp1.200 l3_stats on - ip link set dev $rp1.200 addrgenmode eui64 - ip link set dev $rp1.200 up - ip address add dev $rp1.200 192.0.2.2/28 - ip address add dev $rp1.200 2001:db8:1::2/64 -} - -__test_stats_report() -{ - local dir=$1; shift - local prot=$1; shift - - local a - local b - - RET=0 - - a=$(hw_stats_get l3_stats $rp1.200 ${dir} packets) - send_packets_${dir}_${prot} - ip address flush dev $rp1.200 - b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ - hw_stats_get l3_stats $rp1.200 ${dir} packets) - check_err $? "Traffic not reflected in the counter: $a -> $b" - log_test "Test ${dir} packets: stats pushed on loss of L3" - - ip stats set dev $rp1.200 l3_stats off - ip link del dev $rp1.200 - router_rp1_200_create -} - -test_stats_report_rx() -{ - __test_stats_report rx ipv4 -} - -test_stats_report_tx() -{ - __test_stats_report tx ipv4 -} - -test_destroy_enabled() -{ - RET=0 - - ip link del dev $rp1.200 - router_rp1_200_create - - log_test "Destroy l3_stats-enabled netdev" -} - -test_double_enable() -{ - RET=0 - ___test_stats rx ipv4 \ - ip stats set dev $rp1.200 l3_stats on - log_test "Test stat retention across a spurious enablement" -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | - jq '.[].info.l3_stats.used') -kind=$(ip -j -d link show dev $rp1 | - jq -r '.[].linkinfo.info_kind') -if [[ $used != true ]]; then - if [[ $kind == veth ]]; then - log_test_skip "l3_stats not offloaded on veth interface" - EXIT_STATUS=$ksft_skip - else - RET=1 log_test "l3_stats not offloaded" - fi -else - tests_run -fi - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh deleted file mode 100755 index 7594bbb49029..000000000000 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Test L3 stats on IP-in-IP GRE tunnel without key. - -# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more -# details. - -ALL_TESTS=" - ping_ipv4 - test_stats_rx - test_stats_tx -" -NUM_NETIFS=6 -source lib.sh -source ipip_lib.sh - -setup_prepare() -{ - h1=${NETIFS[p1]} - ol1=${NETIFS[p2]} - - ul1=${NETIFS[p3]} - ul2=${NETIFS[p4]} - - ol2=${NETIFS[p5]} - h2=${NETIFS[p6]} - - ol1mac=$(mac_get $ol1) - - forwarding_enable - vrf_prepare - h1_create - h2_create - sw1_flat_create gre $ol1 $ul1 - sw2_flat_create gre $ol2 $ul2 - ip stats set dev g1a l3_stats on - ip stats set dev g2a l3_stats on -} - -cleanup() -{ - pre_cleanup - - ip stats set dev g1a l3_stats off - ip stats set dev g2a l3_stats off - - sw2_flat_destroy $ol2 $ul2 - sw1_flat_destroy $ol1 $ul1 - h2_destroy - h1_destroy - - vrf_cleanup - forwarding_restore -} - -ping_ipv4() -{ - RET=0 - - ping_test $h1 192.0.2.18 " gre flat" -} - -send_packets_ipv4() -{ - # Send 21 packets instead of 20, because the first one might trap and go - # through the SW datapath, which might not bump the HW counter. - $MZ $h1 -c 21 -d 20msec -p 100 \ - -a own -b $ol1mac -A 192.0.2.1 -B 192.0.2.18 \ - -q -t udp sp=54321,dp=12345 -} - -test_stats() -{ - local dev=$1; shift - local dir=$1; shift - - local a - local b - - RET=0 - - a=$(hw_stats_get l3_stats $dev $dir packets) - send_packets_ipv4 - b=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= $a + 20" \ - hw_stats_get l3_stats $dev $dir packets) - check_err $? "Traffic not reflected in the counter: $a -> $b" - - log_test "Test $dir packets: $prot" -} - -test_stats_tx() -{ - test_stats g1a tx -} - -test_stats_rx() -{ - test_stats g2a rx -} - -skip_on_veth - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/net/forwarding/loopback.sh deleted file mode 100755 index 8f4057310b5b..000000000000 --- a/tools/testing/selftests/net/forwarding/loopback.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -ALL_TESTS="loopback_test" -NUM_NETIFS=2 -source tc_common.sh -source lib.sh - -h1_create() -{ - simple_if_init $h1 192.0.2.1/24 - tc qdisc add dev $h1 clsact -} - -h1_destroy() -{ - tc qdisc del dev $h1 clsact - simple_if_fini $h1 192.0.2.1/24 -} - -h2_create() -{ - simple_if_init $h2 -} - -h2_destroy() -{ - simple_if_fini $h2 -} - -loopback_test() -{ - RET=0 - - tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \ - skip_hw arp_op reply arp_tip 192.0.2.1 action drop - - $MZ $h1 -c 1 -t arp -q - - tc_check_packets "dev $h1 ingress" 101 1 - check_fail $? "Matched on a filter without loopback setup" - - ethtool -K $h1 loopback on - check_err $? "Failed to enable loopback" - - setup_wait_dev $h1 - - $MZ $h1 -c 1 -t arp -q - - tc_check_packets "dev $h1 ingress" 101 1 - check_err $? "Did not match on filter with loopback" - - ethtool -K $h1 loopback off - check_err $? "Failed to disable loopback" - - $MZ $h1 -c 1 -t arp -q - - tc_check_packets "dev $h1 ingress" 101 2 - check_fail $? "Matched on a filter after loopback was removed" - - tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower - - log_test "loopback" -} - -setup_prepare() -{ - h1=${NETIFS[p1]} - h2=${NETIFS[p2]} - - vrf_prepare - - h1_create - h2_create - - if ethtool -k $h1 | grep loopback | grep -q fixed; then - log_test "SKIP: dev $h1 does not support loopback feature" - exit $ksft_skip - fi -} - -cleanup() -{ - pre_cleanup - - h2_destroy - h1_destroy - - vrf_cleanup -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS -- cgit v1.2.3 From 0c499a351777efb98a738272c6e43fef5484cc44 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:33 +0100 Subject: selftests: forwarding: Ditch skip_on_veth() Since the selftests that are not supposed to run on veth pairs are now in their own dedicated directory, the skip_on_veth logic can go away. Drop it from the selftests, and from lib.sh. Cc: Danielle Ratson Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/63b470e10d65270571ee7de709b31672ce314872.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/ethtool.sh | 2 -- .../selftests/drivers/net/hw/ethtool_extended_state.sh | 2 -- tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh | 16 ++++------------ .../testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh | 2 -- tools/testing/selftests/net/forwarding/lib.sh | 11 ----------- 5 files changed, 4 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh index 187429670ee7..bb12d5d70949 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -287,8 +287,6 @@ different_speeds_autoneg_on() ethtool -s $h1 autoneg on } -skip_on_veth - trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh index b0f931260a27..a7584448416e 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh @@ -109,8 +109,6 @@ no_cable() ip link set dev $swp3 down } -skip_on_veth - setup_prepare tests_run diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh index 744760117005..7dfc50366c99 100755 --- a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -325,17 +325,9 @@ setup_wait used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info | jq '.[].info.l3_stats.used') -kind=$(ip -j -d link show dev $rp1 | - jq -r '.[].linkinfo.info_kind') -if [[ $used != true ]]; then - if [[ $kind == veth ]]; then - log_test_skip "l3_stats not offloaded on veth interface" - EXIT_STATUS=$ksft_skip - else - RET=1 log_test "l3_stats not offloaded" - fi -else - tests_run -fi +[[ $used = true ]] +check_err $? "hw_stats_info.used=$used" +log_test "l3_stats offloaded" +tests_run exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh index 354be353455f..ab8d04855af5 100755 --- a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -100,8 +100,6 @@ test_stats_rx() test_stats g2a rx } -skip_on_veth - trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index dbd4348f85b8..ca433ba3612e 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -254,17 +254,6 @@ check_port_mab_support() fi } -skip_on_veth() -{ - local kind=$(ip -j -d link show dev ${NETIFS[p1]} | - jq -r '.[].linkinfo.info_kind') - - if [[ $kind == veth ]]; then - echo "SKIP: Test cannot be run with veth pairs" - exit $ksft_skip - fi -} - if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" exit $ksft_skip -- cgit v1.2.3 From 677f394956e808c709c18b92bd01d19f14a96dd5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:34 +0100 Subject: selftests: forwarding: Change inappropriate log_test_skip() calls The SKIP return should be used for cases where tooling of the machine under test is lacking. For cases where HW is lacking, the appropriate outcome is XFAIL. This is the case with ethtool_rmon and mlxsw_lib. For these, introduce a new helper, log_test_xfail(). Do the same for router_mpath_nh_lib. Note that it will be fixed using a more reusable way in a following patch. For the two resource_scale selftests, the log should simply not be written, because there is no problem. Cc: Tobias Waldekranz Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/3d668d8fb6fa0d9eeb47ce6d9e54114348c7c179.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh | 4 ++-- tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh | 2 +- .../selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh | 1 - .../selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 1 - tools/testing/selftests/net/forwarding/lib.sh | 9 +++++++++ tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh | 2 +- 6 files changed, 13 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh index 709433a4c886..e2a1c10d3503 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -79,7 +79,7 @@ rmon_histogram() for if in $iface $neigh; do if ! ensure_mtu $if ${bucket[0]}; then - log_test_skip "$if does not support the required MTU for $step" + log_test_xfail "$if does not support the required MTU for $step" return fi done @@ -94,7 +94,7 @@ rmon_histogram() jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null) if [ $nbuckets -eq 0 ]; then - log_test_skip "$iface does not support $set histogram counters" + log_test_xfail "$iface does not support $set histogram counters" return fi } diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh index 6369927e9c37..48395cfd4f95 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh @@ -42,7 +42,7 @@ __mlxsw_only_on_spectrum() local src=$1; shift if ! mlxsw_on_spectrum "$rev"; then - log_test_skip $src:$caller "(Spectrum-$rev only)" + log_test_xfail $src:$caller "(Spectrum-$rev only)" return 1 fi } diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index a88d8a8c85f2..899b6892603f 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -47,7 +47,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do RET=0 target=$(${current_test}_get_target "$should_fail") if ((target == 0)); then - log_test_skip "'$current_test' should_fail=$should_fail test" continue fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index f981c957f097..482ebb744eba 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -52,7 +52,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do RET=0 target=$(${current_test}_get_target "$should_fail") if ((target == 0)); then - log_test_skip "'$current_test' [$profile] should_fail=$should_fail test" continue fi ${current_test}_setup_prepare diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ca433ba3612e..5415b8d29862 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -467,6 +467,15 @@ log_test_skip() return 0 } +log_test_xfail() +{ + local test_name=$1 + local opt_str=$2 + + printf "TEST: %-60s [XFAIL]\n" "$test_name $opt_str" + return 0 +} + log_info() { local msg=$1 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index 7e7d62161c34..b2d2c6cecc01 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -69,7 +69,7 @@ nh_stats_test_dispatch_swhw() nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get_hw "${mz[@]}" elif [[ $kind == veth ]]; then - log_test_skip "HW stats not offloaded on veth topology" + log_test_xfail "HW stats not offloaded on veth topology" fi } -- cgit v1.2.3 From 51ccf267beb2bbd4775974475aed4bffa8312951 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:35 +0100 Subject: selftests: lib: Define more kselftest exit codes The following patches will operate with more exit codes besides ksft_skip. Add them here. Additionally, move a duplicated skip exit code definition from forwarding/tc_tunnel_key.sh. Keep a similar duplicate in forwarding/devlink_lib.sh, because even though lib.sh will have been sourced in all cases where devlink_lib is, the inclusion is not visible in the file itself, and relying on it would be confusing. Cc: Davide Caratti Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/545a03046c7aca0628a51a389a9b81949ab288ce.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/tc_tunnel_key.sh | 2 -- tools/testing/selftests/net/lib.sh | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh index 5a5dd9034819..79775b10b99f 100755 --- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh +++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh @@ -1,7 +1,5 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 ALL_TESTS="tunnel_key_nofrag_test" diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 5b366cc4fc43..d9bdf6aa3bf1 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -8,8 +8,12 @@ BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms -# Kselftest framework requirement - SKIP code is 4. +# Kselftest framework constants. +ksft_pass=0 +ksft_fail=1 +ksft_xfail=2 ksft_skip=4 + # namespace list created by setup_ns NS_LIST="" -- cgit v1.2.3 From 596c8819cb78c047acc0cab03a863a9983a3cc62 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:36 +0100 Subject: selftests: forwarding: Have RET track kselftest framework constants The variable RET keeps track of whether the test under execution has so far failed or not. Currently it works in binary fashion: zero means everything is fine, non-zero means something failed. log_test() then uses the value to given a human-readable message. In order to allow log_test() to report skips and xfails, the semantics of RET need to be more fine-grained. Therefore have RET value be one of kselftest framework constants: $ksft_fail, $ksft_xfail, etc. The current logic in check_err() is such that first non-zero value of RET trumps all those that follow. But that is not right when RET has more fine-grained value semantics. Different outcomes have different weights. The results of PASS and XFAIL are mostly the same: they both communicate a test that did not go wrong. SKIP communicates lack of tooling, which the user should go and try to fix, and as such should not be overridden by the passes. So far, the higher-numbered statuses can be considered weightier. But FAIL should be the weightiest. Add a helper, ksft_status_merge(), which merges two statuses in a way that respects the above conditions. Express it in a generic manner, because exit status merge is subtly different, and we want to reuse the same logic. Use the new helper when setting RET in check_err(). Re-express check_fail() in terms of check_err() to avoid duplication. Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/7dfff51cc925c7a3ac879b9050a0d6a327c8d21f.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 21 ++++++++++++------- tools/testing/selftests/net/lib.sh | 30 +++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 5415b8d29862..ee8153651b38 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -396,14 +396,24 @@ EXIT_STATUS=0 # Per-test return value. Clear at the beginning of each test. RET=0 +ret_set_ksft_status() +{ + local ksft_status=$1; shift + local msg=$1; shift + + RET=$(ksft_status_merge $RET $ksft_status) + if (( $? )); then + retmsg=$msg + fi +} + check_err() { local err=$1 local msg=$2 - if [[ $RET -eq 0 && $err -ne 0 ]]; then - RET=$err - retmsg=$msg + if ((err)); then + ret_set_ksft_status $ksft_fail "$msg" fi } @@ -412,10 +422,7 @@ check_fail() local err=$1 local msg=$2 - if [[ $RET -eq 0 && $err -eq 0 ]]; then - RET=1 - retmsg=$msg - fi + check_err $((!err)) "$msg" } check_err_fail() diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index d9bdf6aa3bf1..88f6133ca319 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -19,6 +19,36 @@ NS_LIST="" ############################################################################## # Helpers + +__ksft_status_merge() +{ + local a=$1; shift + local b=$1; shift + local -A weights + local weight=0 + + for i in "$@"; do + weights[$i]=$((weight++)) + done + + if [[ ${weights[$a]} > ${weights[$b]} ]]; then + echo "$a" + return 0 + else + echo "$b" + return 1 + fi +} + +ksft_status_merge() +{ + local a=$1; shift + local b=$1; shift + + __ksft_status_merge "$a" "$b" \ + $ksft_pass $ksft_xfail $ksft_skip $ksft_fail +} + busywait() { local timeout=$1; shift -- cgit v1.2.3 From a923af1ceee744c187d1c08a0d7dc9e8ab7ca482 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:37 +0100 Subject: selftests: forwarding: Convert log_test() to recognize RET values In a previous patch, the interpretation of RET value was changed to mean the kselftest framework constant with the test outcome: $ksft_pass, $ksft_xfail, etc. Update log_test() to recognize the various possible RET values. Then have EXIT_STATUS track the RET value of the current test. This differs subtly from the way RET tracks the value: while for RET we want to recognize XFAIL as a separate status, for purposes of exit code, we want to to conflate XFAIL and PASS, because they both communicate non-failure. Thus add a new helper, ksft_exit_status_merge(). With this log_test_skip() and log_test_xfail() can be reexpressed as thin wrappers around log_test. Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/e5f807cb5476ab795fd14ac74da53a731a9fc432.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 92 ++++++++++++++++++++------- tools/testing/selftests/net/lib.sh | 9 +++ 2 files changed, 77 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ee8153651b38..370fc377249b 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -438,6 +438,62 @@ check_err_fail() fi } +log_test_result() +{ + local test_name=$1; shift + local opt_str=$1; shift + local result=$1; shift + local retmsg=$1; shift + + printf "TEST: %-60s [%s]\n" "$test_name $opt_str" "$result" + if [[ $retmsg ]]; then + printf "\t%s\n" "$retmsg" + fi +} + +pause_on_fail() +{ + if [[ $PAUSE_ON_FAIL == yes ]]; then + echo "Hit enter to continue, 'q' to quit" + read a + [[ $a == q ]] && exit 1 + fi +} + +handle_test_result_pass() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" " OK " +} + +handle_test_result_fail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" FAIL "$retmsg" + pause_on_fail +} + +handle_test_result_xfail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" XFAIL "$retmsg" + pause_on_fail +} + +handle_test_result_skip() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" SKIP "$retmsg" +} + log_test() { local test_name=$1 @@ -447,40 +503,28 @@ log_test() opt_str="($opt_str)" fi - if [[ $RET -ne 0 ]]; then - EXIT_STATUS=1 - printf "TEST: %-60s [FAIL]\n" "$test_name $opt_str" - if [[ ! -z "$retmsg" ]]; then - printf "\t%s\n" "$retmsg" - fi - if [ "${PAUSE_ON_FAIL}" = "yes" ]; then - echo "Hit enter to continue, 'q' to quit" - read a - [ "$a" = "q" ] && exit 1 - fi - return 1 + if ((RET == ksft_pass)); then + handle_test_result_pass "$test_name" "$opt_str" + elif ((RET == ksft_xfail)); then + handle_test_result_xfail "$test_name" "$opt_str" + elif ((RET == ksft_skip)); then + handle_test_result_skip "$test_name" "$opt_str" + else + handle_test_result_fail "$test_name" "$opt_str" fi - printf "TEST: %-60s [ OK ]\n" "$test_name $opt_str" - return 0 + EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET) + return $RET } log_test_skip() { - local test_name=$1 - local opt_str=$2 - - printf "TEST: %-60s [SKIP]\n" "$test_name $opt_str" - return 0 + RET=$ksft_skip retmsg= log_test "$@" } log_test_xfail() { - local test_name=$1 - local opt_str=$2 - - printf "TEST: %-60s [XFAIL]\n" "$test_name $opt_str" - return 0 + RET=$ksft_xfail retmsg= log_test "$@" } log_info() diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 88f6133ca319..b7f7b8695165 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -49,6 +49,15 @@ ksft_status_merge() $ksft_pass $ksft_xfail $ksft_skip $ksft_fail } +ksft_exit_status_merge() +{ + local a=$1; shift + local b=$1; shift + + __ksft_status_merge "$a" "$b" \ + $ksft_xfail $ksft_pass $ksft_skip $ksft_fail +} + busywait() { local timeout=$1; shift -- cgit v1.2.3 From e16a8d209c33de2f5b739d01148738b973371daf Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:38 +0100 Subject: selftests: forwarding: Support for performance sensitive tests Several tests in the suite use large amounts of traffic to e.g. cause congestion and evaluate RED or shaper performance. These tests will not run well on a slow machine, be it one with heavy debug kernel, or a VM, or e.g. a single-board computer. Allow users to specify an environment variable, KSFT_MACHINE_SLOW=yes, to indicate that the tests are being run on one such machine. Performance sensitive tests can then use a new helper, xfail_on_slow(), to mark parts of the test that are sensitive to low-performance machines. The helper can be used to just mark the whole suite, like so: xfail_on_slow tests_run ... or, on the other side of the granularity spectrum, to override individual checks: xfail_on_slow check_err $? "Expected much, got little." Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/99a376a2d2ffdaeee7752b1910cb0c3ea5d80fbe.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 370fc377249b..58df57855bff 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -79,6 +79,11 @@ declare -A NETIFS=( # Flags for TC filters. : "${TC_FLAG:=skip_hw}" +# Whether the machine is "slow" -- i.e. might be incapable of running tests +# involving heavy traffic. This might be the case on a debug kernel, a VM, or +# e.g. a low-power board. +: "${KSFT_MACHINE_SLOW:=no}" + net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") if [[ -f $net_forwarding_dir/forwarding.config ]]; then @@ -407,13 +412,20 @@ ret_set_ksft_status() fi } +# Whether FAILs should be interpreted as XFAILs. Internal. +FAIL_TO_XFAIL= + check_err() { local err=$1 local msg=$2 if ((err)); then - ret_set_ksft_status $ksft_fail "$msg" + if [[ $FAIL_TO_XFAIL = yes ]]; then + ret_set_ksft_status $ksft_xfail "$msg" + else + ret_set_ksft_status $ksft_fail "$msg" + fi fi } @@ -438,6 +450,15 @@ check_err_fail() fi } +xfail_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW = yes ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + log_test_result() { local test_name=$1; shift -- cgit v1.2.3 From e10391092aca4b00b59b1650b899b0ea0a77c81a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:39 +0100 Subject: selftests: forwarding: Mark performance-sensitive tests When run on a slow machine, the scheduler traffic tests can be expected to fail, and should be reported as XFAIL in that case. Therefore run these tests through the perf_sensitive wrapper. Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/9a357f8cf34f5ececac08d43a3eb023008996035.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/forwarding/sch_ets_tests.sh | 19 +++++++++++-------- tools/testing/selftests/net/forwarding/sch_red.sh | 10 +++++----- .../testing/selftests/net/forwarding/sch_tbf_core.sh | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index cdf689e99458..f9d26a7911bb 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -199,25 +199,28 @@ ets_set_dwrr_two_bands() ets_test_strict() { ets_set_strict - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_mixed() { ets_set_mixed - ets_dwrr_test_01 - ets_dwrr_test_12 + xfail_on_slow ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_12 } ets_test_dwrr() { ets_set_dwrr_uniform - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_varying - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_change_quantum - ets_dwrr_test_012 + xfail_on_slow ets_dwrr_test_012 + ets_set_dwrr_two_bands - ets_dwrr_test_01 + xfail_on_slow ets_dwrr_test_01 } diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh index 81f31179ac88..17f28644568e 100755 --- a/tools/testing/selftests/net/forwarding/sch_red.sh +++ b/tools/testing/selftests/net/forwarding/sch_red.sh @@ -451,35 +451,35 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn - do_ecn_test $BACKLOG + xfail_on_slow do_ecn_test $BACKLOG uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop - do_ecn_nodrop_test $BACKLOG + xfail_on_slow do_ecn_nodrop_test $BACKLOG uninstall_qdisc } red_test() { install_qdisc - do_red_test $BACKLOG + xfail_on_slow do_red_test $BACKLOG uninstall_qdisc } red_qevent_test() { install_qdisc qevent early_drop block 10 - do_red_qevent_test $BACKLOG + xfail_on_slow do_red_qevent_test $BACKLOG uninstall_qdisc } ecn_qevent_test() { install_qdisc ecn qevent mark block 10 - do_ecn_qevent_test $BACKLOG + xfail_on_slow do_ecn_qevent_test $BACKLOG uninstall_qdisc } diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh index d1f26cb7cd73..9cd884d4a5de 100644 --- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh @@ -227,7 +227,7 @@ do_tbf_test() local nr=$(rate $t2 $t3 10) local nr_pct=$((100 * (nr - er) / er)) ((-5 <= nr_pct && nr_pct <= 5)) - check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." + xfail_on_slow check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%." log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit" } -- cgit v1.2.3 From 6db870bbf788e0867d6462f28e8d7385694620b2 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:40 +0100 Subject: selftests: forwarding: router_mpath_nh_lib: Don't skip, xfail on veth When the NH group stats tests are currently run on a veth topology, the HW-stats leg of each test is SKIP'ped. But kernel networking CI interprets skips as a sign that tooling is missing, and prompts maintainer investigation. Lack of capability to pass a test should be expressed as XFAIL. Selftests that require HW should normally be put in drivers/net/hw, but doing so for the NH counter selftests would just lead to a lot of duplicity. So instead, introduce a helper, xfail_on_veth(), which can be used to mark selftests that should XFAIL instead of FAILing when run on a veth topology. On non-veth topology, they don't do anything. Use the helper in the HW-stats part of router_mpath_nh_lib selftest. Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/15f0ab9637aa0497f164ec30e83c1c8f53d53719.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 14 ++++++++++++++ .../selftests/net/forwarding/router_mpath_nh_lib.sh | 12 +----------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 58df57855bff..4103ed7afcde 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -459,6 +459,20 @@ xfail_on_slow() fi } +xfail_on_veth() +{ + local dev=$1; shift + local kind + + kind=$(ip -j -d link show dev $dev | + jq -r '.[].linkinfo.info_kind') + if [[ $kind = veth ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + log_test_result() { local test_name=$1; shift diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index b2d2c6cecc01..2903294d8bca 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -56,21 +56,12 @@ nh_stats_test_dispatch_swhw() local group_id=$1; shift local mz="$@" - local used - nh_stats_do_test "$what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get "${mz[@]}" - used=$(ip -s -j -d nexthop show id $group_id | - jq '.[].hw_stats.used') - kind=$(ip -j -d link show dev $rp11 | - jq -r '.[].linkinfo.info_kind') - if [[ $used == true ]]; then + xfail_on_veth $rp11 \ nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \ nh_stats_get_hw "${mz[@]}" - elif [[ $kind == veth ]]; then - log_test_xfail "HW stats not offloaded on veth topology" - fi } nh_stats_test_dispatch() @@ -83,7 +74,6 @@ nh_stats_test_dispatch() local mz="$@" local enabled - local kind if ! ip nexthop help 2>&1 | grep -q hw_stats; then log_test_skip "NH stats test: ip doesn't support HW stats" -- cgit v1.2.3 From 8ff2d7abfb6f7c01d7161765f4152ecf143c9448 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 26 Mar 2024 17:54:41 +0100 Subject: selftests: forwarding: Add a test for testing lib.sh functionality Rerunning various scenarios to make sure lib.sh changes do not impact the observable behavior is no fun. Add a selftest at least for the bare basics -- the mechanics of setting RET, retmsg, and EXIT_STATUS. Since the selftest itself uses lib.sh, it would be possible to break lib.sh in such a way that invalidates result of the selftest. Since the metatest only uses the bare basics (just pass/fail), hopefully such fundamental breakages would be noticed. Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/6d25cedbf2d4b83614944809a34fe023fbe8db38.1711464583.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/Makefile | 1 + .../selftests/net/forwarding/lib_sh_test.sh | 208 +++++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/lib_sh_test.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 56e3557ba8a6..fa7b59ff4029 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -37,6 +37,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ ipip_hier_gre_key.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ + lib_sh_test.sh \ local_termination.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh new file mode 100755 index 000000000000..ff2accccaf4d --- /dev/null +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -0,0 +1,208 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This tests the operation of lib.sh itself. + +ALL_TESTS=" + test_ret + test_exit_status +" +NUM_NETIFS=0 +source lib.sh + +# Simulated checks. + +do_test() +{ + local msg=$1; shift + + "$@" + check_err $? "$msg" +} + +tpass() +{ + do_test "tpass" true +} + +tfail() +{ + do_test "tfail" false +} + +txfail() +{ + FAIL_TO_XFAIL=yes do_test "txfail" false +} + +# Simulated tests. + +pass() +{ + RET=0 + do_test "true" true + log_test "true" +} + +fail() +{ + RET=0 + do_test "false" false + log_test "false" +} + +xfail() +{ + RET=0 + FAIL_TO_XFAIL=yes do_test "xfalse" false + log_test "xfalse" +} + +skip() +{ + RET=0 + log_test_skip "skip" +} + +slow_xfail() +{ + RET=0 + xfail_on_slow do_test "slow_false" false + log_test "slow_false" +} + +# lib.sh tests. + +ret_tests_run() +{ + local t + + RET=0 + retmsg= + for t in "$@"; do + $t + done + echo "$retmsg" + return $RET +} + +ret_subtest() +{ + local expect_ret=$1; shift + local expect_retmsg=$1; shift + local -a tests=( "$@" ) + + local status_names=(pass fail xfail xpass skip) + local ret + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(ret_tests_run "${tests[@]}") + ret=$? + + (( ret == expect_ret )) + check_err $? "RET=$ret expected $expect_ret" + + [[ $out == $expect_retmsg ]] + check_err $? "retmsg=$out expected $expect_retmsg" + + log_test "RET $(echo ${tests[@]}) -> ${status_names[$ret]}" +} + +test_ret() +{ + ret_subtest $ksft_pass "" + + ret_subtest $ksft_pass "" tpass + ret_subtest $ksft_fail "tfail" tfail + ret_subtest $ksft_xfail "txfail" txfail + + ret_subtest $ksft_pass "" tpass tpass + ret_subtest $ksft_fail "tfail" tpass tfail + ret_subtest $ksft_xfail "txfail" tpass txfail + + ret_subtest $ksft_fail "tfail" tfail tpass + ret_subtest $ksft_xfail "txfail" txfail tpass + + ret_subtest $ksft_fail "tfail" tfail tfail + ret_subtest $ksft_fail "tfail" tfail txfail + + ret_subtest $ksft_fail "tfail" txfail tfail + + ret_subtest $ksft_xfail "txfail" txfail txfail +} + +exit_status_tests_run() +{ + EXIT_STATUS=0 + tests_run > /dev/null + return $EXIT_STATUS +} + +exit_status_subtest() +{ + local expect_exit_status=$1; shift + local tests=$1; shift + local what=$1; shift + + local status_names=(pass fail xfail xpass skip) + local exit_status + local out + + RET=0 + + # Run this in a subshell, so that our environment is intact. + out=$(TESTS="$tests" exit_status_tests_run) + exit_status=$? + + (( exit_status == expect_exit_status )) + check_err $? "EXIT_STATUS=$exit_status, expected $expect_exit_status" + + log_test "EXIT_STATUS $tests$what -> ${status_names[$exit_status]}" +} + +test_exit_status() +{ + exit_status_subtest $ksft_pass ":" + + exit_status_subtest $ksft_pass "pass" + exit_status_subtest $ksft_fail "fail" + exit_status_subtest $ksft_pass "xfail" + exit_status_subtest $ksft_skip "skip" + + exit_status_subtest $ksft_pass "pass pass" + exit_status_subtest $ksft_fail "pass fail" + exit_status_subtest $ksft_pass "pass xfail" + exit_status_subtest $ksft_skip "pass skip" + + exit_status_subtest $ksft_fail "fail pass" + exit_status_subtest $ksft_pass "xfail pass" + exit_status_subtest $ksft_skip "skip pass" + + exit_status_subtest $ksft_fail "fail fail" + exit_status_subtest $ksft_fail "fail xfail" + exit_status_subtest $ksft_fail "fail skip" + + exit_status_subtest $ksft_fail "xfail fail" + exit_status_subtest $ksft_fail "skip fail" + + exit_status_subtest $ksft_pass "xfail xfail" + exit_status_subtest $ksft_skip "xfail skip" + exit_status_subtest $ksft_skip "skip xfail" + + exit_status_subtest $ksft_skip "skip skip" + + KSFT_MACHINE_SLOW=yes \ + exit_status_subtest $ksft_pass "slow_xfail" ": slow" + + KSFT_MACHINE_SLOW=no \ + exit_status_subtest $ksft_fail "slow_xfail" ": fast" +} + +trap pre_cleanup EXIT + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From b334f5ed3d914d46652db2b8aad7d135ad4a50ad Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 27 Mar 2024 20:31:28 +0800 Subject: ynl: support hex display_hint for integer Some times it would be convenient to read the integer as hex, like mask values. Suggested-by: Donald Hunter Reviewed-by: Donald Hunter Signed-off-by: Hangbin Liu Link: https://lore.kernel.org/r/20240327123130.1322921-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 5fa7957f6e0f..e73b027c5624 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -819,7 +819,10 @@ class YnlFamily(SpecFamily): if display_hint == 'mac': formatted = ':'.join('%02x' % b for b in raw) elif display_hint == 'hex': - formatted = bytes.hex(raw, ' ') + if isinstance(raw, int): + formatted = hex(raw) + else: + formatted = bytes.hex(raw, ' ') elif display_hint in [ 'ipv4', 'ipv6' ]: formatted = format(ipaddress.ip_address(raw)) elif display_hint == 'uuid': -- cgit v1.2.3 From 5311591fbb349fe9f5c555dcba3b13a5831aa72d Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 26 Mar 2024 10:17:40 +0000 Subject: bpf: Add support for passing mark with bpf_fib_lookup Extend the bpf_fib_lookup() helper by making it to utilize mark if the BPF_FIB_LOOKUP_MARK flag is set. In order to pass the mark the four bytes of struct bpf_fib_lookup are used, shared with the output-only smac/dmac fields. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Reviewed-by: David Ahern Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240326101742.17421-2-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 20 ++++++++++++++++++-- net/core/filter.c | 12 +++++++++--- tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++-- 3 files changed, 45 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9585f5345353..96d57e483133 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3394,6 +3394,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -7120,6 +7124,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7197,8 +7202,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { diff --git a/net/core/filter.c b/net/core/filter.c index 0c66e4a3fc5b..1205dd777dc2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5884,7 +5884,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6027,7 +6030,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6105,7 +6111,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9585f5345353..96d57e483133 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3394,6 +3394,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -7120,6 +7124,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7197,8 +7202,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { -- cgit v1.2.3 From 6efec2cb06411a577125b5f531a852c08ead1209 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 26 Mar 2024 10:17:41 +0000 Subject: selftests/bpf: Add BPF_FIB_LOOKUP_MARK tests This patch extends the fib_lookup test suite by adding a few test cases for each IP family to test the new BPF_FIB_LOOKUP_MARK flag to the bpf_fib_lookup: * Test destination IP address selection with and without a mark and/or the BPF_FIB_LOOKUP_MARK flag set Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240326101742.17421-3-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/fib_lookup.c | 132 ++++++++++++++++----- 1 file changed, 103 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 3379df2d4cf2..bd7658958004 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -26,6 +26,17 @@ #define IPV6_TBID_ADDR "fd00::FFFF" #define IPV6_TBID_NET "fd00::" #define IPV6_TBID_DST "fd00::2" +#define MARK_NO_POLICY 33 +#define MARK 42 +#define MARK_TABLE "200" +#define IPV4_REMOTE_DST "1.2.3.4" +#define IPV4_LOCAL "10.4.0.3" +#define IPV4_GW1 "10.4.0.1" +#define IPV4_GW2 "10.4.0.2" +#define IPV6_REMOTE_DST "be:ef::b0:10" +#define IPV6_LOCAL "fd01::3" +#define IPV6_GW1 "fd01::1" +#define IPV6_GW2 "fd01::2" #define DMAC "11:11:11:11:11:11" #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, } #define DMAC2 "01:01:01:01:01:01" @@ -36,9 +47,11 @@ struct fib_lookup_test { const char *daddr; int expected_ret; const char *expected_src; + const char *expected_dst; int lookup_flags; __u32 tbid; __u8 dmac[6]; + __u32 mark; }; static const struct fib_lookup_test tests[] = { @@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = { .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, .expected_src = IPV6_IFACE_ADDR_SEC, .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + /* policy routing */ + { .desc = "IPv4 policy routing, default", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv4 policy routing, mark doesn't point to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv4 policy routing, mark points to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv4 policy routing, mark points to a policy, but no flag", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, default", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv6 policy routing, mark doesn't point to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv6 policy routing, mark points to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, mark points to a policy, but no flag", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, }; -static int ifindex; - static int setup_netns(void) { int err; @@ -144,12 +194,24 @@ static int setup_netns(void) if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)")) goto fail; + /* Setup for policy routing tests */ + SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL); + SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL); + SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1); + SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE); + SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1); + SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE); + SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + return 0; fail: return -1; } -static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test) +static int set_lookup_params(struct bpf_fib_lookup *params, + const struct fib_lookup_test *test, + int ifindex) { int ret; @@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo params->l4_protocol = IPPROTO_TCP; params->ifindex = ifindex; params->tbid = test->tbid; + params->mark = test->mark; if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) { params->family = AF_INET6; @@ -190,40 +253,45 @@ static void mac_str(char *b, const __u8 *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src) +static void assert_ip_address(int family, void *addr, const char *expected_str) { + char str[INET6_ADDRSTRLEN]; + u8 expected_addr[16]; + int addr_len = 0; int ret; - __u32 src6[4]; - __be32 src4; - switch (fib_params->family) { + switch (family) { case AF_INET6: - ret = inet_pton(AF_INET6, expected_src, src6); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src)); - if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) { - char str_src6[64]; - - inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6, - sizeof(str_src6)); - printf("ipv6 expected %s actual %s ", expected_src, - str_src6); - } - + ret = inet_pton(AF_INET6, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)"); + addr_len = 16; break; case AF_INET: - ret = inet_pton(AF_INET, expected_src, &src4); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src"); - + ret = inet_pton(AF_INET, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)"); + addr_len = 4; break; default: - PRINT_FAIL("invalid addr family: %d", fib_params->family); + PRINT_FAIL("invalid address family: %d", family); + break; + } + + if (memcmp(addr, expected_addr, addr_len)) { + inet_ntop(family, addr, str, sizeof(str)); + PRINT_FAIL("expected %s actual %s ", expected_str, str); } } +static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_src, expected); +} + +static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_dst, expected); +} + void test_fib_lookup(void) { struct bpf_fib_lookup *fib_params; @@ -256,15 +324,18 @@ void test_fib_lookup(void) if (setup_netns()) goto fail; - ifindex = if_nametoindex("veth1"); - skb.ifindex = ifindex; + skb.ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)")) + goto fail; + fib_params = &skel->bss->fib_params; for (i = 0; i < ARRAY_SIZE(tests); i++) { printf("Testing %s ", tests[i].desc); - if (set_lookup_params(fib_params, &tests[i])) + if (set_lookup_params(fib_params, &tests[i], skb.ifindex)) continue; + skel->bss->fib_lookup_ret = -1; skel->bss->lookup_flags = tests[i].lookup_flags; @@ -278,6 +349,9 @@ void test_fib_lookup(void) if (tests[i].expected_src) assert_src_ip(fib_params, tests[i].expected_src); + if (tests[i].expected_dst) + assert_dst_ip(fib_params, tests[i].expected_dst); + ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac)); if (!ASSERT_EQ(ret, 0, "dmac not match")) { char expected[18], actual[18]; -- cgit v1.2.3 From 786bf0e7e2ec90b349a7bab6e97947982ab31f2c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 25 Mar 2024 15:22:10 +0000 Subject: bpf: improve error message for unsupported helper BPF verifier emits "unknown func" message when given BPF program type does not support BPF helper. This message may be confusing for users, as important context that helper is unknown only to current program type is not provided. This patch changes message to "program of this type cannot use helper " and aligns dependent code in libbpf and tests. Any suggestions on improving/changing this message are welcome. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20240325152210.377548-1-yatsenko@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- tools/bpf/bpftool/feature.c | 3 ++- tools/lib/bpf/libbpf_probes.c | 6 ++++-- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- tools/testing/selftests/bpf/progs/verifier_helper_restricted.c | 8 ++++---- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17f26ba1a9e0..edb650667f44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10201,8 +10201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0ea06..c754a428c8c6 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 302188122439..9dfbe7750f56 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 94cb22b01482..99dc60fe59d5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -304,7 +304,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd090c..059aa716e3d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ -- cgit v1.2.3 From 1175f8dea349e5999d99727346db24f38306a793 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:46 -0700 Subject: selftests/bpf: rename and clean up userspace-triggered benchmarks Rename uprobe-base to more precise usermode-count (it will match other baseline-like benchmarks, kernel-count and syscall-count). Also use BENCH_TRIG_USERMODE() macro to define all usermode-based triggering benchmarks, which include usermode-count and uprobe/uretprobe benchmarks. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 14 ++- tools/testing/selftests/bpf/benchs/bench_trigger.c | 106 +++++++-------------- .../selftests/bpf/benchs/run_bench_uprobes.sh | 2 +- 3 files changed, 49 insertions(+), 73 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index b2b4c391eb0a..7ca1e1eb5c30 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -491,7 +491,12 @@ extern const struct bench bench_rename_kretprobe; extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; + +/* pure counting benchmarks to establish theoretical lmits */ +extern const struct bench bench_trig_usermode_count; extern const struct bench bench_trig_base; + +/* kernel-side syscall-triggered benchmarks */ extern const struct bench bench_trig_tp; extern const struct bench bench_trig_rawtp; extern const struct bench bench_trig_kprobe; @@ -502,13 +507,15 @@ extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; -extern const struct bench bench_trig_uprobe_base; + +/* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; + extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -539,7 +546,10 @@ static const struct bench *benchs[] = { &bench_rename_rawtp, &bench_rename_fentry, &bench_rename_fexit, + /* pure counting benchmarks for establishing theoretical limits */ + &bench_trig_usermode_count, &bench_trig_base, + /* syscall-driven triggering benchmarks */ &bench_trig_tp, &bench_trig_rawtp, &bench_trig_kprobe, @@ -550,7 +560,7 @@ static const struct bench *benchs[] = { &bench_trig_fexit, &bench_trig_fentry_sleep, &bench_trig_fmodret, - &bench_trig_uprobe_base, + /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, &bench_trig_uprobe_push, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index d66eddacd642..97aba7e6458d 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -14,6 +14,7 @@ /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; + bool usermode_counters; } ctx; static struct counter base_hits[MAX_BUCKETS]; @@ -74,7 +75,10 @@ static void *trigger_producer(void *input) static void trigger_measure(struct bench_res *res) { - res->hits = sum_and_reset_counters(ctx.skel->bss->hits); + if (ctx.usermode_counters) + res->hits = sum_and_reset_counters(base_hits); + else + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) @@ -192,7 +196,7 @@ __nocf_check __weak void uprobe_target_ret(void) asm volatile (""); } -static void *uprobe_base_producer(void *input) +static void *uprobe_producer_count(void *input) { while (true) { uprobe_target_nop(); @@ -248,32 +252,37 @@ static void usetup(bool use_retprobe, void *target_addr) ctx.skel->links.bench_trigger_uprobe = link; } -static void uprobe_setup_nop(void) +static void usermode_count_setup(void) +{ + ctx.usermode_counters = true; +} + +static void uprobe_nop_setup(void) { usetup(false, &uprobe_target_nop); } -static void uretprobe_setup_nop(void) +static void uretprobe_nop_setup(void) { usetup(true, &uprobe_target_nop); } -static void uprobe_setup_push(void) +static void uprobe_push_setup(void) { usetup(false, &uprobe_target_push); } -static void uretprobe_setup_push(void) +static void uretprobe_push_setup(void) { usetup(true, &uprobe_target_push); } -static void uprobe_setup_ret(void) +static void uprobe_ret_setup(void) { usetup(false, &uprobe_target_ret); } -static void uretprobe_setup_ret(void) +static void uretprobe_ret_setup(void) { usetup(true, &uprobe_target_ret); } @@ -387,65 +396,22 @@ const struct bench bench_trig_fmodret = { .report_final = hits_drops_report_final, }; -const struct bench bench_trig_uprobe_base = { - .name = "trig-uprobe-base", - .setup = NULL, /* no uprobe/uretprobe is attached */ - .producer_thread = uprobe_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_nop = { - .name = "trig-uprobe-nop", - .setup = uprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_nop = { - .name = "trig-uretprobe-nop", - .setup = uretprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_push = { - .name = "trig-uprobe-push", - .setup = uprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_push = { - .name = "trig-uretprobe-push", - .setup = uretprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_ret = { - .name = "trig-uprobe-ret", - .setup = uprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_ret = { - .name = "trig-uretprobe-ret", - .setup = uretprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +/* uprobe benchmarks */ +#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .validate = trigger_validate, \ + .setup = KIND##_setup, \ + .producer_thread = uprobe_producer_##PRODUCER, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ +} + +BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count"); +BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop"); +BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push"); +BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); +BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); +BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); +BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 9bdcc74e03a4..36021d243397 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count base {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" -- cgit v1.2.3 From 7df4e597ea2cfd677e65730948153d5544986a10 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:47 -0700 Subject: selftests/bpf: add batched, mostly in-kernel BPF triggering benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between one syscall execution and BPF program run. While we use a fast get_pgid() syscall, syscall overhead can still be non-trivial. This patch adds kprobe/fentry set of benchmarks significantly amortizing the cost of syscall vs actual BPF triggering overhead. We do this by employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program which does a tight parameterized loop calling cheap BPF helper (bpf_get_numa_node_id()), to which kprobe/fentry programs are attached for benchmarking. This way 1 bpf() syscall causes N executions of BPF program being benchmarked. N defaults to 100, but can be adjusted with --trig-batch-iters CLI argument. For comparison we also implement a new baseline program that instead of triggering another BPF program just does N atomic per-CPU counter increments, establishing the limit for all other types of program within this batched benchmarking setup. Taking the final set of benchmarks added in this patch set (including tp/raw_tp/fmodret, added in later patch), and keeping for now "legacy" syscall-driven benchmarks, we can capture all triggering benchmarks in one place for comparison, before we remove the legacy ones (and rename xxx-batched into just xxx). $ benchs/run_bench_trigger.sh usermode-count : 79.500 ± 0.024M/s kernel-count : 49.949 ± 0.081M/s syscall-count : 9.009 ± 0.007M/s fentry-batch : 31.002 ± 0.015M/s fexit-batch : 20.372 ± 0.028M/s fmodret-batch : 21.651 ± 0.659M/s rawtp-batch : 36.775 ± 0.264M/s tp-batch : 19.411 ± 0.248M/s kprobe-batch : 12.949 ± 0.220M/s kprobe-multi-batch : 15.400 ± 0.007M/s kretprobe-batch : 5.559 ± 0.011M/s kretprobe-multi-batch: 5.861 ± 0.003M/s fentry-legacy : 8.329 ± 0.004M/s fexit-legacy : 6.239 ± 0.003M/s fmodret-legacy : 6.595 ± 0.001M/s rawtp-legacy : 8.305 ± 0.004M/s tp-legacy : 6.382 ± 0.001M/s kprobe-legacy : 5.528 ± 0.003M/s kprobe-multi-legacy : 5.864 ± 0.022M/s kretprobe-legacy : 3.081 ± 0.001M/s kretprobe-multi-legacy: 3.193 ± 0.001M/s Note how xxx-batch variants are measured with significantly higher throughput, even though it's exactly the same in-kernel overhead. As such, results can be compared only between benchmarks of the same kind (syscall vs batched): fentry-legacy : 8.329 ± 0.004M/s fentry-batch : 31.002 ± 0.015M/s kprobe-multi-legacy : 5.864 ± 0.022M/s kprobe-multi-batch : 15.400 ± 0.007M/s Note also that syscall-count is setting a theoretical limit for syscall-triggered benchmarks, while kernel-count is setting similar limits for batch variants. usermode-count is a happy and unachievable case of user space counting without doing any syscalls, and is mostly the measure of CPU speed for such a trivial benchmark. As was mentioned, tp/raw_tp/fmodret require kernel-side kfunc to produce similar benchmark, which we address in a separate patch. Note that run_bench_trigger.sh allows to override a list of benchmarks to run, which is very useful for performance work. Cc: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 21 +++- tools/testing/selftests/bpf/benchs/bench_trigger.c | 133 ++++++++++++++++++++- .../selftests/bpf/benchs/run_bench_trigger.sh | 24 +++- tools/testing/selftests/bpf/progs/trigger_bench.c | 67 ++++++++++- 4 files changed, 238 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 7ca1e1eb5c30..484bcbeaa819 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -280,6 +280,7 @@ extern struct argp bench_strncmp_argp; extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; +extern struct argp bench_trigger_batch_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -292,6 +293,7 @@ static const struct argp_child bench_parsers[] = { { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, + { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, {}, }; @@ -508,6 +510,15 @@ extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; +/* batched, staying mostly in-kernel benchmarks */ +extern const struct bench bench_trig_kernel_count; +extern const struct bench bench_trig_kprobe_batch; +extern const struct bench bench_trig_kretprobe_batch; +extern const struct bench bench_trig_kprobe_multi_batch; +extern const struct bench bench_trig_kretprobe_multi_batch; +extern const struct bench bench_trig_fentry_batch; +extern const struct bench bench_trig_fexit_batch; + /* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; @@ -548,7 +559,7 @@ static const struct bench *benchs[] = { &bench_rename_fexit, /* pure counting benchmarks for establishing theoretical limits */ &bench_trig_usermode_count, - &bench_trig_base, + &bench_trig_kernel_count, /* syscall-driven triggering benchmarks */ &bench_trig_tp, &bench_trig_rawtp, @@ -560,6 +571,13 @@ static const struct bench *benchs[] = { &bench_trig_fexit, &bench_trig_fentry_sleep, &bench_trig_fmodret, + /* batched, staying mostly in-kernel triggers */ + &bench_trig_kprobe_batch, + &bench_trig_kretprobe_batch, + &bench_trig_kprobe_multi_batch, + &bench_trig_kretprobe_multi_batch, + &bench_trig_fentry_batch, + &bench_trig_fexit_batch, /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, @@ -567,6 +585,7 @@ static const struct bench *benchs[] = { &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 97aba7e6458d..20277dabdaf9 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,11 +1,57 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ #define _GNU_SOURCE +#include #include +#include #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +#define MAX_TRIG_BATCH_ITERS 1000 + +static struct { + __u32 batch_iters; +} args = { + .batch_iters = 100, +}; + +enum { + ARG_TRIG_BATCH_ITERS = 7000, +}; + +static const struct argp_option opts[] = { + { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0, + "Number of in-kernel iterations per one driver test run"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_TRIG_BATCH_ITERS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) { + fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n", + 1, MAX_TRIG_BATCH_ITERS); + argp_usage(state); + } + args.batch_iters = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_trigger_batch_argp = { + .options = opts, + .parser = parse_arg, +}; + /* adjust slot shift in inc_hits() if changing */ #define MAX_BUCKETS 256 @@ -15,6 +61,7 @@ static struct trigger_ctx { struct trigger_bench *skel; bool usermode_counters; + int driver_prog_fd; } ctx; static struct counter base_hits[MAX_BUCKETS]; @@ -73,6 +120,16 @@ static void *trigger_producer(void *input) return NULL; } +static void *trigger_producer_batch(void *input) +{ + int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver); + + while (true) + bpf_prog_test_run_opts(fd, NULL); + + return NULL; +} + static void trigger_measure(struct bench_res *res) { if (ctx.usermode_counters) @@ -83,13 +140,23 @@ static void trigger_measure(struct bench_res *res) static void setup_ctx(void) { + int err; + setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + + ctx.skel->rodata->batch_iters = args.batch_iters; + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } } static void attach_bpf(struct bpf_program *prog) @@ -163,6 +230,50 @@ static void trigger_fmodret_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) +{ + setup_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); +} + +static void trigger_kprobe_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe_batch); +} + +static void trigger_kretprobe_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_batch); +} + +static void trigger_kprobe_multi_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_batch); +} + +static void trigger_kretprobe_multi_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_batch); +} + +static void trigger_fentry_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fentry_batch); +} + +static void trigger_fexit_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fexit_batch); +} + /* make sure call is not inlined and not avoided by compiler, so __weak and * inline asm volatile in the body of the function * @@ -396,6 +507,26 @@ const struct bench bench_trig_fmodret = { .report_final = hits_drops_report_final, }; +/* batched (staying mostly in kernel) kprobe/fentry benchmarks */ +#define BENCH_TRIG_BATCH(KIND, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .setup = trigger_##KIND##_setup, \ + .producer_thread = trigger_producer_batch, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ + .argp = &bench_trigger_batch_argp, \ +} + +BENCH_TRIG_BATCH(kernel_count, "kernel-count"); +BENCH_TRIG_BATCH(kprobe_batch, "kprobe-batch"); +BENCH_TRIG_BATCH(kretprobe_batch, "kretprobe-batch"); +BENCH_TRIG_BATCH(kprobe_multi_batch, "kprobe-multi-batch"); +BENCH_TRIG_BATCH(kretprobe_multi_batch, "kretprobe-multi-batch"); +BENCH_TRIG_BATCH(fentry_batch, "fentry-batch"); +BENCH_TRIG_BATCH(fexit_batch, "fexit-batch"); + /* uprobe benchmarks */ #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ const struct bench bench_trig_##KIND = { \ diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f243294..b58ec33ea18c 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,8 +2,24 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret -do - summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-10s: %s\n" $i "$summary" +def_tests=( \ + usermode-count kernel-count syscall-count \ + fentry-batch fexit-batch \ + kprobe-batch kprobe-multi-batch \ + kretprobe-batch kretprobe-multi-batch \ + fentry fexit fmodret \ + rawtp tp \ + kprobe kprobe-multi kretprobe kretprobe-multi \ +) + +tests=("$@") +if [ ${#tests[@]} -eq 0 ]; then + tests=("${def_tests[@]}") +fi + +p=${PROD_CNT:-1} + +for t in "${tests[@]}"; do + summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-21s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 42ec202015ed..f0b76afa5017 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook - #include #include #include @@ -103,3 +102,69 @@ int bench_trigger_uprobe(void *ctx) inc_counter(); return 0; } + +const volatile int batch_iters = 0; + +SEC("raw_tp") +int trigger_count(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + inc_counter(); + + return 0; +} + +SEC("raw_tp") +int trigger_driver(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_get_numa_node_id(); /* attach point for benchmarking */ + + return 0; +} + +SEC("kprobe/bpf_get_numa_node_id") +int bench_trigger_kprobe_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("kretprobe/bpf_get_numa_node_id") +int bench_trigger_kretprobe_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("kprobe.multi/bpf_get_numa_node_id") +int bench_trigger_kprobe_multi_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("kretprobe.multi/bpf_get_numa_node_id") +int bench_trigger_kretprobe_multi_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("fentry/bpf_get_numa_node_id") +int bench_trigger_fentry_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("fexit/bpf_get_numa_node_id") +int bench_trigger_fexit_batch(void *ctx) +{ + inc_counter(); + return 0; +} -- cgit v1.2.3 From 208c4391204d25d9178fbc87f216daffad00cd15 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:48 -0700 Subject: selftests/bpf: remove syscall-driven benchs, keep syscall-count only Remove "legacy" benchmarks triggered by syscalls in favor of newly added in-kernel/batched benchmarks. Drop -batched suffix now as well. Next patch will restore "feature parity" by adding back tp/raw_tp/fmodret benchmarks based on in-kernel kfunc approach. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 32 +--- tools/testing/selftests/bpf/benchs/bench_trigger.c | 213 +++------------------ .../selftests/bpf/benchs/run_bench_trigger.sh | 11 +- .../selftests/bpf/benchs/run_bench_uprobes.sh | 2 +- tools/testing/selftests/bpf/progs/trigger_bench.c | 83 +------- 5 files changed, 42 insertions(+), 299 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 484bcbeaa819..8b16841a3dec 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -496,28 +496,16 @@ extern const struct bench bench_rename_fexit; /* pure counting benchmarks to establish theoretical lmits */ extern const struct bench bench_trig_usermode_count; -extern const struct bench bench_trig_base; +extern const struct bench bench_trig_syscall_count; +extern const struct bench bench_trig_kernel_count; -/* kernel-side syscall-triggered benchmarks */ -extern const struct bench bench_trig_tp; -extern const struct bench bench_trig_rawtp; +/* batched, staying mostly in-kernel benchmarks */ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; -extern const struct bench bench_trig_fentry_sleep; -extern const struct bench bench_trig_fmodret; - -/* batched, staying mostly in-kernel benchmarks */ -extern const struct bench bench_trig_kernel_count; -extern const struct bench bench_trig_kprobe_batch; -extern const struct bench bench_trig_kretprobe_batch; -extern const struct bench bench_trig_kprobe_multi_batch; -extern const struct bench bench_trig_kretprobe_multi_batch; -extern const struct bench bench_trig_fentry_batch; -extern const struct bench bench_trig_fexit_batch; /* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; @@ -560,24 +548,14 @@ static const struct bench *benchs[] = { /* pure counting benchmarks for establishing theoretical limits */ &bench_trig_usermode_count, &bench_trig_kernel_count, - /* syscall-driven triggering benchmarks */ - &bench_trig_tp, - &bench_trig_rawtp, + &bench_trig_syscall_count, + /* batched, staying mostly in-kernel triggers */ &bench_trig_kprobe, &bench_trig_kretprobe, &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, - &bench_trig_fentry_sleep, - &bench_trig_fmodret, - /* batched, staying mostly in-kernel triggers */ - &bench_trig_kprobe_batch, - &bench_trig_kretprobe_batch, - &bench_trig_kprobe_multi_batch, - &bench_trig_kretprobe_multi_batch, - &bench_trig_fentry_batch, - &bench_trig_fexit_batch, /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 20277dabdaf9..7d4f34adfd64 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -99,24 +99,17 @@ static void trigger_validate(void) } } -static void *trigger_base_producer(void *input) -{ - while (true) { - (void)syscall(__NR_getpgid); - inc_counter(base_hits); - } - return NULL; -} - -static void trigger_base_measure(struct bench_res *res) -{ - res->hits = sum_and_reset_counters(base_hits); -} - static void *trigger_producer(void *input) { - while (true) - (void)syscall(__NR_getpgid); + if (ctx.usermode_counters) { + while (true) { + (void)syscall(__NR_getpgid); + inc_counter(base_hits); + } + } else { + while (true) + (void)syscall(__NR_getpgid); + } return NULL; } @@ -170,16 +163,17 @@ static void attach_bpf(struct bpf_program *prog) } } -static void trigger_tp_setup(void) +static void trigger_syscall_count_setup(void) { - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_tp); + ctx.usermode_counters = true; } -static void trigger_rawtp_setup(void) +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } static void trigger_kprobe_setup(void) @@ -218,62 +212,6 @@ static void trigger_fexit_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fexit); } -static void trigger_fentry_sleep_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); -} - -static void trigger_fmodret_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fmodret); -} - -/* Batched, staying mostly in-kernel triggering setups */ -static void trigger_kernel_count_setup(void) -{ - setup_ctx(); - /* override driver program */ - ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); -} - -static void trigger_kprobe_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kprobe_batch); -} - -static void trigger_kretprobe_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_batch); -} - -static void trigger_kprobe_multi_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_batch); -} - -static void trigger_kretprobe_multi_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_batch); -} - -static void trigger_fentry_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_batch); -} - -static void trigger_fexit_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fexit_batch); -} - /* make sure call is not inlined and not avoided by compiler, so __weak and * inline asm volatile in the body of the function * @@ -398,109 +336,10 @@ static void uretprobe_ret_setup(void) usetup(true, &uprobe_target_ret); } -const struct bench bench_trig_base = { - .name = "trig-base", - .validate = trigger_validate, - .producer_thread = trigger_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_tp = { - .name = "trig-tp", - .validate = trigger_validate, - .setup = trigger_tp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_rawtp = { - .name = "trig-rawtp", - .validate = trigger_validate, - .setup = trigger_rawtp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe = { - .name = "trig-kprobe", - .validate = trigger_validate, - .setup = trigger_kprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe = { - .name = "trig-kretprobe", - .validate = trigger_validate, - .setup = trigger_kretprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe_multi = { - .name = "trig-kprobe-multi", - .validate = trigger_validate, - .setup = trigger_kprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe_multi = { - .name = "trig-kretprobe-multi", - .validate = trigger_validate, - .setup = trigger_kretprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry = { - .name = "trig-fentry", - .validate = trigger_validate, - .setup = trigger_fentry_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fexit = { - .name = "trig-fexit", - .validate = trigger_validate, - .setup = trigger_fexit_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry_sleep = { - .name = "trig-fentry-sleep", - .validate = trigger_validate, - .setup = trigger_fentry_sleep_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fmodret = { - .name = "trig-fmodret", +const struct bench bench_trig_syscall_count = { + .name = "trig-syscall-count", .validate = trigger_validate, - .setup = trigger_fmodret_setup, + .setup = trigger_syscall_count_setup, .producer_thread = trigger_producer, .measure = trigger_measure, .report_progress = hits_drops_report_progress, @@ -508,7 +347,7 @@ const struct bench bench_trig_fmodret = { }; /* batched (staying mostly in kernel) kprobe/fentry benchmarks */ -#define BENCH_TRIG_BATCH(KIND, NAME) \ +#define BENCH_TRIG_KERNEL(KIND, NAME) \ const struct bench bench_trig_##KIND = { \ .name = "trig-" NAME, \ .setup = trigger_##KIND##_setup, \ @@ -519,13 +358,13 @@ const struct bench bench_trig_##KIND = { \ .argp = &bench_trigger_batch_argp, \ } -BENCH_TRIG_BATCH(kernel_count, "kernel-count"); -BENCH_TRIG_BATCH(kprobe_batch, "kprobe-batch"); -BENCH_TRIG_BATCH(kretprobe_batch, "kretprobe-batch"); -BENCH_TRIG_BATCH(kprobe_multi_batch, "kprobe-multi-batch"); -BENCH_TRIG_BATCH(kretprobe_multi_batch, "kretprobe-multi-batch"); -BENCH_TRIG_BATCH(fentry_batch, "fentry-batch"); -BENCH_TRIG_BATCH(fexit_batch, "fexit-batch"); +BENCH_TRIG_KERNEL(kernel_count, "kernel-count"); +BENCH_TRIG_KERNEL(kprobe, "kprobe"); +BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); +BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); +BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); +BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(fexit, "fexit"); /* uprobe benchmarks */ #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index b58ec33ea18c..6e790e294260 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -4,12 +4,9 @@ set -eufo pipefail def_tests=( \ usermode-count kernel-count syscall-count \ - fentry-batch fexit-batch \ - kprobe-batch kprobe-multi-batch \ - kretprobe-batch kretprobe-multi-batch \ - fentry fexit fmodret \ - rawtp tp \ - kprobe kprobe-multi kretprobe kretprobe-multi \ + fentry fexit \ + kprobe kprobe-multi \ + kretprobe kretprobe-multi \ ) tests=("$@") @@ -21,5 +18,5 @@ p=${PROD_CNT:-1} for t in "${tests[@]}"; do summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-21s: %s\n" $t "$summary" + printf "%-15s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 36021d243397..af169f831f2f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in usermode-count base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index f0b76afa5017..81990e45b547 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,77 +25,6 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } -SEC("tp/syscalls/sys_enter_getpgid") -int bench_trigger_tp(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("raw_tp/sys_enter") -int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) -{ - if (id == __NR_getpgid) - inc_counter(); - return 0; -} - -SEC("kprobe/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kprobe(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("kretprobe/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kretprobe(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kprobe_multi(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kretprobe_multi(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fentry/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fexit/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fexit(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fentry.s/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry_sleep(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fmodret(void *ctx) -{ - inc_counter(); - return -22; -} - SEC("uprobe") int bench_trigger_uprobe(void *ctx) { @@ -128,42 +57,42 @@ int trigger_driver(void *ctx) } SEC("kprobe/bpf_get_numa_node_id") -int bench_trigger_kprobe_batch(void *ctx) +int bench_trigger_kprobe(void *ctx) { inc_counter(); return 0; } SEC("kretprobe/bpf_get_numa_node_id") -int bench_trigger_kretprobe_batch(void *ctx) +int bench_trigger_kretprobe(void *ctx) { inc_counter(); return 0; } SEC("kprobe.multi/bpf_get_numa_node_id") -int bench_trigger_kprobe_multi_batch(void *ctx) +int bench_trigger_kprobe_multi(void *ctx) { inc_counter(); return 0; } SEC("kretprobe.multi/bpf_get_numa_node_id") -int bench_trigger_kretprobe_multi_batch(void *ctx) +int bench_trigger_kretprobe_multi(void *ctx) { inc_counter(); return 0; } SEC("fentry/bpf_get_numa_node_id") -int bench_trigger_fentry_batch(void *ctx) +int bench_trigger_fentry(void *ctx) { inc_counter(); return 0; } SEC("fexit/bpf_get_numa_node_id") -int bench_trigger_fexit_batch(void *ctx) +int bench_trigger_fexit(void *ctx) { inc_counter(); return 0; -- cgit v1.2.3 From b4ccf9158f5893dedb898687272fabfe80f58907 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:49 -0700 Subject: selftests/bpf: lazy-load trigger bench BPF programs Instead of front-loading all possible benchmarking BPF programs for trigger benchmarks, explicitly specify which BPF programs are used by specific benchmark and load only it. This allows to be more flexible in supporting older kernels, where some program types might not be possible to load (e.g., those that rely on newly added kfunc). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/benchs/bench_trigger.c | 36 ++++++++++++++++++++-- tools/testing/selftests/bpf/progs/trigger_bench.c | 18 +++++------ 2 files changed, 42 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 7d4f34adfd64..2c477808a6f0 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -133,8 +133,6 @@ static void trigger_measure(struct bench_res *res) static void setup_ctx(void) { - int err; - setup_libbpf(); ctx.skel = trigger_bench__open(); @@ -143,7 +141,15 @@ static void setup_ctx(void) exit(1); } + /* default "driver" BPF program */ + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); + ctx.skel->rodata->batch_iters = args.batch_iters; +} + +static void load_ctx(void) +{ + int err; err = trigger_bench__load(ctx.skel); if (err) { @@ -172,6 +178,9 @@ static void trigger_syscall_count_setup(void) static void trigger_kernel_count_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + load_ctx(); /* override driver program */ ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } @@ -179,36 +188,48 @@ static void trigger_kernel_count_setup(void) static void trigger_kprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe); } static void trigger_kretprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe); } static void trigger_kprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi); } static void trigger_kretprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi); } static void trigger_fentry_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fentry); } static void trigger_fexit_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fexit); } @@ -279,15 +300,24 @@ static void usetup(bool use_retprobe, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; + int err; setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + uprobe_offset = get_uprobe_offset(target_addr); link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, use_retprobe, diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 81990e45b547..07587cb3c9f5 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,7 +25,7 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } -SEC("uprobe") +SEC("?uprobe") int bench_trigger_uprobe(void *ctx) { inc_counter(); @@ -34,7 +34,7 @@ int bench_trigger_uprobe(void *ctx) const volatile int batch_iters = 0; -SEC("raw_tp") +SEC("?raw_tp") int trigger_count(void *ctx) { int i; @@ -45,7 +45,7 @@ int trigger_count(void *ctx) return 0; } -SEC("raw_tp") +SEC("?raw_tp") int trigger_driver(void *ctx) { int i; @@ -56,42 +56,42 @@ int trigger_driver(void *ctx) return 0; } -SEC("kprobe/bpf_get_numa_node_id") +SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { inc_counter(); return 0; } -SEC("kretprobe/bpf_get_numa_node_id") +SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { inc_counter(); return 0; } -SEC("kprobe.multi/bpf_get_numa_node_id") +SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { inc_counter(); return 0; } -SEC("kretprobe.multi/bpf_get_numa_node_id") +SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { inc_counter(); return 0; } -SEC("fentry/bpf_get_numa_node_id") +SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { inc_counter(); return 0; } -SEC("fexit/bpf_get_numa_node_id") +SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { inc_counter(); -- cgit v1.2.3 From 985d0681b46be7db5ccc330d9a7f318b96ce0029 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:51 -0700 Subject: selftests/bpf: add batched tp/raw_tp/fmodret tests Utilize bpf_modify_return_test_tp() kfunc to have a fast way to trigger tp/raw_tp/fmodret programs from another BPF program, which gives us comparable batched benchmarks to (batched) kprobe/fentry benchmarks. We don't switch kprobe/fentry batched benchmarks to this kfunc to make bench tool usable on older kernels as well. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 6 ++++ tools/testing/selftests/bpf/benchs/bench_trigger.c | 39 ++++++++++++++++++++++ .../selftests/bpf/benchs/run_bench_trigger.sh | 3 +- tools/testing/selftests/bpf/progs/trigger_bench.c | 34 +++++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 8b16841a3dec..82de56c8162e 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -506,6 +506,9 @@ extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; +extern const struct bench bench_trig_fmodret; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; /* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; @@ -556,6 +559,9 @@ static const struct bench *benchs[] = { &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, + &bench_trig_fmodret, + &bench_trig_tp, + &bench_trig_rawtp, /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 2c477808a6f0..4b05539f167d 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -233,6 +233,42 @@ static void trigger_fexit_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fexit); } +static void trigger_fmodret_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); +} + +static void trigger_tp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_rawtp); +} + /* make sure call is not inlined and not avoided by compiler, so __weak and * inline asm volatile in the body of the function * @@ -395,6 +431,9 @@ BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); BENCH_TRIG_KERNEL(fentry, "fentry"); BENCH_TRIG_KERNEL(fexit, "fexit"); +BENCH_TRIG_KERNEL(fmodret, "fmodret"); +BENCH_TRIG_KERNEL(tp, "tp"); +BENCH_TRIG_KERNEL(rawtp, "rawtp"); /* uprobe benchmarks */ #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 6e790e294260..a690f5a68b6b 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -4,7 +4,8 @@ set -eufo pipefail def_tests=( \ usermode-count kernel-count syscall-count \ - fentry fexit \ + fentry fexit fmodret \ + rawtp tp \ kprobe kprobe-multi \ kretprobe kretprobe-multi \ ) diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 07587cb3c9f5..2619ed193c65 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -56,6 +56,19 @@ int trigger_driver(void *ctx) return 0; } +extern int bpf_modify_return_test_tp(int nonce) __ksym __weak; + +SEC("?raw_tp") +int trigger_driver_kfunc(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */ + + return 0; +} + SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { @@ -97,3 +110,24 @@ int bench_trigger_fexit(void *ctx) inc_counter(); return 0; } + +SEC("?fmod_ret/bpf_modify_return_test_tp") +int bench_trigger_fmodret(void *ctx) +{ + inc_counter(); + return -22; +} + +SEC("?tp/bpf_test_run/bpf_trigger_tp") +int bench_trigger_tp(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("?raw_tp/bpf_trigger_tp") +int bench_trigger_rawtp(void *ctx) +{ + inc_counter(); + return 0; +} -- cgit v1.2.3 From 5da7fb04902b0f0fcd13bc5ef216e232fa971efa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 Mar 2024 12:14:33 -0700 Subject: selftests/bpf: Test loading bpf-tcp-cc prog calling the kernel tcp-cc kfuncs This patch adds a test to ensure all static tcp-cc kfuncs is visible to the struct_ops bpf programs. It is checked by successfully loading the struct_ops programs calling these tcp-cc kfuncs. This patch needs to enable the CONFIG_TCP_CONG_DCTCP and the CONFIG_TCP_CONG_BBR. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240322191433.4133280-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 2 + .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 12 ++ tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 121 +++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 01f241ea2c67..afd675b1bf80 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -88,3 +88,5 @@ CONFIG_VSOCKETS=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y +CONFIG_TCP_CONG_DCTCP=y +CONFIG_TCP_CONG_BBR=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 99dc60fe59d5..332ed54cc6af 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -13,6 +13,7 @@ #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" +#include "tcp_ca_kfunc.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -518,6 +519,15 @@ static void test_link_replace(void) tcp_ca_update__destroy(skel); } +static void test_tcp_ca_kfunc(void) +{ + struct tcp_ca_kfunc *skel; + + skel = tcp_ca_kfunc__open_and_load(); + ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load"); + tcp_ca_kfunc__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -546,4 +556,6 @@ void test_bpf_tcp_ca(void) test_multi_links(); if (test__start_subtest("link_replace")) test_link_replace(); + if (test__start_subtest("tcp_ca_kfunc")) + test_tcp_ca_kfunc(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c new file mode 100644 index 000000000000..fcfbfe0336b4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Facebook */ + +#include "vmlinux.h" +#include + +extern void bbr_init(struct sock *sk) __ksym; +extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; +extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; +extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern u32 bbr_ssthresh(struct sock *sk) __ksym; +extern u32 bbr_min_tso_segs(struct sock *sk) __ksym; +extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym; + +extern void dctcp_init(struct sock *sk) __ksym; +extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym; +extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym; +extern u32 dctcp_ssthresh(struct sock *sk) __ksym; +extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym; +extern void dctcp_state(struct sock *sk, u8 new_state) __ksym; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym; +extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; + +SEC("struct_ops/init") +void BPF_PROG(init, struct sock *sk) +{ + bbr_init(sk); + dctcp_init(sk); + cubictcp_init(sk); +} + +SEC("struct_ops/in_ack_event") +void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) +{ + dctcp_update_alpha(sk, flags); +} + +SEC("struct_ops/cong_control") +void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +{ + bbr_main(sk, rs); +} + +SEC("struct_ops/cong_avoid") +void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) +{ + cubictcp_cong_avoid(sk, ack, acked); +} + +SEC("struct_ops/sndbuf_expand") +u32 BPF_PROG(sndbuf_expand, struct sock *sk) +{ + return bbr_sndbuf_expand(sk); +} + +SEC("struct_ops/undo_cwnd") +u32 BPF_PROG(undo_cwnd, struct sock *sk) +{ + bbr_undo_cwnd(sk); + return dctcp_cwnd_undo(sk); +} + +SEC("struct_ops/cwnd_event") +void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + bbr_cwnd_event(sk, event); + dctcp_cwnd_event(sk, event); + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops/ssthresh") +u32 BPF_PROG(ssthresh, struct sock *sk) +{ + bbr_ssthresh(sk); + dctcp_ssthresh(sk); + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops/min_tso_segs") +u32 BPF_PROG(min_tso_segs, struct sock *sk) +{ + return bbr_min_tso_segs(sk); +} + +SEC("struct_ops/set_state") +void BPF_PROG(set_state, struct sock *sk, u8 new_state) +{ + bbr_set_state(sk, new_state); + dctcp_state(sk, new_state); + cubictcp_state(sk, new_state); +} + +SEC("struct_ops/pkts_acked") +void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC(".struct_ops") +struct tcp_congestion_ops tcp_ca_kfunc = { + .init = (void *)init, + .in_ack_event = (void *)in_ack_event, + .cong_control = (void *)cong_control, + .cong_avoid = (void *)cong_avoid, + .sndbuf_expand = (void *)sndbuf_expand, + .undo_cwnd = (void *)undo_cwnd, + .cwnd_event = (void *)cwnd_event, + .ssthresh = (void *)ssthresh, + .min_tso_segs = (void *)min_tso_segs, + .set_state = (void *)set_state, + .pkts_acked = (void *)pkts_acked, + .name = "tcp_ca_kfunc", +}; + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From cdfd9cc3ba147ceea650afa6b7031e31a98d500e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:14:48 -0700 Subject: selftests/bpf: Replace CHECK with ASSERT macros for ksyms test Replace CHECK with ASSERT macros for ksyms tests. This test failed earlier with clang lto kernel, but the issue is gone with latest code base. But replacing CHECK with ASSERT still improves code as ASSERT is preferred in selftests. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041448.1197812-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ksyms.c | 30 ++++++++++---------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b295969b263b..dc7aab532fb1 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -5,8 +5,6 @@ #include "test_ksyms.skel.h" #include -static int duration; - void test_ksyms(void) { const char *btf_path = "/sys/kernel/btf/vmlinux"; @@ -18,43 +16,37 @@ void test_ksyms(void) int err; err = kallsyms_find("bpf_link_fops", &link_fops_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find")) return; err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find")) return; - if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) return; btf_size = st.st_size; skel = test_ksyms__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load")) return; err = test_ksyms__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ksyms__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); data = skel->data; - CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops", - "got 0x%llx, exp 0x%llx\n", - data->out__bpf_link_fops, link_fops_addr); - CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1", - "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); - CHECK(data->out__btf_size != btf_size, "btf_size", - "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, - per_cpu_start_addr); + ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); + ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); + ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); + ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); cleanup: test_ksyms__destroy(skel); -- cgit v1.2.3 From ad2b05286e94485070475e473963724fa657491c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:14:53 -0700 Subject: libbpf: Mark libbpf_kallsyms_parse static function Currently libbpf_kallsyms_parse() function is declared as a global function but actually it is not a API and there is no external users in bpftool/bpf-selftests. So let us mark the function as static. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041453.1197949-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 5 ++++- tools/lib/bpf/libbpf_internal.h | 5 ----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d7d8f78f8846..1822fb8a02d2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7986,7 +7986,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 864b36177424..a0dcfb82e455 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { -- cgit v1.2.3 From c56e59776f46d77984329488878a52baf4969457 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:14:58 -0700 Subject: libbpf: Handle .llvm. symbol properly With CONFIG_LTO_CLANG_THIN enabled, with some of previous version of kernel code base ([1]), I hit the following error: test_ksyms:PASS:kallsyms_fopen 0 nsec test_ksyms:FAIL:ksym_find symbol 'bpf_link_fops' not found #118 ksyms:FAIL The reason is that 'bpf_link_fops' is renamed to bpf_link_fops.llvm.8325593422554671469 Due to cross-file inlining, the static variable 'bpf_link_fops' in syscall.c is used by a function in another file. To avoid potential duplicated names, the llvm added suffix '.llvm.' ([2]) to 'bpf_link_fops' variable. Such renaming caused a problem in libbpf if 'bpf_link_fops' is used in bpf prog as a ksym but 'bpf_link_fops' does not match any symbol in /proc/kallsyms. To fix this issue, libbpf needs to understand that suffix '.llvm.' is caused by clang lto kernel and to process such symbols properly. With latest bpf-next code base built with CONFIG_LTO_CLANG_THIN, I cannot reproduce the above failure any more. But such an issue could happen with other symbols or in the future for bpf_link_fops symbol. For example, with my current kernel, I got the following from /proc/kallsyms: ffffffff84782154 d __func__.net_ratelimit.llvm.6135436931166841955 ffffffff85f0a500 d tk_core.llvm.726630847145216431 ffffffff85fdb960 d __fs_reclaim_map.llvm.10487989720912350772 ffffffff864c7300 d fake_dst_ops.llvm.54750082607048300 I could not easily create a selftest to test newly-added libbpf functionality with a static C test since I do not know which symbol is cross-file inlined. But based on my particular kernel, the following test change can run successfully. > diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c > index 6a86d1f07800..904a103f7b1d 100644 > --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c > +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c > @@ -42,6 +42,7 @@ void test_ksyms(void) > ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); > ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); > ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); > + ASSERT_NEQ(data->out__fake_dst_ops, 0, "fake_dst_ops"); > ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); > > cleanup: > diff --git a/tools/testing/selftests/bpf/progs/test_ksyms.c b/tools/testing/selftests/bpf/progs/test_ksyms.c > index 6c9cbb5a3bdf..fe91eef54b66 100644 > --- a/tools/testing/selftests/bpf/progs/test_ksyms.c > +++ b/tools/testing/selftests/bpf/progs/test_ksyms.c > @@ -9,11 +9,13 @@ __u64 out__bpf_link_fops = -1; > __u64 out__bpf_link_fops1 = -1; > __u64 out__btf_size = -1; > __u64 out__per_cpu_start = -1; > +__u64 out__fake_dst_ops = -1; > > extern const void bpf_link_fops __ksym; > extern const void __start_BTF __ksym; > extern const void __stop_BTF __ksym; > extern const void __per_cpu_start __ksym; > +extern const void fake_dst_ops __ksym; > /* non-existing symbol, weak, default to zero */ > extern const void bpf_link_fops1 __ksym __weak; > > @@ -23,6 +25,7 @@ int handler(const void *ctx) > out__bpf_link_fops = (__u64)&bpf_link_fops; > out__btf_size = (__u64)(&__stop_BTF - &__start_BTF); > out__per_cpu_start = (__u64)&__per_cpu_start; > + out__fake_dst_ops = (__u64)&fake_dst_ops; > > out__bpf_link_fops1 = (__u64)&bpf_link_fops1; This patch fixed the issue in libbpf such that the suffix '.llvm.' will be ignored during comparison of bpf prog ksym vs. symbols in /proc/kallsyms, this resolved the issue. Currently, only static variables in /proc/kallsyms are checked with '.llvm.' suffix since in bpf programs function ksyms with '.llvm.' suffix are most likely kfunc's and unlikely to be cross-file inlined. Note that currently kernel does not support gcc build with lto. [1] https://lore.kernel.org/bpf/20240302165017.1627295-1-yonghong.song@linux.dev/ [2] https://github.com/llvm/llvm-project/blob/release/18.x/llvm/include/llvm/IR/ModuleSummaryIndex.h#L1714-L1719 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041458.1198161-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1822fb8a02d2..b091154bc5b5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1970,6 +1970,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { @@ -8029,8 +8043,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; -- cgit v1.2.3 From d1320649346c36c5ba7b579533bf518960ef71e1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:03 -0700 Subject: selftests/bpf: Refactor some functions for kprobe_multi_test Refactor some functions in kprobe_multi_test.c to extract some helper functions who will be used in later patches to avoid code duplication. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041503.1198982-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/kprobe_multi_test.c | 94 +++++++++++++--------- 1 file changed, 57 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 05000810e28e..46e28edda595 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -336,6 +336,37 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) return strcmp((const char *) key1, (const char *) key2) == 0; } +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { size_t cap = 0, cnt = 0, i; @@ -368,30 +399,13 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) } while (fgets(buf, sizeof(buf), f)) { - if (kernel && strchr(buf, '[')) - continue; - if (!kernel && !strchr(buf, '[')) + if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - continue; - if (!strcmp(name, "default_idle")) - continue; - if (!strncmp(name, "rcu_", 4)) - continue; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - continue; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) + if (skip_entry(name)) continue; err = hashmap__add(map, name, 0); @@ -426,34 +440,20 @@ error: return err; } -static void test_kprobe_multi_bench_attach(bool kernel) +static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { - LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - struct kprobe_multi_empty *skel = NULL; long attach_start_ns, attach_end_ns; long detach_start_ns, detach_end_ns; double attach_delta, detach_delta; struct bpf_link *link = NULL; - char **syms = NULL; - size_t cnt = 0, i; - - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) - return; - - skel = kprobe_multi_empty__open_and_load(); - if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) - goto cleanup; - - opts.syms = (const char **) syms; - opts.cnt = cnt; attach_start_ns = get_time_ns(); link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, - NULL, &opts); + NULL, opts); attach_end_ns = get_time_ns(); if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) - goto cleanup; + return; detach_start_ns = get_time_ns(); bpf_link__destroy(link); @@ -462,9 +462,29 @@ static void test_kprobe_multi_bench_attach(bool kernel) attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; - printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: found %lu functions\n", __func__, opts->cnt); printf("%s: attached in %7.3lfs\n", __func__, attach_delta); printf("%s: detached in %7.3lfs\n", __func__, detach_delta); +} + +static void test_kprobe_multi_bench_attach(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + char **syms = NULL; + size_t cnt = 0, i; + + if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.syms = (const char **) syms; + opts.cnt = cnt; + + do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); -- cgit v1.2.3 From 9475dacb75e0b5efae086dc904f4d27c31f15157 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:08 -0700 Subject: selftests/bpf: Refactor trace helper func load_kallsyms_local() Refactor trace helper function load_kallsyms_local() such that it invokes a common function with a compare function as input. The common function will be used later for other local functions. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041508.1199239-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/trace_helpers.c | 19 ++++++++++++------- tools/testing/selftests/bpf/trace_helpers.h | 2 ++ 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 27fd7ed3e4b0..fc9de4a89aea 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms) free(ksyms); } -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -struct ksyms *load_kallsyms_local(void) +static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; @@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void) goto error; } fclose(f); - qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp); + qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: @@ -109,6 +104,16 @@ error: return NULL; } +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +struct ksyms *load_kallsyms_local(void) +{ + return load_kallsyms_local_common(ksym_cmp); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 04fd1da7079d..9da0d0484eed 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -13,6 +13,8 @@ struct ksym { }; struct ksyms; +typedef int (*ksym_cmp_t)(const void *p1, const void *p2); + int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); -- cgit v1.2.3 From d1f02581059e42d8daf944aae2a296254cc7a5d5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:13 -0700 Subject: selftests/bpf: Add {load,search}_kallsyms_custom_local() These two functions allow selftests to do loading/searching kallsyms based on their specific compare functions. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041513.1199440-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/trace_helpers.c | 27 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/trace_helpers.h | 5 +++++ 2 files changed, 32 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index fc9de4a89aea..7f45b4cb41fe 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -114,6 +114,11 @@ struct ksyms *load_kallsyms_local(void) return load_kallsyms_local_common(ksym_cmp); } +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) +{ + return load_kallsyms_local_common(cmp_cb); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); @@ -153,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key) return &ksyms->syms[0]; } +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, + ksym_search_cmp_t cmp_cb) +{ + int start = 0, mid, end = ksyms->sym_cnt; + struct ksym *ks; + int result; + + while (start < end) { + mid = start + (end - start) / 2; + ks = &ksyms->syms[mid]; + result = cmp_cb(p, ks); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return ks; + } + + return NULL; +} + struct ksym *ksym_search(long key) { if (!ksyms) diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 9da0d0484eed..d1ed71789049 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -14,6 +14,7 @@ struct ksym { struct ksyms; typedef int (*ksym_cmp_t)(const void *p1, const void *p2); +typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); int load_kallsyms(void); struct ksym *ksym_search(long key); @@ -24,6 +25,10 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key); long ksym_get_addr_local(struct ksyms *ksyms, const char *name); void free_kallsyms_local(struct ksyms *ksyms); +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb); +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, + ksym_search_cmp_t cmp_cb); + /* open kallsyms and find addresses on the fly, faster than load + search. */ int kallsyms_find(const char *sym, unsigned long long *addr); -- cgit v1.2.3 From 9edaafadc2c50f2af99ee5b3bad6831e1b6ad54f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:18 -0700 Subject: selftests/bpf: Fix kprobe_multi_bench_attach test failure with LTO kernel In my locally build clang LTO kernel (enabling CONFIG_LTO and CONFIG_LTO_CLANG_THIN), kprobe_multi_bench_attach/kernel subtest failed like: test_kprobe_multi_bench_attach:PASS:get_syms 0 nsec test_kprobe_multi_bench_attach:PASS:kprobe_multi_empty__open_and_load 0 nsec libbpf: prog 'test_kprobe_empty': failed to attach: No such process test_kprobe_multi_bench_attach:FAIL:bpf_program__attach_kprobe_multi_opts unexpected error: -3 #117/1 kprobe_multi_bench_attach/kernel:FAIL There are multiple symbols in /sys/kernel/debug/tracing/available_filter_functions are renamed in /proc/kallsyms due to cross file inlining. One example is for static function __access_remote_vm in mm/memory.c. In a non-LTO kernel, we have the following call stack: ptrace_access_vm (global, kernel/ptrace.c) access_remote_vm (global, mm/memory.c) __access_remote_vm (static, mm/memory.c) With LTO kernel, it is possible that access_remote_vm() is inlined by ptrace_access_vm(). So we end up with the following call stack: ptrace_access_vm (global, kernel/ptrace.c) __access_remote_vm (static, mm/memory.c) The compiler renames __access_remote_vm to __access_remote_vm.llvm. to prevent potential name collision. The kernel bpf_kprobe_multi_link_attach() and ftrace_lookup_symbols() try to find addresses based on /proc/kallsyms, hence the current test failed with LTO kenrel. This patch consulted /proc/kallsyms to find the corresponding entries for the ksym and this solved the issue. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041518.1199758-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/kprobe_multi_test.c | 62 +++++++++++++++++----- 1 file changed, 48 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 46e28edda595..3b9059164360 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -367,15 +367,49 @@ static bool skip_entry(char *name) return false; } +/* Do comparision by ignoring '.llvm.' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { - size_t cap = 0, cnt = 0, i; - char *name = NULL, **syms = NULL; + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; char buf[256]; FILE *f; int err = 0; + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) + return -EINVAL; + /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable in kprobe multi @@ -408,7 +442,14 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (skip_entry(name)) continue; - err = hashmap__add(map, name, 0); + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; @@ -421,8 +462,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (err) goto error; - syms[cnt++] = name; - name = NULL; + syms[cnt++] = ksym_name; } *symsp = syms; @@ -432,11 +472,8 @@ error: free(name); fclose(f); hashmap__free(map); - if (err) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (err) free(syms); - } return err; } @@ -472,7 +509,7 @@ static void test_kprobe_multi_bench_attach(bool kernel) LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); struct kprobe_multi_empty *skel = NULL; char **syms = NULL; - size_t cnt = 0, i; + size_t cnt = 0; if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) return; @@ -488,11 +525,8 @@ static void test_kprobe_multi_bench_attach(bool kernel) cleanup: kprobe_multi_empty__destroy(skel); - if (syms) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (syms) free(syms); - } } static void test_attach_override(void) -- cgit v1.2.3 From 6302bdeb91df9b4484b9d537c29f8b6117f3f73d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:23 -0700 Subject: selftests/bpf: Add a kprobe_multi subtest to use addrs instead of syms Get addrs directly from available_filter_functions_addrs and send to the kernel during kprobe_multi_attach. This avoids consultation of /proc/kallsyms. But available_filter_functions_addrs is introduced in 6.5, i.e., it is introduced recently, so I skip the test if the kernel does not support it. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041523.1200301-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/kprobe_multi_test.c | 98 ++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 3b9059164360..51628455b6f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -477,6 +477,69 @@ error: return err; } +static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); + return err; +} + static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { long attach_start_ns, attach_end_ns; @@ -529,6 +592,37 @@ cleanup: free(syms); } +static void test_kprobe_multi_bench_attach_addr(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + unsigned long *addrs = NULL; + size_t cnt = 0; + int err; + + err = get_addrs(&addrs, &cnt, kernel); + if (err == -ENOENT) { + test__skip(); + return; + } + + if (!ASSERT_OK(err, "get_addrs")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.addrs = addrs; + opts.cnt = cnt; + + do_bench_test(skel, &opts); + +cleanup: + kprobe_multi_empty__destroy(skel); + free(addrs); +} + static void test_attach_override(void) { struct kprobe_multi_override *skel = NULL; @@ -569,6 +663,10 @@ void serial_test_kprobe_multi_bench_attach(void) test_kprobe_multi_bench_attach(true); if (test__start_subtest("modules")) test_kprobe_multi_bench_attach(false); + if (test__start_subtest("kernel")) + test_kprobe_multi_bench_attach_addr(true); + if (test__start_subtest("modules")) + test_kprobe_multi_bench_attach_addr(false); } void test_kprobe_multi_test(void) -- cgit v1.2.3 From e5e1a3aa56773d55dfb71c4d58176bc19ecfa739 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 26 Mar 2024 18:03:38 +0800 Subject: selftests/bpf: Use connect_fd_to_fd in bpf_tcp_ca To simplify the code, use BPF selftests helper connect_fd_to_fd() in bpf_tcp_ca.c instead of open-coding it. This helper is defined in network_helpers.c, and exported in network_helpers.h, which is already included in bpf_tcp_ca.c. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/e105d1f225c643bee838409378dd90fd9aabb6dc.1711447102.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 332ed54cc6af..2f6b8b2780e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -79,11 +79,9 @@ done: static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - struct sockaddr_in6 sa6 = {}; ssize_t nr_recv = 0, bytes = 0; int lfd = -1, fd = -1; pthread_t srv_thread; - socklen_t addrlen = sizeof(sa6); void *thread_ret; char batch[1500]; int err; @@ -104,10 +102,6 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) settimeo(lfd, 0) || settimeo(fd, 0)) goto done; - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); - if (!ASSERT_NEQ(err, -1, "getsockname")) - goto done; - if (sk_stg_map) { err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd, &expected_stg, BPF_NOEXIST); @@ -116,7 +110,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } /* connect to server */ - err = connect(fd, (struct sockaddr *)&sa6, addrlen); + err = connect_fd_to_fd(fd, lfd, 0); if (!ASSERT_NEQ(err, -1, "connect")) goto done; -- cgit v1.2.3 From 426670929fda4485a23094e03cea5d9b3ca918aa Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 26 Mar 2024 18:03:39 +0800 Subject: selftests/bpf: Drop settimeo in do_test settimeo is invoked in start_server() and in connect_fd_to_fd() already, no need to invoke settimeo(lfd, 0) and settimeo(fd, 0) in do_test() anymore. This patch drops them. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/dbc3613bee3b1c78f95ac9ff468bf47c92f106ea.1711447102.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 2f6b8b2780e6..077b107130f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -98,8 +98,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) return; } - if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd, 0) || settimeo(fd, 0)) + if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca)) goto done; if (sk_stg_map) { -- cgit v1.2.3 From 2aa0cff26ed53bc8d4855292b501759435ffdd38 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:25 -0700 Subject: selftest: af_unix: Test GC for SCM_RIGHTS. This patch adds test cases to verify the new GC. We run each test for the following cases: * SOCK_DGRAM * SOCK_STREAM without embryo socket * SOCK_STREAM without embryo socket + MSG_OOB * SOCK_STREAM with embryo sockets * SOCK_STREAM with embryo sockets + MSG_OOB Before and after running each test case, we ensure that there is no AF_UNIX socket left in the netns by reading /proc/net/protocols. We cannot use /proc/net/unix and UNIX_DIAG because the embryo socket does not show up there. Each test creates multiple sockets in an array. We pass sockets in the even index using the peer sockets in the odd index. So, send_fd(0, 1) actually sends fd[0] to fd[2] via fd[0 + 1]. Test 1 : A <-> A Test 2 : A <-> B Test 3 : A -> B -> C <- D ^.___|___.' ^ `---------' Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-16-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/af_unix/Makefile | 2 +- tools/testing/selftests/net/af_unix/scm_rights.c | 286 +++++++++++++++++++++++ 3 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/af_unix/scm_rights.c (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 2f9d378edec3..d996a0ab0765 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -31,6 +31,7 @@ reuseport_dualstack rxtimestamp sctp_hello scm_pidfd +scm_rights sk_bind_sendto_listen sk_connect_zero_addr socket diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index 221c387a7d7f..3b83c797650d 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -1,4 +1,4 @@ CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd +TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c new file mode 100644 index 000000000000..bab606c9f1eb --- /dev/null +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include + +#include "../../kselftest_harness.h" + +FIXTURE(scm_rights) +{ + int fd[16]; +}; + +FIXTURE_VARIANT(scm_rights) +{ + char name[16]; + int type; + int flags; + bool test_listener; +}; + +FIXTURE_VARIANT_ADD(scm_rights, dgram) +{ + .name = "UNIX ", + .type = SOCK_DGRAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = true, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = true, +}; + +static int count_sockets(struct __test_metadata *_metadata, + const FIXTURE_VARIANT(scm_rights) *variant) +{ + int sockets = -1, len, ret; + char *line = NULL; + size_t unused; + FILE *f; + + f = fopen("/proc/net/protocols", "r"); + ASSERT_NE(NULL, f); + + len = strlen(variant->name); + + while (getline(&line, &unused, f) != -1) { + int unused2; + + if (strncmp(line, variant->name, len)) + continue; + + ret = sscanf(line + len, "%d %d", &unused2, &sockets); + ASSERT_EQ(2, ret); + + break; + } + + free(line); + + ret = fclose(f); + ASSERT_EQ(0, ret); + + return sockets; +} + +FIXTURE_SETUP(scm_rights) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(0, ret); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(scm_rights) +{ + int ret; + + sleep(1); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +static void create_listeners(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + socklen_t addrlen; + int i, ret; + + for (i = 0; i < n * 2; i += 2) { + self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i]); + + addrlen = sizeof(addr.sun_family); + ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + + ret = listen(self->fd[i], -1); + ASSERT_EQ(0, ret); + + addrlen = sizeof(addr); + ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen); + ASSERT_EQ(0, ret); + + self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i + 1]); + + ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + } +} + +static void create_socketpairs(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i += 2) { + ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i); + ASSERT_EQ(0, ret); + } +} + +static void __create_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + if (variant->test_listener) + create_listeners(_metadata, self, n); + else + create_socketpairs(_metadata, self, variant, n); +} + +static void __close_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i++) { + ret = close(self->fd[i]); + ASSERT_EQ(0, ret); + } +} + +void __send_fd(struct __test_metadata *_metadata, + const FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int inflight, int receiver) +{ +#define MSG "nop" +#define MSGLEN 3 + struct { + struct cmsghdr cmsghdr; + int fd[2]; + } cmsg = { + .cmsghdr = { + .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)), + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_RIGHTS, + }, + .fd = { + self->fd[inflight * 2], + self->fd[inflight * 2], + }, + }; + struct iovec iov = { + .iov_base = MSG, + .iov_len = MSGLEN, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &cmsg, + .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)), + }; + int ret; + + ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags); + ASSERT_EQ(MSGLEN, ret); +} + +#define create_sockets(n) \ + __create_sockets(_metadata, self, variant, n) +#define close_sockets(n) \ + __close_sockets(_metadata, self, n) +#define send_fd(inflight, receiver) \ + __send_fd(_metadata, self, variant, inflight, receiver) + +TEST_F(scm_rights, self_ref) +{ + create_sockets(2); + + send_fd(0, 0); + + send_fd(1, 1); + + close_sockets(2); +} + +TEST_F(scm_rights, triangle) +{ + create_sockets(6); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + + send_fd(3, 4); + send_fd(4, 5); + send_fd(5, 3); + + close_sockets(6); +} + +TEST_F(scm_rights, cross_edge) +{ + create_sockets(8); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + send_fd(1, 3); + send_fd(3, 2); + + send_fd(4, 5); + send_fd(5, 6); + send_fd(6, 4); + send_fd(5, 7); + send_fd(7, 6); + + close_sockets(8); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From bd3ce405fecc6f2f9ce09c789386e326346899a0 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 28 Mar 2024 15:56:36 +0000 Subject: tools/net/ynl: Add extack policy attribute decoding The NLMSGERR_ATTR_POLICY extack attribute has been ignored by ynl up to now. Extend extack decoding to include _POLICY and the nested NL_POLICY_TYPE_ATTR_* attributes. For example: ./tools/net/ynl/cli.py \ --spec Documentation/netlink/specs/rt_link.yaml \ --create --do newlink --json '{ "ifname": "12345678901234567890", "linkinfo": {"kind": "bridge"} }' Netlink error: Numerical result out of range nl_len = 104 (88) nl_flags = 0x300 nl_type = 2 error: -34 extack: {'msg': 'Attribute failed policy validation', 'policy': {'max-length': 15, 'type': 'string'}, 'bad-attr': '.ifname'} Signed-off-by: Donald Hunter Link: https://lore.kernel.org/r/20240328155636.64688-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index e73b027c5624..82d3c98067aa 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause from collections import namedtuple +from enum import Enum import functools import os import random @@ -76,6 +77,25 @@ class Netlink: NLMSGERR_ATTR_MISS_TYPE = 5 NLMSGERR_ATTR_MISS_NEST = 6 + # Policy types + NL_POLICY_TYPE_ATTR_TYPE = 1 + NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2 + NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3 + NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4 + NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5 + NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6 + NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7 + NL_POLICY_TYPE_ATTR_POLICY_IDX = 8 + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9 + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10 + NL_POLICY_TYPE_ATTR_PAD = 11 + NL_POLICY_TYPE_ATTR_MASK = 12 + + AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64', + 's8', 's16', 's32', 's64', + 'binary', 'string', 'nul-string', + 'nested', 'nested-array', + 'bitfield32', 'sint', 'uint']) class NlError(Exception): def __init__(self, nl_msg): @@ -198,6 +218,8 @@ class NlMsg: self.extack['miss-nest'] = extack.as_scalar('u32') elif extack.type == Netlink.NLMSGERR_ATTR_OFFS: self.extack['bad-attr-offs'] = extack.as_scalar('u32') + elif extack.type == Netlink.NLMSGERR_ATTR_POLICY: + self.extack['policy'] = self._decode_policy(extack.raw) else: if 'unknown' not in self.extack: self.extack['unknown'] = [] @@ -214,6 +236,30 @@ class NlMsg: desc += f" ({spec['doc']})" self.extack['miss-type'] = desc + def _decode_policy(self, raw): + policy = {} + for attr in NlAttrs(raw): + if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE: + type = attr.as_scalar('u32') + policy['type'] = Netlink.AttrType(type).name + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S: + policy['min-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S: + policy['max-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U: + policy['min-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U: + policy['max-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH: + policy['min-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH: + policy['max-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: + policy['bitfield32-mask'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK: + policy['mask'] = attr.as_scalar('u64') + return policy + def cmd(self): return self.nl_type -- cgit v1.2.3 From 623bdd58be3727318d374f0052f9dfff1e87b854 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 Mar 2024 12:04:10 -0700 Subject: selftests/bpf: make multi-uprobe tests work in RELEASE=1 mode When BPF selftests are built in RELEASE=1 mode with -O2 optimization level, uprobe_multi binary, called from multi-uprobe tests is optimized to the point that all the thousands of target uprobe_multi_func_XXX functions are eliminated, breaking tests. So ensure they are preserved by using weak attribute. But, actually, compiling uprobe_multi binary with -O2 takes a really long time, and is quite useless (it's not a benchmark). So in addition to ensuring that uprobe_multi_func_XXX functions are preserved, opt-out of -O2 explicitly in Makefile and stick to -O0. This saves a lot of compilation time. With -O2, just recompiling uprobe_multi: $ touch uprobe_multi.c $ time make RELEASE=1 -j90 make RELEASE=1 -j90 291.66s user 2.54s system 99% cpu 4:55.52 total With -O0: $ touch uprobe_multi.c $ time make RELEASE=1 -j90 make RELEASE=1 -j90 22.40s user 1.91s system 99% cpu 24.355 total 5 minutes vs (still slow, but...) 24 seconds. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240329190410.4191353-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/uprobe_multi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b9eb40d6343..b0ac3dd80acf 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -759,7 +759,7 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index a61ceab60b68..7ffa563ffeba 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -9,7 +9,7 @@ #define NAME(name, idx) PASTE(name, idx) -#define DEF(name, idx) int NAME(name, idx)(void) { return 0; } +#define DEF(name, idx) int __attribute__((weak)) NAME(name, idx)(void) { return 0; } #define CALL(name, idx) NAME(name, idx)(); #define F(body, name, idx) body(name, idx) -- cgit v1.2.3 From 7d8296b250f2eed73f1758607926d4d258dea5d4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:42 +0100 Subject: bitops: make BYTES_TO_BITS() treewide-available Avoid open-coding that simple expression each time by moving BYTES_TO_BITS() from the probes code to to export it to the rest of the kernel. Simplify the macro while at it. `BITS_PER_LONG / sizeof(long)` always equals to %BITS_PER_BYTE, regardless of the target architecture. Do the same for the tools ecosystem as well (incl. its version of bitops.h). The previous implementation had its implicit type of long, while the new one is int, so adjust the format literal accordingly in the perf code. Suggested-by: Andy Shevchenko Reviewed-by: Przemek Kitszel Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitops.h | 2 ++ kernel/trace/trace_probe.c | 2 -- tools/include/linux/bitops.h | 2 ++ tools/perf/util/probe-finder.c | 4 +--- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index f7f5a783da2a..e0cd09eb91cd 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -21,6 +21,8 @@ #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfe3ee6035ec..87337a0c8e03 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1180,8 +1180,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type, return ret; } -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 7319f6ced108..272f15d0e434 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -20,6 +20,8 @@ #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index c8923375e30d..630e16c54ed5 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -186,8 +186,6 @@ static_var: return ret2; } -#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_LONG / sizeof(long)) - static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, const char *cast, bool user_access) @@ -217,7 +215,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, total = dwarf_bytesize(vr_die); if (boffs < 0 || total < 0) return -ENOENT; - ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs, + ret = snprintf(buf, 16, "b%d@%d/%d", bsize, boffs, BYTES_TO_BITS(total)); goto formatted; } -- cgit v1.2.3 From 10a04ff09bcc39e0044190ffe9f00f998f13647c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:48 +0100 Subject: tools: move alignment-related macros to new Currently, tools have *ALIGN*() macros scattered across the unrelated headers, as there are only 3 of them and they were added separately each time on an as-needed basis. Anyway, let's make it more consistent with the kernel headers and allow using those macros outside of the mentioned headers. Create inside the tools/ folder and include it where needed. Signed-off-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- tools/include/linux/align.h | 12 ++++++++++++ tools/include/linux/bitmap.h | 2 +- tools/include/linux/mm.h | 5 +---- 3 files changed, 14 insertions(+), 5 deletions(-) create mode 100644 tools/include/linux/align.h (limited to 'tools') diff --git a/tools/include/linux/align.h b/tools/include/linux/align.h new file mode 100644 index 000000000000..14e34ace80dd --- /dev/null +++ b/tools/include/linux/align.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _TOOLS_LINUX_ALIGN_H +#define _TOOLS_LINUX_ALIGN_H + +#include + +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _TOOLS_LINUX_ALIGN_H */ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index f3566ea0f932..8c6852dba04f 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -3,6 +3,7 @@ #define _TOOLS_LINUX_BITMAP_H #include +#include #include #include #include @@ -126,7 +127,6 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) static inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index f3c82ab5b14c..7a6b98f4e579 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -2,8 +2,8 @@ #ifndef _TOOLS_LINUX_MM_H #define _TOOLS_LINUX_MM_H +#include #include -#include #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) @@ -11,9 +11,6 @@ #define PHYS_ADDR_MAX (~(phys_addr_t)0) -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) - #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) #define __va(x) ((void *)((unsigned long)(x))) -- cgit v1.2.3 From a37fbe666c016fd89e4460d0ebfcea05baba46dc Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:49 +0100 Subject: bitmap: introduce generic optimized bitmap_size() The number of times yet another open coded `BITS_TO_LONGS(nbits) * sizeof(long)` can be spotted is huge. Some generic helper is long overdue. Add one, bitmap_size(), but with one detail. BITS_TO_LONGS() uses DIV_ROUND_UP(). The latter works well when both divident and divisor are compile-time constants or when the divisor is not a pow-of-2. When it is however, the compilers sometimes tend to generate suboptimal code (GCC 13): 48 83 c0 3f add $0x3f,%rax 48 c1 e8 06 shr $0x6,%rax 48 8d 14 c5 00 00 00 00 lea 0x0(,%rax,8),%rdx %BITS_PER_LONG is always a pow-2 (either 32 or 64), but GCC still does full division of `nbits + 63` by it and then multiplication by 8. Instead of BITS_TO_LONGS(), use ALIGN() and then divide by 8. GCC: 8d 50 3f lea 0x3f(%rax),%edx c1 ea 03 shr $0x3,%edx 81 e2 f8 ff ff 1f and $0x1ffffff8,%edx Now it shifts `nbits + 63` by 3 positions (IOW performs fast division by 8) and then masks bits[2:0]. bloat-o-meter: add/remove: 0/0 grow/shrink: 20/133 up/down: 156/-773 (-617) Clang does it better and generates the same code before/after starting from -O1, except that with the ALIGN() approach it uses %edx and thus still saves some bytes: add/remove: 0/0 grow/shrink: 9/133 up/down: 18/-538 (-520) Note that we can't expand DIV_ROUND_UP() by adding a check and using this approach there, as it's used in array declarations where expressions are not allowed. Add this helper to tools/ as well. Reviewed-by: Przemek Kitszel Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- drivers/md/dm-clone-metadata.c | 5 ----- drivers/s390/cio/idset.c | 2 +- include/linux/bitmap.h | 8 +++++--- include/linux/cpumask.h | 2 +- lib/math/prime_numbers.c | 2 -- tools/include/linux/bitmap.h | 7 ++++--- 6 files changed, 11 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index c43d55672bce..47c1fa7aad8b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -465,11 +465,6 @@ static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd) /*---------------------------------------------------------------------------*/ -static size_t bitmap_size(unsigned long nr_bits) -{ - return BITS_TO_LONGS(nr_bits) * sizeof(long); -} - static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words, unsigned long nr_regions) { diff --git a/drivers/s390/cio/idset.c b/drivers/s390/cio/idset.c index 0a1105a483bf..e5f28370a903 100644 --- a/drivers/s390/cio/idset.c +++ b/drivers/s390/cio/idset.c @@ -18,7 +18,7 @@ struct idset { static inline unsigned long idset_bitmap_size(int num_ssid, int num_id) { - return BITS_TO_LONGS(num_ssid * num_id) * sizeof(unsigned long); + return bitmap_size(size_mul(num_ssid, num_id)); } static struct idset *idset_new(int num_ssid, int num_id) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 914c23e96f26..363e0b184a45 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -226,9 +226,11 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = 0; @@ -238,7 +240,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = ~0UL; @@ -249,7 +251,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = *src; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1c29947db848..6519f9c77709 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -853,7 +853,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) */ static inline unsigned int cpumask_size(void) { - return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long); + return bitmap_size(large_cpumask_bits); } /* diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c index d42cebf7407f..d3b64b10da1c 100644 --- a/lib/math/prime_numbers.c +++ b/lib/math/prime_numbers.c @@ -6,8 +6,6 @@ #include #include -#define bitmap_size(nbits) (BITS_TO_LONGS(nbits) * sizeof(unsigned long)) - struct primes { struct rcu_head rcu; unsigned long last, sz; diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 8c6852dba04f..210c13b1b857 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -26,13 +26,14 @@ bool __bitmap_intersects(const unsigned long *bitmap1, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); - memset(dst, 0, len); + memset(dst, 0, bitmap_size(nbits)); } } @@ -84,7 +85,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, */ static inline unsigned long *bitmap_zalloc(int nbits) { - return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long)); + return calloc(1, bitmap_size(nbits)); } /* -- cgit v1.2.3 From 4cc1730a90fcdeac14eb44857b566a96e1170f79 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 29 Mar 2024 13:50:19 +0000 Subject: doc: netlink: Change generated docs to limit TOC to depth 3 The tables of contents in the generated Netlink docs include individual attribute definitions. This can make the contents exceedingly long and repeats a lot of what is on the rest of the pages. See for example: https://docs.kernel.org/networking/netlink_spec/tc.html Add a depth limit to the contents directive in generated .rst files to limit the contents depth to 3 levels. This reduces the contents to: - Family - Summary - Operations - op-one - op-two - ... - Definitions - struct-one - struct-two - enum-one - ... - Attribute sets - attrs-one - attrs-two - ... Signed-off-by: Donald Hunter Reviewed-by: Breno Leitao Link: https://lore.kernel.org/r/20240329135021.52534-2-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-rst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py index 927407b3efb3..5825a8b3bfb4 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/ynl-gen-rst.py @@ -291,7 +291,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str: title = f"Family ``{obj['name']}`` netlink specification" lines.append(rst_title(title)) - lines.append(rst_paragraph(".. contents::\n")) + lines.append(rst_paragraph(".. contents:: :depth: 3\n")) if "doc" in obj: lines.append(rst_subtitle("Summary")) -- cgit v1.2.3 From 8c1b74a26d96705ac55a43afe7a41fc6e44be390 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 29 Mar 2024 13:50:20 +0000 Subject: doc: netlink: Add hyperlinks to generated Netlink docs Update ynl-gen-rst to generate hyperlinks to definitions, attribute sets and sub-messages from all the places that reference them. Note that there is a single label namespace for all of the kernel docs. Hyperlinks within a single netlink doc need to be qualified by the family name to avoid collisions. The label format is 'family-type-name' which gives, for example, 'rt-link-attribute-set-link-attrs' as the link id. Signed-off-by: Donald Hunter Link: https://lore.kernel.org/r/20240329135021.52534-3-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-rst.py | 60 +++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py index 5825a8b3bfb4..657e881d2ea4 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/ynl-gen-rst.py @@ -82,9 +82,9 @@ def rst_subsubsection(title: str) -> str: return f"{title}\n" + "~" * len(title) -def rst_section(title: str) -> str: +def rst_section(namespace: str, prefix: str, title: str) -> str: """Add a section to the document""" - return f"\n{title}\n" + "=" * len(title) + return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title) def rst_subtitle(title: str) -> str: @@ -102,6 +102,17 @@ def rst_list_inline(list_: List[str], level: int = 0) -> str: return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]" +def rst_ref(namespace: str, prefix: str, name: str) -> str: + """Add a hyperlink to the document""" + mappings = {'enum': 'definition', + 'fixed-header': 'definition', + 'nested-attributes': 'attribute-set', + 'struct': 'definition'} + if prefix in mappings: + prefix = mappings[prefix] + return f":ref:`{namespace}-{prefix}-{name}`" + + def rst_header() -> str: """The headers for all the auto generated RST files""" lines = [] @@ -159,20 +170,24 @@ def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str: return "\n".join(lines) -def parse_operations(operations: List[Dict[str, Any]]) -> str: +def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str: """Parse operations block""" preprocessed = ["name", "doc", "title", "do", "dump"] + linkable = ["fixed-header", "attribute-set"] lines = [] for operation in operations: - lines.append(rst_section(operation["name"])) + lines.append(rst_section(namespace, 'operation', operation["name"])) lines.append(rst_paragraph(sanitize(operation["doc"])) + "\n") for key in operation.keys(): if key in preprocessed: # Skip the special fields continue - lines.append(rst_fields(key, operation[key], 0)) + value = operation[key] + if key in linkable: + value = rst_ref(namespace, key, value) + lines.append(rst_fields(key, value, 0)) if "do" in operation: lines.append(rst_paragraph(":do:", 0)) @@ -212,14 +227,14 @@ def parse_entries(entries: List[Dict[str, Any]], level: int) -> str: return "\n".join(lines) -def parse_definitions(defs: Dict[str, Any]) -> str: +def parse_definitions(defs: Dict[str, Any], namespace: str) -> str: """Parse definitions section""" preprocessed = ["name", "entries", "members"] ignored = ["render-max"] # This is not printed lines = [] for definition in defs: - lines.append(rst_section(definition["name"])) + lines.append(rst_section(namespace, 'definition', definition["name"])) for k in definition.keys(): if k in preprocessed + ignored: continue @@ -237,14 +252,15 @@ def parse_definitions(defs: Dict[str, Any]) -> str: return "\n".join(lines) -def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: +def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse attribute from attribute-set""" preprocessed = ["name", "type"] + linkable = ["enum", "nested-attributes", "struct", "sub-message"] ignored = ["checks"] lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'attribute-set', entry["name"])) for attr in entry["attributes"]: type_ = attr.get("type") attr_line = attr["name"] @@ -257,25 +273,31 @@ def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: for k in attr.keys(): if k in preprocessed + ignored: continue - lines.append(rst_fields(k, sanitize(attr[k]), 0)) + if k in linkable: + value = rst_ref(namespace, k, attr[k]) + else: + value = sanitize(attr[k]) + lines.append(rst_fields(k, value, 0)) lines.append("\n") return "\n".join(lines) -def parse_sub_messages(entries: List[Dict[str, Any]]) -> str: +def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse sub-message definitions""" lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'sub-message', entry["name"])) for fmt in entry["formats"]: value = fmt["value"] lines.append(rst_bullet(bold(value))) for attr in ['fixed-header', 'attribute-set']: if attr in fmt: - lines.append(rst_fields(attr, fmt[attr], 1)) + lines.append(rst_fields(attr, + rst_ref(namespace, attr, fmt[attr]), + 1)) lines.append("\n") return "\n".join(lines) @@ -289,7 +311,9 @@ def parse_yaml(obj: Dict[str, Any]) -> str: lines.append(rst_header()) - title = f"Family ``{obj['name']}`` netlink specification" + family = obj['name'] + + title = f"Family ``{family}`` netlink specification" lines.append(rst_title(title)) lines.append(rst_paragraph(".. contents:: :depth: 3\n")) @@ -300,7 +324,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Operations if "operations" in obj: lines.append(rst_subtitle("Operations")) - lines.append(parse_operations(obj["operations"]["list"])) + lines.append(parse_operations(obj["operations"]["list"], family)) # Multicast groups if "mcast-groups" in obj: @@ -310,17 +334,17 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Definitions if "definitions" in obj: lines.append(rst_subtitle("Definitions")) - lines.append(parse_definitions(obj["definitions"])) + lines.append(parse_definitions(obj["definitions"], family)) # Attributes set if "attribute-sets" in obj: lines.append(rst_subtitle("Attribute sets")) - lines.append(parse_attr_sets(obj["attribute-sets"])) + lines.append(parse_attr_sets(obj["attribute-sets"], family)) # Sub-messages if "sub-messages" in obj: lines.append(rst_subtitle("Sub-messages")) - lines.append(parse_sub_messages(obj["sub-messages"])) + lines.append(parse_sub_messages(obj["sub-messages"], family)) return "\n".join(lines) -- cgit v1.2.3 From f7b68543642136164ce7348945d3ada707c4e635 Mon Sep 17 00:00:00 2001 From: Rameez Rehman Date: Sun, 31 Mar 2024 21:03:44 +0100 Subject: bpftool: Use simpler indentation in source rST for documentation The rST manual pages for bpftool would use a mix of tabs and spaces for indentation. While this is the norm in C code, this is rather unusual for rST documents, and over time we've seen many contributors use a wrong level of indentation for documentation update. Let's fix bpftool's indentation in docs once and for all: - Let's use spaces, that are more common in rST files. - Remove one level of indentation for the synopsis, the command description, and the "see also" section. As a result, all sections start with the same indentation level in the generated man page. - Rewrap the paragraphs after the changes. There is no content change in this patch, only indentation and rewrapping changes. The wrapping in the generated source files for the manual pages is changed, but the pages displayed with "man" remain the same, apart from the adjusted indentation level on relevant sections. [ Quentin: rebased on bpf-next, removed indent level for command description and options, updated synopsis, command summary, and "see also" sections. ] Signed-off-by: Rameez Rehman Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240331200346.29118-2-qmo@kernel.org --- tools/bpf/bpftool/Documentation/Makefile | 6 +- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 104 +++-- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 224 ++++++----- .../bpf/bpftool/Documentation/bpftool-feature.rst | 115 +++--- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 338 ++++++++-------- tools/bpf/bpftool/Documentation/bpftool-iter.rst | 58 ++- tools/bpf/bpftool/Documentation/bpftool-link.rst | 73 ++-- tools/bpf/bpftool/Documentation/bpftool-map.rst | 232 ++++++----- tools/bpf/bpftool/Documentation/bpftool-net.rst | 112 +++--- tools/bpf/bpftool/Documentation/bpftool-perf.rst | 34 +- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 436 ++++++++++----------- .../bpftool/Documentation/bpftool-struct_ops.rst | 81 ++-- tools/bpf/bpftool/Documentation/bpftool.rst | 60 +-- tools/bpf/bpftool/Documentation/common_options.rst | 26 +- 14 files changed, 908 insertions(+), 991 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index ac8487dcff1d..4315652678b9 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -31,9 +31,9 @@ see_also = $(subst " ",, \ "\n" \ "SEE ALSO\n" \ "========\n" \ - "\t**bpf**\ (2),\n" \ - "\t**bpf-helpers**\\ (7)" \ - $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "**bpf**\ (2),\n" \ + "**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n**$(page)**\\ (8)") \ "\n") $(OUTPUT)%.8: %.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 342716f74ec4..1be6b9aae053 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -14,82 +14,76 @@ tool for inspection of BTF data SYNOPSIS ======== - **bpftool** [*OPTIONS*] **btf** *COMMAND* +**bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } - *COMMANDS* := { **dump** | **help** } +*COMMANDS* := { **dump** | **help** } BTF COMMANDS ============= -| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] -| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] -| **bpftool** **btf help** +| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] +| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] +| **bpftool** **btf help** | -| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } -| *FORMAT* := { **raw** | **c** } -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *FORMAT* := { **raw** | **c** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } DESCRIPTION =========== - **bpftool btf { show | list }** [**id** *BTF_ID*] - Show information about loaded BTF objects. If a BTF ID is - specified, show information only about given BTF object, - otherwise list all BTF objects currently loaded on the - system. +**bpftool btf { show | list }** [**id** *BTF_ID*] + Show information about loaded BTF objects. If a BTF ID is specified, show + information only about given BTF object, otherwise list all BTF objects + currently loaded on the system. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BTF - objects. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BTF objects. On such kernels + bpftool will automatically emit this information as well. - **bpftool btf dump** *BTF_SRC* - Dump BTF entries from a given *BTF_SRC*. +**bpftool btf dump** *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. - When **id** is specified, BTF object with that ID will be - loaded and all its BTF types emitted. + When **id** is specified, BTF object with that ID will be loaded and all + its BTF types emitted. - When **map** is provided, it's expected that map has - associated BTF object with BTF types describing key and - value. It's possible to select whether to dump only BTF - type(s) associated with key (**key**), value (**value**), - both key and value (**kv**), or all BTF types present in - associated BTF object (**all**). If not specified, **kv** - is assumed. + When **map** is provided, it's expected that map has associated BTF object + with BTF types describing key and value. It's possible to select whether to + dump only BTF type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in associated BTF + object (**all**). If not specified, **kv** is assumed. - When **prog** is provided, it's expected that program has - associated BTF object with BTF types. + When **prog** is provided, it's expected that program has associated BTF + object with BTF types. - When specifying *FILE*, an ELF file is expected, containing - .BTF section with well-defined BTF binary format data, - typically produced by clang or pahole. + When specifying *FILE*, an ELF file is expected, containing .BTF section + with well-defined BTF binary format data, typically produced by clang or + pahole. - **format** option can be used to override default (raw) - output format. Raw (**raw**) or C-syntax (**c**) output - formats are supported. + **format** option can be used to override default (raw) output format. Raw + (**raw**) or C-syntax (**c**) output formats are supported. - **bpftool btf help** - Print short help message. +**bpftool btf help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -B, --base-btf *FILE* - Pass a base BTF object. Base BTF objects are typically used - with BTF objects for kernel modules. To avoid duplicating - all kernel symbols required by modules, BTF objects for - modules are "split", they are built incrementally on top of - the kernel (vmlinux) BTF object. So the base BTF reference - should usually point to the kernel BTF. - - When the main BTF object to process (for example, the - module BTF to dump) is passed as a *FILE*, bpftool attempts - to autodetect the path for the base object, and passing - this option is optional. When the main BTF object is passed - through other handles, this option becomes necessary. +.. include:: common_options.rst + +-B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used with BTF + objects for kernel modules. To avoid duplicating all kernel symbols + required by modules, BTF objects for modules are "split", they are + built incrementally on top of the kernel (vmlinux) BTF object. So the + base BTF reference should usually point to the kernel BTF. + + When the main BTF object to process (for example, the module BTF to + dump) is passed as a *FILE*, bpftool attempts to autodetect the path + for the base object, and passing this option is optional. When the main + BTF object is passed through other handles, this option becomes + necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 2ce900f66d6e..1dbabe33e56a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -14,134 +14,130 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **cgroup** *COMMAND* +**bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } - *COMMANDS* := - { **show** | **list** | **tree** | **attach** | **detach** | **help** } +*COMMANDS* := +{ **show** | **list** | **tree** | **attach** | **detach** | **help** } CGROUP COMMANDS =============== -| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] -| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] -| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] -| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* -| **bpftool** **cgroup help** +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] +| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] +| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +| **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | -| **cgroup_inet_sock_create** | **cgroup_sock_ops** | -| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | -| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | -| **cgroup_inet4_connect** | **cgroup_inet6_connect** | -| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | -| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | -| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | -| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | -| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | -| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | -| **cgroup_unix_recvmsg** | **cgroup_sysctl** | -| **cgroup_getsockopt** | **cgroup_setsockopt** | -| **cgroup_inet_sock_release** } -| *ATTACH_FLAGS* := { **multi** | **override** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | +| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | +| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_unix_recvmsg** | **cgroup_sysctl** | +| **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } +| *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION =========== - **bpftool cgroup { show | list }** *CGROUP* [**effective**] - List all programs attached to the cgroup *CGROUP*. - - Output will start with program ID followed by attach type, - attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] - Iterate over all cgroups in *CGROUP_ROOT* and list all - attached programs. If *CGROUP_ROOT* is not specified, - bpftool uses cgroup v2 mountpoint. - - The output is similar to the output of cgroup show/list - commands: it starts with absolute cgroup path, followed by - program ID, attach type, attach flags and program name. - - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. - - **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] - Attach program *PROG* to the cgroup *CGROUP* with attach type - *ATTACH_TYPE* and optional *ATTACH_FLAGS*. - - *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs - some bpf program, the program in this cgroup yields to sub-cgroup - program; **multi** if a sub-cgroup installs some bpf program, - that cgroup program gets run in addition to the program in this - cgroup. - - Only one program is allowed to be attached to a cgroup with - no attach flags or the **override** flag. Attaching another - program will release old program and attach the new one. - - Multiple programs are allowed to be attached to a cgroup with - **multi**. They are executed in FIFO order (those that were - attached first, run first). - - Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 - and later. - - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for - an unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). - - **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* - Detach *PROG* from the cgroup *CGROUP* and attach type - *ATTACH_TYPE*. - - **bpftool prog help** - Print short help message. +**bpftool cgroup { show | list }** *CGROUP* [**effective**] + List all programs attached to the cgroup *CGROUP*. + + Output will start with program ID followed by attach type, attach flags and + program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +**bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] + Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. + If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. + + The output is similar to the output of cgroup show/list commands: it starts + with absolute cgroup path, followed by program ID, attach type, attach + flags and program name. + + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. + +**bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] + Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* + and optional *ATTACH_FLAGS*. + + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs some + bpf program, the program in this cgroup yields to sub-cgroup program; + **multi** if a sub-cgroup installs some bpf program, that cgroup program + gets run in addition to the program in this cgroup. + + Only one program is allowed to be attached to a cgroup with no attach flags + or the **override** flag. Attaching another program will release old + program and attach the new one. + + Multiple programs are allowed to be attached to a cgroup with **multi**. + They are executed in FIFO order (those that were attached first, run + first). + + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. + + *ATTACH_TYPE* can be on of: + **ingress** ingress path of the inet socket (since 4.10); + **egress** egress path of the inet socket (since 4.10); + **sock_create** opening of an inet socket (since 4.10); + **sock_ops** various socket operations (since 4.12); + **device** device access (since 4.15); + **bind4** call to bind(2) for an inet4 socket (since 4.17); + **bind6** call to bind(2) for an inet6 socket (since 4.17); + **post_bind4** return from bind(2) for an inet4 socket (since 4.17); + **post_bind6** return from bind(2) for an inet6 socket (since 4.17); + **connect4** call to connect(2) for an inet4 socket (since 4.17); + **connect6** call to connect(2) for an inet6 socket (since 4.17); + **connect_unix** call to connect(2) for a unix socket (since 6.7); + **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected + udp4 socket (since 4.18); + **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected + udp6 socket (since 4.18); + **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an + unconnected unix socket (since 6.7); + **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an + unconnected udp4 socket (since 5.2); + **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an + unconnected udp6 socket (since 5.2); + **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an + unconnected unix socket (since 6.7); + **sysctl** sysctl access (since 5.2); + **getsockopt** call to getsockopt (since 5.3); + **setsockopt** call to setsockopt (since 5.3); + **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); + **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); + **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); + **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); + **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). + **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); + **sock_release** closing an userspace inet socket (since 5.9). + +**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* + Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. + +**bpftool prog help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned programs. +-f, --bpffs + Show file names of pinned programs. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index e44039f89be7..5c2433147ba4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -14,77 +14,70 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device SYNOPSIS ======== - **bpftool** [*OPTIONS*] **feature** *COMMAND* +**bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **probe** | **help** } +*COMMANDS* := { **probe** | **help** } FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list_builtins** *GROUP* -| **bpftool** **feature help** +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list_builtins** *GROUP* +| **bpftool** **feature help** | -| *COMPONENT* := { **kernel** | **dev** *NAME* } -| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } +| *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] - Probe the running kernel and dump a number of eBPF-related - parameters, such as availability of the **bpf**\ () system call, - JIT status, eBPF program types availability, eBPF helper - functions availability, and more. - - By default, bpftool **does not run probes** for - **bpf_probe_write_user**\ () and **bpf_trace_printk**\() - helpers which print warnings to kernel logs. To enable them - and run all probes, the **full** keyword should be used. - - If the **macros** keyword (but not the **-j** option) is - passed, a subset of the output is dumped as a list of - **#define** macros that are ready to be included in a C - header file, for example. If, additionally, **prefix** is - used to define a *PREFIX*, the provided string will be used - as a prefix to the names of the macros: this can be used to - avoid conflicts on macro names when including the output of - this command as a header file. - - Keyword **kernel** can be omitted. If no probe target is - specified, probing the kernel is the default behaviour. - - When the **unprivileged** keyword is used, bpftool will dump - only the features available to a user who does not have the - **CAP_SYS_ADMIN** capability set. The features available in - that case usually represent a small subset of the parameters - supported by the system. Unprivileged users MUST use the - **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. This - keyword is unavailable if bpftool was compiled without - libcap. - - **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] - Probe network device for supported eBPF features and dump - results to the console. - - The keywords **full**, **macros** and **prefix** have the - same role as when probing the kernel. - - **bpftool feature list_builtins** *GROUP* - List items known to bpftool. These can be BPF program types - (**prog_types**), BPF map types (**map_types**), attach types - (**attach_types**), link types (**link_types**), or BPF helper - functions (**helpers**). The command does not probe the system, but - simply lists the elements that bpftool knows from compilation time, - as provided from libbpf (for all object types) or from the BPF UAPI - header (list of helpers). This can be used in scripts to iterate over - BPF types or helpers. - - **bpftool feature help** - Print short help message. +**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related parameters, such + as availability of the **bpf**\ () system call, JIT status, eBPF program + types availability, eBPF helper functions availability, and more. + + By default, bpftool **does not run probes** for **bpf_probe_write_user**\ + () and **bpf_trace_printk**\() helpers which print warnings to kernel logs. + To enable them and run all probes, the **full** keyword should be used. + + If the **macros** keyword (but not the **-j** option) is passed, a subset + of the output is dumped as a list of **#define** macros that are ready to + be included in a C header file, for example. If, additionally, **prefix** + is used to define a *PREFIX*, the provided string will be used as a prefix + to the names of the macros: this can be used to avoid conflicts on macro + names when including the output of this command as a header file. + + Keyword **kernel** can be omitted. If no probe target is specified, probing + the kernel is the default behaviour. + + When the **unprivileged** keyword is used, bpftool will dump only the + features available to a user who does not have the **CAP_SYS_ADMIN** + capability set. The features available in that case usually represent a + small subset of the parameters supported by the system. Unprivileged users + MUST use the **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. This keyword is + unavailable if bpftool was compiled without libcap. + +**bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] + Probe network device for supported eBPF features and dump results to the + console. + + The keywords **full**, **macros** and **prefix** have the same role as when + probing the kernel. + +**bpftool feature list_builtins** *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper functions + (**helpers**). The command does not probe the system, but simply lists the + elements that bpftool knows from compilation time, as provided from libbpf + (for all object types) or from the BPF UAPI header (list of helpers). This + can be used in scripts to iterate over BPF types or helpers. + +**bpftool feature help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 5e60825818dd..68104347d592 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,199 +14,177 @@ tool for BPF code-generation SYNOPSIS ======== - **bpftool** [*OPTIONS*] **gen** *COMMAND* +**bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } - *COMMAND* := { **object** | **skeleton** | **help** } +*COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] -| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] -| **bpftool** **gen help** +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +| **bpftool** **gen help** DESCRIPTION =========== - **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] - Statically link (combine) together one or more *INPUT_FILE*'s - into a single resulting *OUTPUT_FILE*. All the files involved - are BPF ELF object files. - - The rules of BPF static linking are mostly the same as for - user-space object files, but in addition to combining data - and instruction sections, .BTF and .BTF.ext (if present in - any of the input files) data are combined together. .BTF - data is deduplicated, so all the common types across - *INPUT_FILE*'s will only be represented once in the resulting - BTF information. - - BPF static linking allows to partition BPF source code into - individually compiled files that are then linked into - a single resulting BPF object file, which can be used to - generated BPF skeleton (with **gen skeleton** command) or - passed directly into **libbpf** (using **bpf_object__open()** - family of APIs). - - **bpftool gen skeleton** *FILE* - Generate BPF skeleton C header file for a given *FILE*. - - BPF skeleton is an alternative interface to existing libbpf - APIs for working with BPF objects. Skeleton code is intended - to significantly shorten and simplify code to load and work - with BPF programs from userspace side. Generated code is - tailored to specific input BPF object *FILE*, reflecting its - structure by listing out available maps, program, variables, - etc. Skeleton eliminates the need to lookup mentioned - components by name. Instead, if skeleton instantiation - succeeds, they are populated in skeleton structure as valid - libbpf types (e.g., **struct bpf_map** pointer) and can be - passed to existing generic libbpf APIs. - - In addition to simple and reliable access to maps and - programs, skeleton provides a storage for BPF links (**struct - bpf_link**) for each BPF program within BPF object. When - requested, supported BPF programs will be automatically - attached and resulting BPF links stored for further use by - user in pre-allocated fields in skeleton struct. For BPF - programs that can't be automatically attached by libbpf, - user can attach them manually, but store resulting BPF link - in per-program link field. All such set up links will be - automatically destroyed on BPF skeleton destruction. This - eliminates the need for users to manage links manually and - rely on libbpf support to detach programs and free up - resources. - - Another facility provided by BPF skeleton is an interface to - global variables of all supported kinds: mutable, read-only, - as well as extern ones. This interface allows to pre-setup - initial values of variables before BPF object is loaded and - verified by kernel. For non-read-only variables, the same - interface can be used to fetch values of global variables on - userspace side, even if they are modified by BPF code. - - During skeleton generation, contents of source BPF object - *FILE* is embedded within generated code and is thus not - necessary to keep around. This ensures skeleton and BPF - object file are matching 1-to-1 and always stay in sync. - Generated code is dual-licensed under LGPL-2.1 and - BSD-2-Clause licenses. - - It is a design goal and guarantee that skeleton interfaces - are interoperable with generic libbpf APIs. User should - always be able to use skeleton API to create and load BPF - object, and later use libbpf APIs to keep working with - specific maps, programs, etc. - - As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name. Object name can - either be derived from object file name, i.e., if BPF object - file name is **example.o**, BPF object name will be - **example**. Object name can be also specified explicitly - through **name** *OBJECT_NAME* parameter. The following - custom functions are provided (assuming **example** as - the object name): - - - **example__open** and **example__open_opts**. - These functions are used to instantiate skeleton. It - corresponds to libbpf's **bpf_object__open**\ () API. - **_opts** variants accepts extra **bpf_object_open_opts** - options. - - - **example__load**. - This function creates maps, loads and verifies BPF - programs, initializes global data maps. It corresponds to - libppf's **bpf_object__load**\ () API. - - - **example__open_and_load** combines **example__open** and - **example__load** invocations in one commonly used - operation. - - - **example__attach** and **example__detach** - This pair of functions allow to attach and detach, - correspondingly, already loaded BPF object. Only BPF - programs of types supported by libbpf for auto-attachment - will be auto-attached and their corresponding BPF links - instantiated. For other BPF programs, user can manually - create a BPF link and assign it to corresponding fields in - skeleton struct. **example__detach** will detach both - links created automatically, as well as those populated by - user manually. - - - **example__destroy** - Detach and unload BPF programs, free up all the resources - used by skeleton and BPF object. - - If BPF object has global variables, corresponding structs - with memory layout corresponding to global data data section - layout will be created. Currently supported ones are: *.data*, - *.bss*, *.rodata*, and *.kconfig* structs/data sections. - These data sections/structs can be used to set up initial - values of variables, if set before **example__load**. - Afterwards, if target kernel supports memory-mapped BPF - arrays, same structs can be used to fetch and update - (non-read-only) data from userspace, with same simplicity - as for BPF side. - - **bpftool gen subskeleton** *FILE* - Generate BPF subskeleton C header file for a given *FILE*. - - Subskeletons are similar to skeletons, except they do not own - the corresponding maps, programs, or global variables. They - require that the object file used to generate them is already - loaded into a *bpf_object* by some other means. - - This functionality is useful when a library is included into a - larger BPF program. A subskeleton for the library would have - access to all objects and globals defined in it, without - having to know about the larger program. - - Consequently, there are only two functions defined - for subskeletons: - - - **example__open(bpf_object\*)** - Instantiates a subskeleton from an already opened (but not - necessarily loaded) **bpf_object**. - - - **example__destroy()** - Frees the storage for the subskeleton but *does not* unload - any BPF programs or maps. - - **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] - Generate a minimum BTF file as *OUTPUT*, derived from a given - *INPUT* BTF file, containing all needed BTF types so one, or - more, given eBPF objects CO-RE relocations may be satisfied. - - When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, - libbpf, when loading an eBPF object, has to rely on external - BTF files to be able to calculate CO-RE relocations. - - Usually, an external BTF file is built from existing kernel - DWARF data using pahole. It contains all the types used by - its respective kernel image and, because of that, is big. - - The min_core_btf feature builds smaller BTF files, customized - to one or multiple eBPF objects, so they can be distributed - together with an eBPF CO-RE based application, turning the - application portable to different kernel versions. - - Check examples bellow for more information how to use it. - - **bpftool gen help** - Print short help message. +**bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s into a single + resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for user-space + object files, but in addition to combining data and instruction sections, + .BTF and .BTF.ext (if present in any of the input files) data are combined + together. .BTF data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting BTF + information. + + BPF static linking allows to partition BPF source code into individually + compiled files that are then linked into a single resulting BPF object + file, which can be used to generated BPF skeleton (with **gen skeleton** + command) or passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + +**bpftool gen skeleton** *FILE* + Generate BPF skeleton C header file for a given *FILE*. + + BPF skeleton is an alternative interface to existing libbpf APIs for + working with BPF objects. Skeleton code is intended to significantly + shorten and simplify code to load and work with BPF programs from userspace + side. Generated code is tailored to specific input BPF object *FILE*, + reflecting its structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned components by name. + Instead, if skeleton instantiation succeeds, they are populated in skeleton + structure as valid libbpf types (e.g., **struct bpf_map** pointer) and can + be passed to existing generic libbpf APIs. + + In addition to simple and reliable access to maps and programs, skeleton + provides a storage for BPF links (**struct bpf_link**) for each BPF program + within BPF object. When requested, supported BPF programs will be + automatically attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF programs that + can't be automatically attached by libbpf, user can attach them manually, + but store resulting BPF link in per-program link field. All such set up + links will be automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and rely on libbpf + support to detach programs and free up resources. + + Another facility provided by BPF skeleton is an interface to global + variables of all supported kinds: mutable, read-only, as well as extern + ones. This interface allows to pre-setup initial values of variables before + BPF object is loaded and verified by kernel. For non-read-only variables, + the same interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. + + During skeleton generation, contents of source BPF object *FILE* is + embedded within generated code and is thus not necessary to keep around. + This ensures skeleton and BPF object file are matching 1-to-1 and always + stay in sync. Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. + + It is a design goal and guarantee that skeleton interfaces are + interoperable with generic libbpf APIs. User should always be able to use + skeleton API to create and load BPF object, and later use libbpf APIs to + keep working with specific maps, programs, etc. + + As part of skeleton, few custom functions are generated. Each of them is + prefixed with object name. Object name can either be derived from object + file name, i.e., if BPF object file name is **example.o**, BPF object name + will be **example**. Object name can be also specified explicitly through + **name** *OBJECT_NAME* parameter. The following custom functions are + provided (assuming **example** as the object name): + + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It corresponds to + libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra + **bpf_object_open_opts** options. + + - **example__load**. + This function creates maps, loads and verifies BPF programs, initializes + global data maps. It corresponds to libppf's **bpf_object__load**\ () + API. + + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used operation. + + - **example__attach** and **example__detach** + This pair of functions allow to attach and detach, correspondingly, + already loaded BPF object. Only BPF programs of types supported by libbpf + for auto-attachment will be auto-attached and their corresponding BPF + links instantiated. For other BPF programs, user can manually create a + BPF link and assign it to corresponding fields in skeleton struct. + **example__detach** will detach both links created automatically, as well + as those populated by user manually. + + - **example__destroy** + Detach and unload BPF programs, free up all the resources used by + skeleton and BPF object. + + If BPF object has global variables, corresponding structs with memory + layout corresponding to global data data section layout will be created. + Currently supported ones are: *.data*, *.bss*, *.rodata*, and *.kconfig* + structs/data sections. These data sections/structs can be used to set up + initial values of variables, if set before **example__load**. Afterwards, + if target kernel supports memory-mapped BPF arrays, same structs can be + used to fetch and update (non-read-only) data from userspace, with same + simplicity as for BPF side. + +**bpftool gen subskeleton** *FILE* + Generate BPF subskeleton C header file for a given *FILE*. + + Subskeletons are similar to skeletons, except they do not own the + corresponding maps, programs, or global variables. They require that the + object file used to generate them is already loaded into a *bpf_object* by + some other means. + + This functionality is useful when a library is included into a larger BPF + program. A subskeleton for the library would have access to all objects and + globals defined in it, without having to know about the larger program. + + Consequently, there are only two functions defined for subskeletons: + + - **example__open(bpf_object\*)** + Instantiates a subskeleton from an already opened (but not necessarily + loaded) **bpf_object**. + + - **example__destroy()** + Frees the storage for the subskeleton but *does not* unload any BPF + programs or maps. + +**bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] + Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF + file, containing all needed BTF types so one, or more, given eBPF objects + CO-RE relocations may be satisfied. + + When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, libbpf, when + loading an eBPF object, has to rely on external BTF files to be able to + calculate CO-RE relocations. + + Usually, an external BTF file is built from existing kernel DWARF data + using pahole. It contains all the types used by its respective kernel image + and, because of that, is big. + + The min_core_btf feature builds smaller BTF files, customized to one or + multiple eBPF objects, so they can be distributed together with an eBPF + CO-RE based application, turning the application portable to different + kernel versions. + + Check examples bellow for more information how to use it. + +**bpftool gen help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -L, --use-loader - For skeletons, generate a "light" skeleton (also known as "loader" - skeleton). A light skeleton contains a loader eBPF program. It does - not use the majority of the libbpf infrastructure, and does not need - libelf. +-L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does not use + the majority of the libbpf infrastructure, and does not need libelf. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 84839d488621..af17cc0201d4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -14,50 +14,46 @@ tool to create BPF iterators SYNOPSIS ======== - **bpftool** [*OPTIONS*] **iter** *COMMAND* +**bpftool** [*OPTIONS*] **iter** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **pin** | **help** } +*COMMANDS* := { **pin** | **help** } ITER COMMANDS =================== -| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] -| **bpftool** **iter help** +| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] +| **bpftool** **iter help** | -| *OBJ* := /a/file/of/bpf_iter_target.o -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *OBJ* := /a/file/of/bpf_iter_target.o +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] - A bpf iterator combines a kernel iterating of - particular kernel data (e.g., tasks, bpf_maps, etc.) - and a bpf program called for each kernel data object - (e.g., one task, one bpf_map, etc.). User space can - *read* kernel iterator output through *read()* syscall. - - The *pin* command creates a bpf iterator from *OBJ*, - and pin it to *PATH*. The *PATH* should be located - in *bpffs* mount. It must not contain a dot - character ('.'), which is reserved for future extensions - of *bpffs*. - - Map element bpf iterator requires an additional parameter - *MAP* so bpf program can iterate over map elements for - that map. User can have a bpf program in kernel to run - with each map element, do checking, filtering, aggregation, - etc. without copying data to user space. - - User can then *cat PATH* to see the bpf iterator output. - - **bpftool iter help** - Print short help message. +**bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] + A bpf iterator combines a kernel iterating of particular kernel data (e.g., + tasks, bpf_maps, etc.) and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator + output through *read()* syscall. + + The *pin* command creates a bpf iterator from *OBJ*, and pin it to *PATH*. + The *PATH* should be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + + Map element bpf iterator requires an additional parameter *MAP* so bpf + program can iterate over map elements for that map. User can have a bpf + program in kernel to run with each map element, do checking, filtering, + aggregation, etc. without copying data to user space. + + User can then *cat PATH* to see the bpf iterator output. + +**bpftool iter help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 52a4eee4af54..6d39ad890fea 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -14,67 +14,62 @@ tool for inspection and simple manipulation of eBPF links SYNOPSIS ======== - **bpftool** [*OPTIONS*] **link** *COMMAND* +**bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := { **show** | **list** | **pin** | **help** } +*COMMANDS* := { **show** | **list** | **pin** | **help** } LINK COMMANDS ============= -| **bpftool** **link { show | list }** [*LINK*] -| **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach** *LINK* -| **bpftool** **link help** +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link detach** *LINK* +| **bpftool** **link help** | -| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool link { show | list }** [*LINK*] - Show information about active links. If *LINK* is - specified show information only about given link, - otherwise list all links currently active on the system. +**bpftool link { show | list }** [*LINK*] + Show information about active links. If *LINK* is specified show + information only about given link, otherwise list all links currently + active on the system. - Output will start with link ID followed by link type and - zero or more named attributes, some of which depend on type - of link. + Output will start with link ID followed by link type and zero or more named + attributes, some of which depend on type of link. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - links. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF links. On such kernels + bpftool will automatically emit this information as well. - **bpftool link pin** *LINK* *FILE* - Pin link *LINK* as *FILE*. +**bpftool link pin** *LINK* *FILE* + Pin link *LINK* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool link detach** *LINK* - Force-detach link *LINK*. BPF link and its underlying BPF - program will stay valid, but they will be detached from the - respective BPF hook and BPF link will transition into - a defunct state until last open file descriptor for that - link is closed. +**bpftool link detach** *LINK* + Force-detach link *LINK*. BPF link and its underlying BPF program will stay + valid, but they will be detached from the respective BPF hook and BPF link + will transition into a defunct state until last open file descriptor for + that link is closed. - **bpftool link help** - Print short help message. +**bpftool link help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst + .. include:: common_options.rst - -f, --bpffs - When showing BPF links, show file names of pinned - links. + -f, --bpffs + When showing BPF links, show file names of pinned links. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. + -n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 9d6a314dfd7a..864257169942 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -14,166 +14,160 @@ tool for inspection and simple manipulation of eBPF maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] **map** *COMMAND* +**bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **help** } +*COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **help** } MAP COMMANDS ============= -| **bpftool** **map** { **show** | **list** } [*MAP*] -| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ -| [**offload_dev** *NAME*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* [**key** *DATA*] -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* -| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] -| **bpftool** **map peek** *MAP* -| **bpftool** **map push** *MAP* **value** *VALUE* -| **bpftool** **map pop** *MAP* -| **bpftool** **map enqueue** *MAP* **value** *VALUE* -| **bpftool** **map dequeue** *MAP* -| **bpftool** **map freeze** *MAP* -| **bpftool** **map help** +| **bpftool** **map** { **show** | **list** } [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**offload_dev** *NAME*] +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* +| **bpftool** **map freeze** *MAP* +| **bpftool** **map help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } -| *DATA* := { [**hex**] *BYTES* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *VALUE* := { *DATA* | *MAP* | *PROG* } -| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } -| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** -| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** -| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** -| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** -| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *DATA* := { [**hex**] *BYTES* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } +| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== - **bpftool map { show | list }** [*MAP*] - Show information about loaded maps. If *MAP* is specified - show information only about given maps, otherwise list all - maps currently loaded on the system. In case of **name**, - *MAP* may match several maps which will all be shown. +**bpftool map { show | list }** [*MAP*] + Show information about loaded maps. If *MAP* is specified show information + only about given maps, otherwise list all maps currently loaded on the + system. In case of **name**, *MAP* may match several maps which will all + be shown. - Output will start with map ID followed by map type and - zero or more named attributes (depending on kernel version). + Output will start with map ID followed by map type and zero or more named + attributes (depending on kernel version). - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - maps. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF maps. On such kernels + bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] - Create a new map with given parameters and pin it to *bpffs* - as *FILE*. +**bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] + Create a new map with given parameters and pin it to *bpffs* as *FILE*. - *FLAGS* should be an integer which is the combination of - desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h - UAPI header for existing flags). + *FLAGS* should be an integer which is the combination of desired flags, + e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing + flags). - To create maps of type array-of-maps or hash-of-maps, the - **inner_map** keyword must be used to pass an inner map. The - kernel needs it to collect metadata related to the inner maps - that the new map will work with. + To create maps of type array-of-maps or hash-of-maps, the **inner_map** + keyword must be used to pass an inner map. The kernel needs it to collect + metadata related to the inner maps that the new map will work with. - Keyword **offload_dev** expects a network interface name, - and is used to request hardware offload for the map. + Keyword **offload_dev** expects a network interface name, and is used to + request hardware offload for the map. - **bpftool map dump** *MAP* - Dump all entries in a given *MAP*. In case of **name**, - *MAP* may match several maps which will all be dumped. +**bpftool map dump** *MAP* + Dump all entries in a given *MAP*. In case of **name**, *MAP* may match + several maps which will all be dumped. - **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] - Update map entry for a given *KEY*. +**bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. - *UPDATE_FLAGS* can be one of: **any** update existing entry - or add if doesn't exit; **exist** update only if entry already - exists; **noexist** update only if entry doesn't exist. + *UPDATE_FLAGS* can be one of: **any** update existing entry or add if + doesn't exit; **exist** update only if entry already exists; **noexist** + update only if entry doesn't exist. - If the **hex** keyword is provided in front of the bytes - sequence, the bytes are parsed as hexadecimal values, even if - no "0x" prefix is added. If the keyword is not provided, then - the bytes are parsed as decimal values, unless a "0x" prefix - (for hexadecimal) or a "0" prefix (for octal) is provided. + If the **hex** keyword is provided in front of the bytes sequence, the + bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If + the keyword is not provided, then the bytes are parsed as decimal values, + unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is + provided. - **bpftool map lookup** *MAP* [**key** *DATA*] - Lookup **key** in the map. +**bpftool map lookup** *MAP* [**key** *DATA*] + Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** *DATA*] - Get next key. If *key* is not specified, get first key. +**bpftool map getnext** *MAP* [**key** *DATA*] + Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** *DATA* - Remove entry from the map. +**bpftool map delete** *MAP* **key** *DATA* + Remove entry from the map. - **bpftool map pin** *MAP* *FILE* - Pin map *MAP* as *FILE*. +**bpftool map pin** *MAP* *FILE* + Pin map *MAP* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] - Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. +**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. - Install perf rings into a perf event array map and dump - output of any **bpf_perf_event_output**\ () call in the kernel. - By default read the number of CPUs on the system and - install perf ring for each CPU in the corresponding index - in the array. + Install perf rings into a perf event array map and dump output of any + **bpf_perf_event_output**\ () call in the kernel. By default read the + number of CPUs on the system and install perf ring for each CPU in the + corresponding index in the array. - If **cpu** and **index** are specified, install perf ring - for given **cpu** at **index** in the array (single ring). + If **cpu** and **index** are specified, install perf ring for given **cpu** + at **index** in the array (single ring). - Note that installing a perf ring into an array will silently - replace any existing ring. Any other application will stop - receiving events if it installed its rings earlier. + Note that installing a perf ring into an array will silently replace any + existing ring. Any other application will stop receiving events if it + installed its rings earlier. - **bpftool map peek** *MAP* - Peek next value in the queue or stack. +**bpftool map peek** *MAP* + Peek next value in the queue or stack. - **bpftool map push** *MAP* **value** *VALUE* - Push *VALUE* onto the stack. +**bpftool map push** *MAP* **value** *VALUE* + Push *VALUE* onto the stack. - **bpftool map pop** *MAP* - Pop and print value from the stack. +**bpftool map pop** *MAP* + Pop and print value from the stack. - **bpftool map enqueue** *MAP* **value** *VALUE* - Enqueue *VALUE* into the queue. +**bpftool map enqueue** *MAP* **value** *VALUE* + Enqueue *VALUE* into the queue. - **bpftool map dequeue** *MAP* - Dequeue and print value from the queue. +**bpftool map dequeue** *MAP* + Dequeue and print value from the queue. - **bpftool map freeze** *MAP* - Freeze the map as read-only from user space. Entries from a - frozen map can not longer be updated or deleted with the - **bpf**\ () system call. This operation is not reversible, - and the map remains immutable from user space until its - destruction. However, read and write permissions for BPF - programs to the map remain unchanged. +**bpftool map freeze** *MAP* + Freeze the map as read-only from user space. Entries from a frozen map can + not longer be updated or deleted with the **bpf**\ () system call. This + operation is not reversible, and the map remains immutable from user space + until its destruction. However, read and write permissions for BPF programs + to the map remain unchanged. - **bpftool map help** - Print short help message. +**bpftool map help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned maps. +-f, --bpffs + Show file names of pinned maps. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index dd3f9469765b..43d2c858fdf8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -14,76 +14,74 @@ tool for inspection of networking related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **net** *COMMAND* +**bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **attach** | **detach** | **help** } +*COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } NET COMMANDS ============ -| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] -| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* -| **bpftool** **net help** +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +| **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION =========== - **bpftool net { show | list }** [ **dev** *NAME* ] - List bpf program attachments in the kernel networking subsystem. - - Currently, device driver xdp attachments, tcx, netkit and old-style tc - classifier/action attachments, flow_dissector as well as netfilter - attachments are implemented, i.e., for - program types **BPF_PROG_TYPE_XDP**, **BPF_PROG_TYPE_SCHED_CLS**, - **BPF_PROG_TYPE_SCHED_ACT**, **BPF_PROG_TYPE_FLOW_DISSECTOR**, - **BPF_PROG_TYPE_NETFILTER**. - - For programs attached to a particular cgroup, e.g., - **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, - **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, - users can use **bpftool cgroup** to dump cgroup attachments. - For sk_{filter, skb, msg, reuseport} and lwt/seg6 - bpf programs, users should consult other tools, e.g., iproute2. - - The current output will start with all xdp program attachments, followed by - all tcx, netkit, then tc class/qdisc bpf program attachments, then flow_dissector - and finally netfilter programs. Both xdp programs and tcx/netkit/tc programs are - ordered based on ifindex number. If multiple bpf programs attached - to the same networking device through **tc**, the order will be first - all bpf programs attached to tcx, netkit, then tc classes, then all bpf programs - attached to non clsact qdiscs, and finally all bpf programs attached - to root and clsact qdisc. - - **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] - Attach bpf program *PROG* to network interface *NAME* with - type specified by *ATTACH_TYPE*. Previously attached bpf program - can be replaced by the command used with **overwrite** option. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - *ATTACH_TYPE* can be of: - **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; - **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; - **xdpdrv** - Native XDP. runs earliest point in driver's receive path; - **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; - - **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* - Detach bpf program attached to network interface *NAME* with - type specified by *ATTACH_TYPE*. To detach bpf program, same - *ATTACH_TYPE* previously used for attach must be specified. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. - - **bpftool net help** - Print short help message. +**bpftool net { show | list }** [ **dev** *NAME* ] + List bpf program attachments in the kernel networking subsystem. + + Currently, device driver xdp attachments, tcx, netkit and old-style tc + classifier/action attachments, flow_dissector as well as netfilter + attachments are implemented, i.e., for program types **BPF_PROG_TYPE_XDP**, + **BPF_PROG_TYPE_SCHED_CLS**, **BPF_PROG_TYPE_SCHED_ACT**, + **BPF_PROG_TYPE_FLOW_DISSECTOR**, **BPF_PROG_TYPE_NETFILTER**. + + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, users + can use **bpftool cgroup** to dump cgroup attachments. For sk_{filter, skb, + msg, reuseport} and lwt/seg6 bpf programs, users should consult other + tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by + all tcx, netkit, then tc class/qdisc bpf program attachments, then + flow_dissector and finally netfilter programs. Both xdp programs and + tcx/netkit/tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc**, the order + will be first all bpf programs attached to tcx, netkit, then tc classes, + then all bpf programs attached to non clsact qdiscs, and finally all bpf + programs attached to root and clsact qdisc. + +**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] + Attach bpf program *PROG* to network interface *NAME* with type specified + by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the + command used with **overwrite** option. Currently, only XDP-related modes + are supported for *ATTACH_TYPE*. + + *ATTACH_TYPE* can be of: + **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; + **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; + **xdpdrv** - Native XDP. runs earliest point in driver's receive path; + **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + +**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* + Detach bpf program attached to network interface *NAME* with type specified + by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used + for attach must be specified. Currently, only XDP-related modes are + supported for *ATTACH_TYPE*. + +**bpftool net help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 5fea633a82f1..841789e30acd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -14,37 +14,37 @@ tool for inspection of perf related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **perf** *COMMAND* +**bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **help** } +*COMMANDS* := +{ **show** | **list** | **help** } PERF COMMANDS ============= -| **bpftool** **perf** { **show** | **list** } -| **bpftool** **perf help** +| **bpftool** **perf** { **show** | **list** } +| **bpftool** **perf help** DESCRIPTION =========== - **bpftool perf { show | list }** - List all raw_tracepoint, tracepoint, kprobe attachment in the system. +**bpftool perf { show | list }** + List all raw_tracepoint, tracepoint, kprobe attachment in the system. - Output will start with process id and file descriptor in that process, - followed by bpf program id, attachment information, and attachment point. - The attachment point for raw_tracepoint/tracepoint is the trace probe name. - The attachment point for k[ret]probe is either symbol name and offset, - or a kernel virtual address. - The attachment point for u[ret]probe is the file name and the file offset. + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, or a + kernel virtual address. The attachment point for u[ret]probe is the file + name and the file offset. - **bpftool perf help** - Print short help message. +**bpftool perf help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 58e6a5b10ef7..cc456d357db3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -14,250 +14,226 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **prog** *COMMAND* +**bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | - { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | - { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | +{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | +{ **-L** | **--use-loader** } } - *COMMANDS* := - { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | - **loadall** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | +**loadall** | **help** } PROG COMMANDS ============= -| **bpftool** **prog** { **show** | **list** } [*PROG*] -| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] -| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] -| **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] -| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog tracelog** -| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] -| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* -| **bpftool** **prog help** +| **bpftool** **prog** { **show** | **list** } [*PROG*] +| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +| **bpftool** **prog pin** *PROG* *FILE* +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog tracelog** +| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* +| **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *TYPE* := { -| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | -| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | -| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | -| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | -| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | -| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | -| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | -| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | -| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | -| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** -| } -| *ATTACH_TYPE* := { -| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | -| **sk_skb_stream_parser** | **flow_dissector** -| } -| *METRICs* := { -| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | -| **itlb_misses** | **dtlb_misses** -| } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *TYPE* := { +| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | +| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | +| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | +| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | +| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | +| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | +| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | +| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** +| } +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** +| } +| *METRICs* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** +| } DESCRIPTION =========== - **bpftool prog { show | list }** [*PROG*] - Show information about loaded programs. If *PROG* is - specified show information only about given programs, - otherwise list all programs currently loaded on the system. - In case of **tag** or **name**, *PROG* may match several - programs which will all be shown. - - Output will start with program ID followed by program type and - zero or more named attributes (depending on kernel version). - - Since Linux 5.1 the kernel can collect statistics on BPF - programs (such as the total time spent running the program, - and the number of times it was run). If available, bpftool - shows such statistics. However, the kernel does not collect - them by defaults, as it slightly impacts performance on each - program run. Activation or deactivation of the feature is - performed via the **kernel.bpf_stats_enabled** sysctl knob. - - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - programs. On such kernels bpftool will automatically emit this - information as well. - - **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] - Dump eBPF instructions of the programs from the kernel. By - default, eBPF will be disassembled and printed to standard - output in human-readable format. In this case, **opcodes** - controls if raw opcodes should be printed as well. - - In case of **tag** or **name**, *PROG* may match several - programs which will all be dumped. However, if **file** or - **visual** is specified, *PROG* must match a single program. - - If **file** is specified, the binary image will instead be - written to *FILE*. - - If **visual** is specified, control flow graph (CFG) will be - built instead, and eBPF instructions will be presented with - CFG in DOT format, on standard output. - - If the programs have line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] - Dump jited image (host machine code) of the program. - - If *FILE* is specified image will be written to a file, - otherwise it will be disassembled and printed to stdout. - *PROG* must match a single program when **file** is specified. - - **opcodes** controls if raw opcodes will be printed. - - If the prog has line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. - - **bpftool prog pin** *PROG* *FILE* - Pin program *PROG* as *FILE*. - - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] - Load bpf program(s) from binary *OBJ* and pin as *PATH*. - **bpftool prog load** pins only the first program from the - *OBJ* as *PATH*. **bpftool prog loadall** pins all programs - from the *OBJ* under *PATH* directory. - **type** is optional, if not specified program type will be - inferred from section names. - By default bpftool will create new maps as declared in the ELF - object being loaded. **map** parameter allows for the reuse - of existing maps. It can be specified multiple times, each - time for a different map. *IDX* refers to index of the map - to be replaced in the ELF file counting from 0, while *NAME* - allows to replace a map by name. *MAP* specifies the map to - use, referring to it by **id** or through a **pinned** file. - If **offload_dev** *NAME* is specified program will be loaded - onto given networking device (offload). - If **xdpmeta_dev** *NAME* is specified program will become - device-bound without offloading, this facilitates access - to XDP metadata. - Optional **pinmaps** argument can be provided to pin all - maps under *MAP_DIR* directory. - - If **autoattach** is specified program will be attached - before pin. In that case, only the link (representing the - program attached to its hook) is pinned, not the program as - such, so the path won't show in **bpftool prog show -f**, - only show in **bpftool link show -f**. Also, this only works - when bpftool (libbpf) is able to infer all necessary - information from the object file, in particular, it's not - supported for all program types. If a program does not - support autoattach, bpftool falls back to regular pinning - for that program instead. - - Note: *PATH* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. - - **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] - Attach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - attached to current networking name space. - - **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] - Detach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - detached from the current networking name space. - - **bpftool prog tracelog** - Dump the trace pipe of the system to the console (stdout). - Hit to stop printing. BPF programs can write to this - trace pipe at runtime with the **bpf_trace_printk**\ () helper. - This should be used only for debugging purposes. For - streaming data from BPF programs to user space, one can use - perf events (see also **bpftool-map**\ (8)). - - **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] - Run BPF program *PROG* in the kernel testing infrastructure - for BPF, meaning that the program works on the data and - context provided by the user, and not on actual packets or - monitored functions etc. Return value and duration for the - test run are printed out to the console. - - Input data is read from the *FILE* passed with **data_in**. - If this *FILE* is "**-**", input data is read from standard - input. Input context, if any, is read from *FILE* passed with - **ctx_in**. Again, "**-**" can be used to read from standard - input, but only if standard input is not already in use for - input data. If a *FILE* is passed with **data_out**, output - data is written to that file. Similarly, output context is - written to the *FILE* passed with **ctx_out**. For both - output flows, "**-**" can be used to print to the standard - output (as plain text, or JSON if relevant option was - passed). If output keywords are omitted, output data and - context are discarded. Keywords **data_size_out** and - **ctx_size_out** are used to pass the size (in bytes) for the - output buffers to the kernel, although the default of 32 kB - should be more than enough for most cases. - - Keyword **repeat** is used to indicate the number of - consecutive runs to perform. Note that output data and - context printed to files correspond to the last of those - runs. The duration printed out at the end of the runs is an - average over all runs performed by the command. - - Not all program types support test run. Among those which do, - not all of them can take the **ctx_in**/**ctx_out** - arguments. bpftool does not perform checks on program types. - - **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* - Profile *METRICs* for bpf program *PROG* for *DURATION* - seconds or until user hits . *DURATION* is optional. - If *DURATION* is not specified, the profiling will run up to - **UINT_MAX** seconds. - - **bpftool prog help** - Print short help message. +**bpftool prog { show | list }** [*PROG*] + Show information about loaded programs. If *PROG* is specified show + information only about given programs, otherwise list all programs + currently loaded on the system. In case of **tag** or **name**, *PROG* may + match several programs which will all be shown. + + Output will start with program ID followed by program type and zero or more + named attributes (depending on kernel version). + + Since Linux 5.1 the kernel can collect statistics on BPF programs (such as + the total time spent running the program, and the number of times it was + run). If available, bpftool shows such statistics. However, the kernel does + not collect them by defaults, as it slightly impacts performance on each + program run. Activation or deactivation of the feature is performed via the + **kernel.bpf_stats_enabled** sysctl knob. + + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF programs. On such kernels + bpftool will automatically emit this information as well. + +**bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] + Dump eBPF instructions of the programs from the kernel. By default, eBPF + will be disassembled and printed to standard output in human-readable + format. In this case, **opcodes** controls if raw opcodes should be printed + as well. + + In case of **tag** or **name**, *PROG* may match several programs which + will all be dumped. However, if **file** or **visual** is specified, + *PROG* must match a single program. + + If **file** is specified, the binary image will instead be written to + *FILE*. + + If **visual** is specified, control flow graph (CFG) will be built instead, + and eBPF instructions will be presented with CFG in DOT format, on standard + output. + + If the programs have line_info available, the source line will be + displayed. If **linum** is specified, the filename, line number and line + column will also be displayed. + +**bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] + Dump jited image (host machine code) of the program. + + If *FILE* is specified image will be written to a file, otherwise it will + be disassembled and printed to stdout. *PROG* must match a single program + when **file** is specified. + + **opcodes** controls if raw opcodes will be printed. + + If the prog has line_info available, the source line will be displayed. If + **linum** is specified, the filename, line number and line column will also + be displayed. + +**bpftool prog pin** *PROG* *FILE* + Pin program *PROG* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +**bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] + Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog + load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog + loadall** pins all programs from the *OBJ* under *PATH* directory. **type** + is optional, if not specified program type will be inferred from section + names. By default bpftool will create new maps as declared in the ELF + object being loaded. **map** parameter allows for the reuse of existing + maps. It can be specified multiple times, each time for a different map. + *IDX* refers to index of the map to be replaced in the ELF file counting + from 0, while *NAME* allows to replace a map by name. *MAP* specifies the + map to use, referring to it by **id** or through a **pinned** file. If + **offload_dev** *NAME* is specified program will be loaded onto given + networking device (offload). If **xdpmeta_dev** *NAME* is specified program + will become device-bound without offloading, this facilitates access to XDP + metadata. Optional **pinmaps** argument can be provided to pin all maps + under *MAP_DIR* directory. + + If **autoattach** is specified program will be attached before pin. In that + case, only the link (representing the program attached to its hook) is + pinned, not the program as such, so the path won't show in **bpftool prog + show -f**, only show in **bpftool link show -f**. Also, this only works + when bpftool (libbpf) is able to infer all necessary information from the + object file, in particular, it's not supported for all program types. If a + program does not support autoattach, bpftool falls back to regular pinning + for that program instead. + + Note: *PATH* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. + +**bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is attached to current networking name space. + +**bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is detached from the current networking name space. + +**bpftool prog tracelog** + Dump the trace pipe of the system to the console (stdout). Hit to + stop printing. BPF programs can write to this trace pipe at runtime with + the **bpf_trace_printk**\ () helper. This should be used only for debugging + purposes. For streaming data from BPF programs to user space, one can use + perf events (see also **bpftool-map**\ (8)). + +**bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] + Run BPF program *PROG* in the kernel testing infrastructure for BPF, + meaning that the program works on the data and context provided by the + user, and not on actual packets or monitored functions etc. Return value + and duration for the test run are printed out to the console. + + Input data is read from the *FILE* passed with **data_in**. If this *FILE* + is "**-**", input data is read from standard input. Input context, if any, + is read from *FILE* passed with **ctx_in**. Again, "**-**" can be used to + read from standard input, but only if standard input is not already in use + for input data. If a *FILE* is passed with **data_out**, output data is + written to that file. Similarly, output context is written to the *FILE* + passed with **ctx_out**. For both output flows, "**-**" can be used to + print to the standard output (as plain text, or JSON if relevant option was + passed). If output keywords are omitted, output data and context are + discarded. Keywords **data_size_out** and **ctx_size_out** are used to pass + the size (in bytes) for the output buffers to the kernel, although the + default of 32 kB should be more than enough for most cases. + + Keyword **repeat** is used to indicate the number of consecutive runs to + perform. Note that output data and context printed to files correspond to + the last of those runs. The duration printed out at the end of the runs is + an average over all runs performed by the command. + + Not all program types support test run. Among those which do, not all of + them can take the **ctx_in**/**ctx_out** arguments. bpftool does not + perform checks on program types. + +**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until + user hits . *DURATION* is optional. If *DURATION* is not specified, + the profiling will run up to **UINT_MAX** seconds. + +**bpftool prog help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst - - -f, --bpffs - When showing BPF programs, show file names of pinned - programs. - - -m, --mapcompat - Allow loading maps with unknown map definitions. - - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. - - -L, --use-loader - Load program as a "loader" program. This is useful to debug - the generation of such programs. When this option is in - use, bpftool attempts to load the programs from the object - file into the kernel, but does not pin them (therefore, the - *PATH* must not be provided). - - When combined with the **-d**\ \|\ **--debug** option, - additional debug messages are generated, and the execution - of the loader program will use the **bpf_trace_printk**\ () - helper to log each step of loading BTF, creating the maps, - and loading the programs (see **bpftool prog tracelog** as - a way to dump those messages). +.. include:: common_options.rst + +-f, --bpffs + When showing BPF programs, show file names of pinned programs. + +-m, --mapcompat + Allow loading maps with unknown map definitions. + +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. + +-L, --use-loader + Load program as a "loader" program. This is useful to debug the generation + of such programs. When this option is in use, bpftool attempts to load the + programs from the object file into the kernel, but does not pin them + (therefore, the *PATH* must not be provided). + + When combined with the **-d**\ \|\ **--debug** option, additional debug + messages are generated, and the execution of the loader program will use + the **bpf_trace_printk**\ () helper to log each step of loading BTF, + creating the maps, and loading the programs (see **bpftool prog tracelog** + as a way to dump those messages). EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 8022b5321dbe..61ec1286446c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -14,61 +14,60 @@ tool to register/unregister/introspect BPF struct_ops SYNOPSIS ======== - **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* +**bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump** | **register** | **unregister** | **help** } STRUCT_OPS COMMANDS =================== -| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] -| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* -| **bpftool** **struct_ops help** +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** | -| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } -| *OBJ* := /a/file/of/bpf_struct_ops.o +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o DESCRIPTION =========== - **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] - Show brief information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it shows information only - for the given struct_ops. Otherwise, it lists all struct_ops - currently existing in the system. - - Output will start with struct_ops map ID, followed by its map - name and its struct_ops's kernel type. - - **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] - Dump details information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it dumps information only - for the given struct_ops. Otherwise, it dumps all struct_ops - currently existing in the system. - - **bpftool struct_ops register** *OBJ* [*LINK_DIR*] - Register bpf struct_ops from *OBJ*. All struct_ops under - the ELF section ".struct_ops" and ".struct_ops.link" will - be registered to its kernel subsystem. For each - struct_ops in the ".struct_ops.link" section, a link - will be created. You can give *LINK_DIR* to provide a - directory path where these links will be pinned with the - same name as their corresponding map name. - - **bpftool struct_ops unregister** *STRUCT_OPS_MAP* - Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. - - **bpftool struct_ops help** - Print short help message. +**bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it shows information only for the given + struct_ops. Otherwise, it lists all struct_ops currently existing in the + system. + + Output will start with struct_ops map ID, followed by its map name and its + struct_ops's kernel type. + +**bpftool struct_ops dump** [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it dumps information only for the given + struct_ops. Otherwise, it dumps all struct_ops currently existing in the + system. + +**bpftool struct_ops register** *OBJ* [*LINK_DIR*] + Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section + ".struct_ops" and ".struct_ops.link" will be registered to its kernel + subsystem. For each struct_ops in the ".struct_ops.link" section, a link + will be created. You can give *LINK_DIR* to provide a directory path where + these links will be pinned with the same name as their corresponding map + name. + +**bpftool struct_ops unregister** *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. + +**bpftool struct_ops help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 09e4f2ff5658..f38ae5c40439 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -14,57 +14,57 @@ tool for inspection and simple manipulation of eBPF programs and maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } +**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } - **bpftool** **batch file** *FILE* +**bpftool** **batch file** *FILE* - **bpftool** **version** +**bpftool** **version** - *OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | - **btf** | **gen** | **struct_ops** | **iter** } +*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | +**btf** | **gen** | **struct_ops** | **iter** } - *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } +*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } - *MAP-COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **event_pipe** | **help** } +*MAP-COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **event_pipe** | **help** } - *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | - **load** | **attach** | **detach** | **help** } +*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | +**load** | **attach** | **detach** | **help** } - *LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } +*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } - *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } +*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } - *PERF-COMMANDS* := { **show** | **list** | **help** } +*PERF-COMMANDS* := { **show** | **list** | **help** } - *NET-COMMANDS* := { **show** | **list** | **help** } +*NET-COMMANDS* := { **show** | **list** | **help** } - *FEATURE-COMMANDS* := { **probe** | **help** } +*FEATURE-COMMANDS* := { **probe** | **help** } - *BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } +*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } - *GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } +*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } - *STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } - *ITER-COMMANDS* := { **pin** | **help** } +*ITER-COMMANDS* := { **pin** | **help** } DESCRIPTION =========== - *bpftool* allows for inspection and simple modification of BPF objects - on the system. +*bpftool* allows for inspection and simple modification of BPF objects on the +system. - Note that format of the output of all tools is not guaranteed to be - stable and should not be depended upon. +Note that format of the output of all tools is not guaranteed to be stable and +should not be depended upon. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -m, --mapcompat - Allow loading maps with unknown map definitions. +-m, --mapcompat + Allow loading maps with unknown map definitions. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst index 30df7a707f02..9234b9dab768 100644 --- a/tools/bpf/bpftool/Documentation/common_options.rst +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -1,25 +1,23 @@ .. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -h, --help - Print short help message (similar to **bpftool help**). + Print short help message (similar to **bpftool help**). -V, --version - Print bpftool's version number (similar to **bpftool version**), the - number of the libbpf version in use, and optional features that were - included when bpftool was compiled. Optional features include linking - against LLVM or libbfd to provide the disassembler for JIT-ted - programs (**bpftool prog dump jited**) and usage of BPF skeletons - (some features like **bpftool prog profile** or showing pids - associated to BPF objects may rely on it). + Print bpftool's version number (similar to **bpftool version**), the number + of the libbpf version in use, and optional features that were included when + bpftool was compiled. Optional features include linking against LLVM or + libbfd to provide the disassembler for JIT-ted programs (**bpftool prog + dump jited**) and usage of BPF skeletons (some features like **bpftool prog + profile** or showing pids associated to BPF objects may rely on it). -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. + Generate JSON output. For commands that cannot produce JSON, this option + has no effect. -p, --pretty - Generate human-readable JSON output. Implies **-j**. + Generate human-readable JSON output. Implies **-j**. -d, --debug - Print all logs available, even debug-level information. This includes - logs from libbpf as well as from the verifier, when attempting to - load programs. + Print all logs available, even debug-level information. This includes logs + from libbpf as well as from the verifier, when attempting to load programs. -- cgit v1.2.3 From ea379b3ccc2e4dff9c3d616f0611b5312fe389ad Mon Sep 17 00:00:00 2001 From: Rameez Rehman Date: Sun, 31 Mar 2024 21:03:45 +0100 Subject: bpftool: Remove useless emphasis on command description in man pages As it turns out, the terms in definition lists in the rST file are already rendered with bold-ish formatting when generating the man pages; all double-star sequences we have in the commands for the command description are unnecessary, and can be removed to make the documentation easier to read. The rST files were automatically processed with: sed -i '/DESCRIPTION/,/OPTIONS/ { /^\*/ s/\*\*//g }' b*.rst Signed-off-by: Rameez Rehman Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240331200346.29118-3-qmo@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 6 ++-- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 10 +++---- .../bpf/bpftool/Documentation/bpftool-feature.rst | 8 +++--- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 10 +++---- tools/bpf/bpftool/Documentation/bpftool-iter.rst | 4 +-- tools/bpf/bpftool/Documentation/bpftool-link.rst | 8 +++--- tools/bpf/bpftool/Documentation/bpftool-map.rst | 32 +++++++++++----------- tools/bpf/bpftool/Documentation/bpftool-net.rst | 8 +++--- tools/bpf/bpftool/Documentation/bpftool-perf.rst | 4 +-- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 22 +++++++-------- .../bpftool/Documentation/bpftool-struct_ops.rst | 10 +++---- 11 files changed, 61 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 1be6b9aae053..f66781f20af2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -34,7 +34,7 @@ BTF COMMANDS DESCRIPTION =========== -**bpftool btf { show | list }** [**id** *BTF_ID*] +bpftool btf { show | list } [id *BTF_ID*] Show information about loaded BTF objects. If a BTF ID is specified, show information only about given BTF object, otherwise list all BTF objects currently loaded on the system. @@ -43,7 +43,7 @@ DESCRIPTION that hold open file descriptors (FDs) against BTF objects. On such kernels bpftool will automatically emit this information as well. -**bpftool btf dump** *BTF_SRC* +bpftool btf dump *BTF_SRC* Dump BTF entries from a given *BTF_SRC*. When **id** is specified, BTF object with that ID will be loaded and all @@ -65,7 +65,7 @@ DESCRIPTION **format** option can be used to override default (raw) output format. Raw (**raw**) or C-syntax (**c**) output formats are supported. -**bpftool btf help** +bpftool btf help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 1dbabe33e56a..1acf9f58fca2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -49,7 +49,7 @@ CGROUP COMMANDS DESCRIPTION =========== -**bpftool cgroup { show | list }** *CGROUP* [**effective**] +bpftool cgroup { show | list } *CGROUP* [effective] List all programs attached to the cgroup *CGROUP*. Output will start with program ID followed by attach type, attach flags and @@ -59,7 +59,7 @@ DESCRIPTION for events within a cgroup. This includes inherited along with attached ones. -**bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] +bpftool cgroup tree [*CGROUP_ROOT*] [effective] Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. @@ -71,7 +71,7 @@ DESCRIPTION for events within a cgroup. This includes inherited along with attached ones. -**bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* and optional *ATTACH_FLAGS*. @@ -126,10 +126,10 @@ DESCRIPTION **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); **sock_release** closing an userspace inet socket (since 5.9). -**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. -**bpftool prog help** +bpftool prog help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 5c2433147ba4..c7f837898bc7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -32,7 +32,7 @@ FEATURE COMMANDS DESCRIPTION =========== -**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] +bpftool feature probe [kernel] [full] [macros [prefix *PREFIX*]] Probe the running kernel and dump a number of eBPF-related parameters, such as availability of the **bpf**\ () system call, JIT status, eBPF program types availability, eBPF helper functions availability, and more. @@ -59,14 +59,14 @@ DESCRIPTION bpftool is inadvertently run as non-root, for example. This keyword is unavailable if bpftool was compiled without libcap. -**bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] +bpftool feature probe dev *NAME* [full] [macros [prefix *PREFIX*]] Probe network device for supported eBPF features and dump results to the console. The keywords **full**, **macros** and **prefix** have the same role as when probing the kernel. -**bpftool feature list_builtins** *GROUP* +bpftool feature list_builtins *GROUP* List items known to bpftool. These can be BPF program types (**prog_types**), BPF map types (**map_types**), attach types (**attach_types**), link types (**link_types**), or BPF helper functions @@ -75,7 +75,7 @@ DESCRIPTION (for all object types) or from the BPF UAPI header (list of helpers). This can be used in scripts to iterate over BPF types or helpers. -**bpftool feature help** +bpftool feature help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 68104347d592..e9589c21e9c3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -31,7 +31,7 @@ GEN COMMANDS DESCRIPTION =========== -**bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +bpftool gen object *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] Statically link (combine) together one or more *INPUT_FILE*'s into a single resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. @@ -48,7 +48,7 @@ DESCRIPTION command) or passed directly into **libbpf** (using **bpf_object__open()** family of APIs). -**bpftool gen skeleton** *FILE* +bpftool gen skeleton *FILE* Generate BPF skeleton C header file for a given *FILE*. BPF skeleton is an alternative interface to existing libbpf APIs for @@ -132,7 +132,7 @@ DESCRIPTION used to fetch and update (non-read-only) data from userspace, with same simplicity as for BPF side. -**bpftool gen subskeleton** *FILE* +bpftool gen subskeleton *FILE* Generate BPF subskeleton C header file for a given *FILE*. Subskeletons are similar to skeletons, except they do not own the @@ -154,7 +154,7 @@ DESCRIPTION Frees the storage for the subskeleton but *does not* unload any BPF programs or maps. -**bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF file, containing all needed BTF types so one, or more, given eBPF objects CO-RE relocations may be satisfied. @@ -174,7 +174,7 @@ DESCRIPTION Check examples bellow for more information how to use it. -**bpftool gen help** +bpftool gen help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index af17cc0201d4..7bcb4e993d7d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -31,7 +31,7 @@ ITER COMMANDS DESCRIPTION =========== -**bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] +bpftool iter pin *OBJ* *PATH* [map *MAP*] A bpf iterator combines a kernel iterating of particular kernel data (e.g., tasks, bpf_maps, etc.) and a bpf program called for each kernel data object (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator @@ -48,7 +48,7 @@ DESCRIPTION User can then *cat PATH* to see the bpf iterator output. -**bpftool iter help** +bpftool iter help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 6d39ad890fea..6f09d4405ed8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -33,7 +33,7 @@ LINK COMMANDS DESCRIPTION =========== -**bpftool link { show | list }** [*LINK*] +bpftool link { show | list } [*LINK*] Show information about active links. If *LINK* is specified show information only about given link, otherwise list all links currently active on the system. @@ -45,19 +45,19 @@ DESCRIPTION that hold open file descriptors (FDs) against BPF links. On such kernels bpftool will automatically emit this information as well. -**bpftool link pin** *LINK* *FILE* +bpftool link pin *LINK* *FILE* Pin link *LINK* as *FILE*. Note: *FILE* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool link detach** *LINK* +bpftool link detach *LINK* Force-detach link *LINK*. BPF link and its underlying BPF program will stay valid, but they will be detached from the respective BPF hook and BPF link will transition into a defunct state until last open file descriptor for that link is closed. -**bpftool link help** +bpftool link help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 864257169942..252e4c538edb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -59,7 +59,7 @@ MAP COMMANDS DESCRIPTION =========== -**bpftool map { show | list }** [*MAP*] +bpftool map { show | list } [*MAP*] Show information about loaded maps. If *MAP* is specified show information only about given maps, otherwise list all maps currently loaded on the system. In case of **name**, *MAP* may match several maps which will all @@ -72,7 +72,7 @@ DESCRIPTION that hold open file descriptors (FDs) against BPF maps. On such kernels bpftool will automatically emit this information as well. -**bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] +bpftool map create *FILE* type *TYPE* key *KEY_SIZE* value *VALUE_SIZE* entries *MAX_ENTRIES* name *NAME* [flags *FLAGS*] [inner_map *MAP*] [offload_dev *NAME*] Create a new map with given parameters and pin it to *bpffs* as *FILE*. *FLAGS* should be an integer which is the combination of desired flags, @@ -86,11 +86,11 @@ DESCRIPTION Keyword **offload_dev** expects a network interface name, and is used to request hardware offload for the map. -**bpftool map dump** *MAP* +bpftool map dump *MAP* Dump all entries in a given *MAP*. In case of **name**, *MAP* may match several maps which will all be dumped. -**bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +bpftool map update *MAP* [key *DATA*] [value *VALUE*] [*UPDATE_FLAGS*] Update map entry for a given *KEY*. *UPDATE_FLAGS* can be one of: **any** update existing entry or add if @@ -103,22 +103,22 @@ DESCRIPTION unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is provided. -**bpftool map lookup** *MAP* [**key** *DATA*] +bpftool map lookup *MAP* [key *DATA*] Lookup **key** in the map. -**bpftool map getnext** *MAP* [**key** *DATA*] +bpftool map getnext *MAP* [key *DATA*] Get next key. If *key* is not specified, get first key. -**bpftool map delete** *MAP* **key** *DATA* +bpftool map delete *MAP* key *DATA* Remove entry from the map. -**bpftool map pin** *MAP* *FILE* +bpftool map pin *MAP* *FILE* Pin map *MAP* as *FILE*. Note: *FILE* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +bpftool map event_pipe *MAP* [cpu *N* index *M*] Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. Install perf rings into a perf event array map and dump output of any @@ -133,29 +133,29 @@ DESCRIPTION existing ring. Any other application will stop receiving events if it installed its rings earlier. -**bpftool map peek** *MAP* +bpftool map peek *MAP* Peek next value in the queue or stack. -**bpftool map push** *MAP* **value** *VALUE* +bpftool map push *MAP* value *VALUE* Push *VALUE* onto the stack. -**bpftool map pop** *MAP* +bpftool map pop *MAP* Pop and print value from the stack. -**bpftool map enqueue** *MAP* **value** *VALUE* +bpftool map enqueue *MAP* value *VALUE* Enqueue *VALUE* into the queue. -**bpftool map dequeue** *MAP* +bpftool map dequeue *MAP* Dequeue and print value from the queue. -**bpftool map freeze** *MAP* +bpftool map freeze *MAP* Freeze the map as read-only from user space. Entries from a frozen map can not longer be updated or deleted with the **bpf**\ () system call. This operation is not reversible, and the map remains immutable from user space until its destruction. However, read and write permissions for BPF programs to the map remain unchanged. -**bpftool map help** +bpftool map help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 43d2c858fdf8..f8e65869f8b4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -33,7 +33,7 @@ NET COMMANDS DESCRIPTION =========== -**bpftool net { show | list }** [ **dev** *NAME* ] +bpftool net { show | list } [ dev *NAME* ] List bpf program attachments in the kernel networking subsystem. Currently, device driver xdp attachments, tcx, netkit and old-style tc @@ -58,7 +58,7 @@ DESCRIPTION then all bpf programs attached to non clsact qdiscs, and finally all bpf programs attached to root and clsact qdisc. -**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] Attach bpf program *PROG* to network interface *NAME* with type specified by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the command used with **overwrite** option. Currently, only XDP-related modes @@ -70,13 +70,13 @@ DESCRIPTION **xdpdrv** - Native XDP. runs earliest point in driver's receive path; **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; -**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used for attach must be specified. Currently, only XDP-related modes are supported for *ATTACH_TYPE*. -**bpftool net help** +bpftool net help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 841789e30acd..8c1ae55be596 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -29,7 +29,7 @@ PERF COMMANDS DESCRIPTION =========== -**bpftool perf { show | list }** +bpftool perf { show | list } List all raw_tracepoint, tracepoint, kprobe attachment in the system. Output will start with process id and file descriptor in that process, @@ -39,7 +39,7 @@ DESCRIPTION kernel virtual address. The attachment point for u[ret]probe is the file name and the file offset. -**bpftool perf help** +bpftool perf help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index cc456d357db3..8e730cfb2589 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -67,7 +67,7 @@ PROG COMMANDS DESCRIPTION =========== -**bpftool prog { show | list }** [*PROG*] +bpftool prog { show | list } [*PROG*] Show information about loaded programs. If *PROG* is specified show information only about given programs, otherwise list all programs currently loaded on the system. In case of **tag** or **name**, *PROG* may @@ -87,7 +87,7 @@ DESCRIPTION that hold open file descriptors (FDs) against BPF programs. On such kernels bpftool will automatically emit this information as well. -**bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +bpftool prog dump xlated *PROG* [{ file *FILE* | [opcodes] [linum] [visual] }] Dump eBPF instructions of the programs from the kernel. By default, eBPF will be disassembled and printed to standard output in human-readable format. In this case, **opcodes** controls if raw opcodes should be printed @@ -108,7 +108,7 @@ DESCRIPTION displayed. If **linum** is specified, the filename, line number and line column will also be displayed. -**bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +bpftool prog dump jited *PROG* [{ file *FILE* | [opcodes] [linum] }] Dump jited image (host machine code) of the program. If *FILE* is specified image will be written to a file, otherwise it will @@ -121,13 +121,13 @@ DESCRIPTION **linum** is specified, the filename, line number and line column will also be displayed. -**bpftool prog pin** *PROG* *FILE* +bpftool prog pin *PROG* *FILE* Pin program *PROG* as *FILE*. Note: *FILE* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +bpftool prog { load | loadall } *OBJ* *PATH* [type *TYPE*] [map { idx *IDX* | name *NAME* } *MAP*] [{ offload_dev | xdpmeta_dev } *NAME*] [pinmaps *MAP_DIR*] [autoattach] Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog loadall** pins all programs from the *OBJ* under *PATH* directory. **type** @@ -156,24 +156,24 @@ DESCRIPTION Note: *PATH* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +bpftool prog attach *PROG* *ATTACH_TYPE* [*MAP*] Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* parameter, with the exception of *flow_dissector* which is attached to current networking name space. -**bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +bpftool prog detach *PROG* *ATTACH_TYPE* [*MAP*] Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* parameter, with the exception of *flow_dissector* which is detached from the current networking name space. -**bpftool prog tracelog** +bpftool prog tracelog Dump the trace pipe of the system to the console (stdout). Hit to stop printing. BPF programs can write to this trace pipe at runtime with the **bpf_trace_printk**\ () helper. This should be used only for debugging purposes. For streaming data from BPF programs to user space, one can use perf events (see also **bpftool-map**\ (8)). -**bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] Run BPF program *PROG* in the kernel testing infrastructure for BPF, meaning that the program works on the data and context provided by the user, and not on actual packets or monitored functions etc. Return value @@ -201,12 +201,12 @@ DESCRIPTION them can take the **ctx_in**/**ctx_out** arguments. bpftool does not perform checks on program types. -**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* +bpftool prog profile *PROG* [duration *DURATION*] *METRICs* Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until user hits . *DURATION* is optional. If *DURATION* is not specified, the profiling will run up to **UINT_MAX** seconds. -**bpftool prog help** +bpftool prog help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 61ec1286446c..e871b9539ac7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -36,7 +36,7 @@ STRUCT_OPS COMMANDS DESCRIPTION =========== -**bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] +bpftool struct_ops { show | list } [*STRUCT_OPS_MAP*] Show brief information about the struct_ops in the system. If *STRUCT_OPS_MAP* is specified, it shows information only for the given struct_ops. Otherwise, it lists all struct_ops currently existing in the @@ -45,13 +45,13 @@ DESCRIPTION Output will start with struct_ops map ID, followed by its map name and its struct_ops's kernel type. -**bpftool struct_ops dump** [*STRUCT_OPS_MAP*] +bpftool struct_ops dump [*STRUCT_OPS_MAP*] Dump details information about the struct_ops in the system. If *STRUCT_OPS_MAP* is specified, it dumps information only for the given struct_ops. Otherwise, it dumps all struct_ops currently existing in the system. -**bpftool struct_ops register** *OBJ* [*LINK_DIR*] +bpftool struct_ops register *OBJ* [*LINK_DIR*] Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section ".struct_ops" and ".struct_ops.link" will be registered to its kernel subsystem. For each struct_ops in the ".struct_ops.link" section, a link @@ -59,10 +59,10 @@ DESCRIPTION these links will be pinned with the same name as their corresponding map name. -**bpftool struct_ops unregister** *STRUCT_OPS_MAP* +bpftool struct_ops unregister *STRUCT_OPS_MAP* Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. -**bpftool struct_ops help** +bpftool struct_ops help Print short help message. OPTIONS -- cgit v1.2.3 From a70f5d840a56a82c576385ca79ee55c1598f1bc3 Mon Sep 17 00:00:00 2001 From: Rameez Rehman Date: Sun, 31 Mar 2024 21:03:46 +0100 Subject: bpftool: Clean-up typos, punctuation, list formatting in docs Improve the formatting of the attach flags for cgroup programs in the relevant man page, and fix typos ("can be on of", "an userspace inet socket") when introducing that list. Also fix a couple of other trivial issues in docs. [ Quentin: Fixed trival issues in bpftool-gen.rst and bpftool-iter.rst ] Signed-off-by: Rameez Rehman Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240331200346.29118-4-qmo@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 65 ++++++++++------------ tools/bpf/bpftool/Documentation/bpftool-gen.rst | 8 +-- tools/bpf/bpftool/Documentation/bpftool-iter.rst | 2 +- 3 files changed, 35 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 1acf9f58fca2..b2610d169e60 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -90,41 +90,36 @@ bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected - udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected - udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an - unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an - unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an - unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). + *ATTACH_TYPE* can be one of: + + - **ingress** ingress path of the inet socket (since 4.10) + - **egress** egress path of the inet socket (since 4.10) + - **sock_create** opening of an inet socket (since 4.10) + - **sock_ops** various socket operations (since 4.12) + - **device** device access (since 4.15) + - **bind4** call to bind(2) for an inet4 socket (since 4.17) + - **bind6** call to bind(2) for an inet6 socket (since 4.17) + - **post_bind4** return from bind(2) for an inet4 socket (since 4.17) + - **post_bind6** return from bind(2) for an inet6 socket (since 4.17) + - **connect4** call to connect(2) for an inet4 socket (since 4.17) + - **connect6** call to connect(2) for an inet6 socket (since 4.17) + - **connect_unix** call to connect(2) for a unix socket (since 6.7) + - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp4 socket (since 4.18) + - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp6 socket (since 4.18) + - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected unix socket (since 6.7) + - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp4 socket (since 5.2) + - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp6 socket (since 5.2) + - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected unix socket (since 6.7) + - **sysctl** sysctl access (since 5.2) + - **getsockopt** call to getsockopt (since 5.3) + - **setsockopt** call to setsockopt (since 5.3) + - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8) + - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8) + - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7) + - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8) + - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8) + - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7) + - **sock_release** closing a userspace inet socket (since 5.9) bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index e9589c21e9c3..c768e6d4ae09 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -110,7 +110,7 @@ bpftool gen skeleton *FILE* - **example__open_and_load** combines **example__open** and **example__load** invocations in one commonly used operation. - - **example__attach** and **example__detach** + - **example__attach** and **example__detach**. This pair of functions allow to attach and detach, correspondingly, already loaded BPF object. Only BPF programs of types supported by libbpf for auto-attachment will be auto-attached and their corresponding BPF @@ -119,7 +119,7 @@ bpftool gen skeleton *FILE* **example__detach** will detach both links created automatically, as well as those populated by user manually. - - **example__destroy** + - **example__destroy**. Detach and unload BPF programs, free up all the resources used by skeleton and BPF object. @@ -146,11 +146,11 @@ bpftool gen subskeleton *FILE* Consequently, there are only two functions defined for subskeletons: - - **example__open(bpf_object\*)** + - **example__open(bpf_object\*)**. Instantiates a subskeleton from an already opened (but not necessarily loaded) **bpf_object**. - - **example__destroy()** + - **example__destroy()**. Frees the storage for the subskeleton but *does not* unload any BPF programs or maps. diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 7bcb4e993d7d..2e5d81c906dc 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -21,7 +21,7 @@ SYNOPSIS *COMMANDS* := { **pin** | **help** } ITER COMMANDS -=================== +============= | **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] | **bpftool** **iter help** -- cgit v1.2.3 From ca4ddc26f8acaa9cb451fcb20f7ab0f02e4970cb Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 29 Mar 2024 10:28:46 -0500 Subject: bpf: Fix typo in uapi doc comments In a few places in the bpf uapi headers, EOPNOTSUPP is missing a "P" in the doc comments. This adds the missing "P". Signed-off-by: David Lechner Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240329152900.398260-2-dlechner@baylibre.com --- include/uapi/linux/bpf.h | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 96d57e483133..79c548276b6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5026,7 +5026,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5512,7 +5512,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 96d57e483133..79c548276b6b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5026,7 +5026,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5512,7 +5512,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) -- cgit v1.2.3 From 965c6167c93f3fac53e25807f83c07e87b3c085a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 1 Apr 2024 19:54:46 -0700 Subject: selftests/bpf: Using llvm may_goto inline asm for cond_break macro Currently, cond_break macro uses bytes to encode the may_goto insn. Patch [1] in llvm implemented may_goto insn in BPF backend. Replace byte-level encoding with llvm inline asm for better usability. Using llvm may_goto insn is controlled by macro __BPF_FEATURE_MAY_GOTO. [1] https://github.com/llvm/llvm-project/commit/0e0bfacff71859d1f9212205f8f873d47029d3fb Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240402025446.3215182-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/bpf_experimental.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a5b9df38c162..3329ea080865 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,6 +326,16 @@ l_true: \ }) #endif +#ifdef __BPF_FEATURE_MAY_GOTO +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else #define cond_break \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ @@ -337,6 +347,7 @@ l_true: \ l_break: break; \ l_continue:; \ }) +#endif #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ -- cgit v1.2.3 From 2a24e2485722b0e12e17a2bd473bd15c9e420bdb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 10:07:13 -0700 Subject: bpftool: Use __typeof__() instead of typeof() in BPF skeleton When generated BPF skeleton header is included in C++ code base, some compiler setups will emit warning about using language extensions due to typeof() usage, resulting in something like: error: extension used [-Werror,-Wlanguage-extension-token] obj->struct_ops.empty_tcp_ca = (typeof(obj->struct_ops.empty_tcp_ca)) ^ It looks like __typeof__() is a preferred way to do typeof() with better C++ compatibility behavior, so switch to that. With __typeof__() we get no such warning. Fixes: c2a0257c1edf ("bpftool: Cast pointers for shadow types explicitly.") Fixes: 00389c58ffe9 ("bpftool: Add support for subskeletons") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Kui-Feng Lee Acked-by: Quentin Monnet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240401170713.2081368-1-andrii@kernel.org --- tools/bpf/bpftool/gen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 786268f1a483..b3979ddc0189 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -386,7 +386,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name */ needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var); if (needs_typeof) - printf("typeof("); + printf("__typeof__("); err = btf_dump__emit_type_decl(d, var_type_id, &opts); if (err) @@ -1131,7 +1131,7 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = (typeof(obj->struct_ops.%1$s))\n\ + obj->struct_ops.%1$s = (__typeof__(obj->struct_ops.%1$s))\n\ bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); -- cgit v1.2.3 From c186ed12a8ec498532d13de43094bdec9ac6f121 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 2 Apr 2024 07:30:29 +0000 Subject: selftests/bpf: Skip test when perf_event_open returns EOPNOTSUPP When testing send_signal and stacktrace_build_id_nmi using the riscv sbi pmu driver without the sscofpmf extension or the riscv legacy pmu driver, then failures as follows are encountered: test_send_signal_common:FAIL:perf_event_open unexpected perf_event_open: actual -1 < expected 0 #272/3 send_signal/send_signal_nmi:FAIL test_stacktrace_build_id_nmi:FAIL:perf_event_open err -1 errno 95 #304 stacktrace_build_id_nmi:FAIL The reason is that the above pmu driver or hardware does not support sampling events, that is, PERF_PMU_CAP_NO_INTERRUPT is set to pmu capabilities, and then perf_event_open returns EOPNOTSUPP. Since PERF_PMU_CAP_NO_INTERRUPT is not only set in the riscv-related pmu driver, it is better to skip testing when this capability is set. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402073029.1299085-1-pulehui@huaweicloud.com --- tools/testing/selftests/bpf/prog_tests/send_signal.c | 2 +- tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index b15b343ebb6b..920aee41bd58 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -179,7 +179,7 @@ static void test_send_signal_nmi(bool signal_thread) pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); if (pmu_fd == -1) { - if (errno == ENOENT) { + if (errno == ENOENT || errno == EOPNOTSUPP) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 5db9eec24b5b..0832fd787457 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -35,7 +35,7 @@ retry: pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, 0 /* flags */); - if (pmu_fd < 0 && errno == ENOENT) { + if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); goto cleanup; -- cgit v1.2.3 From 15ea39ad7e83af16480bbf20144fcc6edf4757f9 Mon Sep 17 00:00:00 2001 From: Tobias Böhm Date: Tue, 2 Apr 2024 17:10:52 +0200 Subject: libbpf: Use local bpf_helpers.h include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 20d59ee55172fdf6 ("libbpf: add bpf_core_cast() macro") added a bpf_helpers include in bpf_core_read.h as a system include. Usually, the includes are local, though, like in bpf_tracing.h. This commit adjusts the include to be local as well. Signed-off-by: Tobias Böhm Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/q5d5bgc6vty2fmaazd5e73efd6f5bhiru2le6fxn43vkw45bls@fhlw2s5ootdb --- tools/lib/bpf/bpf_core_read.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 1ce738d91685..b5c7ce5c243a 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -2,7 +2,7 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ -#include +#include "bpf_helpers.h" /* * enum bpf_field_info_kind is passed as a second argument into -- cgit v1.2.3 From c07b4bcd5163c2929d8bfc55140325fc15afb4eb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 2 Apr 2024 18:50:21 +0800 Subject: selftests/bpf: Add pid limit for mptcpify prog In order to prevent mptcpify prog from affecting the running results of other BPF tests, a pid limit was added to restrict it from only modifying its own program. Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8987e2938e15e8ec390b85b5dcbee704751359dc.1712054986.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/prog_tests/mptcp.c | 2 ++ tools/testing/selftests/bpf/progs/mptcpify.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 8f8d792307c1..4e0f69295872 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -273,6 +273,8 @@ static int run_mptcpify(int cgroup_fd) if (!ASSERT_OK_PTR(mptcpify_skel, "skel_open_load")) return libbpf_get_error(mptcpify_skel); + mptcpify_skel->bss->pid = getpid(); + err = mptcpify__attach(mptcpify_skel); if (!ASSERT_OK(err, "skel_attach")) goto out; diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c index 53301ae8a8f7..cbdc730c3a47 100644 --- a/tools/testing/selftests/bpf/progs/mptcpify.c +++ b/tools/testing/selftests/bpf/progs/mptcpify.c @@ -6,10 +6,14 @@ #include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; +int pid; SEC("fmod_ret/update_socket_protocol") int BPF_PROG(mptcpify, int family, int type, int protocol) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return protocol; + if ((family == AF_INET || family == AF_INET6) && type == SOCK_STREAM && (!protocol || protocol == IPPROTO_TCP)) { -- cgit v1.2.3 From d6d647d7ba6413148c3db2d48640986c8c1d7d08 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 11:16:51 -0700 Subject: tools: ynl: add ynl_dump_empty() helper Checking if dump is empty requires a couple of casts. Add a convenient wrapper. Add an example use in the netdev sample, loopback is always present so an empty dump is an error. Reviewed-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20240329181651.319326-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.h | 12 ++++++++++++ tools/net/ynl/samples/netdev.c | 2 ++ 2 files changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 9842e85a8c57..eef7c6324ed4 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -91,6 +91,18 @@ void ynl_sock_destroy(struct ynl_sock *ys); !ynl_dump_obj_is_last(iter); \ iter = ynl_dump_obj_next(iter)) +/** + * ynl_dump_empty() - does the dump have no entries + * @dump: pointer to the dump list, as returned by a dump call + * + * Check if the dump is empty, i.e. contains no objects. + * Dump calls return NULL on error, and terminator element if empty. + */ +static inline bool ynl_dump_empty(void *dump) +{ + return dump == (void *)YNL_LIST_END; +} + int ynl_subscribe(struct ynl_sock *ys, const char *grp_name); int ynl_socket_get_fd(struct ynl_sock *ys); int ynl_ntf_check(struct ynl_sock *ys); diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index 591b90e21890..3e7b29bd55d5 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -100,6 +100,8 @@ int main(int argc, char **argv) if (!devs) goto err_close; + if (ynl_dump_empty(devs)) + fprintf(stderr, "Error: no devices reported\n"); ynl_dump_foreach(devs, d) netdev_print_device(d, 0); netdev_dev_get_list_free(devs); -- cgit v1.2.3 From 4dd31243e30843d5f63bccfb0369146e4de1a130 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 25 Mar 2024 15:07:16 +0000 Subject: bpf: Add arm64 JIT support for bpf_addr_space_cast instruction. LLVM generates bpf_addr_space_cast instruction while translating pointers between native (zero) address space and __attribute__((address_space(N))). The addr_space=0 is reserved as bpf_arena address space. rY = addr_space_cast(rX, 0, 1) is processed by the verifier and converted to normal 32-bit move: wX = wY. rY = addr_space_cast(rX, 1, 0) : used to convert a bpf arena pointer to a pointer in the userspace vma. This has to be converted by the JIT. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240325150716.4387-3-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 16 ++++++++++++++++ tools/testing/selftests/bpf/DENYLIST.aarch64 | 2 -- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c6210b150b6a..76b91f36c729 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -82,6 +82,7 @@ struct jit_ctx { __le32 *ro_image; u32 stack_size; int fpb_offset; + u64 user_vm_start; }; struct bpf_plt { @@ -868,6 +869,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn_is_cast_user(insn)) { + emit(A64_MOV(0, tmp, src), ctx); // 32-bit mov clears the upper 32 bits + emit_a64_mov_i(0, dst, ctx->user_vm_start >> 32, ctx); + emit(A64_LSL(1, dst, dst, 32), ctx); + emit(A64_CBZ(1, tmp, 2), ctx); + emit(A64_ORR(1, tmp, dst, tmp), ctx); + emit(A64_MOV(1, dst, tmp), ctx); + break; + } switch (insn->off) { case 0: emit(A64_MOV(is64, dst, src), ctx); @@ -1690,6 +1700,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.fpb_offset = find_fpb_offset(prog); + ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); /* * 1. Initial fake pass to compute ctx->idx and ctx->offset. @@ -2511,6 +2522,11 @@ bool bpf_jit_supports_exceptions(void) return true; } +bool bpf_jit_supports_arena(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index d8ade15e2789..0445ac38bc07 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,5 +10,3 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -verifier_arena # JIT does not support arena -arena_htab # JIT does not support arena -- cgit v1.2.3 From 7effe3fdc049a34c56a68671100b5570e53e8f0a Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:23 +0000 Subject: tools: Add ethtool.h header to tooling infra This commit duplicates the ethtool.h file from the include/uapi/linux directory in the kernel source to the tools/include/uapi/linux directory. This action ensures that the ethtool.h file used in the tools directory is in sync with the kernel's version, maintaining consistency across the codebase. There are some checkpatch warnings in this file that could be cleaned up, but I preferred to move it over as-is for now to avoid disrupting the code. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-2-tushar.vyavahare@intel.com --- tools/include/uapi/linux/ethtool.h | 2229 +++++++++++++++++++++++++++++++++++- 1 file changed, 2198 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h index 47afae3895ec..11fc18988bc2 100644 --- a/tools/include/uapi/linux/ethtool.h +++ b/tools/include/uapi/linux/ethtool.h @@ -14,40 +14,140 @@ #ifndef _UAPI_LINUX_ETHTOOL_H #define _UAPI_LINUX_ETHTOOL_H -#include +#include #include #include -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#ifndef __KERNEL__ +#include /* for INT_MAX */ +#endif + +/* All structures exposed to userland should be defined such that they + * have the same layout for 32-bit and 64-bit userland. + */ + +/* Note on reserved space. + * Reserved fields must not be accessed directly by user space because + * they may be replaced by a different field in the future. They must + * be initialized to zero before making the request, e.g. via memset + * of the entire structure or implicitly by not being set in a structure + * initializer. + */ /** - * struct ethtool_channels - configuring number of network channel - * @cmd: ETHTOOL_{G,S}CHANNELS - * @max_rx: Read only. Maximum number of receive channel the driver support. - * @max_tx: Read only. Maximum number of transmit channel the driver support. - * @max_other: Read only. Maximum number of other channel the driver support. - * @max_combined: Read only. Maximum number of combined channel the driver - * support. Set of queues RX, TX or other. - * @rx_count: Valid values are in the range 1 to the max_rx. - * @tx_count: Valid values are in the range 1 to the max_tx. - * @other_count: Valid values are in the range 1 to the max_other. - * @combined_count: Valid values are in the range 1 to the max_combined. + * struct ethtool_cmd - DEPRECATED, link control and status + * This structure is DEPRECATED, please use struct ethtool_link_settings. + * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET + * @supported: Bitmask of %SUPPORTED_* flags for the link modes, + * physical connectors and other link features for which the + * interface supports autonegotiation or auto-detection. + * Read-only. + * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, + * physical connectors and other link features that are + * advertised through autonegotiation or enabled for + * auto-detection. + * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @transceiver: Historically used to distinguish different possible + * PHY types, but not in a consistent way. Deprecated. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @maxtxpkt: Historically used to report TX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @maxrxpkt: Historically used to report RX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes + * and other link features that the link partner advertised + * through autonegotiation; 0 if unknown or not applicable. + * Read-only. + * @reserved: Reserved for future use; see the note on reserved space. * - * This can be used to configure RX, TX and other channels. + * The link speed in Mbps is split between @speed and @speed_hi. Use + * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to + * access it. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GSET to get the current values before making specific + * changes and then applying them with %ETHTOOL_SSET. + * + * Deprecated fields should be ignored by both users and drivers. */ - -struct ethtool_channels { +struct ethtool_cmd { __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; + __u32 supported; + __u32 advertising; + __u16 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 transceiver; + __u8 autoneg; + __u8 mdio_support; + __u32 maxtxpkt; + __u32 maxrxpkt; + __u16 speed_hi; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __u32 lp_advertising; + __u32 reserved[2]; }; +static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, + __u32 speed) +{ + ep->speed = (__u16)(speed & 0xFFFF); + ep->speed_hi = (__u16)(speed >> 16); +} + +static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) +{ + return (ep->speed_hi << 16) | ep->speed; +} + +/* Device supports clause 22 register access to PHY or peripherals + * using the interface defined in . This should not be + * set if there are known to be no such peripherals present or if + * the driver only emulates clause 22 registers for compatibility. + */ +#define ETH_MDIO_SUPPORTS_C22 1 + +/* Device supports clause 45 register access to PHY or peripherals + * using the interface defined in and . + * This should not be set if there are known to be no such peripherals + * present. + */ +#define ETH_MDIO_SUPPORTS_C45 2 + #define ETHTOOL_FWVERS_LEN 32 #define ETHTOOL_BUSINFO_LEN 32 #define ETHTOOL_EROMVERS_LEN 32 @@ -59,8 +159,10 @@ struct ethtool_channels { * in its bus driver structure (e.g. pci_driver::name). Must * not be an empty string. * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string + * @fw_version: Firmware version string; driver defined; may be an + * empty string + * @erom_version: Expansion ROM version string; driver defined; may be + * an empty string * @bus_info: Device bus address. This should match the dev_name() * string for the underlying bus device, if there is one. May be * an empty string. @@ -79,10 +181,6 @@ struct ethtool_channels { * * Users can use the %ETHTOOL_GSSET_INFO command to get the number of * strings in any string set (from Linux 2.6.34). - * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. */ struct ethtool_drvinfo { __u32 cmd; @@ -99,6 +197,2075 @@ struct ethtool_drvinfo { __u32 regdump_len; }; -#define ETHTOOL_GDRVINFO 0x00000003 +#define SOPASS_MAX 6 + +/** + * struct ethtool_wolinfo - Wake-On-Lan configuration + * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL + * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. + * Read-only. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE + * is set in @wolopts. + */ +struct ethtool_wolinfo { + __u32 cmd; + __u32 supported; + __u32 wolopts; + __u8 sopass[SOPASS_MAX]; +}; + +/* for passing single values */ +struct ethtool_value { + __u32 cmd; + __u32 data; +}; + +#define PFC_STORM_PREVENTION_AUTO 0xffff +#define PFC_STORM_PREVENTION_DISABLE 0 + +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ + ETHTOOL_TX_COPYBREAK_BUF_SIZE, + /* + * Add your fresh new tunable attribute above and remember to update + * tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_TUNABLE_COUNT, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[]; +}; + +#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff +#define DOWNSHIFT_DEV_DISABLE 0 + +/* Time in msecs after which link is reported as down + * 0 = lowest time supported by the PHY + * 0xff = off, link down detection according to standard + */ +#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 +#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff + +/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where + * the PHY's RX & TX blocks are put into a low-power mode when there is no + * link detected (typically cable is un-plugged). For RX, only a minimal + * link-detection is available, and for TX the PHY wakes up to send link pulses + * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode. + * + * Some PHYs may support configuration of the wake-up interval for TX pulses, + * and some PHYs may support only disabling TX pulses entirely. For the latter + * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be + * configured from userspace (should the user want it). + * + * The interval units for TX wake-up are in milliseconds, since this should + * cover a reasonable range of intervals: + * - from 1 millisecond, which does not sound like much of a power-saver + * - to ~65 seconds which is quite a lot to wait for a link to come up when + * plugging a cable + */ +#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff +#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe +#define ETHTOOL_PHY_EDPD_DISABLE 0 + +enum phy_tunable_id { + ETHTOOL_PHY_ID_UNSPEC, + ETHTOOL_PHY_DOWNSHIFT, + ETHTOOL_PHY_FAST_LINK_DOWN, + ETHTOOL_PHY_EDPD, + /* + * Add your fresh new phy tunable attribute above and remember to update + * phy_tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_PHY_TUNABLE_COUNT, +}; + +/** + * struct ethtool_regs - hardware register dump + * @cmd: Command number = %ETHTOOL_GREGS + * @version: Dump format version. This is driver-specific and may + * distinguish different chips/revisions. Drivers must use new + * version numbers whenever the dump format changes in an + * incompatible way. + * @len: On entry, the real length of @data. On return, the number of + * bytes used. + * @data: Buffer for the register dump + * + * Users should use %ETHTOOL_GDRVINFO to find the maximum length of + * a register dump for the interface. They must allocate the buffer + * immediately following this structure. + */ +struct ethtool_regs { + __u32 cmd; + __u32 version; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eeprom - EEPROM dump + * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or + * %ETHTOOL_SEEPROM + * @magic: A 'magic cookie' value to guard against accidental changes. + * The value passed in to %ETHTOOL_SEEPROM must match the value + * returned by %ETHTOOL_GEEPROM for the same device. This is + * unused when @cmd is %ETHTOOL_GMODULEEEPROM. + * @offset: Offset within the EEPROM to begin reading/writing, in bytes + * @len: On entry, number of bytes to read/write. On successful + * return, number of bytes actually read/written. In case of + * error, this may indicate at what point the error occurred. + * @data: Buffer to read/write from + * + * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find + * the length of an on-board or module EEPROM, respectively. They + * must allocate the buffer immediately following this structure. + */ +struct ethtool_eeprom { + __u32 cmd; + __u32 magic; + __u32 offset; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eee - Energy Efficient Ethernet information + * @cmd: ETHTOOL_{G,S}EEE + * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations + * for which there is EEE support. + * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations + * advertised as eee capable. + * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex + * combinations advertised by the link partner as eee capable. + * @eee_active: Result of the eee auto negotiation. + * @eee_enabled: EEE configured mode (enabled/disabled). + * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given + * that eee was negotiated. + * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting + * its tx lpi (after reaching 'idle' state). Effective only when eee + * was negotiated and tx_lpi_enabled was set. + * @reserved: Reserved for future use; see the note on reserved space. + */ +struct ethtool_eee { + __u32 cmd; + __u32 supported; + __u32 advertised; + __u32 lp_advertised; + __u32 eee_active; + __u32 eee_enabled; + __u32 tx_lpi_enabled; + __u32 tx_lpi_timer; + __u32 reserved[2]; +}; + +/** + * struct ethtool_modinfo - plugin module eeprom information + * @cmd: %ETHTOOL_GMODULEINFO + * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx + * @eeprom_len: Length of the eeprom + * @reserved: Reserved for future use; see the note on reserved space. + * + * This structure is used to return the information to + * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. + * The type code indicates the eeprom data format + */ +struct ethtool_modinfo { + __u32 cmd; + __u32 type; + __u32 eeprom_len; + __u32 reserved[8]; +}; + +/** + * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates + * @cmd: ETHTOOL_{G,S}COALESCE + * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after + * a packet arrives. + * @rx_max_coalesced_frames: Maximum number of packets to receive + * before an RX interrupt. + * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after + * a packet is sent. + * @tx_max_coalesced_frames: Maximum number of packets to be sent + * before a TX interrupt. + * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @stats_block_coalesce_usecs: How many usecs to delay in-memory + * statistics block updates. Some drivers do not have an + * in-memory statistic block, and in such cases this value is + * ignored. This value must not be zero. + * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. + * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. + * @pkt_rate_low: Threshold for low packet rate (packets per second). + * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is below @pkt_rate_low. + * @rx_max_coalesced_frames_low: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is below @pkt_rate_low. + * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is below @pkt_rate_low. + * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before + * a TX interrupt, when the packet rate is below @pkt_rate_low. + * @pkt_rate_high: Threshold for high packet rate (packets per second). + * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is above @pkt_rate_high. + * @rx_max_coalesced_frames_high: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is above @pkt_rate_high. + * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is above @pkt_rate_high. + * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before + * a TX interrupt, when the packet rate is above @pkt_rate_high. + * @rate_sample_interval: How often to do adaptive coalescing packet rate + * sampling, measured in seconds. Must not be zero. + * + * Each pair of (usecs, max_frames) fields specifies that interrupts + * should be coalesced until + * (usecs > 0 && time_since_first_completion >= usecs) || + * (max_frames > 0 && completed_frames >= max_frames) + * + * It is illegal to set both usecs and max_frames to zero as this + * would cause interrupts to never be generated. To disable + * coalescing, set usecs = 0 and max_frames = 1. + * + * Some implementations ignore the value of max_frames and use the + * condition time_since_first_completion >= usecs + * + * This is deprecated. Drivers for hardware that does not support + * counting completions should validate that max_frames == !rx_usecs. + * + * Adaptive RX/TX coalescing is an algorithm implemented by some + * drivers to improve latency under low packet rates and improve + * throughput under high packet rates. Some drivers only implement + * one of RX or TX adaptive coalescing. Anything not implemented by + * the driver causes these values to be silently ignored. + * + * When the packet rate is below @pkt_rate_high but above + * @pkt_rate_low (both measured in packets per second) the + * normal {rx,tx}_* coalescing parameters are used. + */ +struct ethtool_coalesce { + __u32 cmd; + __u32 rx_coalesce_usecs; + __u32 rx_max_coalesced_frames; + __u32 rx_coalesce_usecs_irq; + __u32 rx_max_coalesced_frames_irq; + __u32 tx_coalesce_usecs; + __u32 tx_max_coalesced_frames; + __u32 tx_coalesce_usecs_irq; + __u32 tx_max_coalesced_frames_irq; + __u32 stats_block_coalesce_usecs; + __u32 use_adaptive_rx_coalesce; + __u32 use_adaptive_tx_coalesce; + __u32 pkt_rate_low; + __u32 rx_coalesce_usecs_low; + __u32 rx_max_coalesced_frames_low; + __u32 tx_coalesce_usecs_low; + __u32 tx_max_coalesced_frames_low; + __u32 pkt_rate_high; + __u32 rx_coalesce_usecs_high; + __u32 rx_max_coalesced_frames_high; + __u32 tx_coalesce_usecs_high; + __u32 tx_max_coalesced_frames_high; + __u32 rate_sample_interval; +}; + +/** + * struct ethtool_ringparam - RX/TX ring parameters + * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM + * @rx_max_pending: Maximum supported number of pending entries per + * RX ring. Read-only. + * @rx_mini_max_pending: Maximum supported number of pending entries + * per RX mini ring. Read-only. + * @rx_jumbo_max_pending: Maximum supported number of pending entries + * per RX jumbo ring. Read-only. + * @tx_max_pending: Maximum supported number of pending entries per + * TX ring. Read-only. + * @rx_pending: Current maximum number of pending entries per RX ring + * @rx_mini_pending: Current maximum number of pending entries per RX + * mini ring + * @rx_jumbo_pending: Current maximum number of pending entries per RX + * jumbo ring + * @tx_pending: Current maximum supported number of pending entries + * per TX ring + * + * If the interface does not have separate RX mini and/or jumbo rings, + * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. + * + * There may also be driver-dependent minimum values for the number + * of entries per ring. + */ +struct ethtool_ringparam { + __u32 cmd; + __u32 rx_max_pending; + __u32 rx_mini_max_pending; + __u32 rx_jumbo_max_pending; + __u32 tx_max_pending; + __u32 rx_pending; + __u32 rx_mini_pending; + __u32 rx_jumbo_pending; + __u32 tx_pending; +}; + +/** + * struct ethtool_channels - configuring number of network channel + * @cmd: ETHTOOL_{G,S}CHANNELS + * @max_rx: Read only. Maximum number of receive channel the driver support. + * @max_tx: Read only. Maximum number of transmit channel the driver support. + * @max_other: Read only. Maximum number of other channel the driver support. + * @max_combined: Read only. Maximum number of combined channel the driver + * support. Set of queues RX, TX or other. + * @rx_count: Valid values are in the range 1 to the max_rx. + * @tx_count: Valid values are in the range 1 to the max_tx. + * @other_count: Valid values are in the range 1 to the max_other. + * @combined_count: Valid values are in the range 1 to the max_combined. + * + * This can be used to configure RX, TX and other channels. + */ + +struct ethtool_channels { + __u32 cmd; + __u32 max_rx; + __u32 max_tx; + __u32 max_other; + __u32 max_combined; + __u32 rx_count; + __u32 tx_count; + __u32 other_count; + __u32 combined_count; +}; + +/** + * struct ethtool_pauseparam - Ethernet pause (flow control) parameters + * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM + * @autoneg: Flag to enable autonegotiation of pause frame use + * @rx_pause: Flag to enable reception of pause frames + * @tx_pause: Flag to enable transmission of pause frames + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + * If the link is autonegotiated, drivers should use + * mii_advertise_flowctrl() or similar code to set the advertised + * pause frame capabilities based on the @rx_pause and @tx_pause flags, + * even if @autoneg is zero. They should also allow the advertised + * pause frame capabilities to be controlled directly through the + * advertising field of &struct ethtool_cmd. + * + * If @autoneg is non-zero, the MAC is configured to send and/or + * receive pause frames according to the result of autonegotiation. + * Otherwise, it is configured directly based on the @rx_pause and + * @tx_pause flags. + */ +struct ethtool_pauseparam { + __u32 cmd; + __u32 autoneg; + __u32 rx_pause; + __u32 tx_pause; +}; + +/* Link extended state */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, + ETHTOOL_LINK_EXT_STATE_MODULE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ +enum ethtool_link_ext_substate_module { + ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, +}; + +#define ETH_GSTRING_LEN 32 + +/** + * enum ethtool_stringset - string set ID + * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST + * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS + * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with + * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS + * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; + * now deprecated + * @ETH_SS_FEATURES: Device feature names + * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_TUNABLES: tunable names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS + * @ETH_SS_PHY_TUNABLES: PHY tunable names + * @ETH_SS_LINK_MODES: link mode names + * @ETH_SS_MSG_CLASSES: debug message class names + * @ETH_SS_WOL_MODES: wake-on-lan modes + * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags + * @ETH_SS_TS_TX_TYPES: timestamping Tx types + * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types + * @ETH_SS_STATS_STD: standardized stats + * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics + * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics + * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics + * @ETH_SS_STATS_RMON: names of RMON statistics + * + * @ETH_SS_COUNT: number of defined string sets + */ +enum ethtool_stringset { + ETH_SS_TEST = 0, + ETH_SS_STATS, + ETH_SS_PRIV_FLAGS, + ETH_SS_NTUPLE_FILTERS, + ETH_SS_FEATURES, + ETH_SS_RSS_HASH_FUNCS, + ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, + ETH_SS_PHY_TUNABLES, + ETH_SS_LINK_MODES, + ETH_SS_MSG_CLASSES, + ETH_SS_WOL_MODES, + ETH_SS_SOF_TIMESTAMPING, + ETH_SS_TS_TX_TYPES, + ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, + ETH_SS_STATS_STD, + ETH_SS_STATS_ETH_PHY, + ETH_SS_STATS_ETH_MAC, + ETH_SS_STATS_ETH_CTRL, + ETH_SS_STATS_RMON, + + /* add new constants above here */ + ETH_SS_COUNT +}; + +/** + * enum ethtool_mac_stats_src - source of ethtool MAC statistics + * @ETHTOOL_MAC_STATS_SRC_AGGREGATE: + * if device supports a MAC merge layer, this retrieves the aggregate + * statistics of the eMAC and pMAC. Otherwise, it retrieves just the + * statistics of the single (express) MAC. + * @ETHTOOL_MAC_STATS_SRC_EMAC: + * if device supports a MM layer, this retrieves the eMAC statistics. + * Otherwise, it retrieves the statistics of the single (express) MAC. + * @ETHTOOL_MAC_STATS_SRC_PMAC: + * if device supports a MM layer, this retrieves the pMAC statistics. + */ +enum ethtool_mac_stats_src { + ETHTOOL_MAC_STATS_SRC_AGGREGATE, + ETHTOOL_MAC_STATS_SRC_EMAC, + ETHTOOL_MAC_STATS_SRC_PMAC, +}; + +/** + * enum ethtool_module_power_mode_policy - plug-in module power mode policy + * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. + * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host + * to high power mode when the first port using it is put administratively + * up and to low power mode when the last port using it is put + * administratively down. + */ +enum ethtool_module_power_mode_policy { + ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, +}; + +/** + * enum ethtool_module_power_mode - plug-in module power mode + * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. + * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. + */ +enum ethtool_module_power_mode { + ETHTOOL_MODULE_POWER_MODE_LOW = 1, + ETHTOOL_MODULE_POWER_MODE_HIGH, +}; + +/** + * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are + * unknown + * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled + * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled + */ +enum ethtool_podl_pse_admin_state { + ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, + ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, +}; + +/** + * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is + * asserted true when the PoDL PSE state diagram variable mr_pse_enable is + * false" + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is + * asserted true when either of the PSE state diagram variables + * pi_detecting or pi_classifying is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” + * is asserted true when the PoDL PSE state diagram variable pi_powered is + * true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted + * true when the PoDL PSE state diagram variable pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true + * when the logical combination of the PoDL PSE state diagram variables + * pi_prebiased*!pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted + * true when the PoDL PSE state diagram variable overload_held is true." + */ +enum ethtool_podl_pse_pw_d_status { + ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, + ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, + ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, + ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, + ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, + ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, + ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, +}; + +/** + * enum ethtool_mm_verify_status - status of MAC Merge Verify function + * @ETHTOOL_MM_VERIFY_STATUS_UNKNOWN: + * verification status is unknown + * @ETHTOOL_MM_VERIFY_STATUS_INITIAL: + * the 802.3 Verify State diagram is in the state INIT_VERIFICATION + * @ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + * the Verify State diagram is in the state VERIFICATION_IDLE, + * SEND_VERIFY or WAIT_FOR_RESPONSE + * @ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + * indicates that the Verify State diagram is in the state VERIFIED + * @ETHTOOL_MM_VERIFY_STATUS_FAILED: + * the Verify State diagram is in the state VERIFY_FAIL + * @ETHTOOL_MM_VERIFY_STATUS_DISABLED: + * verification of preemption operation is disabled + */ +enum ethtool_mm_verify_status { + ETHTOOL_MM_VERIFY_STATUS_UNKNOWN, + ETHTOOL_MM_VERIFY_STATUS_INITIAL, + ETHTOOL_MM_VERIFY_STATUS_VERIFYING, + ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED, + ETHTOOL_MM_VERIFY_STATUS_FAILED, + ETHTOOL_MM_VERIFY_STATUS_DISABLED, +}; + +/** + * struct ethtool_gstrings - string set for data tagging + * @cmd: Command number = %ETHTOOL_GSTRINGS + * @string_set: String set ID; one of &enum ethtool_stringset + * @len: On return, the number of strings in the string set + * @data: Buffer for strings. Each string is null-padded to a size of + * %ETH_GSTRING_LEN. + * + * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in + * the string set. They must allocate a buffer of the appropriate + * size immediately following this structure. + */ +struct ethtool_gstrings { + __u32 cmd; + __u32 string_set; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_sset_info - string set information + * @cmd: Command number = %ETHTOOL_GSSET_INFO + * @reserved: Reserved for future use; see the note on reserved space. + * @sset_mask: On entry, a bitmask of string sets to query, with bits + * numbered according to &enum ethtool_stringset. On return, a + * bitmask of those string sets queried that are supported. + * @data: Buffer for string set sizes. On return, this contains the + * size of each string set that was queried and supported, in + * order of ID. + * + * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on + * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the + * size of set 1 and @data[1] contains the size of set 2. + * + * Users must allocate a buffer of the appropriate size (4 * number of + * sets queried) immediately following this structure. + */ +struct ethtool_sset_info { + __u32 cmd; + __u32 reserved; + __u64 sset_mask; + __u32 data[]; +}; + +/** + * enum ethtool_test_flags - flags definition of ethtool_test + * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise + * only online tests. + * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. + * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback + * test. + * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test + */ + +enum ethtool_test_flags { + ETH_TEST_FL_OFFLINE = (1 << 0), + ETH_TEST_FL_FAILED = (1 << 1), + ETH_TEST_FL_EXTERNAL_LB = (1 << 2), + ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), +}; + +/** + * struct ethtool_test - device self-test invocation + * @cmd: Command number = %ETHTOOL_TEST + * @flags: A bitmask of flags from &enum ethtool_test_flags. Some + * flags may be set by the user on entry; others may be set by + * the driver on return. + * @reserved: Reserved for future use; see the note on reserved space. + * @len: On return, the number of test results + * @data: Array of test results + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of test results that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of results) immediately + * following this structure. + */ +struct ethtool_test { + __u32 cmd; + __u32 flags; + __u32 reserved; + __u32 len; + __u64 data[]; +}; + +/** + * struct ethtool_stats - device-specific statistics + * @cmd: Command number = %ETHTOOL_GSTATS + * @n_stats: On return, the number of statistics + * @data: Array of statistics + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of statistics that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of statistics) + * immediately following this structure. + */ +struct ethtool_stats { + __u32 cmd; + __u32 n_stats; + __u64 data[]; +}; + +/** + * struct ethtool_perm_addr - permanent hardware address + * @cmd: Command number = %ETHTOOL_GPERMADDR + * @size: On entry, the size of the buffer. On return, the size of the + * address. The command fails if the buffer is too small. + * @data: Buffer for the address + * + * Users must allocate the buffer immediately following this structure. + * A buffer size of %MAX_ADDR_LEN should be sufficient for any address + * type. + */ +struct ethtool_perm_addr { + __u32 cmd; + __u32 size; + __u8 data[]; +}; +/* boolean flags controlling per-interface behavior characteristics. + * When reading, the flag indicates whether or not a certain behavior + * is enabled/present. When writing, the flag indicates whether + * or not the driver should turn on (set) or off (clear) a behavior. + * + * Some behaviors may read-only (unconditionally absent or present). + * If such is the case, return EINVAL in the set-flags operation if the + * flag differs from the read-only value. + */ +enum ethtool_flags { + ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ + ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ + ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ + ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ + ETH_FLAG_RXHASH = (1 << 28), +}; + +/* The following structures are for supporting RX network flow + * classification and RX n-tuple configuration. Note, all multibyte + * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to + * be in network byte order. + */ + +/** + * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. + * @ip4src: Source host + * @ip4dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tos: Type-of-service + * + * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. + */ +struct ethtool_tcpip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be16 psrc; + __be16 pdst; + __u8 tos; +}; + +/** + * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @spi: Security parameters index + * @tos: Type-of-service + * + * This can be used to specify an IPsec transport or tunnel over IPv4. + */ +struct ethtool_ah_espip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 spi; + __u8 tos; +}; + +#define ETH_RX_NFC_IP4 1 + +/** + * struct ethtool_usrip4_spec - general flow specification for IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tos: Type-of-service + * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 + * @proto: Transport protocol number; mask must be 0 + */ +struct ethtool_usrip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 l4_4_bytes; + __u8 tos; + __u8 ip_ver; + __u8 proto; +}; + +/** + * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. + * @ip6src: Source host + * @ip6dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tclass: Traffic Class + * + * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. + */ +struct ethtool_tcpip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be16 psrc; + __be16 pdst; + __u8 tclass; +}; + +/** + * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @spi: Security parameters index + * @tclass: Traffic Class + * + * This can be used to specify an IPsec transport or tunnel over IPv6. + */ +struct ethtool_ah_espip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 spi; + __u8 tclass; +}; + +/** + * struct ethtool_usrip6_spec - general flow specification for IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tclass: Traffic Class + * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) + */ +struct ethtool_usrip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 l4_4_bytes; + __u8 tclass; + __u8 l4_proto; +}; + +union ethtool_flow_union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethtool_tcpip6_spec tcp_ip6_spec; + struct ethtool_tcpip6_spec udp_ip6_spec; + struct ethtool_tcpip6_spec sctp_ip6_spec; + struct ethtool_ah_espip6_spec ah_ip6_spec; + struct ethtool_ah_espip6_spec esp_ip6_spec; + struct ethtool_usrip6_spec usr_ip6_spec; + struct ethhdr ether_spec; + __u8 hdata[52]; +}; + +/** + * struct ethtool_flow_ext - additional RX flow fields + * @h_dest: destination MAC address + * @vlan_etype: VLAN EtherType + * @vlan_tci: VLAN tag control information + * @data: user defined data + * @padding: Reserved for future use; see the note on reserved space. + * + * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT + * is set in &struct ethtool_rx_flow_spec @flow_type. + * @h_dest is valid if %FLOW_MAC_EXT is set. + */ +struct ethtool_flow_ext { + __u8 padding[2]; + unsigned char h_dest[ETH_ALEN]; + __be16 vlan_etype; + __be16 vlan_tci; + __be32 data[2]; +}; + +/** + * struct ethtool_rx_flow_spec - classification rule for RX flows + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow fields to match (dependent on @flow_type) + * @h_ext: Additional fields to match + * @m_u: Masks for flow field bits to be matched + * @m_ext: Masks for additional field bits to be matched + * Note, all additional fields must be ignored unless @flow_type + * includes the %FLOW_EXT or %FLOW_MAC_EXT flag + * (see &struct ethtool_flow_ext description). + * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC + * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the + * packets should be used for Wake-on-LAN with %WAKE_FILTER + * @location: Location of rule in the table. Locations must be + * numbered such that a flow matching multiple rules will be + * classified according to the first (lowest numbered) rule. + */ +struct ethtool_rx_flow_spec { + __u32 flow_type; + union ethtool_flow_union h_u; + struct ethtool_flow_ext h_ext; + union ethtool_flow_union m_u; + struct ethtool_flow_ext m_ext; + __u64 ring_cookie; + __u32 location; +}; + +/* How rings are laid out when accessing virtual functions or + * offloaded queues is device specific. To allow users to do flow + * steering and specify these queues the ring cookie is partitioned + * into a 32bit queue index with an 8 bit virtual function id. + * This also leaves the 3bytes for further specifiers. It is possible + * future devices may support more than 256 virtual functions if + * devices start supporting PCIe w/ARI. However at the moment I + * do not know of any devices that support this so I do not reserve + * space for this at this time. If a future patch consumes the next + * byte it should be aware of this possibility. + */ +#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 +static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) +{ + return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; +} + +static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) +{ + return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; +} + +/** + * struct ethtool_rxnfc - command to get or set RX flow classification rules + * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, + * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, + * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS + * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW + * @data: Command-dependent value + * @fs: Flow classification rule + * @rss_context: RSS context to be affected + * @rule_cnt: Number of rules to be affected + * @rule_locs: Array of used rule locations + * + * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating + * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following + * structure fields must not be used, except that if @flow_type includes + * the %FLOW_RSS flag, then @rss_context determines which RSS context to + * act on. + * + * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues + * on return. + * + * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined + * rules on return. If @data is non-zero on return then it is the + * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the + * driver supports any special location values. If that flag is not + * set in @data then special location values should not be used. + * + * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an + * existing rule on entry and @fs contains the rule on return; if + * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is + * filled with the RSS context ID associated with the rule. + * + * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the + * user buffer for @rule_locs on entry. On return, @data is the size + * of the rule table, @rule_cnt is the number of defined rules, and + * @rule_locs contains the locations of the defined rules. Drivers + * must use the second parameter to get_rxnfc() instead of @rule_locs. + * + * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. + * @fs.@location either specifies the location to use or is a special + * location value with %RX_CLS_LOC_SPECIAL flag set. On return, + * @fs.@location is the actual rule location. If @fs.@flow_type + * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to + * use for flow spreading traffic which matches this rule. The value + * from the rxfh indirection table will be added to @fs.@ring_cookie + * to choose which ring to deliver to. + * + * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an + * existing rule on entry. + * + * A driver supporting the special location values for + * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused + * location, and may remove a rule at a later location (lower + * priority) that matches exactly the same set of flows. The special + * values are %RX_CLS_LOC_ANY, selecting any location; + * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum + * priority); and %RX_CLS_LOC_LAST, selecting the last suitable + * location (minimum priority). Additional special values may be + * defined in future and drivers must return -%EINVAL for any + * unrecognised value. + */ +struct ethtool_rxnfc { + __u32 cmd; + __u32 flow_type; + __u64 data; + struct ethtool_rx_flow_spec fs; + union { + __u32 rule_cnt; + __u32 rss_context; + }; + __u32 rule_locs[]; +}; + + +/** + * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection + * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR + * @size: On entry, the array size of the user buffer, which may be zero. + * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware + * indirection table. + * @ring_index: RX ring/queue index for each hash value + * + * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size + * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means + * the table should be reset to default values. This last feature + * is not supported by the original implementations. + */ +struct ethtool_rxfh_indir { + __u32 cmd; + __u32 size; + __u32 ring_index[]; +}; + +/** + * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. + * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command + * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @indir_size: On entry, the array size of the user buffer for the + * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), + * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, + * the array size of the hardware indirection table. + * @key_size: On entry, the array size of the user buffer for the hash key, + * which may be zero. On return from %ETHTOOL_GRSSH, the size of the + * hardware hash key. + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. + * @rsvd8: Reserved for future use; see the note on reserved space. + * @rsvd32: Reserved for future use; see the note on reserved space. + * @rss_config: RX ring/queue index for each hash value i.e., indirection table + * of @indir_size __u32 elements, followed by hash key of @key_size + * bytes. + * + * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the + * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of + * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested + * and a @indir_size of zero means the indir table should be reset to default + * values (if @rss_context == 0) or that the RSS context should be deleted. + * An hfunc of zero means that hash function setting is not requested. + */ +struct ethtool_rxfh { + __u32 cmd; + __u32 rss_context; + __u32 indir_size; + __u32 key_size; + __u8 hfunc; + __u8 input_xfrm; + __u8 rsvd8[2]; + __u32 rsvd32; + __u32 rss_config[]; +}; +#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff +#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff + +/** + * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow field values to match (dependent on @flow_type) + * @m_u: Masks for flow field value bits to be ignored + * @vlan_tag: VLAN tag to match + * @vlan_tag_mask: Mask for VLAN tag bits to be ignored + * @data: Driver-dependent data to match + * @data_mask: Mask for driver-dependent data bits to be ignored + * @action: RX ring/queue index to deliver to (non-negative) or other action + * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) + * + * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where + * a field value and mask are both zero this is treated as if all mask + * bits are set i.e. the field is ignored. + */ +struct ethtool_rx_ntuple_flow_spec { + __u32 flow_type; + union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethhdr ether_spec; + __u8 hdata[72]; + } h_u, m_u; + + __u16 vlan_tag; + __u16 vlan_tag_mask; + __u64 data; + __u64 data_mask; + + __s32 action; +#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ +#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ +}; + +/** + * struct ethtool_rx_ntuple - command to set or clear RX flow filter + * @cmd: Command number - %ETHTOOL_SRXNTUPLE + * @fs: Flow filter specification + */ +struct ethtool_rx_ntuple { + __u32 cmd; + struct ethtool_rx_ntuple_flow_spec fs; +}; + +#define ETHTOOL_FLASH_MAX_FILENAME 128 +enum ethtool_flash_op_type { + ETHTOOL_FLASH_ALL_REGIONS = 0, +}; + +/* for passing firmware flashing related parameters */ +struct ethtool_flash { + __u32 cmd; + __u32 region; + char data[ETHTOOL_FLASH_MAX_FILENAME]; +}; + +/** + * struct ethtool_dump - used for retrieving, setting device dump + * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or + * %ETHTOOL_SET_DUMP + * @version: FW version of the dump, filled in by driver + * @flag: driver dependent flag for dump setting, filled in by driver during + * get and filled in by ethtool for set operation. + * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when + * firmware dump is disabled. + * @len: length of dump data, used as the length of the user buffer on entry to + * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver + * for %ETHTOOL_GET_DUMP_FLAG command + * @data: data collected for get dump data operation + */ +struct ethtool_dump { + __u32 cmd; + __u32 version; + __u32 flag; + __u32 len; + __u8 data[]; +}; + +#define ETH_FW_DUMP_DISABLE 0 + +/* for returning and changing feature sets */ + +/** + * struct ethtool_get_features_block - block with state of 32 features + * @available: mask of changeable features + * @requested: mask of features requested to be enabled if possible + * @active: mask of currently enabled features + * @never_changed: mask of features not changeable for any device + */ +struct ethtool_get_features_block { + __u32 available; + __u32 requested; + __u32 active; + __u32 never_changed; +}; + +/** + * struct ethtool_gfeatures - command to get state of device's features + * @cmd: command number = %ETHTOOL_GFEATURES + * @size: On entry, the number of elements in the features[] array; + * on return, the number of elements in features[] needed to hold + * all features + * @features: state of features + */ +struct ethtool_gfeatures { + __u32 cmd; + __u32 size; + struct ethtool_get_features_block features[]; +}; + +/** + * struct ethtool_set_features_block - block with request for 32 features + * @valid: mask of features to be changed + * @requested: values of features to be changed + */ +struct ethtool_set_features_block { + __u32 valid; + __u32 requested; +}; + +/** + * struct ethtool_sfeatures - command to request change in device's features + * @cmd: command number = %ETHTOOL_SFEATURES + * @size: array size of the features[] array + * @features: feature change masks + */ +struct ethtool_sfeatures { + __u32 cmd; + __u32 size; + struct ethtool_set_features_block features[]; +}; + +/** + * struct ethtool_ts_info - holds a device's timestamping and PHC association + * @cmd: command number = %ETHTOOL_GET_TS_INFO + * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags + * @phc_index: device index of the associated PHC, or -1 if there is none + * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values + * @tx_reserved: Reserved for future use; see the note on reserved space. + * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values + * @rx_reserved: Reserved for future use; see the note on reserved space. + * + * The bits in the 'tx_types' and 'rx_filters' fields correspond to + * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, + * respectively. For example, if the device supports HWTSTAMP_TX_ON, + * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. + * + * Drivers should only report the filters they actually support without + * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for + * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the + * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. + */ +struct ethtool_ts_info { + __u32 cmd; + __u32 so_timestamping; + __s32 phc_index; + __u32 tx_types; + __u32 tx_reserved[3]; + __u32 rx_filters; + __u32 rx_reserved[3]; +}; + +/* + * %ETHTOOL_SFEATURES changes features present in features[].valid to the + * values of corresponding bits in features[].requested. Bits in .requested + * not set in .valid or not changeable are ignored. + * + * Returns %EINVAL when .valid contains undefined or never-changeable bits + * or size is not equal to required number of features words (32-bit blocks). + * Returns >= 0 if request was completed; bits set in the value mean: + * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not + * changeable (not present in %ETHTOOL_GFEATURES' features[].available) + * those bits were ignored. + * %ETHTOOL_F_WISH - some or all changes requested were recorded but the + * resulting state of bits masked by .valid is not equal to .requested. + * Probably there are other device-specific constraints on some features + * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered + * here as though ignored bits were cleared. + * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling + * compatibility functions. Requested offload state cannot be properly + * managed by kernel. + * + * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of + * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands + * for ETH_SS_FEATURES string set. First entry in the table corresponds to least + * significant bit in features[0] fields. Empty strings mark undefined features. + */ +enum ethtool_sfeatures_retval_bits { + ETHTOOL_F_UNSUPPORTED__BIT, + ETHTOOL_F_WISH__BIT, + ETHTOOL_F_COMPAT__BIT, +}; + +#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) +#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) +#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) + +#define MAX_NUM_QUEUE 4096 + +/** + * struct ethtool_per_queue_op - apply sub command to the queues in mask. + * @cmd: ETHTOOL_PERQUEUE + * @sub_command: the sub command which apply to each queues + * @queue_mask: Bitmap of the queues which sub command apply to + * @data: A complete command structure following for each of the queues addressed + */ +struct ethtool_per_queue_op { + __u32 cmd; + __u32 sub_command; + __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + char data[]; +}; + +/** + * struct ethtool_fecparam - Ethernet Forward Error Correction parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on the port, single bit set, GET only. + * @fec: Bitmask of configured FEC modes. + * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. + * + * Note that @reserved was never validated on input and ethtool user space + * left it uninitialized when calling SET. Hence going forward it can only be + * used to return a value to userspace with GET. + * + * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. + * FEC settings are configured by link autonegotiation whenever it's enabled. + * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. + * + * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. + * It is recommended that drivers only accept a single bit set in @fec. + * When multiple bits are set in @fec drivers may pick mode in an implementation + * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other + * FEC modes, because it's unclear whether in this case other modes constrain + * AUTO or are independent choices. + * Drivers must reject SET requests if they support none of the requested modes. + * + * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead + * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. + * + * See enum ethtool_fec_config_bits for definition of valid bits for both + * @fec and @active_fec. + */ +struct ethtool_fecparam { + __u32 cmd; + /* bitmask of FEC modes */ + __u32 active_fec; + __u32 fec; + __u32 reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually + * based link mode and SFP parameters read from module's + * EEPROM. This bit does _not_ mean autonegotiation. + * @ETHTOOL_FEC_OFF_BIT: No FEC Mode + * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, + ETHTOOL_FEC_LLRS_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) +#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) + +/* CMDs currently supported */ +#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. + * Please use ETHTOOL_GLINKSETTINGS + */ +#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. + * Please use ETHTOOL_SLINKSETTINGS + */ +#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ +#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ +#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ +#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ +#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ +#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ +#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ +/* Get link status for host, i.e. whether the interface *and* the + * physical port (if there is one) are up (ethtool_value). */ +#define ETHTOOL_GLINK 0x0000000a +#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ +#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ +#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ +#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ +#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ +#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ +#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ +#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ +#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ +#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ +#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ +#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ +#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable + * (ethtool_value) */ +#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable + * (ethtool_value). */ +#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ +#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ +#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ +#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ +#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ +#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ +#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ +#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ +#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ +#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ +#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ +#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ + +#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ +#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ +#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ +#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ +#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ +#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ +#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ +#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ +#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ +#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ +#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ +#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ +#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ +#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ +#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ +#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ +#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ + +#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ +#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ +#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ +#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ +#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ +#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ +#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ +#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ +#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ +#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ + +#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ +#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ + +#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ + +#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ +#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ +#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ +#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ + +/* compatibility with older code */ +#define SPARC_ETH_GSET ETHTOOL_GSET +#define SPARC_ETH_SSET ETHTOOL_SSET + +/* Link mode bit indices */ +enum ethtool_link_mode_bit_indices { + ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, + ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, + ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, + ETHTOOL_LINK_MODE_Autoneg_BIT = 6, + ETHTOOL_LINK_MODE_TP_BIT = 7, + ETHTOOL_LINK_MODE_AUI_BIT = 8, + ETHTOOL_LINK_MODE_MII_BIT = 9, + ETHTOOL_LINK_MODE_FIBRE_BIT = 10, + ETHTOOL_LINK_MODE_BNC_BIT = 11, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, + ETHTOOL_LINK_MODE_Pause_BIT = 13, + ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, + ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, + ETHTOOL_LINK_MODE_Backplane_BIT = 16, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, + ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, + ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, + ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, + ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, + ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, + ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, + ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, + ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, + ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, + ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, + ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + ETHTOOL_LINK_MODE_10baseT1S_Full_BIT = 99, + ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, + ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, + + /* must be last entry */ + __ETHTOOL_LINK_MODE_MASK_NBITS +}; + +#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ + (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) + +/* DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new SUPPORTED_* macro for bits > 31. + */ +#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new SUPPORTED_* macro for bits > 31, see + * notice above. + */ + +/* + * DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new ADERTISE_* macro for bits > 31. + */ +#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new ADVERTISED_* macro for bits > 31, see + * notice above. + */ + +/* The following are all involved in forcing a particular link + * mode for the device for setting things. When getting the + * devices settings, these indicate the current mode and whether + * it was forced up into this mode or autonegotiated. + */ + +/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. + * Update drivers/net/phy/phy.c:phy_speed_to_str() and + * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. + */ +#define SPEED_10 10 +#define SPEED_100 100 +#define SPEED_1000 1000 +#define SPEED_2500 2500 +#define SPEED_5000 5000 +#define SPEED_10000 10000 +#define SPEED_14000 14000 +#define SPEED_20000 20000 +#define SPEED_25000 25000 +#define SPEED_40000 40000 +#define SPEED_50000 50000 +#define SPEED_56000 56000 +#define SPEED_100000 100000 +#define SPEED_200000 200000 +#define SPEED_400000 400000 +#define SPEED_800000 800000 + +#define SPEED_UNKNOWN -1 + +static inline int ethtool_validate_speed(__u32 speed) +{ + return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; +} + +/* Duplex, half or full. */ +#define DUPLEX_HALF 0x00 +#define DUPLEX_FULL 0x01 +#define DUPLEX_UNKNOWN 0xff + +static inline int ethtool_validate_duplex(__u8 duplex) +{ + switch (duplex) { + case DUPLEX_HALF: + case DUPLEX_FULL: + case DUPLEX_UNKNOWN: + return 1; + } + + return 0; +} + +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + +/* These are used to throttle the rate of data on the phy interface when the + * native speed of the interface is higher than the link speed. These should + * not be used for phy interfaces which natively support multiple speeds (e.g. + * MII or SGMII). + */ +/* No rate matching performed. */ +#define RATE_MATCH_NONE 0 +/* The phy sends pause frames to throttle the MAC. */ +#define RATE_MATCH_PAUSE 1 +/* The phy asserts CRS to prevent the MAC from transmitting. */ +#define RATE_MATCH_CRS 2 +/* The MAC is programmed with a sufficiently-large IPG. */ +#define RATE_MATCH_OPEN_LOOP 3 + +/* Which connector port. */ +#define PORT_TP 0x00 +#define PORT_AUI 0x01 +#define PORT_MII 0x02 +#define PORT_FIBRE 0x03 +#define PORT_BNC 0x04 +#define PORT_DA 0x05 +#define PORT_NONE 0xef +#define PORT_OTHER 0xff + +/* Which transceiver to use. */ +#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ +#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ +#define XCVR_DUMMY1 0x02 +#define XCVR_DUMMY2 0x03 +#define XCVR_DUMMY3 0x04 + +/* Enable or disable autonegotiation. */ +#define AUTONEG_DISABLE 0x00 +#define AUTONEG_ENABLE 0x01 + +/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then + * the driver is required to renegotiate link + */ +#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ +#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ +#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ +#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ + +/* Wake-On-Lan options. */ +#define WAKE_PHY (1 << 0) +#define WAKE_UCAST (1 << 1) +#define WAKE_MCAST (1 << 2) +#define WAKE_BCAST (1 << 3) +#define WAKE_ARP (1 << 4) +#define WAKE_MAGIC (1 << 5) +#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ +#define WAKE_FILTER (1 << 7) + +#define WOL_MODE_COUNT 8 + +/* RSS hash function data + * XOR the corresponding source and destination fields of each specified + * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH + * calculation. Note that this XORing reduces the input set entropy and could + * be exploited to reduce the RSS queue spread. + */ +#define RXH_XFRM_SYM_XOR (1 << 0) +#define RXH_XFRM_NO_CHANGE 0xff + +/* L2-L4 network traffic flow types */ +#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ +#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ +#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ +#define AH_ESP_V4_FLOW 0x04 /* hash only */ +#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ +#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ +#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ +#define AH_ESP_V6_FLOW 0x08 /* hash only */ +#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ +#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ +#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ +#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ +#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define IP_USER_FLOW IPV4_USER_FLOW +#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ +#define IPV4_FLOW 0x10 /* hash only */ +#define IPV6_FLOW 0x11 /* hash only */ +#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ + +/* Used for GTP-U IPv4 and IPv6. + * The format of GTP packets only includes + * elements such as TEID and GTP version. + * It is primarily intended for data communication of the UE. + */ +#define GTPU_V4_FLOW 0x13 /* hash only */ +#define GTPU_V6_FLOW 0x14 /* hash only */ + +/* Use for GTP-C IPv4 and v6. + * The format of these GTP packets does not include TEID. + * Primarily expected to be used for communication + * to create sessions for UE data communication, + * commonly referred to as CSR (Create Session Request). + */ +#define GTPC_V4_FLOW 0x15 /* hash only */ +#define GTPC_V6_FLOW 0x16 /* hash only */ + +/* Use for GTP-C IPv4 and v6. + * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. + * After session creation, it becomes this packet. + * This is mainly used for requests to realize UE handover. + */ +#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ +#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ + +/* Use for GTP-U and extended headers for the PSC (PDU Session Container). + * The format of these GTP packets includes TEID and QFI. + * In 5G communication using UPF (User Plane Function), + * data communication with this extended header is performed. + */ +#define GTPU_EH_V4_FLOW 0x19 /* hash only */ +#define GTPU_EH_V6_FLOW 0x1a /* hash only */ + +/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. + * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by + * UL/DL included in the PSC. + * There are differences in the data included based on Downlink/Uplink, + * and can be used to distinguish packets. + * The functions described so far are useful when you want to + * handle communication from the mobile network in UPF, PGW, etc. + */ +#define GTPU_UL_V4_FLOW 0x1b /* hash only */ +#define GTPU_UL_V6_FLOW 0x1c /* hash only */ +#define GTPU_DL_V4_FLOW 0x1d /* hash only */ +#define GTPU_DL_V6_FLOW 0x1e /* hash only */ + +/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ +#define FLOW_EXT 0x80000000 +#define FLOW_MAC_EXT 0x40000000 +/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ +#define FLOW_RSS 0x20000000 + +/* L3-L4 network traffic flow hash options */ +#define RXH_L2DA (1 << 1) +#define RXH_VLAN (1 << 2) +#define RXH_L3_PROTO (1 << 3) +#define RXH_IP_SRC (1 << 4) +#define RXH_IP_DST (1 << 5) +#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ +#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ +#define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_DISCARD (1 << 31) + +#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL +#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL + +/* Special RX classification rule insert location values */ +#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ +#define RX_CLS_LOC_ANY 0xffffffff +#define RX_CLS_LOC_FIRST 0xfffffffe +#define RX_CLS_LOC_LAST 0xfffffffd + +/* EEPROM Standards for plug in modules */ +#define ETH_MODULE_SFF_8079 0x1 +#define ETH_MODULE_SFF_8079_LEN 256 +#define ETH_MODULE_SFF_8472 0x2 +#define ETH_MODULE_SFF_8472_LEN 512 +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 + +#define ETH_MODULE_SFF_8636_MAX_LEN 640 +#define ETH_MODULE_SFF_8436_MAX_LEN 640 + +/* Reset flags */ +/* The reset() operation must clear the flags for the components which + * were actually reset. On successful return, the flags indicate the + * components which were not reset, either because they do not exist + * in the hardware or because they cannot be reset independently. The + * driver must never reset any components that were not requested. + */ +enum ethtool_reset_flags { + /* These flags represent components dedicated to the interface + * the command is addressed to. Shift any flag left by + * ETH_RESET_SHARED_SHIFT to reset a shared component of the + * same type. + */ + ETH_RESET_MGMT = 1 << 0, /* Management processor */ + ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ + ETH_RESET_DMA = 1 << 2, /* DMA engine */ + ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ + ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ + ETH_RESET_MAC = 1 << 5, /* Media access controller */ + ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ + ETH_RESET_RAM = 1 << 7, /* RAM shared between + * multiple components */ + ETH_RESET_AP = 1 << 8, /* Application processor */ + + ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to + * this interface */ + ETH_RESET_ALL = 0xffffffff, /* All components used by this + * interface, even if shared */ +}; +#define ETH_RESET_SHARED_SHIFT 16 + + +/** + * struct ethtool_link_settings - link control and status + * + * IMPORTANT, Backward compatibility notice: When implementing new + * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and + * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link + * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS + * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in + * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use + * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link + * settings; do not use %ETHTOOL_SLINKSETTINGS if + * %ETHTOOL_GLINKSETTINGS failed: stick to + * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. + * + * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS + * @speed: Link speed (Mbps) + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @link_mode_masks_nwords: Number of 32-bit words for each of the + * supported, advertising, lp_advertising link mode bitmaps. For + * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user + * (>= 0); on return, if handshake in progress, negative if + * request size unsupported by kernel: absolute value indicates + * kernel expected size and all the other fields but cmd + * are 0; otherwise (handshake completed), strictly positive + * to indicate size used by kernel and cmd field stays + * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For + * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive + * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise + * refused. For drivers: ignore this field (use kernel's + * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will + * be overwritten by kernel. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. + * @master_slave_cfg: Master/slave port mode. + * @master_slave_state: Master/slave port state. + * @rate_matching: Rate adaptation performed by the PHY + * @reserved: Reserved for future use; see the note on reserved space. + * @link_mode_masks: Variable length bitmaps. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt + * are not available in %ethtool_link_settings. These fields will be + * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will + * fail if any of them is set to non-zero value. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GLINKSETTINGS to get the current values before making specific + * changes and then applying them with %ETHTOOL_SLINKSETTINGS. + * + * Drivers that implement %get_link_ksettings and/or + * %set_link_ksettings should ignore the @cmd + * and @link_mode_masks_nwords fields (any change to them overwritten + * by kernel), and rely only on kernel's internal + * %__ETHTOOL_LINK_MODE_MASK_NBITS and + * %ethtool_link_mode_mask_t. Drivers that implement + * %set_link_ksettings() should validate all fields other than @cmd + * and @link_mode_masks_nwords that are not described as read-only or + * deprecated, and must ignore all fields described as read-only. + * + * @link_mode_masks is divided into three bitfields, each of length + * @link_mode_masks_nwords: + * - supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * - advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * - lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. + */ +struct ethtool_link_settings { + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u8 transceiver; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 rate_matching; + __u32 reserved[7]; + __u32 link_mode_masks[]; + /* layout of link_mode_masks fields: + * __u32 map_supported[link_mode_masks_nwords]; + * __u32 map_advertising[link_mode_masks_nwords]; + * __u32 map_lp_advertising[link_mode_masks_nwords]; + */ +}; #endif /* _UAPI_LINUX_ETHTOOL_H */ -- cgit v1.2.3 From c3bd015090f24dcd2e839db1401e948ad95ce803 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:24 +0000 Subject: selftests/xsk: Make batch size variable Convert the constant BATCH_SIZE into a variable named batch_size to allow dynamic modification at runtime. This is required for the forthcoming changes to support testing different hardware ring sizes. While running these tests, a bug was identified when the batch size is roughly the same as the NIC ring size. This has now been addressed by Maciej's fix in commit 913eda2b08cc ("i40e: xsk: remove count_mask"). Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-3-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 20 +++++++++++--------- tools/testing/selftests/bpf/xskxceiver.h | 3 ++- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index b1102ee13faa..eaa102c8098b 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -239,7 +239,7 @@ static void enable_busy_poll(struct xsk_socket_info *xsk) (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); - sock_opt = BATCH_SIZE; + sock_opt = xsk->batch_size; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); @@ -439,6 +439,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, for (j = 0; j < MAX_SOCKETS; j++) { memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; if (i == 0) ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; else @@ -1087,7 +1088,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) return TEST_CONTINUE; } - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); if (!rcvd) return TEST_CONTINUE; @@ -1239,7 +1240,8 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) { + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { ret = kick_tx(xsk); if (ret) return TEST_FAILURE; @@ -1249,7 +1251,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b fds.fd = xsk_socket__fd(xsk->xsk); fds.events = POLLOUT; - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); if (timeout) { @@ -1269,10 +1271,10 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b } } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } - for (i = 0; i < BATCH_SIZE; i++) { + for (i = 0; i < xsk->batch_size; i++) { struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); u32 nb_frags_left, nb_frags, bytes_written = 0; @@ -1280,9 +1282,9 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b break; nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > BATCH_SIZE - i) { + if (nb_frags > xsk->batch_size - i) { pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); break; } nb_frags_left = nb_frags; @@ -1370,7 +1372,7 @@ static int wait_for_tx_completion(struct xsk_socket_info *xsk) return TEST_FAILURE; } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } return TEST_PASS; diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index f174df2d693f..425304e52f35 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -44,7 +44,7 @@ #define MAX_ETH_JUMBO_SIZE 9000 #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define DEFAULT_BATCH_SIZE 64 #define POLL_TMOUT 1000 #define THREAD_TMOUT 3 #define DEFAULT_PKT_CNT (4 * 1024) @@ -91,6 +91,7 @@ struct xsk_socket_info { struct pkt_stream *pkt_stream; u32 outstanding_tx; u32 rxqsize; + u32 batch_size; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; -- cgit v1.2.3 From 90a695c3d31e1c9f0adb8c4c80028ed4ea7ed5ab Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:25 +0000 Subject: selftests/bpf: Implement get_hw_ring_size function to retrieve current and max interface size Introduce a new function called get_hw_size that retrieves both the current and maximum size of the interface and stores this information in the 'ethtool_ringparam' structure. Remove ethtool_channels struct from xdp_hw_metadata.c due to redefinition error. Remove unused linux/if.h include from flow_dissector BPF test to address CI pipeline failure. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-4-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/network_helpers.c | 24 ++++++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 4 ++++ .../selftests/bpf/prog_tests/flow_dissector.c | 1 - tools/testing/selftests/bpf/xdp_hw_metadata.c | 14 ------------- 4 files changed, 28 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 6db27a9088e9..1cab20020f94 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -497,3 +497,27 @@ int get_socket_local_port(int sock_fd) return -1; } + +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_GRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 94b9be24e39b..de4d27d201cb 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -9,8 +9,11 @@ typedef __u16 __sum16; #include #include #include +#include +#include #include #include +#include #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 @@ -61,6 +64,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); char *ping_command(int family); int get_socket_local_port(int sock_fd); +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index c4773173a4e4..9e5f38739104 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index bdf5d8180067..0859fe727da7 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -495,20 +495,6 @@ peek: return 0; } -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - static int rxq_num(const char *ifname) { struct ethtool_channels ch = { -- cgit v1.2.3 From bee3a7b07624223526c1fea465557068546d3b3c Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:26 +0000 Subject: selftests/bpf: Implement set_hw_ring_size function to configure interface ring size Introduce a new function called set_hw_ring_size that allows for the dynamic configuration of the ring size within the interface. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-5-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/network_helpers.c | 24 ++++++++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 1 + 2 files changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 1cab20020f94..04175e16195a 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -521,3 +521,27 @@ int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) close(sockfd); return 0; } + +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_SRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index de4d27d201cb..6457445cc6e2 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -65,6 +65,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, char *ping_command(int family); int get_socket_local_port(int sock_fd); int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** -- cgit v1.2.3 From 776021e07fd0d7592c767e60929b954e82676186 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:27 +0000 Subject: selftests/xsk: Introduce set_ring_size function with a retry mechanism for handling AF_XDP socket closures Introduce a new function, set_ring_size(), to manage asynchronous AF_XDP socket closure. Retry set_hw_ring_size up to SOCK_RECONF_CTR times if it fails due to an active AF_XDP socket. Return an error immediately for non-EBUSY errors. This enhances robustness against asynchronous AF_XDP socket closures during ring size changes. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-6-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/xskxceiver.c | 57 +++++++++++++++++++++++++++++++- tools/testing/selftests/bpf/xskxceiver.h | 9 +++++ 3 files changed, 66 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b0ac3dd80acf..eea5b8deaaf0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -695,7 +695,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Include find_bit.c to compile xskxceiver. EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c -$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) +$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index eaa102c8098b..8c26868e17cf 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -105,11 +106,15 @@ #include "../kselftest.h" #include "xsk_xdp_common.h" +#include + static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; +void test__fail(void) { /* for network_helpers.c */ } + static void __exit_with_error(int error, const char *file, const char *func, int line) { ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, @@ -409,6 +414,33 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +static int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) { @@ -452,12 +484,16 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + test->ifobj_tx = ifobj_tx; test->ifobj_rx = ifobj_rx; test->current_step = 0; test->total_steps = 1; test->nb_sockets = 1; test->fail = false; + test->set_ring = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -1862,6 +1898,14 @@ static int testapp_validate_traffic(struct test_spec *test) return TEST_SKIP; } + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) + return set_ring_size(ifobj_tx); + + ksft_test_result_skip("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); } @@ -2479,7 +2523,7 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, -}; + }; static void print_tests(void) { @@ -2499,6 +2543,7 @@ int main(int argc, char **argv) int modes = TEST_MODE_SKB + 1; struct test_spec test; bool shared_netdev; + int ret; /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); @@ -2536,6 +2581,13 @@ int main(int argc, char **argv) modes++; } + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + init_iface(ifobj_rx, worker_testapp_validate_rx); init_iface(ifobj_tx, worker_testapp_validate_tx); @@ -2583,6 +2635,9 @@ int main(int argc, char **argv) } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + pkt_stream_delete(tx_pkt_stream_default); pkt_stream_delete(rx_pkt_stream_default); xsk_unload_xdp_programs(ifobj_tx); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 425304e52f35..906de5fab7a3 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -114,6 +114,11 @@ struct pkt_stream { bool verbatim; }; +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + struct ifobject; struct test_spec; typedef int (*validation_func_t)(struct ifobject *ifobj); @@ -130,6 +135,8 @@ struct ifobject { struct xsk_xdp_progs *xdp_progs; struct bpf_map *xskmap; struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; enum test_mode mode; int ifindex; int mtu; @@ -146,6 +153,7 @@ struct ifobject { bool unaligned_supp; bool multi_buff_supp; bool multi_buff_zc_supp; + bool hw_ring_size_supp; }; struct test_spec { @@ -163,6 +171,7 @@ struct test_spec { u16 current_step; u16 nb_sockets; bool fail; + bool set_ring; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; -- cgit v1.2.3 From c4f960539fae6f04617d3909cc0dfdb88e7d197b Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:28 +0000 Subject: selftests/xsk: Test AF_XDP functionality under minimal ring configurations Add a new test case that stresses AF_XDP and the driver by configuring small hardware and software ring sizes. This verifies that AF_XDP continues to function properly even with insufficient ring space that could lead to frequent producer/consumer throttling. The test procedure involves: 1. Set the minimum possible ring configuration(tx 64 and rx 128). 2. Run tests with various batch sizes(1 and 63) to validate the system's behavior under different configurations. Update Makefile to include network_helpers.o in the build process for xskxceiver. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-7-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 8c26868e17cf..dac2dcf39bb8 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2419,6 +2419,26 @@ static int testapp_xdp_metadata_mb(struct test_spec *test) return testapp_xdp_metadata_copy(test); } +static int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2523,6 +2543,7 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, }; static void print_tests(void) -- cgit v1.2.3 From c53908b254fcf25b05bcdf6634adb36eaccac111 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:29 +0000 Subject: selftests/xsk: Add new test case for AF_XDP under max ring sizes Introduce a test case to evaluate AF_XDP's robustness by pushing hardware and software ring sizes to their limits. This test ensures AF_XDP's reliability amidst potential producer/consumer throttling due to maximum ring utilization. The testing strategy includes: 1. Configuring rings to their maximum allowable sizes. 2. Executing a series of tests across diverse batch sizes to assess system's behavior under different configurations. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-8-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index dac2dcf39bb8..2eac0895b0a1 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2439,6 +2439,30 @@ static int testapp_hw_sw_min_ring_size(struct test_spec *test) return testapp_validate_traffic(test); } +static int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->xsk->rxqsize = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 4095 */ + test->ifobj_tx->xsk->batch_size = max_descs - 1; + test->ifobj_rx->xsk->batch_size = max_descs - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2544,6 +2568,7 @@ static const struct test_spec tests[] = { {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, }; static void print_tests(void) -- cgit v1.2.3 From 374af9f1f06b5e991c810d2e4983d6f58df32136 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 22 Mar 2024 15:43:12 -0700 Subject: perf annotate: Get rid of duplicate --group option item The options array in cmd_annotate() has duplicate --group options. It only needs one and let's get rid of the other. $ perf annotate -h 2>&1 | grep group --group Show event group information together --group Show event group information together Fixes: 7ebaf4890f63eb90 ("perf annotate: Support '--group' option") Reviewed-by: Kan Liang Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jin Yao Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240322224313.423181-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index f677671409b1..3e9f7e0596e8 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -810,8 +810,6 @@ int cmd_annotate(int argc, const char **argv) "Enable symbol demangling"), OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, "Enable kernel symbol demangling"), - OPT_BOOLEAN(0, "group", &symbol_conf.event_group, - "Show event group information together"), OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, "Show a column with the sum of periods"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, -- cgit v1.2.3 From bdeaf6ffec8b3590740973712c6be673c28a54a4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 22 Mar 2024 15:43:13 -0700 Subject: perf annotate: Honor output options with --data-type For data type profiling output, it should be in sync with normal output so make it display percentage for each field. Also use coloring scheme for users to identify fields with big overhead easily. Users can use --show-total-period or --show-nr-samples to change the output style like in the normal perf annotate output. Before: $ perf annotate --data-type Annotate type: 'struct task_struct' in [kernel.kallsyms] (34 samples): ============================================================================ samples offset size field 34 0 9792 struct task_struct { 2 0 24 struct thread_info thread_info { 0 0 8 long unsigned int flags; 1 8 8 long unsigned int syscall_work; 0 16 4 u32 status; 1 20 4 u32 cpu; }; After: $ perf annotate --data-type Annotate type: 'struct task_struct' in [kernel.kallsyms] (34 samples): ============================================================================ Percent offset size field 100.00 0 9792 struct task_struct { 3.55 0 24 struct thread_info thread_info { 0.00 0 8 long unsigned int flags; 1.63 8 8 long unsigned int syscall_work; 0.00 16 4 u32 status; 1.91 20 4 u32 cpu; }; Committer testing: First collect a suitable perf.data file for use with 'perf annotate --data-type': root@number:~# perf mem record -a sleep 1s [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 11.047 MB perf.data (3466 samples) ] root@number:~# Then, before: root@number:~# perf annotate --data-type Annotate type: 'union ' in /usr/lib64/libc.so.6 (6 samples): ============================================================================ samples offset size field 6 0 40 union { 6 0 40 struct __pthread_mutex_s __data { 2 0 4 int __lock; 0 4 4 unsigned int __count; 0 8 4 int __owner; 1 12 4 unsigned int __nusers; 2 16 4 int __kind; 1 20 2 short int __spins; 0 22 2 short int __elision; 0 24 16 __pthread_list_t __list { 0 24 8 struct __pthread_internal_list* __prev; 0 32 8 struct __pthread_internal_list* __next; }; }; 0 0 0 char* __size; 2 0 8 long int __align; }; And after: Annotate type: 'union ' in /usr/lib64/libc.so.6 (6 samples): ============================================================================ Percent offset size field 100.00 0 40 union { 100.00 0 40 struct __pthread_mutex_s __data { 31.27 0 4 int __lock; 0.00 4 4 unsigned int __count; 0.00 8 4 int __owner; 7.67 12 4 unsigned int __nusers; 53.10 16 4 int __kind; 7.96 20 2 short int __spins; 0.00 22 2 short int __elision; 0.00 24 16 __pthread_list_t __list { 0.00 24 8 struct __pthread_internal_list* __prev; 0.00 32 8 struct __pthread_internal_list* __next; }; }; 0.00 0 0 char* __size; 31.27 0 8 long int __align; }; The lines with percentages >= 7.67 have its percentages red colored. Reviewed-by: Kan Liang Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240322224313.423181-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 44 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 3e9f7e0596e8..16e1581207c9 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -42,6 +42,7 @@ #include #include #include +#include struct perf_annotate { struct perf_tool tool; @@ -332,6 +333,8 @@ static void print_annotated_data_header(struct hist_entry *he, struct evsel *evs struct dso *dso = map__dso(he->ms.map); int nr_members = 1; int nr_samples = he->stat.nr_events; + int width = 7; + const char *val_hdr = "Percent"; if (evsel__is_group_event(evsel)) { struct hist_entry *pair; @@ -353,8 +356,30 @@ static void print_annotated_data_header(struct hist_entry *he, struct evsel *evs nr_members = evsel->core.nr_members; } + if (symbol_conf.show_total_period) { + width = 11; + val_hdr = "Period"; + } else if (symbol_conf.show_nr_samples) { + width = 7; + val_hdr = "Samples"; + } + printf("============================================================================\n"); - printf("%*s %10s %10s %s\n", 11 * nr_members, "samples", "offset", "size", "field"); + printf("%*s %10s %10s %s\n", (width + 1) * nr_members, val_hdr, + "offset", "size", "field"); +} + +static void print_annotated_data_value(struct type_hist *h, u64 period, int nr_samples) +{ + double percent = h->period ? (100.0 * period / h->period) : 0; + const char *color = get_percent_color(percent); + + if (symbol_conf.show_total_period) + color_fprintf(stdout, color, " %11" PRIu64, period); + else if (symbol_conf.show_nr_samples) + color_fprintf(stdout, color, " %7d", nr_samples); + else + color_fprintf(stdout, color, " %7.2f", percent); } static void print_annotated_data_type(struct annotated_data_type *mem_type, @@ -364,10 +389,14 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, struct annotated_member *child; struct type_hist *h = mem_type->histograms[evsel->core.idx]; int i, nr_events = 1, samples = 0; + u64 period = 0; + int width = symbol_conf.show_total_period ? 11 : 7; - for (i = 0; i < member->size; i++) + for (i = 0; i < member->size; i++) { samples += h->addr[member->offset + i].nr_samples; - printf(" %10d", samples); + period += h->addr[member->offset + i].period; + } + print_annotated_data_value(h, period, samples); if (evsel__is_group_event(evsel)) { struct evsel *pos; @@ -376,9 +405,12 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, h = mem_type->histograms[pos->core.idx]; samples = 0; - for (i = 0; i < member->size; i++) + period = 0; + for (i = 0; i < member->size; i++) { samples += h->addr[member->offset + i].nr_samples; - printf(" %10d", samples); + period += h->addr[member->offset + i].period; + } + print_annotated_data_value(h, period, samples); } nr_events = evsel->core.nr_members; } @@ -394,7 +426,7 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, print_annotated_data_type(mem_type, child, evsel, indent + 4); if (!list_empty(&member->children)) - printf("%*s}", 11 * nr_events + 24 + indent, ""); + printf("%*s}", (width + 1) * nr_events + 24 + indent, ""); printf(";\n"); } -- cgit v1.2.3 From 6e4b398770d5023eb6383da9360a23bd537c155b Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 1 Apr 2024 14:27:23 +0800 Subject: perf sched timehist: Fix -g/--call-graph option failure When 'perf sched' enables the call-graph recording, sample_type of dummy event does not have PERF_SAMPLE_CALLCHAIN, timehist_check_attr() checks that the evsel does not have a callchain, and set show_callchain to 0. Currently 'perf sched timehist' only saves callchain when processing the 'sched:sched_switch event', timehist_check_attr() only needs to determine whether the event has PERF_SAMPLE_CALLCHAIN. Before: # perf sched record -g true [ perf record: Woken up 0 times to write data ] [ perf record: Captured and wrote 4.153 MB perf.data (7536 samples) ] # perf sched timehist Samples do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- 147851.826019 [0000] perf[285035] 0.000 0.000 0.000 147851.826029 [0000] migration/0[15] 0.000 0.003 0.009 147851.826063 [0001] perf[285035] 0.000 0.000 0.000 147851.826069 [0001] migration/1[21] 0.000 0.003 0.006 After: # perf sched record -g true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 2.572 MB perf.data (822 samples) ] # perf sched timehist time cpu task name waittime sch delay runtime [tid/pid] (msec) (msec) (msec) ----------- --- --------------- -------- -------- ----- 4193.035164 [0] perf[277062] 0.000 0.000 0.000 __traceiter_sched_switch <- __traceiter_sched_switch <- __sched_text_start <- preempt_schedule_common <- __cond_resched <- __wait_for_common <- wait_for_completion 4193.035174 [0] migration/0[15] 0.000 0.003 0.009 __traceiter_sched_switch <- __traceiter_sched_switch <- __sched_text_start <- smpboot_thread_fn <- kthread <- ret_from_fork 4193.035207 [1] perf[277062] 0.000 0.000 0.000 __traceiter_sched_switch <- __traceiter_sched_switch <- __sched_text_start <- preempt_schedule_common <- __cond_resched <- __wait_for_common <- wait_for_completion 4193.035214 [1] migration/1[21] 0.000 0.003 0.007 __traceiter_sched_switch <- __traceiter_sched_switch <- __sched_text_start <- smpboot_thread_fn <- kthread <- ret_from_fork Fixes: 9c95e4ef06572349 ("perf evlist: Add evlist__findnew_tracking_event() helper") Reviewed-by: Ian Rogers Signed-off-by: Yang Jihong Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240401062724.1006010-2-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b248c433529a..1bfb22347371 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2963,8 +2963,11 @@ static int timehist_check_attr(struct perf_sched *sched, return -1; } - if (sched->show_callchain && !evsel__has_callchain(evsel)) { - pr_info("Samples do not have callchains.\n"); + /* only need to save callchain related to sched_switch event */ + if (sched->show_callchain && + evsel__name_is(evsel, "sched:sched_switch") && + !evsel__has_callchain(evsel)) { + pr_info("Samples of sched_switch event do not have callchains.\n"); sched->show_callchain = 0; symbol_conf.use_callchain = 0; } -- cgit v1.2.3 From 09d2056efe0c02b7a589915c004c6e925735d081 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 1 Apr 2024 14:27:24 +0800 Subject: perf evsel: Use evsel__name_is() helper Code cleanup, replace strcmp(evsel__name(evsel, {NAME})) with evsel__name_is() helper. No functional change. Committer notes: Fix this build error: trace.syscalls.events.bpf_output = evlist__last(trace.evlist); - assert(evsel__name_is(trace.syscalls.events.bpf_output), "__augmented_syscalls__"); + assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__")); Reviewed-by: Ian Rogers Signed-off-by: Yang Jihong Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240401062724.1006010-3-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-sched.c | 4 ++-- tools/perf/builtin-script.c | 2 +- tools/perf/builtin-trace.c | 4 ++-- tools/perf/tests/evsel-roundtrip-name.c | 4 ++-- tools/perf/tests/parse-events.c | 39 ++++++++++++--------------------- 6 files changed, 22 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9714327fd0ea..6fd95be5032b 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -1408,7 +1408,7 @@ static int __cmd_kmem(struct perf_session *session) } evlist__for_each_entry(session->evlist, evsel) { - if (!strcmp(evsel__name(evsel), "kmem:mm_page_alloc") && + if (evsel__name_is(evsel, "kmem:mm_page_alloc") && evsel__field(evsel, "pfn")) { use_pfn = true; break; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 1bfb22347371..0fce7d8986c0 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2148,7 +2148,7 @@ static bool is_idle_sample(struct perf_sample *sample, struct evsel *evsel) { /* pid 0 == swapper == idle task */ - if (strcmp(evsel__name(evsel), "sched:sched_switch") == 0) + if (evsel__name_is(evsel, "sched:sched_switch")) return evsel__intval(evsel, sample, "prev_pid") == 0; return sample->pid == 0; @@ -2375,7 +2375,7 @@ static bool timehist_skip_sample(struct perf_sched *sched, } if (sched->idle_hist) { - if (strcmp(evsel__name(evsel), "sched:sched_switch")) + if (!evsel__name_is(evsel, "sched:sched_switch")) rc = true; else if (evsel__intval(evsel, sample, "prev_pid") != 0 && evsel__intval(evsel, sample, "next_pid") != 0) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2e7148d667bd..6a274e27b108 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3471,7 +3471,7 @@ static int check_ev_match(char *dir_name, char *scriptname, match = 0; evlist__for_each_entry(session->evlist, pos) { - if (!strcmp(evsel__name(pos), evname)) { + if (evsel__name_is(pos, evname)) { match = 1; break; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d3ec244e692a..e5fef39c34bf 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4916,7 +4916,7 @@ int cmd_trace(int argc, const char **argv) goto out; } trace.syscalls.events.bpf_output = evlist__last(trace.evlist); - assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__")); + assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__")); skip_augmentation: #endif err = -1; @@ -4973,7 +4973,7 @@ skip_augmentation: */ if (trace.syscalls.events.bpf_output) { evlist__for_each_entry(trace.evlist, evsel) { - bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit"); if (raw_syscalls_sys_exit) { trace.raw_augmented_syscalls = true; diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index 15ff86f9da0b..1922cac13a24 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -37,7 +37,7 @@ static int perf_evsel__roundtrip_cache_name_test(void) continue; } evlist__for_each_entry(evlist, evsel) { - if (strcmp(evsel__name(evsel), name)) { + if (!evsel__name_is(evsel, name)) { pr_debug("%s != %s\n", evsel__name(evsel), name); ret = TEST_FAIL; } @@ -71,7 +71,7 @@ static int perf_evsel__name_array_test(const char *const names[], int nr_names) continue; } evlist__for_each_entry(evlist, evsel) { - if (strcmp(evsel__name(evsel), names[i])) { + if (!evsel__name_is(evsel, names[i])) { pr_debug("%s != %s\n", evsel__name(evsel), names[i]); ret = TEST_FAIL; } diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index feb5727584d1..0b70451451b3 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -470,8 +470,7 @@ static int test__checkevent_breakpoint_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:u")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:u")); return test__checkevent_breakpoint(evlist); } @@ -484,8 +483,7 @@ static int test__checkevent_breakpoint_x_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:x:k")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:x:k")); return test__checkevent_breakpoint_x(evlist); } @@ -498,8 +496,7 @@ static int test__checkevent_breakpoint_r_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:r:hp")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:r:hp")); return test__checkevent_breakpoint_r(evlist); } @@ -512,8 +509,7 @@ static int test__checkevent_breakpoint_w_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:w:up")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:w:up")); return test__checkevent_breakpoint_w(evlist); } @@ -526,8 +522,7 @@ static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "mem:0:rw:kp")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "mem:0:rw:kp")); return test__checkevent_breakpoint_rw(evlist); } @@ -540,8 +535,7 @@ static int test__checkevent_breakpoint_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint(evlist); } @@ -554,8 +548,7 @@ static int test__checkevent_breakpoint_x_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_x(evlist); } @@ -568,8 +561,7 @@ static int test__checkevent_breakpoint_r_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_r(evlist); } @@ -582,8 +574,7 @@ static int test__checkevent_breakpoint_w_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_w(evlist); } @@ -596,8 +587,7 @@ static int test__checkevent_breakpoint_rw_modifier_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "breakpoint")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint")); return test__checkevent_breakpoint_rw(evlist); } @@ -609,12 +599,12 @@ static int test__checkevent_breakpoint_2_events(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "breakpoint1")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint1")); evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "breakpoint2")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "breakpoint2")); return TEST_OK; } @@ -691,15 +681,14 @@ static int test__checkevent_pmu_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); - TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "krava")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "krava")); /* cpu/config=2/u" */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", test_config(evsel, 2)); - TEST_ASSERT_VAL("wrong name", - !strcmp(evsel__name(evsel), "cpu/config=2/u")); + TEST_ASSERT_VAL("wrong name", evsel__name_is(evsel, "cpu/config=2/u")); return TEST_OK; } -- cgit v1.2.3 From ad399baa06931a62d1166d215eaad6f3b0dcd3d5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 29 Mar 2024 14:58:08 -0700 Subject: perf annotate: Use ins__is_xxx() if possible This is to prepare separation of disasm related code. Use the public ins API instead of checking the internal data structure. Signed-off-by: Namhyung Kim Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240329215812.537846-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 64e54ff1aa1d..986c499150ef 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3665,7 +3665,7 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, struct annotated_op_loc *op_loc; int i; - if (!strcmp(dl->ins.name, "lock")) + if (ins__is_lock(&dl->ins)) ops = dl->ops.locked.ops; else ops = &dl->ops; @@ -3763,7 +3763,7 @@ static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip, * llvm-objdump places "lock" in a separate line and * in that case, we want to get the next line. */ - if (!strcmp(dl->ins.name, "lock") && + if (ins__is_lock(&dl->ins) && *dl->ops.raw == '\0' && allow_update) { ip++; continue; @@ -4093,10 +4093,10 @@ static bool process_basic_block(struct basic_block_data *bb_data, if (dl == last_dl) break; /* 'return' instruction finishes the block */ - if (dl->ins.ops == &ret_ops) + if (ins__is_ret(&dl->ins)) break; /* normal instructions are part of the basic block */ - if (dl->ins.ops != &jump_ops) + if (!ins__is_jump(&dl->ins)) continue; /* jump to a different function, tail call or return */ if (dl->ops.target.outside) -- cgit v1.2.3 From 10adbf777622e22323abbf9f7861c26deb373199 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 29 Mar 2024 14:58:09 -0700 Subject: perf annotate: Add and use ins__is_nop() Likewise, add ins__is_nop() to check if the current instruction is NOP. Signed-off-by: Namhyung Kim Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240329215812.537846-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 7 ++++++- tools/perf/util/annotate.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 986c499150ef..5d0ca004dcfb 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -757,6 +757,11 @@ static struct ins_ops ret_ops = { .scnprintf = ins__raw_scnprintf, }; +bool ins__is_nop(const struct ins *ins) +{ + return ins->ops == &nop_ops; +} + bool ins__is_ret(const struct ins *ins) { return ins->ops == &ret_ops; @@ -1785,7 +1790,7 @@ static void delete_last_nop(struct symbol *sym) dl = list_entry(list->prev, struct disasm_line, al.node); if (dl->ins.ops) { - if (dl->ins.ops != &nop_ops) + if (!ins__is_nop(&dl->ins)) return; } else { if (!strstr(dl->al.line, " nop ") && diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 14980b65f812..98f556af637c 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -74,6 +74,7 @@ struct ins_ops { bool ins__is_jump(const struct ins *ins); bool ins__is_call(const struct ins *ins); +bool ins__is_nop(const struct ins *ins); bool ins__is_ret(const struct ins *ins); bool ins__is_lock(const struct ins *ins); int ins__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -- cgit v1.2.3 From 98f69a573c668a18cfda41b3976118e04521a196 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 29 Mar 2024 14:58:10 -0700 Subject: perf annotate: Split out util/disasm.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The util/annotate.c code has both disassembly and sample annotation related codes. Factor out the disasm part so that it can be handled more easily. No functional changes intended. Committer notes: Add missing include env.h, util.h, bpf-event.h and bpf-util.h to disasm.c, to fix things like: util/disasm.c: In function ‘symbol__disassemble_bpf’: util/disasm.c:1203:9: error: implicit declaration of function ‘perf_exe’ [-Werror=implicit-function-declaration] 1203 | perf_exe(tpath, sizeof(tpath)); | ^~~~~~~~ Signed-off-by: Namhyung Kim Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240329215812.537846-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/annotate.c | 1708 ++------------------------------------------ tools/perf/util/annotate.h | 60 +- tools/perf/util/disasm.c | 1591 +++++++++++++++++++++++++++++++++++++++++ tools/perf/util/disasm.h | 112 +++ 5 files changed, 1762 insertions(+), 1710 deletions(-) create mode 100644 tools/perf/util/disasm.c create mode 100644 tools/perf/util/disasm.h (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index e0a723e24503..aec5a590e349 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -12,6 +12,7 @@ perf-y += config.o perf-y += copyfile.o perf-y += ctype.o perf-y += db-export.o +perf-y += disasm.o perf-y += env.o perf-y += event.o perf-y += evlist.o diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 5d0ca004dcfb..b795f27f2602 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -16,6 +16,7 @@ #include "build-id.h" #include "color.h" #include "config.h" +#include "disasm.h" #include "dso.h" #include "env.h" #include "map.h" @@ -64,48 +65,6 @@ /* global annotation options */ struct annotation_options annotate_opts; -static regex_t file_lineno; - -static struct ins_ops *ins__find(struct arch *arch, const char *name); -static void ins__sort(struct arch *arch); -static int disasm_line__parse(char *line, const char **namep, char **rawp); -static int call__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); -static int jump__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); - -struct arch { - const char *name; - struct ins *instructions; - size_t nr_instructions; - size_t nr_instructions_allocated; - struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); - bool sorted_instructions; - bool initialized; - const char *insn_suffix; - void *priv; - unsigned int model; - unsigned int family; - int (*init)(struct arch *arch, char *cpuid); - bool (*ins_is_fused)(struct arch *arch, const char *ins1, - const char *ins2); - struct { - char comment_char; - char skip_functions_char; - char register_char; - char memory_ref_char; - char imm_char; - } objdump; -}; - -static struct ins_ops call_ops; -static struct ins_ops dec_ops; -static struct ins_ops jump_ops; -static struct ins_ops mov_ops; -static struct ins_ops nop_ops; -static struct ins_ops lock_ops; -static struct ins_ops ret_ops; - /* Data type collection debug statistics */ struct annotated_data_stat ann_data_stat; LIST_HEAD(ann_insn_stat); @@ -125,759 +84,6 @@ struct annotated_data_type canary_type = { }, }; -static int arch__grow_instructions(struct arch *arch) -{ - struct ins *new_instructions; - size_t new_nr_allocated; - - if (arch->nr_instructions_allocated == 0 && arch->instructions) - goto grow_from_non_allocated_table; - - new_nr_allocated = arch->nr_instructions_allocated + 128; - new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins)); - if (new_instructions == NULL) - return -1; - -out_update_instructions: - arch->instructions = new_instructions; - arch->nr_instructions_allocated = new_nr_allocated; - return 0; - -grow_from_non_allocated_table: - new_nr_allocated = arch->nr_instructions + 128; - new_instructions = calloc(new_nr_allocated, sizeof(struct ins)); - if (new_instructions == NULL) - return -1; - - memcpy(new_instructions, arch->instructions, arch->nr_instructions); - goto out_update_instructions; -} - -static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops) -{ - struct ins *ins; - - if (arch->nr_instructions == arch->nr_instructions_allocated && - arch__grow_instructions(arch)) - return -1; - - ins = &arch->instructions[arch->nr_instructions]; - ins->name = strdup(name); - if (!ins->name) - return -1; - - ins->ops = ops; - arch->nr_instructions++; - - ins__sort(arch); - return 0; -} - -#include "arch/arc/annotate/instructions.c" -#include "arch/arm/annotate/instructions.c" -#include "arch/arm64/annotate/instructions.c" -#include "arch/csky/annotate/instructions.c" -#include "arch/loongarch/annotate/instructions.c" -#include "arch/mips/annotate/instructions.c" -#include "arch/x86/annotate/instructions.c" -#include "arch/powerpc/annotate/instructions.c" -#include "arch/riscv64/annotate/instructions.c" -#include "arch/s390/annotate/instructions.c" -#include "arch/sparc/annotate/instructions.c" - -static struct arch architectures[] = { - { - .name = "arc", - .init = arc__annotate_init, - }, - { - .name = "arm", - .init = arm__annotate_init, - }, - { - .name = "arm64", - .init = arm64__annotate_init, - }, - { - .name = "csky", - .init = csky__annotate_init, - }, - { - .name = "mips", - .init = mips__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, - { - .name = "x86", - .init = x86__annotate_init, - .instructions = x86__instructions, - .nr_instructions = ARRAY_SIZE(x86__instructions), - .insn_suffix = "bwlq", - .objdump = { - .comment_char = '#', - .register_char = '%', - .memory_ref_char = '(', - .imm_char = '$', - }, - }, - { - .name = "powerpc", - .init = powerpc__annotate_init, - }, - { - .name = "riscv64", - .init = riscv64__annotate_init, - }, - { - .name = "s390", - .init = s390__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, - { - .name = "sparc", - .init = sparc__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, - { - .name = "loongarch", - .init = loongarch__annotate_init, - .objdump = { - .comment_char = '#', - }, - }, -}; - -static void ins__delete(struct ins_operands *ops) -{ - if (ops == NULL) - return; - zfree(&ops->source.raw); - zfree(&ops->source.name); - zfree(&ops->target.raw); - zfree(&ops->target.name); -} - -static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); -} - -int ins__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - if (ins->ops->scnprintf) - return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name); - - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); -} - -bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) -{ - if (!arch || !arch->ins_is_fused) - return false; - - return arch->ins_is_fused(arch, ins1, ins2); -} - -static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) -{ - char *endptr, *tok, *name; - struct map *map = ms->map; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; - - ops->target.addr = strtoull(ops->raw, &endptr, 16); - - name = strchr(endptr, '<'); - if (name == NULL) - goto indirect_call; - - name++; - - if (arch->objdump.skip_functions_char && - strchr(name, arch->objdump.skip_functions_char)) - return -1; - - tok = strchr(name, '>'); - if (tok == NULL) - return -1; - - *tok = '\0'; - ops->target.name = strdup(name); - *tok = '>'; - - if (ops->target.name == NULL) - return -1; -find_target: - target.addr = map__objdump_2mem(map, ops->target.addr); - - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - return 0; - -indirect_call: - tok = strchr(endptr, '*'); - if (tok != NULL) { - endptr++; - - /* Indirect call can use a non-rip register and offset: callq *0x8(%rbx). - * Do not parse such instruction. */ - if (strstr(endptr, "(%r") == NULL) - ops->target.addr = strtoull(endptr, NULL, 16); - } - goto find_target; -} - -static int call__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - if (ops->target.sym) - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); - - if (ops->target.addr == 0) - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); - - if (ops->target.name) - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.name); - - return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr); -} - -static struct ins_ops call_ops = { - .parse = call__parse, - .scnprintf = call__scnprintf, -}; - -bool ins__is_call(const struct ins *ins) -{ - return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops; -} - -/* - * Prevents from matching commas in the comment section, e.g.: - * ffff200008446e70: b.cs ffff2000084470f4 // b.hs, b.nlast - * - * and skip comma as part of function arguments, e.g.: - * 1d8b4ac - */ -static inline const char *validate_comma(const char *c, struct ins_operands *ops) -{ - if (ops->jump.raw_comment && c > ops->jump.raw_comment) - return NULL; - - if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) - return NULL; - - return c; -} - -static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) -{ - struct map *map = ms->map; - struct symbol *sym = ms->sym; - struct addr_map_symbol target = { - .ms = { .map = map, }, - }; - const char *c = strchr(ops->raw, ','); - u64 start, end; - - ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); - ops->jump.raw_func_start = strchr(ops->raw, '<'); - - c = validate_comma(c, ops); - - /* - * Examples of lines to parse for the _cpp_lex_token@@Base - * function: - * - * 1159e6c: jne 115aa32 <_cpp_lex_token@@Base+0xf92> - * 1159e8b: jne c469be - * - * The first is a jump to an offset inside the same function, - * the second is to another function, i.e. that 0xa72 is an - * offset in the cpp_named_operator2name@@base function. - */ - /* - * skip over possible up to 2 operands to get to address, e.g.: - * tbnz w0, #26, ffff0000083cd190 - */ - if (c++ != NULL) { - ops->target.addr = strtoull(c, NULL, 16); - if (!ops->target.addr) { - c = strchr(c, ','); - c = validate_comma(c, ops); - if (c++ != NULL) - ops->target.addr = strtoull(c, NULL, 16); - } - } else { - ops->target.addr = strtoull(ops->raw, NULL, 16); - } - - target.addr = map__objdump_2mem(map, ops->target.addr); - start = map__unmap_ip(map, sym->start); - end = map__unmap_ip(map, sym->end); - - ops->target.outside = target.addr < start || target.addr > end; - - /* - * FIXME: things like this in _cpp_lex_token (gcc's cc1 program): - - cpp_named_operator2name@@Base+0xa72 - - * Point to a place that is after the cpp_named_operator2name - * boundaries, i.e. in the ELF symbol table for cc1 - * cpp_named_operator2name is marked as being 32-bytes long, but it in - * fact is much larger than that, so we seem to need a symbols__find() - * routine that looks for >= current->start and < next_symbol->start, - * possibly just for C++ objects? - * - * For now lets just make some progress by marking jumps to outside the - * current function as call like. - * - * Actual navigation will come next, with further understanding of how - * the symbol searching and disassembly should be done. - */ - if (maps__find_ams(ms->maps, &target) == 0 && - map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) - ops->target.sym = target.ms.sym; - - if (!ops->target.outside) { - ops->target.offset = target.addr - start; - ops->target.offset_avail = true; - } else { - ops->target.offset_avail = false; - } - - return 0; -} - -static int jump__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - const char *c; - - if (!ops->target.addr || ops->target.offset < 0) - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); - - if (ops->target.outside && ops->target.sym != NULL) - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); - - c = strchr(ops->raw, ','); - c = validate_comma(c, ops); - - if (c != NULL) { - const char *c2 = strchr(c + 1, ','); - - c2 = validate_comma(c2, ops); - /* check for 3-op insn */ - if (c2 != NULL) - c = c2; - c++; - - /* mirror arch objdump's space-after-comma style */ - if (*c == ' ') - c++; - } - - return scnprintf(bf, size, "%-*s %.*s%" PRIx64, max_ins_name, - ins->name, c ? c - ops->raw : 0, ops->raw, - ops->target.offset); -} - -static void jump__delete(struct ins_operands *ops __maybe_unused) -{ - /* - * The ops->jump.raw_comment and ops->jump.raw_func_start belong to the - * raw string, don't free them. - */ -} - -static struct ins_ops jump_ops = { - .free = jump__delete, - .parse = jump__parse, - .scnprintf = jump__scnprintf, -}; - -bool ins__is_jump(const struct ins *ins) -{ - return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops; -} - -static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) -{ - char *endptr, *name, *t; - - if (strstr(raw, "(%rip)") == NULL) - return 0; - - *addrp = strtoull(comment, &endptr, 16); - if (endptr == comment) - return 0; - name = strchr(endptr, '<'); - if (name == NULL) - return -1; - - name++; - - t = strchr(name, '>'); - if (t == NULL) - return 0; - - *t = '\0'; - *namep = strdup(name); - *t = '>'; - - return 0; -} - -static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) -{ - ops->locked.ops = zalloc(sizeof(*ops->locked.ops)); - if (ops->locked.ops == NULL) - return 0; - - if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0) - goto out_free_ops; - - ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name); - - if (ops->locked.ins.ops == NULL) - goto out_free_ops; - - if (ops->locked.ins.ops->parse && - ops->locked.ins.ops->parse(arch, ops->locked.ops, ms) < 0) - goto out_free_ops; - - return 0; - -out_free_ops: - zfree(&ops->locked.ops); - return 0; -} - -static int lock__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - int printed; - - if (ops->locked.ins.ops == NULL) - return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); - - printed = scnprintf(bf, size, "%-*s ", max_ins_name, ins->name); - return printed + ins__scnprintf(&ops->locked.ins, bf + printed, - size - printed, ops->locked.ops, max_ins_name); -} - -static void lock__delete(struct ins_operands *ops) -{ - struct ins *ins = &ops->locked.ins; - - if (ins->ops && ins->ops->free) - ins->ops->free(ops->locked.ops); - else - ins__delete(ops->locked.ops); - - zfree(&ops->locked.ops); - zfree(&ops->target.raw); - zfree(&ops->target.name); -} - -static struct ins_ops lock_ops = { - .free = lock__delete, - .parse = lock__parse, - .scnprintf = lock__scnprintf, -}; - -/* - * Check if the operand has more than one registers like x86 SIB addressing: - * 0x1234(%rax, %rbx, 8) - * - * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check - * the input string after 'memory_ref_char' if exists. - */ -static bool check_multi_regs(struct arch *arch, const char *op) -{ - int count = 0; - - if (arch->objdump.register_char == 0) - return false; - - if (arch->objdump.memory_ref_char) { - op = strchr(op, arch->objdump.memory_ref_char); - if (op == NULL) - return false; - } - - while ((op = strchr(op, arch->objdump.register_char)) != NULL) { - count++; - op++; - } - - return count > 1; -} - -static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) -{ - char *s = strchr(ops->raw, ','), *target, *comment, prev; - - if (s == NULL) - return -1; - - *s = '\0'; - - /* - * x86 SIB addressing has something like 0x8(%rax, %rcx, 1) - * then it needs to have the closing parenthesis. - */ - if (strchr(ops->raw, '(')) { - *s = ','; - s = strchr(ops->raw, ')'); - if (s == NULL || s[1] != ',') - return -1; - *++s = '\0'; - } - - ops->source.raw = strdup(ops->raw); - *s = ','; - - if (ops->source.raw == NULL) - return -1; - - ops->source.multi_regs = check_multi_regs(arch, ops->source.raw); - - target = skip_spaces(++s); - comment = strchr(s, arch->objdump.comment_char); - - if (comment != NULL) - s = comment - 1; - else - s = strchr(s, '\0') - 1; - - while (s > target && isspace(s[0])) - --s; - s++; - prev = *s; - *s = '\0'; - - ops->target.raw = strdup(target); - *s = prev; - - if (ops->target.raw == NULL) - goto out_free_source; - - ops->target.multi_regs = check_multi_regs(arch, ops->target.raw); - - if (comment == NULL) - return 0; - - comment = skip_spaces(comment); - comment__symbol(ops->source.raw, comment + 1, &ops->source.addr, &ops->source.name); - comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); - - return 0; - -out_free_source: - zfree(&ops->source.raw); - return -1; -} - -static int mov__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name, - ops->source.name ?: ops->source.raw, - ops->target.name ?: ops->target.raw); -} - -static struct ins_ops mov_ops = { - .parse = mov__parse, - .scnprintf = mov__scnprintf, -}; - -static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) -{ - char *target, *comment, *s, prev; - - target = s = ops->raw; - - while (s[0] != '\0' && !isspace(s[0])) - ++s; - prev = *s; - *s = '\0'; - - ops->target.raw = strdup(target); - *s = prev; - - if (ops->target.raw == NULL) - return -1; - - comment = strchr(s, arch->objdump.comment_char); - if (comment == NULL) - return 0; - - comment = skip_spaces(comment); - comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); - - return 0; -} - -static int dec__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, - ops->target.name ?: ops->target.raw); -} - -static struct ins_ops dec_ops = { - .parse = dec__parse, - .scnprintf = dec__scnprintf, -}; - -static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size, - struct ins_operands *ops __maybe_unused, int max_ins_name) -{ - return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); -} - -static struct ins_ops nop_ops = { - .scnprintf = nop__scnprintf, -}; - -static struct ins_ops ret_ops = { - .scnprintf = ins__raw_scnprintf, -}; - -bool ins__is_nop(const struct ins *ins) -{ - return ins->ops == &nop_ops; -} - -bool ins__is_ret(const struct ins *ins) -{ - return ins->ops == &ret_ops; -} - -bool ins__is_lock(const struct ins *ins) -{ - return ins->ops == &lock_ops; -} - -static int ins__key_cmp(const void *name, const void *insp) -{ - const struct ins *ins = insp; - - return strcmp(name, ins->name); -} - -static int ins__cmp(const void *a, const void *b) -{ - const struct ins *ia = a; - const struct ins *ib = b; - - return strcmp(ia->name, ib->name); -} - -static void ins__sort(struct arch *arch) -{ - const int nmemb = arch->nr_instructions; - - qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); -} - -static struct ins_ops *__ins__find(struct arch *arch, const char *name) -{ - struct ins *ins; - const int nmemb = arch->nr_instructions; - - if (!arch->sorted_instructions) { - ins__sort(arch); - arch->sorted_instructions = true; - } - - ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); - if (ins) - return ins->ops; - - if (arch->insn_suffix) { - char tmp[32]; - char suffix; - size_t len = strlen(name); - - if (len == 0 || len >= sizeof(tmp)) - return NULL; - - suffix = name[len - 1]; - if (strchr(arch->insn_suffix, suffix) == NULL) - return NULL; - - strcpy(tmp, name); - tmp[len - 1] = '\0'; /* remove the suffix and check again */ - - ins = bsearch(tmp, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); - } - return ins ? ins->ops : NULL; -} - -static struct ins_ops *ins__find(struct arch *arch, const char *name) -{ - struct ins_ops *ops = __ins__find(arch, name); - - if (!ops && arch->associate_instruction_ops) - ops = arch->associate_instruction_ops(arch, name); - - return ops; -} - -static int arch__key_cmp(const void *name, const void *archp) -{ - const struct arch *arch = archp; - - return strcmp(name, arch->name); -} - -static int arch__cmp(const void *a, const void *b) -{ - const struct arch *aa = a; - const struct arch *ab = b; - - return strcmp(aa->name, ab->name); -} - -static void arch__sort(void) -{ - const int nmemb = ARRAY_SIZE(architectures); - - qsort(architectures, nmemb, sizeof(struct arch), arch__cmp); -} - -static struct arch *arch__find(const char *name) -{ - const int nmemb = ARRAY_SIZE(architectures); - static bool sorted; - - if (!sorted) { - arch__sort(); - sorted = true; - } - - return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); -} - -bool arch__is(struct arch *arch, const char *name) -{ - return !strcmp(arch->name, name); -} - /* symbol histogram: key = offset << 16 | evsel->core.idx */ static size_t sym_hist_hash(long key, void *ctx __maybe_unused) { @@ -1214,212 +420,76 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 branch->cover_insn += cover_insn; } } -} - -static int annotation__compute_ipc(struct annotation *notes, size_t size) -{ - int err = 0; - s64 offset; - - if (!notes->branch || !notes->branch->cycles_hist) - return 0; - - notes->branch->total_insn = annotation__count_insn(notes, 0, size - 1); - notes->branch->hit_cycles = 0; - notes->branch->hit_insn = 0; - notes->branch->cover_insn = 0; - - annotation__lock(notes); - for (offset = size - 1; offset >= 0; --offset) { - struct cyc_hist *ch; - - ch = ¬es->branch->cycles_hist[offset]; - if (ch && ch->cycles) { - struct annotation_line *al; - - al = notes->src->offsets[offset]; - if (al && al->cycles == NULL) { - al->cycles = zalloc(sizeof(*al->cycles)); - if (al->cycles == NULL) { - err = ENOMEM; - break; - } - } - if (ch->have_start) - annotation__count_and_fill(notes, ch->start, offset, ch); - if (al && ch->num_aggr) { - al->cycles->avg = ch->cycles_aggr / ch->num_aggr; - al->cycles->max = ch->cycles_max; - al->cycles->min = ch->cycles_min; - } - } - } - - if (err) { - while (++offset < (s64)size) { - struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; - - if (ch && ch->cycles) { - struct annotation_line *al = notes->src->offsets[offset]; - if (al) - zfree(&al->cycles); - } - } - } - - annotation__unlock(notes); - return 0; -} - -int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, - struct evsel *evsel) -{ - return symbol__inc_addr_samples(&ams->ms, evsel, ams->al_addr, sample); -} - -int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample, - struct evsel *evsel, u64 ip) -{ - return symbol__inc_addr_samples(&he->ms, evsel, ip, sample); -} - -static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) -{ - dl->ins.ops = ins__find(arch, dl->ins.name); - - if (!dl->ins.ops) - return; - - if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms) < 0) - dl->ins.ops = NULL; -} - -static int disasm_line__parse(char *line, const char **namep, char **rawp) -{ - char tmp, *name = skip_spaces(line); - - if (name[0] == '\0') - return -1; - - *rawp = name + 1; - - while ((*rawp)[0] != '\0' && !isspace((*rawp)[0])) - ++*rawp; - - tmp = (*rawp)[0]; - (*rawp)[0] = '\0'; - *namep = strdup(name); - - if (*namep == NULL) - goto out; - - (*rawp)[0] = tmp; - *rawp = strim(*rawp); - - return 0; - -out: - return -1; -} - -struct annotate_args { - struct arch *arch; - struct map_symbol ms; - struct evsel *evsel; - struct annotation_options *options; - s64 offset; - char *line; - int line_nr; - char *fileloc; -}; - -static void annotation_line__init(struct annotation_line *al, - struct annotate_args *args, - int nr) -{ - al->offset = args->offset; - al->line = strdup(args->line); - al->line_nr = args->line_nr; - al->fileloc = args->fileloc; - al->data_nr = nr; -} - -static void annotation_line__exit(struct annotation_line *al) -{ - zfree_srcline(&al->path); - zfree(&al->line); - zfree(&al->cycles); -} - -static size_t disasm_line_size(int nr) -{ - struct annotation_line *al; - - return (sizeof(struct disasm_line) + (sizeof(al->data[0]) * nr)); -} - -/* - * Allocating the disasm annotation line data with - * following structure: - * - * ------------------------------------------- - * struct disasm_line | struct annotation_line - * ------------------------------------------- - * - * We have 'struct annotation_line' member as last member - * of 'struct disasm_line' to have an easy access. - */ -static struct disasm_line *disasm_line__new(struct annotate_args *args) +} + +static int annotation__compute_ipc(struct annotation *notes, size_t size) { - struct disasm_line *dl = NULL; - int nr = 1; + int err = 0; + s64 offset; - if (evsel__is_group_event(args->evsel)) - nr = args->evsel->core.nr_members; + if (!notes->branch || !notes->branch->cycles_hist) + return 0; - dl = zalloc(disasm_line_size(nr)); - if (!dl) - return NULL; + notes->branch->total_insn = annotation__count_insn(notes, 0, size - 1); + notes->branch->hit_cycles = 0; + notes->branch->hit_insn = 0; + notes->branch->cover_insn = 0; - annotation_line__init(&dl->al, args, nr); - if (dl->al.line == NULL) - goto out_delete; + annotation__lock(notes); + for (offset = size - 1; offset >= 0; --offset) { + struct cyc_hist *ch; - if (args->offset != -1) { - if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) - goto out_free_line; + ch = ¬es->branch->cycles_hist[offset]; + if (ch && ch->cycles) { + struct annotation_line *al; - disasm_line__init_ins(dl, args->arch, &args->ms); + al = notes->src->offsets[offset]; + if (al && al->cycles == NULL) { + al->cycles = zalloc(sizeof(*al->cycles)); + if (al->cycles == NULL) { + err = ENOMEM; + break; + } + } + if (ch->have_start) + annotation__count_and_fill(notes, ch->start, offset, ch); + if (al && ch->num_aggr) { + al->cycles->avg = ch->cycles_aggr / ch->num_aggr; + al->cycles->max = ch->cycles_max; + al->cycles->min = ch->cycles_min; + } + } } - return dl; + if (err) { + while (++offset < (s64)size) { + struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; -out_free_line: - zfree(&dl->al.line); -out_delete: - free(dl); - return NULL; + if (ch && ch->cycles) { + struct annotation_line *al = notes->src->offsets[offset]; + if (al) + zfree(&al->cycles); + } + } + } + + annotation__unlock(notes); + return 0; } -void disasm_line__free(struct disasm_line *dl) +int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, + struct evsel *evsel) { - if (dl->ins.ops && dl->ins.ops->free) - dl->ins.ops->free(&dl->ops); - else - ins__delete(&dl->ops); - zfree(&dl->ins.name); - annotation_line__exit(&dl->al); - free(dl); + return symbol__inc_addr_samples(&ams->ms, evsel, ams->al_addr, sample); } -int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name) +int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample, + struct evsel *evsel, u64 ip) { - if (raw || !dl->ins.ops) - return scnprintf(bf, size, "%-*s %s", max_ins_name, dl->ins.name, dl->ops.raw); - - return ins__scnprintf(&dl->ins, bf, size, &dl->ops, max_ins_name); + return symbol__inc_addr_samples(&he->ms, evsel, ip, sample); } + void annotation__exit(struct annotation *notes) { annotated_source__delete(notes->src); @@ -1478,8 +548,7 @@ bool annotation__trylock(struct annotation *notes) return mutex_trylock(mutex); } - -static void annotation_line__add(struct annotation_line *al, struct list_head *head) +void annotation_line__add(struct annotation_line *al, struct list_head *head) { list_add_tail(&al->node, head); } @@ -1689,673 +758,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start return 0; } -/* - * symbol__parse_objdump_line() parses objdump output (with -d --no-show-raw) - * which looks like following - * - * 0000000000415500 <_init>: - * 415500: sub $0x8,%rsp - * 415504: mov 0x2f5ad5(%rip),%rax # 70afe0 <_DYNAMIC+0x2f8> - * 41550b: test %rax,%rax - * 41550e: je 415515 <_init+0x15> - * 415510: callq 416e70 <__gmon_start__@plt> - * 415515: add $0x8,%rsp - * 415519: retq - * - * it will be parsed and saved into struct disasm_line as - * - * - * The offset will be a relative offset from the start of the symbol and -1 - * means that it's not a disassembly line so should be treated differently. - * The ops.raw part will be parsed further according to type of the instruction. - */ -static int symbol__parse_objdump_line(struct symbol *sym, - struct annotate_args *args, - char *parsed_line, int *line_nr, char **fileloc) -{ - struct map *map = args->ms.map; - struct annotation *notes = symbol__annotation(sym); - struct disasm_line *dl; - char *tmp; - s64 line_ip, offset = -1; - regmatch_t match[2]; - - /* /filename:linenr ? Save line number and ignore. */ - if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { - *line_nr = atoi(parsed_line + match[1].rm_so); - free(*fileloc); - *fileloc = strdup(parsed_line); - return 0; - } - - /* Process hex address followed by ':'. */ - line_ip = strtoull(parsed_line, &tmp, 16); - if (parsed_line != tmp && tmp[0] == ':' && tmp[1] != '\0') { - u64 start = map__rip_2objdump(map, sym->start), - end = map__rip_2objdump(map, sym->end); - - offset = line_ip - start; - if ((u64)line_ip < start || (u64)line_ip >= end) - offset = -1; - else - parsed_line = tmp + 1; - } - - args->offset = offset; - args->line = parsed_line; - args->line_nr = *line_nr; - args->fileloc = *fileloc; - args->ms.sym = sym; - - dl = disasm_line__new(args); - (*line_nr)++; - - if (dl == NULL) - return -1; - - if (!disasm_line__has_local_offset(dl)) { - dl->ops.target.offset = dl->ops.target.addr - - map__rip_2objdump(map, sym->start); - dl->ops.target.offset_avail = true; - } - - /* kcore has no symbols, so add the call target symbol */ - if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) { - struct addr_map_symbol target = { - .addr = dl->ops.target.addr, - .ms = { .map = map, }, - }; - - if (!maps__find_ams(args->ms.maps, &target) && - target.ms.sym->start == target.al_addr) - dl->ops.target.sym = target.ms.sym; - } - - annotation_line__add(&dl->al, ¬es->src->source); - return 0; -} - -static __attribute__((constructor)) void symbol__init_regexpr(void) -{ - regcomp(&file_lineno, "^/[^:]+:([0-9]+)", REG_EXTENDED); -} - -static void delete_last_nop(struct symbol *sym) -{ - struct annotation *notes = symbol__annotation(sym); - struct list_head *list = ¬es->src->source; - struct disasm_line *dl; - - while (!list_empty(list)) { - dl = list_entry(list->prev, struct disasm_line, al.node); - - if (dl->ins.ops) { - if (!ins__is_nop(&dl->ins)) - return; - } else { - if (!strstr(dl->al.line, " nop ") && - !strstr(dl->al.line, " nopl ") && - !strstr(dl->al.line, " nopw ")) - return; - } - - list_del_init(&dl->al.node); - disasm_line__free(dl); - } -} - -int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, size_t buflen) -{ - struct dso *dso = map__dso(ms->map); - - BUG_ON(buflen == 0); - - if (errnum >= 0) { - str_error_r(errnum, buf, buflen); - return 0; - } - - switch (errnum) { - case SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX: { - char bf[SBUILD_ID_SIZE + 15] = " with build id "; - char *build_id_msg = NULL; - - if (dso->has_build_id) { - build_id__sprintf(&dso->bid, bf + 15); - build_id_msg = bf; - } - scnprintf(buf, buflen, - "No vmlinux file%s\nwas found in the path.\n\n" - "Note that annotation using /proc/kcore requires CAP_SYS_RAWIO capability.\n\n" - "Please use:\n\n" - " perf buildid-cache -vu vmlinux\n\n" - "or:\n\n" - " --vmlinux vmlinux\n", build_id_msg ?: ""); - } - break; - case SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF: - scnprintf(buf, buflen, "Please link with binutils's libopcode to enable BPF annotation"); - break; - case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP: - scnprintf(buf, buflen, "Problems with arch specific instruction name regular expressions."); - break; - case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING: - scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); - break; - case SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE: - scnprintf(buf, buflen, "Invalid BPF file: %s.", dso->long_name); - break; - case SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF: - scnprintf(buf, buflen, "The %s BPF file has no BTF section, compile with -g or use pahole -J.", - dso->long_name); - break; - default: - scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); - break; - } - - return 0; -} - -static int dso__disassemble_filename(struct dso *dso, char *filename, size_t filename_size) -{ - char linkname[PATH_MAX]; - char *build_id_filename; - char *build_id_path = NULL; - char *pos; - int len; - - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && - !dso__is_kcore(dso)) - return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX; - - build_id_filename = dso__build_id_filename(dso, NULL, 0, false); - if (build_id_filename) { - __symbol__join_symfs(filename, filename_size, build_id_filename); - free(build_id_filename); - } else { - if (dso->has_build_id) - return ENOMEM; - goto fallback; - } - - build_id_path = strdup(filename); - if (!build_id_path) - return ENOMEM; - - /* - * old style build-id cache has name of XX/XXXXXXX.. while - * new style has XX/XXXXXXX../{elf,kallsyms,vdso}. - * extract the build-id part of dirname in the new style only. - */ - pos = strrchr(build_id_path, '/'); - if (pos && strlen(pos) < SBUILD_ID_SIZE - 2) - dirname(build_id_path); - - if (dso__is_kcore(dso)) - goto fallback; - - len = readlink(build_id_path, linkname, sizeof(linkname) - 1); - if (len < 0) - goto fallback; - - linkname[len] = '\0'; - if (strstr(linkname, DSO__NAME_KALLSYMS) || - access(filename, R_OK)) { -fallback: - /* - * If we don't have build-ids or the build-id file isn't in the - * cache, or is just a kallsyms file, well, lets hope that this - * DSO is the same as when 'perf record' ran. - */ - if (dso->kernel && dso->long_name[0] == '/') - snprintf(filename, filename_size, "%s", dso->long_name); - else - __symbol__join_symfs(filename, filename_size, dso->long_name); - - mutex_lock(&dso->lock); - if (access(filename, R_OK) && errno == ENOENT && dso->nsinfo) { - char *new_name = dso__filename_with_chroot(dso, filename); - if (new_name) { - strlcpy(filename, new_name, filename_size); - free(new_name); - } - } - mutex_unlock(&dso->lock); - } - - free(build_id_path); - return 0; -} - -#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -#define PACKAGE "perf" -#include -#include -#include -#include -#include -#include -#include - -static int symbol__disassemble_bpf(struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct bpf_prog_linfo *prog_linfo = NULL; - struct bpf_prog_info_node *info_node; - int len = sym->end - sym->start; - disassembler_ftype disassemble; - struct map *map = args->ms.map; - struct perf_bpil *info_linear; - struct disassemble_info info; - struct dso *dso = map__dso(map); - int pc = 0, count, sub_id; - struct btf *btf = NULL; - char tpath[PATH_MAX]; - size_t buf_size; - int nr_skip = 0; - char *buf; - bfd *bfdf; - int ret; - FILE *s; - - if (dso->binary_type != DSO_BINARY_TYPE__BPF_PROG_INFO) - return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; - - pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, - sym->name, sym->start, sym->end - sym->start); - - memset(tpath, 0, sizeof(tpath)); - perf_exe(tpath, sizeof(tpath)); - - bfdf = bfd_openr(tpath, NULL); - if (bfdf == NULL) - abort(); - - if (!bfd_check_format(bfdf, bfd_object)) - abort(); - - s = open_memstream(&buf, &buf_size); - if (!s) { - ret = errno; - goto out; - } - init_disassemble_info_compat(&info, s, - (fprintf_ftype) fprintf, - fprintf_styled); - info.arch = bfd_get_arch(bfdf); - info.mach = bfd_get_mach(bfdf); - - info_node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, - dso->bpf_prog.id); - if (!info_node) { - ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; - goto out; - } - info_linear = info_node->info_linear; - sub_id = dso->bpf_prog.sub_id; - - info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); - info.buffer_length = info_linear->info.jited_prog_len; - - if (info_linear->info.nr_line_info) - prog_linfo = bpf_prog_linfo__new(&info_linear->info); - - if (info_linear->info.btf_id) { - struct btf_node *node; - - node = perf_env__find_btf(dso->bpf_prog.env, - info_linear->info.btf_id); - if (node) - btf = btf__new((__u8 *)(node->data), - node->data_size); - } - - disassemble_init_for_target(&info); - -#ifdef DISASM_FOUR_ARGS_SIGNATURE - disassemble = disassembler(info.arch, - bfd_big_endian(bfdf), - info.mach, - bfdf); -#else - disassemble = disassembler(bfdf); -#endif - if (disassemble == NULL) - abort(); - - fflush(s); - do { - const struct bpf_line_info *linfo = NULL; - struct disasm_line *dl; - size_t prev_buf_size; - const char *srcline; - u64 addr; - - addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; - count = disassemble(pc, &info); - - if (prog_linfo) - linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, - addr, sub_id, - nr_skip); - - if (linfo && btf) { - srcline = btf__name_by_offset(btf, linfo->line_off); - nr_skip++; - } else - srcline = NULL; - - fprintf(s, "\n"); - prev_buf_size = buf_size; - fflush(s); - - if (!annotate_opts.hide_src_code && srcline) { - args->offset = -1; - args->line = strdup(srcline); - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) { - annotation_line__add(&dl->al, - ¬es->src->source); - } - } - - args->offset = pc; - args->line = buf + prev_buf_size; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - pc += count; - } while (count > 0 && pc < len); - - ret = 0; -out: - free(prog_linfo); - btf__free(btf); - fclose(s); - bfd_close(bfdf); - return ret; -} -#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, - struct annotate_args *args __maybe_unused) -{ - return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; -} -#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) - -static int -symbol__disassemble_bpf_image(struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct disasm_line *dl; - - args->offset = -1; - args->line = strdup("to be implemented"); - args->line_nr = 0; - args->fileloc = NULL; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - zfree(&args->line); - return 0; -} - -/* - * Possibly create a new version of line with tabs expanded. Returns the - * existing or new line, storage is updated if a new line is allocated. If - * allocation fails then NULL is returned. - */ -static char *expand_tabs(char *line, char **storage, size_t *storage_len) -{ - size_t i, src, dst, len, new_storage_len, num_tabs; - char *new_line; - size_t line_len = strlen(line); - - for (num_tabs = 0, i = 0; i < line_len; i++) - if (line[i] == '\t') - num_tabs++; - - if (num_tabs == 0) - return line; - - /* - * Space for the line and '\0', less the leading and trailing - * spaces. Each tab may introduce 7 additional spaces. - */ - new_storage_len = line_len + 1 + (num_tabs * 7); - - new_line = malloc(new_storage_len); - if (new_line == NULL) { - pr_err("Failure allocating memory for tab expansion\n"); - return NULL; - } - - /* - * Copy regions starting at src and expand tabs. If there are two - * adjacent tabs then 'src == i', the memcpy is of size 0 and the spaces - * are inserted. - */ - for (i = 0, src = 0, dst = 0; i < line_len && num_tabs; i++) { - if (line[i] == '\t') { - len = i - src; - memcpy(&new_line[dst], &line[src], len); - dst += len; - new_line[dst++] = ' '; - while (dst % 8 != 0) - new_line[dst++] = ' '; - src = i + 1; - num_tabs--; - } - } - - /* Expand the last region. */ - len = line_len - src; - memcpy(&new_line[dst], &line[src], len); - dst += len; - new_line[dst] = '\0'; - - free(*storage); - *storage = new_line; - *storage_len = new_storage_len; - return new_line; - -} - -static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) -{ - struct annotation_options *opts = &annotate_opts; - struct map *map = args->ms.map; - struct dso *dso = map__dso(map); - char *command; - FILE *file; - char symfs_filename[PATH_MAX]; - struct kcore_extract kce; - bool delete_extract = false; - bool decomp = false; - int lineno = 0; - char *fileloc = NULL; - int nline; - char *line; - size_t line_len; - const char *objdump_argv[] = { - "/bin/sh", - "-c", - NULL, /* Will be the objdump command to run. */ - "--", - NULL, /* Will be the symfs path. */ - NULL, - }; - struct child_process objdump_process; - int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename)); - - if (err) - return err; - - pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, - symfs_filename, sym->name, map__unmap_ip(map, sym->start), - map__unmap_ip(map, sym->end)); - - pr_debug("annotating [%p] %30s : [%p] %30s\n", - dso, dso->long_name, sym, sym->name); - - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { - return symbol__disassemble_bpf(sym, args); - } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { - return symbol__disassemble_bpf_image(sym, args); - } else if (dso__is_kcore(dso)) { - kce.kcore_filename = symfs_filename; - kce.addr = map__rip_2objdump(map, sym->start); - kce.offs = sym->start; - kce.len = sym->end - sym->start; - if (!kcore_extract__create(&kce)) { - delete_extract = true; - strlcpy(symfs_filename, kce.extract_filename, - sizeof(symfs_filename)); - } - } else if (dso__needs_decompress(dso)) { - char tmp[KMOD_DECOMP_LEN]; - - if (dso__decompress_kmodule_path(dso, symfs_filename, - tmp, sizeof(tmp)) < 0) - return -1; - - decomp = true; - strcpy(symfs_filename, tmp); - } - - err = asprintf(&command, - "%s %s%s --start-address=0x%016" PRIx64 - " --stop-address=0x%016" PRIx64 - " %s -d %s %s %s %c%s%c %s%s -C \"$1\"", - opts->objdump_path ?: "objdump", - opts->disassembler_style ? "-M " : "", - opts->disassembler_style ?: "", - map__rip_2objdump(map, sym->start), - map__rip_2objdump(map, sym->end), - opts->show_linenr ? "-l" : "", - opts->show_asm_raw ? "" : "--no-show-raw-insn", - opts->annotate_src ? "-S" : "", - opts->prefix ? "--prefix " : "", - opts->prefix ? '"' : ' ', - opts->prefix ?: "", - opts->prefix ? '"' : ' ', - opts->prefix_strip ? "--prefix-strip=" : "", - opts->prefix_strip ?: ""); - - if (err < 0) { - pr_err("Failure allocating memory for the command to run\n"); - goto out_remove_tmp; - } - - pr_debug("Executing: %s\n", command); - - objdump_argv[2] = command; - objdump_argv[4] = symfs_filename; - - /* Create a pipe to read from for stdout */ - memset(&objdump_process, 0, sizeof(objdump_process)); - objdump_process.argv = objdump_argv; - objdump_process.out = -1; - objdump_process.err = -1; - objdump_process.no_stderr = 1; - if (start_command(&objdump_process)) { - pr_err("Failure starting to run %s\n", command); - err = -1; - goto out_free_command; - } - - file = fdopen(objdump_process.out, "r"); - if (!file) { - pr_err("Failure creating FILE stream for %s\n", command); - /* - * If we were using debug info should retry with - * original binary. - */ - err = -1; - goto out_close_stdout; - } - - /* Storage for getline. */ - line = NULL; - line_len = 0; - - nline = 0; - while (!feof(file)) { - const char *match; - char *expanded_line; - - if (getline(&line, &line_len, file) < 0 || !line) - break; - - /* Skip lines containing "filename:" */ - match = strstr(line, symfs_filename); - if (match && match[strlen(symfs_filename)] == ':') - continue; - - expanded_line = strim(line); - expanded_line = expand_tabs(expanded_line, &line, &line_len); - if (!expanded_line) - break; - - /* - * The source code line number (lineno) needs to be kept in - * across calls to symbol__parse_objdump_line(), so that it - * can associate it with the instructions till the next one. - * See disasm_line__new() and struct disasm_line::line_nr. - */ - if (symbol__parse_objdump_line(sym, args, expanded_line, - &lineno, &fileloc) < 0) - break; - nline++; - } - free(line); - free(fileloc); - - err = finish_command(&objdump_process); - if (err) - pr_err("Error running %s\n", command); - - if (nline == 0) { - err = -1; - pr_err("No output from %s\n", command); - } - - /* - * kallsyms does not have symbol sizes so there may a nop at the end. - * Remove it. - */ - if (dso__is_kcore(dso)) - delete_last_nop(sym); - - fclose(file); - -out_close_stdout: - close(objdump_process.out); - -out_free_command: - free(command); - -out_remove_tmp: - if (decomp) - unlink(symfs_filename); - - if (delete_extract) - kcore_extract__delete(&kce); - - return err; -} - static void calc_percent(struct annotation *notes, struct evsel *evsel, struct annotation_data *data, diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 98f556af637c..b3007c9966fd 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -13,10 +13,10 @@ #include "mutex.h" #include "spark.h" #include "hashmap.h" +#include "disasm.h" struct hist_browser_timer; struct hist_entry; -struct ins_ops; struct map; struct map_symbol; struct addr_map_symbol; @@ -26,60 +26,6 @@ struct evsel; struct symbol; struct annotated_data_type; -struct ins { - const char *name; - struct ins_ops *ops; -}; - -struct ins_operands { - char *raw; - struct { - char *raw; - char *name; - struct symbol *sym; - u64 addr; - s64 offset; - bool offset_avail; - bool outside; - bool multi_regs; - } target; - union { - struct { - char *raw; - char *name; - u64 addr; - bool multi_regs; - } source; - struct { - struct ins ins; - struct ins_operands *ops; - } locked; - struct { - char *raw_comment; - char *raw_func_start; - } jump; - }; -}; - -struct arch; - -bool arch__is(struct arch *arch, const char *name); - -struct ins_ops { - void (*free)(struct ins_operands *ops); - int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms); - int (*scnprintf)(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); -}; - -bool ins__is_jump(const struct ins *ins); -bool ins__is_call(const struct ins *ins); -bool ins__is_nop(const struct ins *ins); -bool ins__is_ret(const struct ins *ins); -bool ins__is_lock(const struct ins *ins); -int ins__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2); - #define ANNOTATION__IPC_WIDTH 6 #define ANNOTATION__CYCLES_WIDTH 6 #define ANNOTATION__MINMAX_CYCLES_WIDTH 19 @@ -172,6 +118,8 @@ struct disasm_line { struct annotation_line al; }; +void annotation_line__add(struct annotation_line *al, struct list_head *head); + static inline double annotation_data__percent(struct annotation_data *data, unsigned int which) { @@ -213,7 +161,6 @@ static inline bool disasm_line__has_local_offset(const struct disasm_line *dl) */ bool disasm_line__is_valid_local_jump(struct disasm_line *dl, struct symbol *sym); -void disasm_line__free(struct disasm_line *dl); struct annotation_line * annotation_line__next(struct annotation_line *pos, struct list_head *head); @@ -236,7 +183,6 @@ int __annotation__scnprintf_samples_period(struct annotation *notes, struct evsel *evsel, bool show_freq); -int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name); size_t disasm__fprintf(struct list_head *head, FILE *fp); void symbol__calc_percent(struct symbol *sym, struct evsel *evsel); diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c new file mode 100644 index 000000000000..21a43b07e8b5 --- /dev/null +++ b/tools/perf/util/disasm.c @@ -0,0 +1,1591 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "annotate.h" +#include "build-id.h" +#include "debug.h" +#include "disasm.h" +#include "dso.h" +#include "env.h" +#include "evsel.h" +#include "map.h" +#include "maps.h" +#include "srcline.h" +#include "symbol.h" +#include "util.h" + +static regex_t file_lineno; + +/* These can be referred from the arch-dependent code */ +static struct ins_ops call_ops; +static struct ins_ops dec_ops; +static struct ins_ops jump_ops; +static struct ins_ops mov_ops; +static struct ins_ops nop_ops; +static struct ins_ops lock_ops; +static struct ins_ops ret_ops; + +static int jump__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +static int call__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); + +static void ins__sort(struct arch *arch); +static int disasm_line__parse(char *line, const char **namep, char **rawp); + +static __attribute__((constructor)) void symbol__init_regexpr(void) +{ + regcomp(&file_lineno, "^/[^:]+:([0-9]+)", REG_EXTENDED); +} + +static int arch__grow_instructions(struct arch *arch) +{ + struct ins *new_instructions; + size_t new_nr_allocated; + + if (arch->nr_instructions_allocated == 0 && arch->instructions) + goto grow_from_non_allocated_table; + + new_nr_allocated = arch->nr_instructions_allocated + 128; + new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins)); + if (new_instructions == NULL) + return -1; + +out_update_instructions: + arch->instructions = new_instructions; + arch->nr_instructions_allocated = new_nr_allocated; + return 0; + +grow_from_non_allocated_table: + new_nr_allocated = arch->nr_instructions + 128; + new_instructions = calloc(new_nr_allocated, sizeof(struct ins)); + if (new_instructions == NULL) + return -1; + + memcpy(new_instructions, arch->instructions, arch->nr_instructions); + goto out_update_instructions; +} + +static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops) +{ + struct ins *ins; + + if (arch->nr_instructions == arch->nr_instructions_allocated && + arch__grow_instructions(arch)) + return -1; + + ins = &arch->instructions[arch->nr_instructions]; + ins->name = strdup(name); + if (!ins->name) + return -1; + + ins->ops = ops; + arch->nr_instructions++; + + ins__sort(arch); + return 0; +} + +#include "arch/arc/annotate/instructions.c" +#include "arch/arm/annotate/instructions.c" +#include "arch/arm64/annotate/instructions.c" +#include "arch/csky/annotate/instructions.c" +#include "arch/loongarch/annotate/instructions.c" +#include "arch/mips/annotate/instructions.c" +#include "arch/x86/annotate/instructions.c" +#include "arch/powerpc/annotate/instructions.c" +#include "arch/riscv64/annotate/instructions.c" +#include "arch/s390/annotate/instructions.c" +#include "arch/sparc/annotate/instructions.c" + +static struct arch architectures[] = { + { + .name = "arc", + .init = arc__annotate_init, + }, + { + .name = "arm", + .init = arm__annotate_init, + }, + { + .name = "arm64", + .init = arm64__annotate_init, + }, + { + .name = "csky", + .init = csky__annotate_init, + }, + { + .name = "mips", + .init = mips__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, + { + .name = "x86", + .init = x86__annotate_init, + .instructions = x86__instructions, + .nr_instructions = ARRAY_SIZE(x86__instructions), + .insn_suffix = "bwlq", + .objdump = { + .comment_char = '#', + .register_char = '%', + .memory_ref_char = '(', + .imm_char = '$', + }, + }, + { + .name = "powerpc", + .init = powerpc__annotate_init, + }, + { + .name = "riscv64", + .init = riscv64__annotate_init, + }, + { + .name = "s390", + .init = s390__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, + { + .name = "sparc", + .init = sparc__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, + { + .name = "loongarch", + .init = loongarch__annotate_init, + .objdump = { + .comment_char = '#', + }, + }, +}; + +static int arch__key_cmp(const void *name, const void *archp) +{ + const struct arch *arch = archp; + + return strcmp(name, arch->name); +} + +static int arch__cmp(const void *a, const void *b) +{ + const struct arch *aa = a; + const struct arch *ab = b; + + return strcmp(aa->name, ab->name); +} + +static void arch__sort(void) +{ + const int nmemb = ARRAY_SIZE(architectures); + + qsort(architectures, nmemb, sizeof(struct arch), arch__cmp); +} + +struct arch *arch__find(const char *name) +{ + const int nmemb = ARRAY_SIZE(architectures); + static bool sorted; + + if (!sorted) { + arch__sort(); + sorted = true; + } + + return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); +} + +bool arch__is(struct arch *arch, const char *name) +{ + return !strcmp(arch->name, name); +} + +static void ins_ops__delete(struct ins_operands *ops) +{ + if (ops == NULL) + return; + zfree(&ops->source.raw); + zfree(&ops->source.name); + zfree(&ops->target.raw); + zfree(&ops->target.name); +} + +static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); +} + +int ins__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + if (ins->ops->scnprintf) + return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name); + + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); +} + +bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) +{ + if (!arch || !arch->ins_is_fused) + return false; + + return arch->ins_is_fused(arch, ins1, ins2); +} + +static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + char *endptr, *tok, *name; + struct map *map = ms->map; + struct addr_map_symbol target = { + .ms = { .map = map, }, + }; + + ops->target.addr = strtoull(ops->raw, &endptr, 16); + + name = strchr(endptr, '<'); + if (name == NULL) + goto indirect_call; + + name++; + + if (arch->objdump.skip_functions_char && + strchr(name, arch->objdump.skip_functions_char)) + return -1; + + tok = strchr(name, '>'); + if (tok == NULL) + return -1; + + *tok = '\0'; + ops->target.name = strdup(name); + *tok = '>'; + + if (ops->target.name == NULL) + return -1; +find_target: + target.addr = map__objdump_2mem(map, ops->target.addr); + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + return 0; + +indirect_call: + tok = strchr(endptr, '*'); + if (tok != NULL) { + endptr++; + + /* Indirect call can use a non-rip register and offset: callq *0x8(%rbx). + * Do not parse such instruction. */ + if (strstr(endptr, "(%r") == NULL) + ops->target.addr = strtoull(endptr, NULL, 16); + } + goto find_target; +} + +static int call__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + if (ops->target.sym) + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); + + if (ops->target.addr == 0) + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); + + if (ops->target.name) + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.name); + + return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr); +} + +static struct ins_ops call_ops = { + .parse = call__parse, + .scnprintf = call__scnprintf, +}; + +bool ins__is_call(const struct ins *ins) +{ + return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops; +} + +/* + * Prevents from matching commas in the comment section, e.g.: + * ffff200008446e70: b.cs ffff2000084470f4 // b.hs, b.nlast + * + * and skip comma as part of function arguments, e.g.: + * 1d8b4ac + */ +static inline const char *validate_comma(const char *c, struct ins_operands *ops) +{ + if (ops->jump.raw_comment && c > ops->jump.raw_comment) + return NULL; + + if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) + return NULL; + + return c; +} + +static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + struct map *map = ms->map; + struct symbol *sym = ms->sym; + struct addr_map_symbol target = { + .ms = { .map = map, }, + }; + const char *c = strchr(ops->raw, ','); + u64 start, end; + + ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); + ops->jump.raw_func_start = strchr(ops->raw, '<'); + + c = validate_comma(c, ops); + + /* + * Examples of lines to parse for the _cpp_lex_token@@Base + * function: + * + * 1159e6c: jne 115aa32 <_cpp_lex_token@@Base+0xf92> + * 1159e8b: jne c469be + * + * The first is a jump to an offset inside the same function, + * the second is to another function, i.e. that 0xa72 is an + * offset in the cpp_named_operator2name@@base function. + */ + /* + * skip over possible up to 2 operands to get to address, e.g.: + * tbnz w0, #26, ffff0000083cd190 + */ + if (c++ != NULL) { + ops->target.addr = strtoull(c, NULL, 16); + if (!ops->target.addr) { + c = strchr(c, ','); + c = validate_comma(c, ops); + if (c++ != NULL) + ops->target.addr = strtoull(c, NULL, 16); + } + } else { + ops->target.addr = strtoull(ops->raw, NULL, 16); + } + + target.addr = map__objdump_2mem(map, ops->target.addr); + start = map__unmap_ip(map, sym->start); + end = map__unmap_ip(map, sym->end); + + ops->target.outside = target.addr < start || target.addr > end; + + /* + * FIXME: things like this in _cpp_lex_token (gcc's cc1 program): + + cpp_named_operator2name@@Base+0xa72 + + * Point to a place that is after the cpp_named_operator2name + * boundaries, i.e. in the ELF symbol table for cc1 + * cpp_named_operator2name is marked as being 32-bytes long, but it in + * fact is much larger than that, so we seem to need a symbols__find() + * routine that looks for >= current->start and < next_symbol->start, + * possibly just for C++ objects? + * + * For now lets just make some progress by marking jumps to outside the + * current function as call like. + * + * Actual navigation will come next, with further understanding of how + * the symbol searching and disassembly should be done. + */ + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + if (!ops->target.outside) { + ops->target.offset = target.addr - start; + ops->target.offset_avail = true; + } else { + ops->target.offset_avail = false; + } + + return 0; +} + +static int jump__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + const char *c; + + if (!ops->target.addr || ops->target.offset < 0) + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); + + if (ops->target.outside && ops->target.sym != NULL) + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name); + + c = strchr(ops->raw, ','); + c = validate_comma(c, ops); + + if (c != NULL) { + const char *c2 = strchr(c + 1, ','); + + c2 = validate_comma(c2, ops); + /* check for 3-op insn */ + if (c2 != NULL) + c = c2; + c++; + + /* mirror arch objdump's space-after-comma style */ + if (*c == ' ') + c++; + } + + return scnprintf(bf, size, "%-*s %.*s%" PRIx64, max_ins_name, + ins->name, c ? c - ops->raw : 0, ops->raw, + ops->target.offset); +} + +static void jump__delete(struct ins_operands *ops __maybe_unused) +{ + /* + * The ops->jump.raw_comment and ops->jump.raw_func_start belong to the + * raw string, don't free them. + */ +} + +static struct ins_ops jump_ops = { + .free = jump__delete, + .parse = jump__parse, + .scnprintf = jump__scnprintf, +}; + +bool ins__is_jump(const struct ins *ins) +{ + return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops; +} + +static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) +{ + char *endptr, *name, *t; + + if (strstr(raw, "(%rip)") == NULL) + return 0; + + *addrp = strtoull(comment, &endptr, 16); + if (endptr == comment) + return 0; + name = strchr(endptr, '<'); + if (name == NULL) + return -1; + + name++; + + t = strchr(name, '>'); + if (t == NULL) + return 0; + + *t = '\0'; + *namep = strdup(name); + *t = '>'; + + return 0; +} + +static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + ops->locked.ops = zalloc(sizeof(*ops->locked.ops)); + if (ops->locked.ops == NULL) + return 0; + + if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0) + goto out_free_ops; + + ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name); + + if (ops->locked.ins.ops == NULL) + goto out_free_ops; + + if (ops->locked.ins.ops->parse && + ops->locked.ins.ops->parse(arch, ops->locked.ops, ms) < 0) + goto out_free_ops; + + return 0; + +out_free_ops: + zfree(&ops->locked.ops); + return 0; +} + +static int lock__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + int printed; + + if (ops->locked.ins.ops == NULL) + return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name); + + printed = scnprintf(bf, size, "%-*s ", max_ins_name, ins->name); + return printed + ins__scnprintf(&ops->locked.ins, bf + printed, + size - printed, ops->locked.ops, max_ins_name); +} + +static void lock__delete(struct ins_operands *ops) +{ + struct ins *ins = &ops->locked.ins; + + if (ins->ops && ins->ops->free) + ins->ops->free(ops->locked.ops); + else + ins_ops__delete(ops->locked.ops); + + zfree(&ops->locked.ops); + zfree(&ops->target.raw); + zfree(&ops->target.name); +} + +static struct ins_ops lock_ops = { + .free = lock__delete, + .parse = lock__parse, + .scnprintf = lock__scnprintf, +}; + +/* + * Check if the operand has more than one registers like x86 SIB addressing: + * 0x1234(%rax, %rbx, 8) + * + * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check + * the input string after 'memory_ref_char' if exists. + */ +static bool check_multi_regs(struct arch *arch, const char *op) +{ + int count = 0; + + if (arch->objdump.register_char == 0) + return false; + + if (arch->objdump.memory_ref_char) { + op = strchr(op, arch->objdump.memory_ref_char); + if (op == NULL) + return false; + } + + while ((op = strchr(op, arch->objdump.register_char)) != NULL) { + count++; + op++; + } + + return count > 1; +} + +static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) +{ + char *s = strchr(ops->raw, ','), *target, *comment, prev; + + if (s == NULL) + return -1; + + *s = '\0'; + + /* + * x86 SIB addressing has something like 0x8(%rax, %rcx, 1) + * then it needs to have the closing parenthesis. + */ + if (strchr(ops->raw, '(')) { + *s = ','; + s = strchr(ops->raw, ')'); + if (s == NULL || s[1] != ',') + return -1; + *++s = '\0'; + } + + ops->source.raw = strdup(ops->raw); + *s = ','; + + if (ops->source.raw == NULL) + return -1; + + ops->source.multi_regs = check_multi_regs(arch, ops->source.raw); + + target = skip_spaces(++s); + comment = strchr(s, arch->objdump.comment_char); + + if (comment != NULL) + s = comment - 1; + else + s = strchr(s, '\0') - 1; + + while (s > target && isspace(s[0])) + --s; + s++; + prev = *s; + *s = '\0'; + + ops->target.raw = strdup(target); + *s = prev; + + if (ops->target.raw == NULL) + goto out_free_source; + + ops->target.multi_regs = check_multi_regs(arch, ops->target.raw); + + if (comment == NULL) + return 0; + + comment = skip_spaces(comment); + comment__symbol(ops->source.raw, comment + 1, &ops->source.addr, &ops->source.name); + comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); + + return 0; + +out_free_source: + zfree(&ops->source.raw); + return -1; +} + +static int mov__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name, + ops->source.name ?: ops->source.raw, + ops->target.name ?: ops->target.raw); +} + +static struct ins_ops mov_ops = { + .parse = mov__parse, + .scnprintf = mov__scnprintf, +}; + +static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) +{ + char *target, *comment, *s, prev; + + target = s = ops->raw; + + while (s[0] != '\0' && !isspace(s[0])) + ++s; + prev = *s; + *s = '\0'; + + ops->target.raw = strdup(target); + *s = prev; + + if (ops->target.raw == NULL) + return -1; + + comment = strchr(s, arch->objdump.comment_char); + if (comment == NULL) + return 0; + + comment = skip_spaces(comment); + comment__symbol(ops->target.raw, comment + 1, &ops->target.addr, &ops->target.name); + + return 0; +} + +static int dec__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, + ops->target.name ?: ops->target.raw); +} + +static struct ins_ops dec_ops = { + .parse = dec__parse, + .scnprintf = dec__scnprintf, +}; + +static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size, + struct ins_operands *ops __maybe_unused, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s", max_ins_name, "nop"); +} + +static struct ins_ops nop_ops = { + .scnprintf = nop__scnprintf, +}; + +static struct ins_ops ret_ops = { + .scnprintf = ins__raw_scnprintf, +}; + +bool ins__is_nop(const struct ins *ins) +{ + return ins->ops == &nop_ops; +} + +bool ins__is_ret(const struct ins *ins) +{ + return ins->ops == &ret_ops; +} + +bool ins__is_lock(const struct ins *ins) +{ + return ins->ops == &lock_ops; +} + +static int ins__key_cmp(const void *name, const void *insp) +{ + const struct ins *ins = insp; + + return strcmp(name, ins->name); +} + +static int ins__cmp(const void *a, const void *b) +{ + const struct ins *ia = a; + const struct ins *ib = b; + + return strcmp(ia->name, ib->name); +} + +static void ins__sort(struct arch *arch) +{ + const int nmemb = arch->nr_instructions; + + qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); +} + +static struct ins_ops *__ins__find(struct arch *arch, const char *name) +{ + struct ins *ins; + const int nmemb = arch->nr_instructions; + + if (!arch->sorted_instructions) { + ins__sort(arch); + arch->sorted_instructions = true; + } + + ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); + if (ins) + return ins->ops; + + if (arch->insn_suffix) { + char tmp[32]; + char suffix; + size_t len = strlen(name); + + if (len == 0 || len >= sizeof(tmp)) + return NULL; + + suffix = name[len - 1]; + if (strchr(arch->insn_suffix, suffix) == NULL) + return NULL; + + strcpy(tmp, name); + tmp[len - 1] = '\0'; /* remove the suffix and check again */ + + ins = bsearch(tmp, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); + } + return ins ? ins->ops : NULL; +} + +struct ins_ops *ins__find(struct arch *arch, const char *name) +{ + struct ins_ops *ops = __ins__find(arch, name); + + if (!ops && arch->associate_instruction_ops) + ops = arch->associate_instruction_ops(arch, name); + + return ops; +} + +static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) +{ + dl->ins.ops = ins__find(arch, dl->ins.name); + + if (!dl->ins.ops) + return; + + if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms) < 0) + dl->ins.ops = NULL; +} + +static int disasm_line__parse(char *line, const char **namep, char **rawp) +{ + char tmp, *name = skip_spaces(line); + + if (name[0] == '\0') + return -1; + + *rawp = name + 1; + + while ((*rawp)[0] != '\0' && !isspace((*rawp)[0])) + ++*rawp; + + tmp = (*rawp)[0]; + (*rawp)[0] = '\0'; + *namep = strdup(name); + + if (*namep == NULL) + goto out; + + (*rawp)[0] = tmp; + *rawp = strim(*rawp); + + return 0; + +out: + return -1; +} + +static void annotation_line__init(struct annotation_line *al, + struct annotate_args *args, + int nr) +{ + al->offset = args->offset; + al->line = strdup(args->line); + al->line_nr = args->line_nr; + al->fileloc = args->fileloc; + al->data_nr = nr; +} + +static void annotation_line__exit(struct annotation_line *al) +{ + zfree_srcline(&al->path); + zfree(&al->line); + zfree(&al->cycles); +} + +static size_t disasm_line_size(int nr) +{ + struct annotation_line *al; + + return (sizeof(struct disasm_line) + (sizeof(al->data[0]) * nr)); +} + +/* + * Allocating the disasm annotation line data with + * following structure: + * + * ------------------------------------------- + * struct disasm_line | struct annotation_line + * ------------------------------------------- + * + * We have 'struct annotation_line' member as last member + * of 'struct disasm_line' to have an easy access. + */ +struct disasm_line *disasm_line__new(struct annotate_args *args) +{ + struct disasm_line *dl = NULL; + int nr = 1; + + if (evsel__is_group_event(args->evsel)) + nr = args->evsel->core.nr_members; + + dl = zalloc(disasm_line_size(nr)); + if (!dl) + return NULL; + + annotation_line__init(&dl->al, args, nr); + if (dl->al.line == NULL) + goto out_delete; + + if (args->offset != -1) { + if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) + goto out_free_line; + + disasm_line__init_ins(dl, args->arch, &args->ms); + } + + return dl; + +out_free_line: + zfree(&dl->al.line); +out_delete: + free(dl); + return NULL; +} + +void disasm_line__free(struct disasm_line *dl) +{ + if (dl->ins.ops && dl->ins.ops->free) + dl->ins.ops->free(&dl->ops); + else + ins_ops__delete(&dl->ops); + zfree(&dl->ins.name); + annotation_line__exit(&dl->al); + free(dl); +} + +int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw, int max_ins_name) +{ + if (raw || !dl->ins.ops) + return scnprintf(bf, size, "%-*s %s", max_ins_name, dl->ins.name, dl->ops.raw); + + return ins__scnprintf(&dl->ins, bf, size, &dl->ops, max_ins_name); +} + +/* + * symbol__parse_objdump_line() parses objdump output (with -d --no-show-raw) + * which looks like following + * + * 0000000000415500 <_init>: + * 415500: sub $0x8,%rsp + * 415504: mov 0x2f5ad5(%rip),%rax # 70afe0 <_DYNAMIC+0x2f8> + * 41550b: test %rax,%rax + * 41550e: je 415515 <_init+0x15> + * 415510: callq 416e70 <__gmon_start__@plt> + * 415515: add $0x8,%rsp + * 415519: retq + * + * it will be parsed and saved into struct disasm_line as + * + * + * The offset will be a relative offset from the start of the symbol and -1 + * means that it's not a disassembly line so should be treated differently. + * The ops.raw part will be parsed further according to type of the instruction. + */ +static int symbol__parse_objdump_line(struct symbol *sym, + struct annotate_args *args, + char *parsed_line, int *line_nr, char **fileloc) +{ + struct map *map = args->ms.map; + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + char *tmp; + s64 line_ip, offset = -1; + regmatch_t match[2]; + + /* /filename:linenr ? Save line number and ignore. */ + if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { + *line_nr = atoi(parsed_line + match[1].rm_so); + free(*fileloc); + *fileloc = strdup(parsed_line); + return 0; + } + + /* Process hex address followed by ':'. */ + line_ip = strtoull(parsed_line, &tmp, 16); + if (parsed_line != tmp && tmp[0] == ':' && tmp[1] != '\0') { + u64 start = map__rip_2objdump(map, sym->start), + end = map__rip_2objdump(map, sym->end); + + offset = line_ip - start; + if ((u64)line_ip < start || (u64)line_ip >= end) + offset = -1; + else + parsed_line = tmp + 1; + } + + args->offset = offset; + args->line = parsed_line; + args->line_nr = *line_nr; + args->fileloc = *fileloc; + args->ms.sym = sym; + + dl = disasm_line__new(args); + (*line_nr)++; + + if (dl == NULL) + return -1; + + if (!disasm_line__has_local_offset(dl)) { + dl->ops.target.offset = dl->ops.target.addr - + map__rip_2objdump(map, sym->start); + dl->ops.target.offset_avail = true; + } + + /* kcore has no symbols, so add the call target symbol */ + if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) { + struct addr_map_symbol target = { + .addr = dl->ops.target.addr, + .ms = { .map = map, }, + }; + + if (!maps__find_ams(args->ms.maps, &target) && + target.ms.sym->start == target.al_addr) + dl->ops.target.sym = target.ms.sym; + } + + annotation_line__add(&dl->al, ¬es->src->source); + return 0; +} + +static void delete_last_nop(struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + struct list_head *list = ¬es->src->source; + struct disasm_line *dl; + + while (!list_empty(list)) { + dl = list_entry(list->prev, struct disasm_line, al.node); + + if (dl->ins.ops) { + if (!ins__is_nop(&dl->ins)) + return; + } else { + if (!strstr(dl->al.line, " nop ") && + !strstr(dl->al.line, " nopl ") && + !strstr(dl->al.line, " nopw ")) + return; + } + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } +} + +int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, size_t buflen) +{ + struct dso *dso = map__dso(ms->map); + + BUG_ON(buflen == 0); + + if (errnum >= 0) { + str_error_r(errnum, buf, buflen); + return 0; + } + + switch (errnum) { + case SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX: { + char bf[SBUILD_ID_SIZE + 15] = " with build id "; + char *build_id_msg = NULL; + + if (dso->has_build_id) { + build_id__sprintf(&dso->bid, bf + 15); + build_id_msg = bf; + } + scnprintf(buf, buflen, + "No vmlinux file%s\nwas found in the path.\n\n" + "Note that annotation using /proc/kcore requires CAP_SYS_RAWIO capability.\n\n" + "Please use:\n\n" + " perf buildid-cache -vu vmlinux\n\n" + "or:\n\n" + " --vmlinux vmlinux\n", build_id_msg ?: ""); + } + break; + case SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF: + scnprintf(buf, buflen, "Please link with binutils's libopcode to enable BPF annotation"); + break; + case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP: + scnprintf(buf, buflen, "Problems with arch specific instruction name regular expressions."); + break; + case SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING: + scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); + break; + case SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE: + scnprintf(buf, buflen, "Invalid BPF file: %s.", dso->long_name); + break; + case SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF: + scnprintf(buf, buflen, "The %s BPF file has no BTF section, compile with -g or use pahole -J.", + dso->long_name); + break; + default: + scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); + break; + } + + return 0; +} + +static int dso__disassemble_filename(struct dso *dso, char *filename, size_t filename_size) +{ + char linkname[PATH_MAX]; + char *build_id_filename; + char *build_id_path = NULL; + char *pos; + int len; + + if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && + !dso__is_kcore(dso)) + return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX; + + build_id_filename = dso__build_id_filename(dso, NULL, 0, false); + if (build_id_filename) { + __symbol__join_symfs(filename, filename_size, build_id_filename); + free(build_id_filename); + } else { + if (dso->has_build_id) + return ENOMEM; + goto fallback; + } + + build_id_path = strdup(filename); + if (!build_id_path) + return ENOMEM; + + /* + * old style build-id cache has name of XX/XXXXXXX.. while + * new style has XX/XXXXXXX../{elf,kallsyms,vdso}. + * extract the build-id part of dirname in the new style only. + */ + pos = strrchr(build_id_path, '/'); + if (pos && strlen(pos) < SBUILD_ID_SIZE - 2) + dirname(build_id_path); + + if (dso__is_kcore(dso)) + goto fallback; + + len = readlink(build_id_path, linkname, sizeof(linkname) - 1); + if (len < 0) + goto fallback; + + linkname[len] = '\0'; + if (strstr(linkname, DSO__NAME_KALLSYMS) || + access(filename, R_OK)) { +fallback: + /* + * If we don't have build-ids or the build-id file isn't in the + * cache, or is just a kallsyms file, well, lets hope that this + * DSO is the same as when 'perf record' ran. + */ + if (dso->kernel && dso->long_name[0] == '/') + snprintf(filename, filename_size, "%s", dso->long_name); + else + __symbol__join_symfs(filename, filename_size, dso->long_name); + + mutex_lock(&dso->lock); + if (access(filename, R_OK) && errno == ENOENT && dso->nsinfo) { + char *new_name = dso__filename_with_chroot(dso, filename); + if (new_name) { + strlcpy(filename, new_name, filename_size); + free(new_name); + } + } + mutex_unlock(&dso->lock); + } + + free(build_id_path); + return 0; +} + +#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +#define PACKAGE "perf" +#include +#include +#include +#include +#include +#include +#include + +#include "bpf-event.h" +#include "bpf-utils.h" + +static int symbol__disassemble_bpf(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct bpf_prog_linfo *prog_linfo = NULL; + struct bpf_prog_info_node *info_node; + int len = sym->end - sym->start; + disassembler_ftype disassemble; + struct map *map = args->ms.map; + struct perf_bpil *info_linear; + struct disassemble_info info; + struct dso *dso = map__dso(map); + int pc = 0, count, sub_id; + struct btf *btf = NULL; + char tpath[PATH_MAX]; + size_t buf_size; + int nr_skip = 0; + char *buf; + bfd *bfdf; + int ret; + FILE *s; + + if (dso->binary_type != DSO_BINARY_TYPE__BPF_PROG_INFO) + return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; + + pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, + sym->name, sym->start, sym->end - sym->start); + + memset(tpath, 0, sizeof(tpath)); + perf_exe(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + if (bfdf == NULL) + abort(); + + if (!bfd_check_format(bfdf, bfd_object)) + abort(); + + s = open_memstream(&buf, &buf_size); + if (!s) { + ret = errno; + goto out; + } + init_disassemble_info_compat(&info, s, + (fprintf_ftype) fprintf, + fprintf_styled); + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + + info_node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, + dso->bpf_prog.id); + if (!info_node) { + ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; + goto out; + } + info_linear = info_node->info_linear; + sub_id = dso->bpf_prog.sub_id; + + info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); + info.buffer_length = info_linear->info.jited_prog_len; + + if (info_linear->info.nr_line_info) + prog_linfo = bpf_prog_linfo__new(&info_linear->info); + + if (info_linear->info.btf_id) { + struct btf_node *node; + + node = perf_env__find_btf(dso->bpf_prog.env, + info_linear->info.btf_id); + if (node) + btf = btf__new((__u8 *)(node->data), + node->data_size); + } + + disassemble_init_for_target(&info); + +#ifdef DISASM_FOUR_ARGS_SIGNATURE + disassemble = disassembler(info.arch, + bfd_big_endian(bfdf), + info.mach, + bfdf); +#else + disassemble = disassembler(bfdf); +#endif + if (disassemble == NULL) + abort(); + + fflush(s); + do { + const struct bpf_line_info *linfo = NULL; + struct disasm_line *dl; + size_t prev_buf_size; + const char *srcline; + u64 addr; + + addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; + count = disassemble(pc, &info); + + if (prog_linfo) + linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, + addr, sub_id, + nr_skip); + + if (linfo && btf) { + srcline = btf__name_by_offset(btf, linfo->line_off); + nr_skip++; + } else + srcline = NULL; + + fprintf(s, "\n"); + prev_buf_size = buf_size; + fflush(s); + + if (!annotate_opts.hide_src_code && srcline) { + args->offset = -1; + args->line = strdup(srcline); + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) { + annotation_line__add(&dl->al, + ¬es->src->source); + } + } + + args->offset = pc; + args->line = buf + prev_buf_size; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + pc += count; + } while (count > 0 && pc < len); + + ret = 0; +out: + free(prog_linfo); + btf__free(btf); + fclose(s); + bfd_close(bfdf); + return ret; +} +#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, + struct annotate_args *args __maybe_unused) +{ + return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; +} +#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) + +static int +symbol__disassemble_bpf_image(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + + args->offset = -1; + args->line = strdup("to be implemented"); + args->line_nr = 0; + args->fileloc = NULL; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + zfree(&args->line); + return 0; +} + +/* + * Possibly create a new version of line with tabs expanded. Returns the + * existing or new line, storage is updated if a new line is allocated. If + * allocation fails then NULL is returned. + */ +static char *expand_tabs(char *line, char **storage, size_t *storage_len) +{ + size_t i, src, dst, len, new_storage_len, num_tabs; + char *new_line; + size_t line_len = strlen(line); + + for (num_tabs = 0, i = 0; i < line_len; i++) + if (line[i] == '\t') + num_tabs++; + + if (num_tabs == 0) + return line; + + /* + * Space for the line and '\0', less the leading and trailing + * spaces. Each tab may introduce 7 additional spaces. + */ + new_storage_len = line_len + 1 + (num_tabs * 7); + + new_line = malloc(new_storage_len); + if (new_line == NULL) { + pr_err("Failure allocating memory for tab expansion\n"); + return NULL; + } + + /* + * Copy regions starting at src and expand tabs. If there are two + * adjacent tabs then 'src == i', the memcpy is of size 0 and the spaces + * are inserted. + */ + for (i = 0, src = 0, dst = 0; i < line_len && num_tabs; i++) { + if (line[i] == '\t') { + len = i - src; + memcpy(&new_line[dst], &line[src], len); + dst += len; + new_line[dst++] = ' '; + while (dst % 8 != 0) + new_line[dst++] = ' '; + src = i + 1; + num_tabs--; + } + } + + /* Expand the last region. */ + len = line_len - src; + memcpy(&new_line[dst], &line[src], len); + dst += len; + new_line[dst] = '\0'; + + free(*storage); + *storage = new_line; + *storage_len = new_storage_len; + return new_line; +} + +int symbol__disassemble(struct symbol *sym, struct annotate_args *args) +{ + struct annotation_options *opts = &annotate_opts; + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + char *command; + FILE *file; + char symfs_filename[PATH_MAX]; + struct kcore_extract kce; + bool delete_extract = false; + bool decomp = false; + int lineno = 0; + char *fileloc = NULL; + int nline; + char *line; + size_t line_len; + const char *objdump_argv[] = { + "/bin/sh", + "-c", + NULL, /* Will be the objdump command to run. */ + "--", + NULL, /* Will be the symfs path. */ + NULL, + }; + struct child_process objdump_process; + int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename)); + + if (err) + return err; + + pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, + symfs_filename, sym->name, map__unmap_ip(map, sym->start), + map__unmap_ip(map, sym->end)); + + pr_debug("annotating [%p] %30s : [%p] %30s\n", + dso, dso->long_name, sym, sym->name); + + if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { + return symbol__disassemble_bpf(sym, args); + } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { + return symbol__disassemble_bpf_image(sym, args); + } else if (dso__is_kcore(dso)) { + kce.kcore_filename = symfs_filename; + kce.addr = map__rip_2objdump(map, sym->start); + kce.offs = sym->start; + kce.len = sym->end - sym->start; + if (!kcore_extract__create(&kce)) { + delete_extract = true; + strlcpy(symfs_filename, kce.extract_filename, + sizeof(symfs_filename)); + } + } else if (dso__needs_decompress(dso)) { + char tmp[KMOD_DECOMP_LEN]; + + if (dso__decompress_kmodule_path(dso, symfs_filename, + tmp, sizeof(tmp)) < 0) + return -1; + + decomp = true; + strcpy(symfs_filename, tmp); + } + + err = asprintf(&command, + "%s %s%s --start-address=0x%016" PRIx64 + " --stop-address=0x%016" PRIx64 + " %s -d %s %s %s %c%s%c %s%s -C \"$1\"", + opts->objdump_path ?: "objdump", + opts->disassembler_style ? "-M " : "", + opts->disassembler_style ?: "", + map__rip_2objdump(map, sym->start), + map__rip_2objdump(map, sym->end), + opts->show_linenr ? "-l" : "", + opts->show_asm_raw ? "" : "--no-show-raw-insn", + opts->annotate_src ? "-S" : "", + opts->prefix ? "--prefix " : "", + opts->prefix ? '"' : ' ', + opts->prefix ?: "", + opts->prefix ? '"' : ' ', + opts->prefix_strip ? "--prefix-strip=" : "", + opts->prefix_strip ?: ""); + + if (err < 0) { + pr_err("Failure allocating memory for the command to run\n"); + goto out_remove_tmp; + } + + pr_debug("Executing: %s\n", command); + + objdump_argv[2] = command; + objdump_argv[4] = symfs_filename; + + /* Create a pipe to read from for stdout */ + memset(&objdump_process, 0, sizeof(objdump_process)); + objdump_process.argv = objdump_argv; + objdump_process.out = -1; + objdump_process.err = -1; + objdump_process.no_stderr = 1; + if (start_command(&objdump_process)) { + pr_err("Failure starting to run %s\n", command); + err = -1; + goto out_free_command; + } + + file = fdopen(objdump_process.out, "r"); + if (!file) { + pr_err("Failure creating FILE stream for %s\n", command); + /* + * If we were using debug info should retry with + * original binary. + */ + err = -1; + goto out_close_stdout; + } + + /* Storage for getline. */ + line = NULL; + line_len = 0; + + nline = 0; + while (!feof(file)) { + const char *match; + char *expanded_line; + + if (getline(&line, &line_len, file) < 0 || !line) + break; + + /* Skip lines containing "filename:" */ + match = strstr(line, symfs_filename); + if (match && match[strlen(symfs_filename)] == ':') + continue; + + expanded_line = strim(line); + expanded_line = expand_tabs(expanded_line, &line, &line_len); + if (!expanded_line) + break; + + /* + * The source code line number (lineno) needs to be kept in + * across calls to symbol__parse_objdump_line(), so that it + * can associate it with the instructions till the next one. + * See disasm_line__new() and struct disasm_line::line_nr. + */ + if (symbol__parse_objdump_line(sym, args, expanded_line, + &lineno, &fileloc) < 0) + break; + nline++; + } + free(line); + free(fileloc); + + err = finish_command(&objdump_process); + if (err) + pr_err("Error running %s\n", command); + + if (nline == 0) { + err = -1; + pr_err("No output from %s\n", command); + } + + /* + * kallsyms does not have symbol sizes so there may a nop at the end. + * Remove it. + */ + if (dso__is_kcore(dso)) + delete_last_nop(sym); + + fclose(file); + +out_close_stdout: + close(objdump_process.out); + +out_free_command: + free(command); + +out_remove_tmp: + if (decomp) + unlink(symfs_filename); + + if (delete_extract) + kcore_extract__delete(&kce); + + return err; +} diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h new file mode 100644 index 000000000000..3d381a043520 --- /dev/null +++ b/tools/perf/util/disasm.h @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __PERF_UTIL_DISASM_H +#define __PERF_UTIL_DISASM_H + +#include "map_symbol.h" + +struct annotation_options; +struct disasm_line; +struct ins; +struct evsel; +struct symbol; + +struct arch { + const char *name; + struct ins *instructions; + size_t nr_instructions; + size_t nr_instructions_allocated; + struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); + bool sorted_instructions; + bool initialized; + const char *insn_suffix; + void *priv; + unsigned int model; + unsigned int family; + int (*init)(struct arch *arch, char *cpuid); + bool (*ins_is_fused)(struct arch *arch, const char *ins1, + const char *ins2); + struct { + char comment_char; + char skip_functions_char; + char register_char; + char memory_ref_char; + char imm_char; + } objdump; +}; + +struct ins { + const char *name; + struct ins_ops *ops; +}; + +struct ins_operands { + char *raw; + struct { + char *raw; + char *name; + struct symbol *sym; + u64 addr; + s64 offset; + bool offset_avail; + bool outside; + bool multi_regs; + } target; + union { + struct { + char *raw; + char *name; + u64 addr; + bool multi_regs; + } source; + struct { + struct ins ins; + struct ins_operands *ops; + } locked; + struct { + char *raw_comment; + char *raw_func_start; + } jump; + }; +}; + +struct ins_ops { + void (*free)(struct ins_operands *ops); + int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms); + int (*scnprintf)(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +}; + +struct annotate_args { + struct arch *arch; + struct map_symbol ms; + struct evsel *evsel; + struct annotation_options *options; + s64 offset; + char *line; + int line_nr; + char *fileloc; +}; + +struct arch *arch__find(const char *name); +bool arch__is(struct arch *arch, const char *name); + +struct ins_ops *ins__find(struct arch *arch, const char *name); +int ins__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); + +bool ins__is_call(const struct ins *ins); +bool ins__is_jump(const struct ins *ins); +bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2); +bool ins__is_nop(const struct ins *ins); +bool ins__is_ret(const struct ins *ins); +bool ins__is_lock(const struct ins *ins); + +struct disasm_line *disasm_line__new(struct annotate_args *args); +void disasm_line__free(struct disasm_line *dl); + +int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, + bool raw, int max_ins_name); + +int symbol__disassemble(struct symbol *sym, struct annotate_args *args); + +#endif /* __PERF_UTIL_DISASM_H */ -- cgit v1.2.3 From 6d17edc113de1e21fc66afa76be475a4f7c91826 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 29 Mar 2024 14:58:11 -0700 Subject: perf annotate: Use libcapstone to disassemble Now it can use the capstone library to disassemble the instructions. Let's use that (if available) for perf annotate to speed up. Currently it only supports x86 architecture. With this change I can see ~3x speed up in data type profiling. But note that capstone cannot give the source file and line number info. For now, users should use the external objdump for that by specifying the --objdump option explicitly. Signed-off-by: Namhyung Kim Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240329215812.537846-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 21a43b07e8b5..6552b45e37ec 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include #include +#include #include #include #include @@ -19,6 +20,7 @@ #include "evsel.h" #include "map.h" #include "maps.h" +#include "namespaces.h" #include "srcline.h" #include "symbol.h" #include "util.h" @@ -1346,6 +1348,158 @@ symbol__disassemble_bpf_image(struct symbol *sym, return 0; } +#ifdef HAVE_LIBCAPSTONE_SUPPORT +#include + +static int open_capstone_handle(struct annotate_args *args, bool is_64bit, + csh *handle) +{ + struct annotation_options *opt = args->options; + cs_mode mode = is_64bit ? CS_MODE_64 : CS_MODE_32; + + /* TODO: support more architectures */ + if (!arch__is(args->arch, "x86")) + return -1; + + if (cs_open(CS_ARCH_X86, mode, handle) != CS_ERR_OK) + return -1; + + if (!opt->disassembler_style || + !strcmp(opt->disassembler_style, "att")) + cs_option(*handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); + + return 0; +} + +struct find_file_offset_data { + u64 ip; + u64 offset; +}; + +/* This will be called for each PHDR in an ELF binary */ +static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) +{ + struct find_file_offset_data *data = arg; + + if (start <= data->ip && data->ip < start + len) { + data->offset = pgoff + data->ip - start; + return 1; + } + return 0; +} + +static int symbol__disassemble_capstone(char *filename, struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + struct nscookie nsc; + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + u64 len = end - start; + u64 offset; + int i, fd, count; + bool is_64bit = false; + bool needs_cs_close = false; + u8 *buf = NULL; + struct find_file_offset_data data = { + .ip = start, + }; + csh handle; + cs_insn *insn; + char disasm_buf[512]; + struct disasm_line *dl; + + if (args->options->objdump_path) + return -1; + + nsinfo__mountns_enter(dso->nsinfo, &nsc); + fd = open(filename, O_RDONLY); + nsinfo__mountns_exit(&nsc); + if (fd < 0) + return -1; + + if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, + &is_64bit) == 0) + goto err; + + if (open_capstone_handle(args, is_64bit, &handle) < 0) + goto err; + + needs_cs_close = true; + + buf = malloc(len); + if (buf == NULL) + goto err; + + count = pread(fd, buf, len, data.offset); + close(fd); + fd = -1; + + if ((u64)count != len) + goto err; + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + count = cs_disasm(handle, buf, len, start, len, &insn); + for (i = 0, offset = 0; i < count; i++) { + scnprintf(disasm_buf, sizeof(disasm_buf), + " %-7s %s", + insn[i].mnemonic, insn[i].op_str); + + args->offset = offset; + args->line = disasm_buf; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + offset += insn[i].size; + } + +out: + if (needs_cs_close) + cs_close(&handle); + free(buf); + return count < 0 ? count : 0; + +err: + if (fd >= 0) + close(fd); + if (needs_cs_close) { + struct disasm_line *tmp; + + /* + * It probably failed in the middle of the above loop. + * Release any resources it might add. + */ + list_for_each_entry_safe(dl, tmp, ¬es->src->source, al.node) { + list_del(&dl->al.node); + free(dl); + } + } + count = -1; + goto out; +} +#endif + /* * Possibly create a new version of line with tabs expanded. Returns the * existing or new line, storage is updated if a new line is allocated. If @@ -1468,6 +1622,12 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) strcpy(symfs_filename, tmp); } +#ifdef HAVE_LIBCAPSTONE_SUPPORT + err = symbol__disassemble_capstone(symfs_filename, sym, args); + if (err == 0) + goto out_remove_tmp; +#endif + err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 -- cgit v1.2.3 From 92dfc59463d55b9abb47429591fd8a21b27930cf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 29 Mar 2024 14:58:12 -0700 Subject: perf annotate: Add symbol name when using capstone This is to keep the existing behavior with objdump. It needs to show symbol information of global variables like below: Percent | Source code & Disassembly of elf for cycles:P (1 samples, percent: local period) ------------------------------------------------------------------------------------------------ : 0 0xffffffff81338f70 : 0.00 : ffffffff81338f70: endbr64 0.00 : ffffffff81338f74: callq 0xffffffff81083a40 0.00 : ffffffff81338f79: movq %rdi, %r8 0.00 : ffffffff81338f7c: movq %rdx, %rdi 0.00 : ffffffff81338f7f: callq *0x17021c3(%rip) # ffffffff82a3b148 0.00 : ffffffff81338f85: movq 0xffbf3c(%rip), %rdx # ffffffff82334ec8 0.00 : ffffffff81338f8c: testq %rax, %rax ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 0.00 : ffffffff81338f8f: je 0xffffffff81338fd0 here 0.00 : ffffffff81338f91: movq %rax, %rcx 0.00 : ffffffff81338f94: andl $1, %ecx Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Changbin Du Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240329215812.537846-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 6552b45e37ec..a1219eb930aa 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1368,6 +1368,12 @@ static int open_capstone_handle(struct annotate_args *args, bool is_64bit, !strcmp(opt->disassembler_style, "att")) cs_option(*handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); + /* + * Resolving address operands to symbols is implemented + * on x86 by investigating instruction details. + */ + cs_option(*handle, CS_OPT_DETAIL, CS_OPT_ON); + return 0; } @@ -1388,6 +1394,63 @@ static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) return 0; } +static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, + struct annotate_args *args, u64 addr) +{ + int i; + struct map *map = args->ms.map; + struct symbol *sym; + + /* TODO: support more architectures */ + if (!arch__is(args->arch, "x86")) + return; + + if (insn->detail == NULL) + return; + + for (i = 0; i < insn->detail->x86.op_count; i++) { + cs_x86_op *op = &insn->detail->x86.operands[i]; + u64 orig_addr; + + if (op->type != X86_OP_MEM) + continue; + + /* only print RIP-based global symbols for now */ + if (op->mem.base != X86_REG_RIP) + continue; + + /* get the target address */ + orig_addr = addr + insn->size + op->mem.disp; + addr = map__objdump_2mem(map, orig_addr); + + if (map__dso(map)->kernel) { + /* + * The kernel maps can be splitted into sections, + * let's find the map first and the search the symbol. + */ + map = maps__find(map__kmaps(map), addr); + if (map == NULL) + continue; + } + + /* convert it to map-relative address for search */ + addr = map__map_ip(map, addr); + + sym = map__find_symbol(map, addr); + if (sym == NULL) + continue; + + if (addr == sym->start) { + scnprintf(buf, len, "\t# %"PRIx64" <%s>", + orig_addr, sym->name); + } else { + scnprintf(buf, len, "\t# %"PRIx64" <%s+%#"PRIx64">", + orig_addr, sym->name, addr - sym->start); + } + break; + } +} + static int symbol__disassemble_capstone(char *filename, struct symbol *sym, struct annotate_args *args) { @@ -1458,9 +1521,14 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym, count = cs_disasm(handle, buf, len, start, len, &insn); for (i = 0, offset = 0; i < count; i++) { - scnprintf(disasm_buf, sizeof(disasm_buf), - " %-7s %s", - insn[i].mnemonic, insn[i].op_str); + int printed; + + printed = scnprintf(disasm_buf, sizeof(disasm_buf), + " %-7s %s", + insn[i].mnemonic, insn[i].op_str); + print_capstone_detail(&insn[i], disasm_buf + printed, + sizeof(disasm_buf) - printed, args, + start + offset); args->offset = offset; args->line = disasm_buf; -- cgit v1.2.3 From 089ef2f4c8b9fa609b99c96592a60c8c025dca3f Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Wed, 3 Apr 2024 20:25:58 +0800 Subject: perf beauty: Fix AT_EACCESS undeclared build error for system with kernel versions lower than v5.8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the environment of ubuntu 20.04 (the version of kernel headers is 5.4), there is an error in building perf: CC trace/beauty/fs_at_flags.o trace/beauty/fs_at_flags.c: In function ‘faccessat2__scnprintf_flags’: trace/beauty/fs_at_flags.c:35:14: error: ‘AT_EACCESS’ undeclared (first use in this function); did you mean ‘DN_ACCESS’? 35 | if (flags & AT_EACCESS) { | ^~~~~~~~~~ | DN_ACCESS trace/beauty/fs_at_flags.c:35:14: note: each undeclared identifier is reported only once for each function it appears in commit 8a1ad4413519b10f ("tools headers: Remove now unused copies of uapi/{fcntl,openat2}.h and asm/fcntl.h") removes fcntl.h from tools headers directory, and fs_at_flags.c uses the 'AT_EACCESS' macro. This macro was introduced in the kernel version v5.8. For system with a kernel version older than this version, it will cause compilation to fail. Fixes: 8a1ad4413519b10f ("tools headers: Remove now unused copies of uapi/{fcntl,openat2}.h and asm/fcntl.h") Signed-off-by: Yang Jihong Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240403122558.1438841-1-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/fs_at_flags.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/perf/trace/beauty/fs_at_flags.c b/tools/perf/trace/beauty/fs_at_flags.c index c1365e8f0b96..c200669cb944 100644 --- a/tools/perf/trace/beauty/fs_at_flags.c +++ b/tools/perf/trace/beauty/fs_at_flags.c @@ -10,6 +10,14 @@ #include #include +/* + * uapi/linux/fcntl.h does not keep a copy in tools headers directory, + * for system with kernel versions before v5.8, need to sync AT_EACCESS macro. + */ +#ifndef AT_EACCESS +#define AT_EACCESS 0x200 +#endif + #include "trace/beauty/generated/fs_at_flags_array.c" static DEFINE_STRARRAY(fs_at_flags, "AT_"); -- cgit v1.2.3 From 4793cb599b1bdc3d356f0374c2c99ffe890ae876 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Wed, 27 Mar 2024 10:44:37 +0800 Subject: selftests: cgroup: skip test_cgcore_lesser_ns_open when cgroup2 mounted without nsdelegate The test case test_cgcore_lesser_ns_open only tasks effect when cgroup2 is mounted with "nsdelegate" mount option. If it misses this option, or is remounted without "nsdelegate", the test case will fail. For example, running bpf/test_cgroup_storage first, and then run cgroup/test_core will fail on test_cgcore_lesser_ns_open. Skip it if "nsdelegate" is not detected in cgroup2 mount options. Fixes: bf35a7879f1d ("selftests: cgroup: Test open-time cgroup namespace usage for migration checks") Signed-off-by: Tianchen Ding Reviewed-by: Muhammad Usama Anjum Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/cgroup_util.c | 8 +++++--- tools/testing/selftests/cgroup/cgroup_util.h | 2 +- tools/testing/selftests/cgroup/test_core.c | 7 ++++++- tools/testing/selftests/cgroup/test_cpu.c | 2 +- tools/testing/selftests/cgroup/test_cpuset.c | 2 +- tools/testing/selftests/cgroup/test_freezer.c | 2 +- tools/testing/selftests/cgroup/test_hugetlb_memcg.c | 2 +- tools/testing/selftests/cgroup/test_kill.c | 2 +- tools/testing/selftests/cgroup/test_kmem.c | 2 +- tools/testing/selftests/cgroup/test_memcontrol.c | 2 +- tools/testing/selftests/cgroup/test_zswap.c | 2 +- 11 files changed, 20 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 0340d4ca8f51..432db923bced 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -195,10 +195,10 @@ int cg_write_numeric(const char *cgroup, const char *control, long value) return cg_write(cgroup, control, buf); } -int cg_find_unified_root(char *root, size_t len) +int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) { char buf[10 * PAGE_SIZE]; - char *fs, *mount, *type; + char *fs, *mount, *type, *options; const char delim[] = "\n\t "; if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0) @@ -211,12 +211,14 @@ int cg_find_unified_root(char *root, size_t len) for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) { mount = strtok(NULL, delim); type = strtok(NULL, delim); - strtok(NULL, delim); + options = strtok(NULL, delim); strtok(NULL, delim); strtok(NULL, delim); if (strcmp(type, "cgroup2") == 0) { strncpy(root, mount, len); + if (nsdelegate) + *nsdelegate = !!strstr(options, "nsdelegate"); return 0; } } diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 1df7f202214a..89e8519fb271 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -21,7 +21,7 @@ static inline int values_close(long a, long b, int err) return abs(a - b) <= (a + b) / 100 * err; } -extern int cg_find_unified_root(char *root, size_t len); +extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate); extern char *cg_name(const char *root, const char *name); extern char *cg_name_indexed(const char *root, const char *name, int index); extern char *cg_control(const char *cgroup, const char *control); diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 80aa6b2373b9..a5672a91d273 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -18,6 +18,8 @@ #include "../kselftest.h" #include "cgroup_util.h" +static bool nsdelegate; + static int touch_anon(char *buf, size_t size) { int fd; @@ -775,6 +777,9 @@ static int test_cgcore_lesser_ns_open(const char *root) pid_t pid; int status; + if (!nsdelegate) + return KSFT_SKIP; + cg_test_a = cg_name(root, "cg_test_a"); cg_test_b = cg_name(root, "cg_test_b"); @@ -862,7 +867,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 24020a2c68dc..186bf96f6a28 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -700,7 +700,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "cpu")) diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c index b061ed1e05b4..4034d14ba69a 100644 --- a/tools/testing/selftests/cgroup/test_cpuset.c +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -249,7 +249,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset")) diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8845353aca53..8730645d363a 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -827,7 +827,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c index f0fefeb4cc24..856f9508ea56 100644 --- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -214,7 +214,7 @@ int main(int argc, char **argv) return ret; } - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); switch (test_hugetlb_memcg(root)) { diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c index 6153690319c9..0e5bb6c7307a 100644 --- a/tools/testing/selftests/cgroup/test_kill.c +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -276,7 +276,7 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index c82f974b85c9..137506db0312 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -420,7 +420,7 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index c7c9572003a8..b462416b3806 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1314,7 +1314,7 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, proc_status, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index f0e488ed90d8..ef7f39545317 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -440,7 +440,7 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root))) + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (!zswap_configured()) -- cgit v1.2.3 From baa2ca59ec1e31ccbe3f24ff0368152b36f68720 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Thu, 14 Mar 2024 14:30:00 +0800 Subject: perf build: Add LIBTRACEEVENT_DIR build option Currently, when libtraceevent is not linked, perf does not support tracepoint: # ./perf record -e sched:sched_switch -a sleep 10 event syntax error: 'sched:sched_switch' \___ unsupported tracepoint libtraceevent is necessary for tracepoint support Run 'perf list' for a list of valid events Usage: perf record [] [] or: perf record [] -- [] -e, --event event selector. use 'perf list' to list available events For cross-compilation scenario, library may not be installed in the default system path. Based on the above requirements, add LIBTRACEEVENT_DIR build option to support specifying path of libtraceevent. Example: 1. Cross compile libtraceevent # cd /opt/libtraceevent # CROSS_COMPILE=aarch64-linux-gnu- make 2. Cross compile perf # cd tool/perf # make VF=1 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- NO_LIBELF=1 LDFLAGS=--static LIBTRACEEVENT_DIR=/opt/libtraceevent Auto-detecting system features: ... LIBTRACEEVENT_DIR: /opt/libtraceevent Reviewed-by: Ian Rogers Signed-off-by: Yang Jihong Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240314063000.2139877-1-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 1fe8df97fe88..7783479de691 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -182,6 +182,16 @@ endif FEATURE_CHECK_CFLAGS-libzstd := $(LIBZSTD_CFLAGS) FEATURE_CHECK_LDFLAGS-libzstd := $(LIBZSTD_LDFLAGS) +# for linking with debug library, run like: +# make DEBUG=1 LIBTRACEEVENT_DIR=/opt/libtraceevent/ +TRACEEVENTLIBS := -ltraceevent +ifdef LIBTRACEEVENT_DIR + LIBTRACEEVENT_CFLAGS := -I$(LIBTRACEEVENT_DIR)/include + LIBTRACEEVENT_LDFLAGS := -L$(LIBTRACEEVENT_DIR)/lib +endif +FEATURE_CHECK_CFLAGS-libtraceevent := $(LIBTRACEEVENT_CFLAGS) +FEATURE_CHECK_LDFLAGS-libtraceevent := $(LIBTRACEEVENT_LDFLAGS) $(TRACEEVENTLIBS) + FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi # include ARCH specific config -include $(src-perf)/arch/$(SRCARCH)/Makefile @@ -1165,9 +1175,10 @@ endif ifneq ($(NO_LIBTRACEEVENT),1) $(call feature_check,libtraceevent) ifeq ($(feature-libtraceevent), 1) - CFLAGS += -DHAVE_LIBTRACEEVENT - EXTLIBS += -ltraceevent - LIBTRACEEVENT_VERSION := $(shell $(PKG_CONFIG) --modversion libtraceevent) + CFLAGS += -DHAVE_LIBTRACEEVENT $(LIBTRACEEVENT_CFLAGS) + LDFLAGS += $(LIBTRACEEVENT_LDFLAGS) + EXTLIBS += ${TRACEEVENTLIBS} + LIBTRACEEVENT_VERSION := $(shell PKG_CONFIG_PATH=$(LIBTRACEEVENT_DIR) $(PKG_CONFIG) --modversion libtraceevent) LIBTRACEEVENT_VERSION_1 := $(word 1, $(subst ., ,$(LIBTRACEEVENT_VERSION))) LIBTRACEEVENT_VERSION_2 := $(word 2, $(subst ., ,$(LIBTRACEEVENT_VERSION))) LIBTRACEEVENT_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEEVENT_VERSION))) @@ -1175,7 +1186,7 @@ ifneq ($(NO_LIBTRACEEVENT),1) CFLAGS += -DLIBTRACEEVENT_VERSION=$(LIBTRACEEVENT_VERSION_CPP) $(call detected,CONFIG_LIBTRACEEVENT) else - $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel or build with NO_LIBTRACEEVENT=1) + $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel and/or set LIBTRACEEVENT_DIR or build with NO_LIBTRACEEVENT=1) endif $(call feature_check,libtracefs) @@ -1301,6 +1312,7 @@ ifeq ($(VF),1) $(call print_var,LIBUNWIND_DIR) $(call print_var,LIBDW_DIR) $(call print_var,JDIR) + $(call print_var,LIBTRACEEVENT_DIR) ifeq ($(dwarf-post-unwind),1) $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text)) $(info $(MSG)) -- cgit v1.2.3 From b6347cb5e04e9c1d17342ab46e2ace2d448de727 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 3 Apr 2024 11:44:19 -0300 Subject: perf annotate: Initialize 'arch' variable not to trip some -Werror=maybe-uninitialized In some older distros the build is failing due to -Werror=maybe-uninitialized, in this case we know that this isn't the case because 'arch' gets initialized by evsel__get_arch(), so make sure it is initialized to NULL before returning from evsel__get_arch(), as suggested by Ian Rogers. E.g.: 32 17.12 opensuse:15.5 : FAIL gcc version 7.5.0 (SUSE Linux) util/annotate.c: In function 'hist_entry__get_data_type': util/annotate.c:2269:15: error: 'arch' may be used uninitialized in this function [-Werror=maybe-uninitialized] struct arch *arch; ^~~~ cc1: all warnings being treated as errors 43 7.30 ubuntu:18.04-x-powerpc64el : FAIL gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04) util/annotate.c: In function 'hist_entry__get_data_type': util/annotate.c:2351:36: error: 'arch' may be used uninitialized in this function [-Werror=maybe-uninitialized] if (map__dso(ms->map)->kernel && arch__is(arch, "x86") && ^~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fUqtjxAsmdGrnkjhUTLHs-JvV10TtxyocpYDJK_+LYTiQ@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index b795f27f2602..35235147b111 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -838,8 +838,10 @@ static int evsel__get_arch(struct evsel *evsel, struct arch **parch) struct arch *arch; int err; - if (!arch_name) + if (!arch_name) { + *parch = NULL; return errno; + } *parch = arch = arch__find(arch_name); if (arch == NULL) { -- cgit v1.2.3 From ca3e10c4d83ad6bb611d07857aade7078c87ad99 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 2 Apr 2024 13:39:52 -0700 Subject: tools: ynl: ethtool.py: Make tool invokable from any CWD ethtool.py depends on yml files in a specific location of the linux kernel tree. Using relative lookup for those files means that ethtool.py would need to be run under tools/net/ynl/. Lookup needed yml files without depending on the current working directory that ethtool.py is invoked from. Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Link: https://lore.kernel.org/r/20240402204000.115081-1-rrameshbabu@nvidia.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ethtool.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py index 6c9f7e31250c..44ba3ba58ed9 100755 --- a/tools/net/ynl/ethtool.py +++ b/tools/net/ynl/ethtool.py @@ -6,6 +6,7 @@ import json import pprint import sys import re +import os from lib import YnlFamily @@ -152,8 +153,11 @@ def main(): global args args = parser.parse_args() - spec = '../../../Documentation/netlink/specs/ethtool.yaml' - schema = '../../../Documentation/netlink/genetlink-legacy.yaml' + script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + spec = os.path.join(script_abs_dir, + '../../../Documentation/netlink/specs/ethtool.yaml') + schema = os.path.join(script_abs_dir, + '../../../Documentation/netlink/genetlink-legacy.yaml') ynl = YnlFamily(spec, schema) -- cgit v1.2.3 From fe09d2e314e5501e4673018270e6e1ed92e6e446 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Jan 2024 04:15:42 -0800 Subject: scftorture: Increase memory provided to guest OS The tradition, extending back almost a full year, has been 2GB plus an additional number of GBs equal to the number of CPUs divided by sixteen. This tradition has served scftorture well, even the CONFIG_PREEMPT=y version running KASAN within guest OSes having 40 CPUs. However, this test recently started OOMing on larger systems, and this commit therefore gives this test an additional GB of memory. It is quite possible that further testing on larger systems will show a need to decrease the divisor from 16 to (say) 8, but that is a change to make once it has been demonstrated to be required. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bbac5f4b03d0..42f0aee09e51 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -425,7 +425,7 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((2+HALF_ALLOTED_CPUS/16)) + scfmem=$((3+HALF_ALLOTED_CPUS/16)) torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi -- cgit v1.2.3 From 9e97ea7796412e23b8ecc946fd0488a2de39ea76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Feb 2024 03:11:25 -0800 Subject: rcutorture: Disable tracing to permit Tasks Rude RCU testing Now that the KPROBES, TRACING, BLK_DEV_IO_TRACE, and UPROBE_EVENTS Kconfig options select the TASKS_TRACE_RCU option, the torture.sh tests of enabling exactly one of the RCU Tasks flavors fail. This commit therefore disables these options to allow this testing to succeed. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 42f0aee09e51..13875ee7b050 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -391,7 +391,7 @@ __EOF__ forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`" deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`" echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log - tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y CONFIG_KPROBES=n CONFIG_RCU_TRACE=n CONFIG_TRACING=n CONFIG_BLK_DEV_IO_TRACE=n CONFIG_UPROBE_EVENTS=n $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 retcode=$? if test "$retcode" -ne 0 then -- cgit v1.2.3 From b269d2b4a5232c6130b5df83fb112f4029df4544 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Apr 2024 19:34:21 -0700 Subject: tools: ynl: copy netlink error to NlError Typing e.nl_msg.error when processing exception is a bit tedious and counter-intuitive. Set a local .error member to the positive value of the netlink level error. Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240403023426.1762996-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 82d3c98067aa..b30210f537f7 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -100,9 +100,10 @@ class Netlink: class NlError(Exception): def __init__(self, nl_msg): self.nl_msg = nl_msg + self.error = -nl_msg.error def __str__(self): - return f"Netlink error: {os.strerror(-self.nl_msg.error)}\n{self.nl_msg}" + return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}" class ConfigError(Exception): -- cgit v1.2.3 From b74bc5a633a7d72f89141d481d835e73bda3c3ae Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 4 Apr 2024 08:48:05 +0200 Subject: perf report: Fix PAI counter names for s390 virtual machines s390 introduced the Processor Activity Instrumentation (PAI) counter facility on LPAR and virtual machines z/VM for models 3931 and 3932. These counters are stored as raw data in the perf.data file and are displayed with: # perf report -i /tmp//perfout-635468 -D | grep Counter Counter:007 Value:0x00000000000186a0 Counter:032 Value:0x0000000000000001 Counter:032 Value:0x0000000000000001 Counter:032 Value:0x0000000000000001 # However on z/VM virtual machines, the counter names are not retrieved from the PMU and are shown as ''. This is caused by the CPU string saved in the mapfile.csv for this machine: ^IBM.393[12].*3\.7.[[:xdigit:]]+$,3,cf_z16,core This string contains the CPU Measurement facility first and second version number and authorization level (3\.7.[[:xdigit:]]+). These numbers do not apply to the PAI counter facility. In fact they can be omitted. Shorten the CPU identification string for this machine to manufacturer and model. This is sufficient for all PMU devices. Output after: # perf report -i /tmp//perfout-635468 -D | grep Counter Counter:007 km_aes_128 Value:0x00000000000186a0 Counter:032 kma_gcm_aes_256 Value:0x0000000000000001 Counter:032 kma_gcm_aes_256 Value:0x0000000000000001 Counter:032 kma_gcm_aes_256 Value:0x0000000000000001 # Fixes: b539deafbadb2fc6 ("perf report: Add s390 raw data interpretation for PAI counters") Reviewed-by: Ian Rogers Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Cc: Heiko Carstens Cc: Namhyung Kim Cc: Sven Schnelle Cc: Thomas Richter Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20240404064806.1362876-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/s390/mapfile.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/s390/mapfile.csv b/tools/perf/pmu-events/arch/s390/mapfile.csv index a918e1af77a5..b22648d12751 100644 --- a/tools/perf/pmu-events/arch/s390/mapfile.csv +++ b/tools/perf/pmu-events/arch/s390/mapfile.csv @@ -5,4 +5,4 @@ Family-model,Version,Filename,EventType ^IBM.296[45].*[13]\.[1-5].[[:xdigit:]]+$,1,cf_z13,core ^IBM.390[67].*[13]\.[1-5].[[:xdigit:]]+$,3,cf_z14,core ^IBM.856[12].*3\.6.[[:xdigit:]]+$,3,cf_z15,core -^IBM.393[12].*3\.7.[[:xdigit:]]+$,3,cf_z16,core +^IBM.393[12].*$,3,cf_z16,core -- cgit v1.2.3 From c2f3d7dfc7373d53286f2a5c882d3397a5070adc Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 4 Apr 2024 08:48:06 +0200 Subject: perf stat: Do not fail on metrics on s390 z/VM systems On s390 z/VM virtual machines command 'perf list' also displays metrics: # perf list | grep -A 20 'Metric Groups:' Metric Groups: No_group: cpi [Cycles per Instruction] est_cpi [Estimated Instruction Complexity CPI infinite Level 1] finite_cpi [Cycles per Instructions from Finite cache/memory] l1mp [Level One Miss per 100 Instructions] l2p [Percentage sourced from Level 2 cache] l3p [Percentage sourced from Level 3 on same chip cache] l4lp [Percentage sourced from Level 4 Local cache on same book] l4rp [Percentage sourced from Level 4 Remote cache on different book] memp [Percentage sourced from memory] .... # The command # perf stat -M cpi -- true event syntax error: '{CPU_CYCLES/metric-id=CPU_CYCLES/.....' \___ Bad event or PMU Unable to find PMU or event on a PMU of 'CPU_CYCLES' event syntax error: '{CPU_CYCLES/metric-id=CPU_CYCLES/...' \___ Cannot find PMU `CPU_CYCLES'. Missing kernel support? # fails. 'perf stat' should not fail on metrics when the referenced CPU Counter Measurement PMU is not available. Output after: # perf stat -M est_cpi -- sleep 1 Performance counter stats for 'sleep 1': 1,000,887,494 ns duration_time # 0.00 est_cpi 1.000887494 seconds time elapsed 0.000143000 seconds user 0.000662000 seconds sys # Fixes: 7f76b31130680fb3 ("perf list: Add IBM z16 event description for s390") Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20240404064806.1362876-2-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/s390/cf_z16/transaction.json | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/s390/cf_z16/transaction.json b/tools/perf/pmu-events/arch/s390/cf_z16/transaction.json index ec2ff78e2b5f..3ab1d3a6638c 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z16/transaction.json +++ b/tools/perf/pmu-events/arch/s390/cf_z16/transaction.json @@ -2,71 +2,71 @@ { "BriefDescription": "Transaction count", "MetricName": "transaction", - "MetricExpr": "TX_C_TEND + TX_NC_TEND + TX_NC_TABORT + TX_C_TABORT_SPECIAL + TX_C_TABORT_NO_SPECIAL" + "MetricExpr": "TX_C_TEND + TX_NC_TEND + TX_NC_TABORT + TX_C_TABORT_SPECIAL + TX_C_TABORT_NO_SPECIAL if has_event(TX_C_TEND) else 0" }, { "BriefDescription": "Cycles per Instruction", "MetricName": "cpi", - "MetricExpr": "CPU_CYCLES / INSTRUCTIONS" + "MetricExpr": "CPU_CYCLES / INSTRUCTIONS if has_event(INSTRUCTIONS) else 0" }, { "BriefDescription": "Problem State Instruction Ratio", "MetricName": "prbstate", - "MetricExpr": "(PROBLEM_STATE_INSTRUCTIONS / INSTRUCTIONS) * 100" + "MetricExpr": "(PROBLEM_STATE_INSTRUCTIONS / INSTRUCTIONS) * 100 if has_event(INSTRUCTIONS) else 0" }, { "BriefDescription": "Level One Miss per 100 Instructions", "MetricName": "l1mp", - "MetricExpr": "((L1I_DIR_WRITES + L1D_DIR_WRITES) / INSTRUCTIONS) * 100" + "MetricExpr": "((L1I_DIR_WRITES + L1D_DIR_WRITES) / INSTRUCTIONS) * 100 if has_event(INSTRUCTIONS) else 0" }, { "BriefDescription": "Percentage sourced from Level 2 cache", "MetricName": "l2p", - "MetricExpr": "((DCW_REQ + DCW_REQ_IV + ICW_REQ + ICW_REQ_IV) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100" + "MetricExpr": "((DCW_REQ + DCW_REQ_IV + ICW_REQ + ICW_REQ_IV) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100 if has_event(DCW_REQ) else 0" }, { "BriefDescription": "Percentage sourced from Level 3 on same chip cache", "MetricName": "l3p", - "MetricExpr": "((DCW_REQ_CHIP_HIT + DCW_ON_CHIP + DCW_ON_CHIP_IV + DCW_ON_CHIP_CHIP_HIT + ICW_REQ_CHIP_HIT + ICW_ON_CHIP + ICW_ON_CHIP_IV + ICW_ON_CHIP_CHIP_HIT) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100" + "MetricExpr": "((DCW_REQ_CHIP_HIT + DCW_ON_CHIP + DCW_ON_CHIP_IV + DCW_ON_CHIP_CHIP_HIT + ICW_REQ_CHIP_HIT + ICW_ON_CHIP + ICW_ON_CHIP_IV + ICW_ON_CHIP_CHIP_HIT) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100 if has_event(DCW_REQ_CHIP_HIT) else 0" }, { "BriefDescription": "Percentage sourced from Level 4 Local cache on same book", "MetricName": "l4lp", - "MetricExpr": "((DCW_REQ_DRAWER_HIT + DCW_ON_CHIP_DRAWER_HIT + DCW_ON_MODULE + DCW_ON_DRAWER + IDCW_ON_MODULE_IV + IDCW_ON_MODULE_CHIP_HIT + IDCW_ON_MODULE_DRAWER_HIT + IDCW_ON_DRAWER_IV + IDCW_ON_DRAWER_CHIP_HIT + IDCW_ON_DRAWER_DRAWER_HIT + ICW_REQ_DRAWER_HIT + ICW_ON_CHIP_DRAWER_HIT + ICW_ON_MODULE + ICW_ON_DRAWER) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100" + "MetricExpr": "((DCW_REQ_DRAWER_HIT + DCW_ON_CHIP_DRAWER_HIT + DCW_ON_MODULE + DCW_ON_DRAWER + IDCW_ON_MODULE_IV + IDCW_ON_MODULE_CHIP_HIT + IDCW_ON_MODULE_DRAWER_HIT + IDCW_ON_DRAWER_IV + IDCW_ON_DRAWER_CHIP_HIT + IDCW_ON_DRAWER_DRAWER_HIT + ICW_REQ_DRAWER_HIT + ICW_ON_CHIP_DRAWER_HIT + ICW_ON_MODULE + ICW_ON_DRAWER) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100 if has_event(DCW_REQ_DRAWER_HIT) else 0" }, { "BriefDescription": "Percentage sourced from Level 4 Remote cache on different book", "MetricName": "l4rp", - "MetricExpr": "((DCW_OFF_DRAWER + IDCW_OFF_DRAWER_IV + IDCW_OFF_DRAWER_CHIP_HIT + IDCW_OFF_DRAWER_DRAWER_HIT + ICW_OFF_DRAWER) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100" + "MetricExpr": "((DCW_OFF_DRAWER + IDCW_OFF_DRAWER_IV + IDCW_OFF_DRAWER_CHIP_HIT + IDCW_OFF_DRAWER_DRAWER_HIT + ICW_OFF_DRAWER) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100 if has_event(DCW_OFF_DRAWER) else 0" }, { "BriefDescription": "Percentage sourced from memory", "MetricName": "memp", - "MetricExpr": "((DCW_ON_CHIP_MEMORY + DCW_ON_MODULE_MEMORY + DCW_ON_DRAWER_MEMORY + DCW_OFF_DRAWER_MEMORY + ICW_ON_CHIP_MEMORY + ICW_ON_MODULE_MEMORY + ICW_ON_DRAWER_MEMORY + ICW_OFF_DRAWER_MEMORY) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100" + "MetricExpr": "((DCW_ON_CHIP_MEMORY + DCW_ON_MODULE_MEMORY + DCW_ON_DRAWER_MEMORY + DCW_OFF_DRAWER_MEMORY + ICW_ON_CHIP_MEMORY + ICW_ON_MODULE_MEMORY + ICW_ON_DRAWER_MEMORY + ICW_OFF_DRAWER_MEMORY) / (L1I_DIR_WRITES + L1D_DIR_WRITES)) * 100 if has_event(DCW_ON_CHIP_MEMORY) else 0" }, { "BriefDescription": "Cycles per Instructions from Finite cache/memory", "MetricName": "finite_cpi", - "MetricExpr": "L1C_TLB2_MISSES / INSTRUCTIONS" + "MetricExpr": "L1C_TLB2_MISSES / INSTRUCTIONS if has_event(L1C_TLB2_MISSES) else 0" }, { "BriefDescription": "Estimated Instruction Complexity CPI infinite Level 1", "MetricName": "est_cpi", - "MetricExpr": "(CPU_CYCLES / INSTRUCTIONS) - (L1C_TLB2_MISSES / INSTRUCTIONS)" + "MetricExpr": "(CPU_CYCLES / INSTRUCTIONS) - (L1C_TLB2_MISSES / INSTRUCTIONS) if has_event(INSTRUCTIONS) else 0" }, { "BriefDescription": "Estimated Sourcing Cycles per Level 1 Miss", "MetricName": "scpl1m", - "MetricExpr": "L1C_TLB2_MISSES / (L1I_DIR_WRITES + L1D_DIR_WRITES)" + "MetricExpr": "L1C_TLB2_MISSES / (L1I_DIR_WRITES + L1D_DIR_WRITES) if has_event(L1C_TLB2_MISSES) else 0" }, { "BriefDescription": "Estimated TLB CPU percentage of Total CPU", "MetricName": "tlb_percent", - "MetricExpr": "((DTLB2_MISSES + ITLB2_MISSES) / CPU_CYCLES) * (L1C_TLB2_MISSES / (L1I_PENALTY_CYCLES + L1D_PENALTY_CYCLES)) * 100" + "MetricExpr": "((DTLB2_MISSES + ITLB2_MISSES) / CPU_CYCLES) * (L1C_TLB2_MISSES / (L1I_PENALTY_CYCLES + L1D_PENALTY_CYCLES)) * 100 if has_event(CPU_CYCLES) else 0" }, { "BriefDescription": "Estimated Cycles per TLB Miss", "MetricName": "tlb_miss", - "MetricExpr": "((DTLB2_MISSES + ITLB2_MISSES) / (DTLB2_WRITES + ITLB2_WRITES)) * (L1C_TLB2_MISSES / (L1I_PENALTY_CYCLES + L1D_PENALTY_CYCLES))" + "MetricExpr": "((DTLB2_MISSES + ITLB2_MISSES) / (DTLB2_WRITES + ITLB2_WRITES)) * (L1C_TLB2_MISSES / (L1I_PENALTY_CYCLES + L1D_PENALTY_CYCLES)) if has_event(DTLB2_MISSES) else 0" } ] -- cgit v1.2.3 From 478a535ae54ad3831371904d93b5dfc403222e17 Mon Sep 17 00:00:00 2001 From: Sahil Siddiq Date: Fri, 5 Apr 2024 00:52:19 +0530 Subject: bpftool: Mount bpffs on provided dir instead of parent dir When pinning programs/objects under PATH (eg: during "bpftool prog loadall") the bpffs is mounted on the parent dir of PATH in the following situations: - the given dir exists but it is not bpffs. - the given dir doesn't exist and the parent dir is not bpffs. Mounting on the parent dir can also have the unintentional side- effect of hiding other files located under the parent dir. If the given dir exists but is not bpffs, then the bpffs should be mounted on the given dir and not its parent dir. Similarly, if the given dir doesn't exist and its parent dir is not bpffs, then the given dir should be created and the bpffs should be mounted on this new dir. Fixes: 2a36c26fe3b8 ("bpftool: Support bpffs mountpoint as pin path for prog loadall") Signed-off-by: Sahil Siddiq Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/2da44d24-74ae-a564-1764-afccf395eeec@isovalent.com/T/#t Link: https://lore.kernel.org/bpf/20240404192219.52373-1-icegambit91@gmail.com Closes: https://github.com/libbpf/bpftool/issues/100 Changes since v1: - Split "mount_bpffs_for_pin" into two functions. This is done to improve maintainability and readability. Changes since v2: - mount_bpffs_for_pin: rename to "create_and_mount_bpffs_dir". - mount_bpffs_given_file: rename to "mount_bpffs_given_file". - create_and_mount_bpffs_dir: - introduce "dir_exists" boolean. - remove new dir if "mnt_fs" fails. - improve error handling and error messages. Changes since v3: - Rectify function name. - Improve error messages and formatting. - mount_bpffs_for_file: - Check if dir exists before block_mount check. Changes since v4: - Use strdup instead of strcpy. - create_and_mount_bpffs_dir: - Use S_IRWXU instead of 0700. - Improve error handling and formatting. --- tools/bpf/bpftool/common.c | 96 ++++++++++++++++++++++++++++++++++++------ tools/bpf/bpftool/iter.c | 2 +- tools/bpf/bpftool/main.h | 3 +- tools/bpf/bpftool/prog.c | 5 ++- tools/bpf/bpftool/struct_ops.c | 2 +- 5 files changed, 92 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index cc6e6aae2447..958e92acca8e 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -244,29 +244,101 @@ int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type) return fd; } -int mount_bpffs_for_pin(const char *name, bool is_dir) +int create_and_mount_bpffs_dir(const char *dir_name) { char err_str[ERR_MAX_LEN]; - char *file; - char *dir; + bool dir_exists; int err = 0; - if (is_dir && is_bpffs(name)) + if (is_bpffs(dir_name)) return err; - file = malloc(strlen(name) + 1); - if (!file) { + dir_exists = access(dir_name, F_OK) == 0; + + if (!dir_exists) { + char *temp_name; + char *parent_name; + + temp_name = strdup(dir_name); + if (!temp_name) { + p_err("mem alloc failed"); + return -1; + } + + parent_name = dirname(temp_name); + + if (is_bpffs(parent_name)) { + /* nothing to do if already mounted */ + free(temp_name); + return err; + } + + if (access(parent_name, F_OK) == -1) { + p_err("can't create dir '%s' to pin BPF object: parent dir '%s' doesn't exist", + dir_name, parent_name); + free(temp_name); + return -1; + } + + free(temp_name); + } + + if (block_mount) { + p_err("no BPF file system found, not mounting it due to --nomount option"); + return -1; + } + + if (!dir_exists) { + err = mkdir(dir_name, S_IRWXU); + if (err) { + p_err("failed to create dir '%s': %s", dir_name, strerror(errno)); + return err; + } + } + + err = mnt_fs(dir_name, "bpf", err_str, ERR_MAX_LEN); + if (err) { + err_str[ERR_MAX_LEN - 1] = '\0'; + p_err("can't mount BPF file system on given dir '%s': %s", + dir_name, err_str); + + if (!dir_exists) + rmdir(dir_name); + } + + return err; +} + +int mount_bpffs_for_file(const char *file_name) +{ + char err_str[ERR_MAX_LEN]; + char *temp_name; + char *dir; + int err = 0; + + if (access(file_name, F_OK) != -1) { + p_err("can't pin BPF object: path '%s' already exists", file_name); + return -1; + } + + temp_name = strdup(file_name); + if (!temp_name) { p_err("mem alloc failed"); return -1; } - strcpy(file, name); - dir = dirname(file); + dir = dirname(temp_name); if (is_bpffs(dir)) /* nothing to do if already mounted */ goto out_free; + if (access(dir, F_OK) == -1) { + p_err("can't pin BPF object: dir '%s' doesn't exist", dir); + err = -1; + goto out_free; + } + if (block_mount) { p_err("no BPF file system found, not mounting it due to --nomount option"); err = -1; @@ -276,12 +348,12 @@ int mount_bpffs_for_pin(const char *name, bool is_dir) err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN); if (err) { err_str[ERR_MAX_LEN - 1] = '\0'; - p_err("can't mount BPF file system to pin the object (%s): %s", - name, err_str); + p_err("can't mount BPF file system to pin the object '%s': %s", + file_name, err_str); } out_free: - free(file); + free(temp_name); return err; } @@ -289,7 +361,7 @@ int do_pin_fd(int fd, const char *name) { int err; - err = mount_bpffs_for_pin(name, false); + err = mount_bpffs_for_file(name); if (err) return err; diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index 6b0e5202ca7a..5c39c2ed36a2 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -76,7 +76,7 @@ static int do_pin(int argc, char **argv) goto close_obj; } - err = mount_bpffs_for_pin(path, false); + err = mount_bpffs_for_file(path); if (err) goto close_link; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index b8bb08d10dec..9eb764fe4cc8 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -142,7 +142,8 @@ const char *get_fd_type_name(enum bpf_obj_type type); char *get_fdinfo(int fd, const char *key); int open_obj_pinned(const char *path, bool quiet); int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type); -int mount_bpffs_for_pin(const char *name, bool is_dir); +int mount_bpffs_for_file(const char *file_name); +int create_and_mount_bpffs_dir(const char *dir_name); int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***)); int do_pin_fd(int fd, const char *name); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9cb42a3366c0..4c4cf16a40ba 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1778,7 +1778,10 @@ offload_dev: goto err_close_obj; } - err = mount_bpffs_for_pin(pinfile, !first_prog_only); + if (first_prog_only) + err = mount_bpffs_for_file(pinfile); + else + err = create_and_mount_bpffs_dir(pinfile); if (err) goto err_close_obj; diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index d573f2640d8e..aa43dead249c 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -515,7 +515,7 @@ static int do_register(int argc, char **argv) if (argc == 1) linkdir = GET_ARG(); - if (linkdir && mount_bpffs_for_pin(linkdir, true)) { + if (linkdir && create_and_mount_bpffs_dir(linkdir)) { p_err("can't mount bpffs for pinning"); return -1; } -- cgit v1.2.3 From f91717007217d975aa975ddabd91ae1a107b9bff Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 3 Apr 2024 14:33:03 +0200 Subject: bpf: Pack struct bpf_fib_lookup The struct bpf_fib_lookup is supposed to be of size 64. A recent commit 59b418c7063d ("bpf: Add a check for struct bpf_fib_lookup size") added a static assertion to check this property so that future changes to the structure will not accidentally break this assumption. As it immediately turned out, on some 32-bit arm systems, when AEABI=n, the total size of the structure was equal to 68, see [1]. This happened because the bpf_fib_lookup structure contains a union of two 16-bit fields: union { __u16 tot_len; __u16 mtu_result; }; which was supposed to compile to a 16-bit-aligned 16-bit field. On the aforementioned setups it was instead both aligned and padded to 32-bits. Declare this inner union as __attribute__((packed, aligned(2))) such that it always is of size 2 and is aligned to 16 bits. [1] https://lore.kernel.org/all/CA+G9fYtsoP51f-oP_Sp5MOq-Ffv8La2RztNpwvE6+R1VtFiLrw@mail.gmail.com/#t Reported-by: Naresh Kamboju Fixes: e1850ea9bd9e ("bpf: bpf_fib_lookup return MTU value as output when looked up") Signed-off-by: Anton Protopopov Signed-off-by: Andrii Nakryiko Reviewed-by: Alexander Lobakin Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240403123303.1452184-1-aspsk@isovalent.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c548276b6b..6fe9f11c8abe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7157,7 +7157,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c548276b6b..6fe9f11c8abe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7157,7 +7157,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ -- cgit v1.2.3 From 343ca8131c35ba132d200fd9752b60e65357924d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Apr 2024 14:45:36 -0700 Subject: selftests/bpf: add fp-leaking precise subprog result tests Add selftests validating that BPF verifier handles precision marking for SCALAR registers derived from r10 (fp) register correctly. Given `r0 = (s8)r10;` syntax is not supported by older Clang compilers, use the raw BPF instruction syntax to maximize compatibility. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404214536.3551295-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/include/linux/filter.h | 18 +++++ .../bpf/progs/verifier_subprog_precision.c | 89 ++++++++++++++++++++++ 2 files changed, 107 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 736bdeccdfe4..65aa8ce142e5 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -111,6 +111,24 @@ .off = 0, \ .imm = IMM }) +/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ + +#define BPF_MOVSX64_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_MOVSX32_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6f5d19665cf6..4a58e0398e72 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -6,6 +6,7 @@ #include #include #include "bpf_misc.h" +#include <../../../tools/include/linux/filter.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) @@ -76,6 +77,94 @@ __naked int subprog_result_precise(void) ); } +__naked __noinline __used +static unsigned long fp_leaking_subprog() +{ + asm volatile ( + ".8byte %[r0_eq_r10_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r10_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_10, 8)) + ); +} + +__naked __noinline __used +static unsigned long sneaky_fp_leaking_subprog() +{ + asm volatile ( + "r1 = r10;" + ".8byte %[r0_eq_r1_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r1_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_1, 8)) + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") +__msg("7: R0_w=scalar") +__naked int fp_precise_subprog_result(void) +{ + asm volatile ( + "call fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") +/* here r1 is marked precise, even though it's fp register, but that's fine + * because by the time we get out of subprogram it has to be derived from r10 + * anyways, at which point we'll break precision chain + */ +__msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") +__msg("7: R0_w=scalar") +__naked int sneaky_fp_precise_subprog_result(void) +{ + asm volatile ( + "call sneaky_fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r0") -- cgit v1.2.3 From 38ab60132b0d477e19d841daed701b491c20b465 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Apr 2024 14:08:03 -0700 Subject: perf script: Support 32bit code under 64bit OS with capstone Use the DSO to resolve whether an IP is 32bit or 64bit and use that to configure capstone to the correct mode. This allows to correctly disassemble 32bit code under a 64bit OS. % cat > loop.c volatile int var; int main(void) { int i; for (i = 0; i < 100000; i++) var++; } % gcc -m32 -o loop loop.c % perf record -e cycles:u ./loop % perf script -F +disasm loop 82665 1833176.618023: 1 cycles:u: f7eed500 _start+0x0 (/usr/lib/ld-linux.so.2) movl %esp, %eax loop 82665 1833176.618029: 1 cycles:u: f7eed500 _start+0x0 (/usr/lib/ld-linux.so.2) movl %esp, %eax loop 82665 1833176.618031: 7 cycles:u: f7eed500 _start+0x0 (/usr/lib/ld-linux.so.2) movl %esp, %eax loop 82665 1833176.618034: 91 cycles:u: f7eed500 _start+0x0 (/usr/lib/ld-linux.so.2) movl %esp, %eax loop 82665 1833176.618036: 1242 cycles:u: f7eed500 _start+0x0 (/usr/lib/ld-linux.so.2) movl %esp, %eax Reviewed-by: Adrian Hunter Acked-by: Thomas Richter Signed-off-by: Andi Kleen Link: https://lore.kernel.org/r/20240401210925.209671-2-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 9 +++++---- tools/perf/util/print_insn.c | 27 ++++++++++++++++++++++----- tools/perf/util/print_insn.h | 2 +- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 6a274e27b108..a711bedace47 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1517,7 +1517,8 @@ void script_fetch_insn(struct perf_sample *sample, struct thread *thread, static int perf_sample__fprintf_insn(struct perf_sample *sample, struct perf_event_attr *attr, struct thread *thread, - struct machine *machine, FILE *fp) + struct machine *machine, FILE *fp, + struct addr_location *al) { int printed = 0; @@ -1531,7 +1532,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample, } if (PRINT_FIELD(DISASM) && sample->insn_len) { printed += fprintf(fp, "\t\t"); - printed += sample__fprintf_insn_asm(sample, thread, machine, fp); + printed += sample__fprintf_insn_asm(sample, thread, machine, fp, al); } if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN)) printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp); @@ -1606,7 +1607,7 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample, if (print_srcline_last) printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); - printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp); + printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al); printed += fprintf(fp, "\n"); if (PRINT_FIELD(SRCCODE)) { int ret = map__fprintf_srccode(al->map, al->addr, stdout, @@ -2259,7 +2260,7 @@ static void process_event(struct perf_script *script, if (evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) perf_sample__fprintf_bpf_output(sample, fp); - perf_sample__fprintf_insn(sample, attr, thread, machine, fp); + perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al); if (PRINT_FIELD(PHYS_ADDR)) fprintf(fp, "%16" PRIx64, sample->phys_addr); diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index 459e0e93d7b1..32dc9dad9cf2 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -12,6 +12,8 @@ #include "machine.h" #include "thread.h" #include "print_insn.h" +#include "map.h" +#include "dso.h" size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) { @@ -28,12 +30,12 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) #ifdef HAVE_LIBCAPSTONE_SUPPORT #include -static int capstone_init(struct machine *machine, csh *cs_handle) +static int capstone_init(struct machine *machine, csh *cs_handle, bool is64) { cs_arch arch; cs_mode mode; - if (machine__is(machine, "x86_64")) { + if (machine__is(machine, "x86_64") && is64) { arch = CS_ARCH_X86; mode = CS_MODE_64; } else if (machine__normalized_is(machine, "x86")) { @@ -93,17 +95,31 @@ static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, return printed; } +static bool is64bitip(struct machine *machine, struct addr_location *al) +{ + const struct dso *dso = al->map ? map__dso(al->map) : NULL; + + if (dso) + return dso->is_64_bit; + + return machine__is(machine, "x86_64") || + machine__normalized_is(machine, "arm64") || + machine__normalized_is(machine, "s390"); +} + size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, - struct machine *machine, FILE *fp) + struct machine *machine, FILE *fp, + struct addr_location *al) { csh cs_handle; cs_insn *insn; size_t count; size_t printed = 0; int ret; + bool is64bit = is64bitip(machine, al); /* TODO: Try to initiate capstone only once but need a proper place. */ - ret = capstone_init(machine, &cs_handle); + ret = capstone_init(machine, &cs_handle, is64bit); if (ret < 0) { /* fallback */ return sample__fprintf_insn_raw(sample, fp); @@ -128,7 +144,8 @@ size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *threa size_t sample__fprintf_insn_asm(struct perf_sample *sample __maybe_unused, struct thread *thread __maybe_unused, struct machine *machine __maybe_unused, - FILE *fp __maybe_unused) + FILE *fp __maybe_unused, + struct addr_location *al __maybe_unused) { return 0; } diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h index 465bdcfcc2fd..6447dd41b543 100644 --- a/tools/perf/util/print_insn.h +++ b/tools/perf/util/print_insn.h @@ -10,7 +10,7 @@ struct thread; struct machine; size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, - struct machine *machine, FILE *fp); + struct machine *machine, FILE *fp, struct addr_location *al); size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp); #endif /* PERF_PRINT_INSN_H */ -- cgit v1.2.3 From d812044688dfe73e1b309689d58d06f50a15e618 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 Apr 2024 14:08:04 -0700 Subject: perf script: Add capstone support for '-F +brstackdisasm' Support capstone output for the '-F +brstackinsn' branch dump. The new output is enabled with the new field 'brstackdisasm'. This was possible before with --xed, but now also allow it for users that don't have xed using the builtin capstone support. Before: perf record -b emacs -Q --batch '()' perf script -F +brstackinsn ... emacs 55778 1814366.755945: 151564 cycles:P: 7f0ab2d17192 intel_check_word.constprop.0+0x162 (/usr/lib64/ld-linux-x86-64.s> intel_check_word.constprop.0+237: 00007f0ab2d1711d insn: 75 e6 # PRED 3 cycles [3] 00007f0ab2d17105 insn: 73 51 00007f0ab2d17107 insn: 48 89 c1 00007f0ab2d1710a insn: 48 39 ca 00007f0ab2d1710d insn: 73 96 00007f0ab2d1710f insn: 48 8d 04 11 00007f0ab2d17113 insn: 48 d1 e8 00007f0ab2d17116 insn: 49 8d 34 c1 00007f0ab2d1711a insn: 44 3a 06 00007f0ab2d1711d insn: 75 e6 # PRED 3 cycles [6] 3.00 IPC 00007f0ab2d17105 insn: 73 51 # PRED 1 cycles [7] 1.00 IPC 00007f0ab2d17158 insn: 48 8d 50 01 00007f0ab2d1715c insn: eb 92 # PRED 1 cycles [8] 2.00 IPC 00007f0ab2d170f0 insn: 48 39 ca 00007f0ab2d170f3 insn: 73 b0 # PRED 1 cycles [9] 2.00 IPC After (perf must be compiled with capstone): perf script -F +brstackdisasm ... emacs 55778 1814366.755945: 151564 cycles:P: 7f0ab2d17192 intel_check_word.constprop.0+0x162 (/usr/lib64/ld-linux-x86-64.s> intel_check_word.constprop.0+237: 00007f0ab2d1711d jne intel_check_word.constprop.0+0xd5 # PRED 3 cycles [3] 00007f0ab2d17105 jae intel_check_word.constprop.0+0x128 00007f0ab2d17107 movq %rax, %rcx 00007f0ab2d1710a cmpq %rcx, %rdx 00007f0ab2d1710d jae intel_check_word.constprop.0+0x75 00007f0ab2d1710f leaq (%rcx, %rdx), %rax 00007f0ab2d17113 shrq $1, %rax 00007f0ab2d17116 leaq (%r9, %rax, 8), %rsi 00007f0ab2d1711a cmpb (%rsi), %r8b 00007f0ab2d1711d jne intel_check_word.constprop.0+0xd5 # PRED 3 cycles [6] 3.00 IPC 00007f0ab2d17105 jae intel_check_word.constprop.0+0x128 # PRED 1 cycles [7] 1.00 IPC 00007f0ab2d17158 leaq 1(%rax), %rdx 00007f0ab2d1715c jmp intel_check_word.constprop.0+0xc0 # PRED 1 cycles [8] 2.00 IPC 00007f0ab2d170f0 cmpq %rcx, %rdx 00007f0ab2d170f3 jae intel_check_word.constprop.0+0x75 # PRED 1 cycles [9] 2.00 IPC Reviewed-by: Adrian Hunter Signed-off-by: Andi Kleen Link: https://lore.kernel.org/r/20240401210925.209671-3-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 7 +++-- tools/perf/builtin-script.c | 32 +++++++++++++++----- tools/perf/util/dump-insn.h | 1 + tools/perf/util/print_insn.c | 52 ++++++++++++++++++++++++++++++++ tools/perf/util/print_insn.h | 3 ++ 5 files changed, 86 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 005e51df855e..ff086ef05a0c 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -132,9 +132,9 @@ OPTIONS Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, dsoff, addr, symoff, srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, - brstackinsn, brstackinsnlen, brstackoff, callindent, insn, disasm, + brstackinsn, brstackinsnlen, brstackdisasm, brstackoff, callindent, insn, disasm, insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size, - code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat. + code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat, Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. @@ -257,6 +257,9 @@ OPTIONS can’t know the next sequential instruction after an unconditional branch unless you calculate that based on its length. + brstackdisasm acts like brstackinsn, but will print disassembled instructions if + perf is built with the capstone library. + The brstackoff field will print an offset into a specific dso/binary. With the metric option perf script can compute metrics for diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index a711bedace47..dd10f158ed0c 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -136,6 +136,7 @@ enum perf_output_field { PERF_OUTPUT_RETIRE_LAT = 1ULL << 40, PERF_OUTPUT_DSOFF = 1ULL << 41, PERF_OUTPUT_DISASM = 1ULL << 42, + PERF_OUTPUT_BRSTACKDISASM = 1ULL << 43, }; struct perf_script { @@ -210,6 +211,7 @@ struct output_option { {.str = "vcpu", .field = PERF_OUTPUT_VCPU}, {.str = "cgroup", .field = PERF_OUTPUT_CGROUP}, {.str = "retire_lat", .field = PERF_OUTPUT_RETIRE_LAT}, + {.str = "brstackdisasm", .field = PERF_OUTPUT_BRSTACKDISASM}, }; enum { @@ -510,7 +512,8 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) "selected. Hence, no address to lookup the source line number.\n"); return -EINVAL; } - if ((PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN)) && !allow_user_set && + if ((PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN) || PRINT_FIELD(BRSTACKDISASM)) + && !allow_user_set && !(evlist__combined_branch_type(session->evlist) & PERF_SAMPLE_BRANCH_ANY)) { pr_err("Display of branch stack assembler requested, but non all-branch filter set\n" "Hint: run 'perf record -b ...'\n"); @@ -1162,6 +1165,20 @@ out: return ret; } +static const char *any_dump_insn(struct perf_event_attr *attr __maybe_unused, + struct perf_insn *x, uint64_t ip, + u8 *inbuf, int inlen, int *lenp) +{ +#ifdef HAVE_LIBCAPSTONE_SUPPORT + if (PRINT_FIELD(BRSTACKDISASM)) { + const char *p = cs_dump_insn(x, ip, inbuf, inlen, lenp); + if (p) + return p; + } +#endif + return dump_insn(x, ip, inbuf, inlen, lenp); +} + static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct perf_insn *x, u8 *inbuf, int len, int insn, FILE *fp, int *total_cycles, @@ -1170,7 +1187,7 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, { int ilen = 0; int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t", ip, - dump_insn(x, ip, inbuf, len, &ilen)); + any_dump_insn(attr, x, ip, inbuf, len, &ilen)); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "ilen: %d\t", ilen); @@ -1262,6 +1279,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, nr = max_blocks + 1; x.thread = thread; + x.machine = machine; x.cpu = sample->cpu; printed += fprintf(fp, "%c", '\n'); @@ -1313,7 +1331,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, } else { ilen = 0; printed += fprintf(fp, "\t%016" PRIx64 "\t%s", ip, - dump_insn(&x, ip, buffer + off, len - off, &ilen)); + any_dump_insn(attr, &x, ip, buffer + off, len - off, &ilen)); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1361,7 +1379,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, goto out; ilen = 0; printed += fprintf(fp, "\t%016" PRIx64 "\t%s", sample->ip, - dump_insn(&x, sample->ip, buffer, len, &ilen)); + any_dump_insn(attr, &x, sample->ip, buffer, len, &ilen)); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1372,7 +1390,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, for (off = 0; off <= end - start; off += ilen) { ilen = 0; printed += fprintf(fp, "\t%016" PRIx64 "\t%s", start + off, - dump_insn(&x, start + off, buffer + off, len - off, &ilen)); + any_dump_insn(attr, &x, start + off, buffer + off, len - off, &ilen)); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1534,7 +1552,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample, printed += fprintf(fp, "\t\t"); printed += sample__fprintf_insn_asm(sample, thread, machine, fp, al); } - if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN)) + if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN) || PRINT_FIELD(BRSTACKDISASM)) printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp); return printed; @@ -3940,7 +3958,7 @@ int cmd_script(int argc, const char **argv) "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,dsoff," "addr,symoff,srcline,period,iregs,uregs,brstack," "brstacksym,flags,data_src,weight,bpf-output,brstackinsn," - "brstackinsnlen,brstackoff,callindent,insn,disasm,insnlen,synth," + "brstackinsnlen,brstackdisasm,brstackoff,callindent,insn,disasm,insnlen,synth," "phys_addr,metric,misc,srccode,ipc,tod,data_page_size," "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat", parse_output_fields), diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h index 650125061530..4a7797dd6d09 100644 --- a/tools/perf/util/dump-insn.h +++ b/tools/perf/util/dump-insn.h @@ -11,6 +11,7 @@ struct thread; struct perf_insn { /* Initialized by callers: */ struct thread *thread; + struct machine *machine; u8 cpumode; bool is64bit; int cpu; diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index 32dc9dad9cf2..8825330d435f 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -12,6 +12,7 @@ #include "machine.h" #include "thread.h" #include "print_insn.h" +#include "dump-insn.h" #include "map.h" #include "dso.h" @@ -71,6 +72,57 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64) return 0; } +static void dump_insn_x86(struct thread *thread, cs_insn *insn, struct perf_insn *x) +{ + struct addr_location al; + bool printed = false; + + if (insn->detail && insn->detail->x86.op_count == 1) { + cs_x86_op *op = &insn->detail->x86.operands[0]; + + addr_location__init(&al); + if (op->type == X86_OP_IMM && + thread__find_symbol(thread, x->cpumode, op->imm, &al) && + al.sym && + al.addr < al.sym->end) { + snprintf(x->out, sizeof(x->out), "%s %s+%#" PRIx64 " [%#" PRIx64 "]", insn[0].mnemonic, + al.sym->name, al.addr - al.sym->start, op->imm); + printed = true; + } + addr_location__exit(&al); + } + + if (!printed) + snprintf(x->out, sizeof(x->out), "%s %s", insn[0].mnemonic, insn[0].op_str); +} + +const char *cs_dump_insn(struct perf_insn *x, uint64_t ip, + u8 *inbuf, int inlen, int *lenp) +{ + int ret; + int count; + cs_insn *insn; + csh cs_handle; + + ret = capstone_init(x->machine, &cs_handle, x->is64bit); + if (ret < 0) + return NULL; + + count = cs_disasm(cs_handle, (uint8_t *)inbuf, inlen, ip, 1, &insn); + if (count > 0) { + if (machine__normalized_is(x->machine, "x86")) + dump_insn_x86(x->thread, &insn[0], x); + else + snprintf(x->out, sizeof(x->out), "%s %s", + insn[0].mnemonic, insn[0].op_str); + *lenp = insn->size; + cs_free(insn, count); + } else { + return NULL; + } + return x->out; +} + static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, cs_insn *insn, FILE *fp) { diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h index 6447dd41b543..c2a6391a45ce 100644 --- a/tools/perf/util/print_insn.h +++ b/tools/perf/util/print_insn.h @@ -8,9 +8,12 @@ struct perf_sample; struct thread; struct machine; +struct perf_insn; size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, struct machine *machine, FILE *fp, struct addr_location *al); size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp); +const char *cs_dump_insn(struct perf_insn *x, uint64_t ip, + u8 *inbuf, int inlen, int *lenp); #endif /* PERF_PRINT_INSN_H */ -- cgit v1.2.3 From fecb1597cc11a23f32faa90d70a199533871686a Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:36 +0800 Subject: selftests/bpf: add test for bpf_for_each_map_elem() with different maps A test is added for bpf_for_each_map_elem() with either an arraymap or a hashmap. $ tools/testing/selftests/bpf/test_progs -t for_each #93/1 for_each/hash_map:OK #93/2 for_each/array_map:OK #93/3 for_each/write_map_key:OK #93/4 for_each/multi_maps:OK #93 for_each:OK Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-4-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/for_each.c | 62 ++++++++++++++++++++++ .../selftests/bpf/progs/for_each_multi_maps.c | 49 +++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/for_each_multi_maps.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 8963f8a549f2..09f6487f58b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -5,6 +5,7 @@ #include "for_each_hash_map_elem.skel.h" #include "for_each_array_map_elem.skel.h" #include "for_each_map_elem_write_key.skel.h" +#include "for_each_multi_maps.skel.h" static unsigned int duration; @@ -143,6 +144,65 @@ static void test_write_map_key(void) for_each_map_elem_write_key__destroy(skel); } +static void test_multi_maps(void) +{ + struct for_each_multi_maps *skel; + __u64 val, array_total, hash_total; + __u32 key, max_entries; + int i, err; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + skel = for_each_multi_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_multi_maps__open_and_load")) + return; + + array_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + array_total += val; + err = bpf_map__update_elem(skel->maps.arraymap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "array_map_update")) + goto out; + } + + hash_total = 0; + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i + 100; + val = i + 1; + hash_total += val; + err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "hash_map_update")) + goto out; + } + + skel->bss->data_output = 0; + skel->bss->use_array = 1; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, array_total, "array output"); + + skel->bss->data_output = 0; + skel->bss->use_array = 0; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, hash_total, "hash output"); + +out: + for_each_multi_maps__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) @@ -151,4 +211,6 @@ void test_for_each(void) test_array_map(); if (test__start_subtest("write_map_key")) test_write_map_key(); + if (test__start_subtest("multi_maps")) + test_multi_maps(); } diff --git a/tools/testing/selftests/bpf/progs/for_each_multi_maps.c b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c new file mode 100644 index 000000000000..ff0bed7d4459 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 5); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct callback_ctx { + int output; +}; + +u32 data_output = 0; +int use_array = 0; + +static __u64 +check_map_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + return 0; +} + +SEC("tc") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + if (use_array) + bpf_for_each_map_elem(&arraymap, check_map_elem, &data, 0); + else + bpf_for_each_map_elem(&hashmap, check_map_elem, &data, 0); + data_output = data.output; + + return 0; +} -- cgit v1.2.3 From ba0cbe2bb4ab8aa266e48c6399bebf6e1217828a Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 4 Apr 2024 16:23:42 -0700 Subject: selftests/bpf: Make sure libbpf doesn't enforce the signature of a func pointer. The verifier in the kernel ensures that the struct_ops operators behave correctly by checking that they access parameters and context appropriately. The verifier will approve a program as long as it correctly accesses the context/parameters, regardless of its function signature. In contrast, libbpf should not verify the signature of function pointers and functions to enable flexibility in loading various implementations of an operator even if the signature of the function pointer does not match those in the implementations or the kernel. With this flexibility, user space applications can adapt to different kernel versions by loading a specific implementation of an operator based on feature detection. This is a follow-up of the commit c911fc61a7ce ("libbpf: Skip zeroed or null fields if not found in the kernel type.") Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240404232342.991414-1-thinker.li@gmail.com --- .../bpf/prog_tests/test_struct_ops_module.c | 24 ++++++++++++++++++++++ .../selftests/bpf/progs/struct_ops_module.c | 13 ++++++++++++ 2 files changed, 37 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 098776d00ab4..7cf2b9ddd3e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -138,11 +138,35 @@ static void test_struct_ops_not_zeroed(void) struct_ops_module__destroy(skel); } +/* The signature of an implementation might not match the signature of the + * function pointer prototype defined in the BPF program. This mismatch + * should be allowed as long as the behavior of the operator program + * adheres to the signature in the kernel. Libbpf should not enforce the + * signature; rather, let the kernel verifier handle the enforcement. + */ +static void test_struct_ops_incompatible(void) +{ + struct struct_ops_module *skel; + struct bpf_link *link; + + skel = struct_ops_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); + if (ASSERT_OK_PTR(link, "attach_struct_ops")) + bpf_link__destroy(link); + + struct_ops_module__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) test_struct_ops_load(); if (test__start_subtest("test_struct_ops_not_zeroed")) test_struct_ops_not_zeroed(); + if (test__start_subtest("test_struct_ops_incompatible")) + test_struct_ops_incompatible(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 86e1e50c5531..63b065dae002 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -68,3 +68,16 @@ struct bpf_testmod_ops___zeroed testmod_zeroed = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___incompatible { + int (*test_1)(void); + void (*test_2)(int *a); + int data; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___incompatible testmod_incompatible = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2, + .data = 3, +}; -- cgit v1.2.3 From 1bc724af00cc48ef03e3fa6d7a2f6731ac915c37 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 5 Apr 2024 09:30:41 -0500 Subject: selftests/bpf: Verify calling core kfuncs from BPF_PROG_TYPE_SYCALL Now that we can call some kfuncs from BPF_PROG_TYPE_SYSCALL progs, let's add some selftests that verify as much. As a bonus, let's also verify that we can't call the progs from raw tracepoints. Do do this, we add a new selftest suite called verifier_kfunc_prog_types. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240405143041.632519-3-void@manifault.com --- .../bpf/prog_tests/verifier_kfunc_prog_types.c | 11 ++ .../selftests/bpf/progs/cgrp_kfunc_common.h | 2 +- .../selftests/bpf/progs/task_kfunc_common.h | 2 +- .../bpf/progs/verifier_kfunc_prog_types.c | 122 +++++++++++++++++++++ 4 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..3918ecc2ee91 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include + +#include "verifier_kfunc_prog_types.skel.h" + +void test_verifier_kfunc_prog_types(void) +{ + RUN_TESTS(verifier_kfunc_prog_types); +} diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index 22914a70db54..73ba32e9a693 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -13,7 +13,7 @@ struct __cgrps_kfunc_map_value { struct cgroup __kptr * cgrp; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __cgrps_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index 41f2d44f49cb..6720c4b5be41 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -13,7 +13,7 @@ struct __tasks_kfunc_map_value { struct task_struct __kptr * task; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __tasks_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..cb32b0cfc84b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "cgrp_kfunc_common.h" +#include "cpumask_common.h" +#include "task_kfunc_common.h" + +char _license[] SEC("license") = "GPL"; + +/*************** + * Task kfuncs * + ***************/ + +static void task_kfunc_load_test(void) +{ + struct task_struct *current, *ref_1, *ref_2; + + current = bpf_get_current_task_btf(); + ref_1 = bpf_task_from_pid(current->pid); + if (!ref_1) + return; + + ref_2 = bpf_task_acquire(ref_1); + if (ref_2) + bpf_task_release(ref_2); + bpf_task_release(ref_1); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(task_kfunc_raw_tp) +{ + task_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(task_kfunc_syscall) +{ + task_kfunc_load_test(); + return 0; +} + +/***************** + * cgroup kfuncs * + *****************/ + +static void cgrp_kfunc_load_test(void) +{ + struct cgroup *cgrp, *ref; + + cgrp = bpf_cgroup_from_id(0); + if (!cgrp) + return; + + ref = bpf_cgroup_acquire(cgrp); + if (!ref) { + bpf_cgroup_release(cgrp); + return; + } + + bpf_cgroup_release(ref); + bpf_cgroup_release(cgrp); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cgrp_kfunc_raw_tp) +{ + cgrp_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cgrp_kfunc_syscall) +{ + cgrp_kfunc_load_test(); + return 0; +} + +/****************** + * cpumask kfuncs * + ******************/ + +static void cpumask_kfunc_load_test(void) +{ + struct bpf_cpumask *alloc, *ref; + + alloc = bpf_cpumask_create(); + if (!alloc) + return; + + ref = bpf_cpumask_acquire(alloc); + bpf_cpumask_set_cpu(0, alloc); + bpf_cpumask_test_cpu(0, (const struct cpumask *)ref); + + bpf_cpumask_release(ref); + bpf_cpumask_release(alloc); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cpumask_kfunc_raw_tp) +{ + cpumask_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cpumask_kfunc_syscall) +{ + cpumask_kfunc_load_test(); + return 0; +} -- cgit v1.2.3 From 2e0e148c727061009d3db5f436f51890bbb49a80 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Wed, 3 Apr 2024 14:28:45 -0700 Subject: tools: ynl: ethtool.py: Output timestamping statistics from tsinfo-get operation Print the nested stats attribute containing timestamping statistics when the --show-time-stamping flag is used. [root@binary-eater-vm-01 linux-ethtool-ts]# ./tools/net/ynl/ethtool.py --show-time-stamping mlx5_1 Time stamping parameters for mlx5_1: Capabilities: hardware-transmit hardware-receive hardware-raw-clock PTP Hardware Clock: 0 Hardware Transmit Timestamp Modes: off on Hardware Receive Filter Modes: none all Statistics: tx-pkts: 8 tx-lost: 0 tx-err: 0 Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Link: https://lore.kernel.org/r/20240403212931.128541-8-rrameshbabu@nvidia.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ethtool.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py index 44ba3ba58ed9..63c471f075ab 100755 --- a/tools/net/ynl/ethtool.py +++ b/tools/net/ynl/ethtool.py @@ -324,7 +324,13 @@ def main(): return if args.show_time_stamping: - tsinfo = dumpit(ynl, args, 'tsinfo-get') + req = { + 'header': { + 'flags': 'stats', + }, + } + + tsinfo = dumpit(ynl, args, 'tsinfo-get', req) print(f'Time stamping parameters for {args.device}:') @@ -338,6 +344,9 @@ def main(): print('Hardware Receive Filter Modes:') [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + + print('Statistics:') + [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return print(f'Settings for {args.device}:') -- cgit v1.2.3 From aa6485d813ad6d884d23e1e9cf3913be29a1f229 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 4 Apr 2024 14:31:12 +0800 Subject: ynl: rename array-nest to indexed-array Some implementations, like bonding, has nest array with same attr type. To support all kinds of entries under one nest array. As discussed[1], let's rename array-nest to indexed-array, and assuming the value is a nest by passing the type via sub-type. [1] https://lore.kernel.org/netdev/20240312100105.16a59086@kernel.org/ Suggested-by: Jakub Kicinski Signed-off-by: Hangbin Liu Link: https://lore.kernel.org/r/20240404063114.1221532-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-c.yaml | 2 +- Documentation/netlink/genetlink-legacy.yaml | 2 +- Documentation/netlink/genetlink.yaml | 2 +- Documentation/netlink/netlink-raw.yaml | 2 +- Documentation/netlink/specs/nlctrl.yaml | 6 ++++-- Documentation/netlink/specs/rt_link.yaml | 3 ++- Documentation/netlink/specs/tc.yaml | 21 ++++++++++++++------- .../userspace-api/netlink/genetlink-legacy.rst | 12 +++++++++--- tools/net/ynl/lib/ynl.py | 13 ++++++++----- tools/net/ynl/ynl-gen-c.py | 18 ++++++++++++------ 10 files changed, 53 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 4dfd899a1661..4f803eaac6d8 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -158,7 +158,7 @@ properties: type: &attr-type enum: [ unused, pad, flag, binary, uint, sint, u8, u16, u32, u64, s32, s64, - string, nest, array-nest, nest-type-value ] + string, nest, indexed-array, nest-type-value ] doc: description: Documentation of the attribute. type: string diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index b48ad3b1cc32..8db0e22fa72c 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -201,7 +201,7 @@ properties: description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, uint, sint, u8, u16, u32, u64, s32, s64, - string, nest, array-nest, nest-type-value ] + string, nest, indexed-array, nest-type-value ] doc: description: Documentation of the attribute. type: string diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index ebd6ee743fcc..b036227b46f1 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -124,7 +124,7 @@ properties: type: &attr-type enum: [ unused, pad, flag, binary, uint, sint, u8, u16, u32, u64, s32, s64, - string, nest, array-nest, nest-type-value ] + string, nest, indexed-array, nest-type-value ] doc: description: Documentation of the attribute. type: string diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index a76e54cbadbc..914aa1c0a273 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -222,7 +222,7 @@ properties: description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, u8, u16, u32, u64, s8, s16, s32, s64, - string, nest, array-nest, nest-type-value, + string, nest, indexed-array, nest-type-value, sub-message ] doc: description: Documentation of the attribute. diff --git a/Documentation/netlink/specs/nlctrl.yaml b/Documentation/netlink/specs/nlctrl.yaml index b1632b95f725..a36535350bdb 100644 --- a/Documentation/netlink/specs/nlctrl.yaml +++ b/Documentation/netlink/specs/nlctrl.yaml @@ -65,11 +65,13 @@ attribute-sets: type: u32 - name: ops - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: op-attrs - name: mcast-groups - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: mcast-group-attrs - name: policy diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 81a5a3d1b04d..e5dcb2cf1724 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1690,7 +1690,8 @@ attribute-sets: type: binary - name: hw-s-info - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: hw-s-info-one - name: l3-stats diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index 6068c105c5ee..8c01e4e13195 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -1988,7 +1988,8 @@ attribute-sets: nested-attributes: tc-ematch-attrs - name: act - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: police @@ -2128,7 +2129,8 @@ attribute-sets: type: u32 - name: tin-stats - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-cake-tin-stats-attrs - name: deficit @@ -2348,7 +2350,8 @@ attribute-sets: type: string - name: act - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: key-eth-dst @@ -2849,7 +2852,8 @@ attribute-sets: type: string - name: act - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: mask @@ -3002,7 +3006,8 @@ attribute-sets: type: u32 - name: act - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: flags @@ -3375,7 +3380,8 @@ attribute-sets: nested-attributes: tc-police-attrs - name: act - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: tc-taprio-attrs @@ -3593,7 +3599,8 @@ attribute-sets: nested-attributes: tc-police-attrs - name: act - type: array-nest + type: indexed-array + sub-type: nest nested-attributes: tc-act-attrs - name: indev diff --git a/Documentation/userspace-api/netlink/genetlink-legacy.rst b/Documentation/userspace-api/netlink/genetlink-legacy.rst index 70a77387f6c4..54e8fb25e093 100644 --- a/Documentation/userspace-api/netlink/genetlink-legacy.rst +++ b/Documentation/userspace-api/netlink/genetlink-legacy.rst @@ -46,10 +46,16 @@ For reference the ``multi-attr`` array may look like this:: where ``ARRAY-ATTR`` is the array entry type. -array-nest -~~~~~~~~~~ +indexed-array +~~~~~~~~~~~~~ + +``indexed-array`` wraps the entire array in an extra attribute (hence +limiting its size to 64kB). The ``ENTRY`` nests are special and have the +index of the entry as their type instead of normal attribute type. -``array-nest`` creates the following structure:: +A ``sub-type`` is needed to describe what type in the ``ENTRY``. A ``nest`` +``sub-type`` means there are nest arrays in the ``ENTRY``, with the structure +looks like:: [SOME-OTHER-ATTR] [ARRAY-ATTR] diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index b30210f537f7..d671efea808a 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -631,15 +631,18 @@ class YnlFamily(SpecFamily): decoded = self._formatted_string(decoded, attr_spec.display_hint) return decoded - def _decode_array_nest(self, attr, attr_spec): + def _decode_array_attr(self, attr, attr_spec): decoded = [] offset = 0 while offset < len(attr.raw): item = NlAttr(attr.raw, offset) offset += item.full_len - subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) - decoded.append({ item.type: subattrs }) + if attr_spec["sub-type"] == 'nest': + subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) + decoded.append({ item.type: subattrs }) + else: + raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded def _decode_nest_type_value(self, attr, attr_spec): @@ -733,8 +736,8 @@ class YnlFamily(SpecFamily): decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order) if 'enum' in attr_spec: decoded = self._decode_enum(decoded, attr_spec) - elif attr_spec["type"] == 'array-nest': - decoded = self._decode_array_nest(attr, attr_spec) + elif attr_spec["type"] == 'indexed-array': + decoded = self._decode_array_attr(attr, attr_spec) elif attr_spec["type"] == 'bitfield32': value, selector = struct.unpack("II", attr.raw) if 'enum' in attr_spec: diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index a451cbfbd781..c0b90c104d92 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -841,8 +841,11 @@ class AttrSet(SpecAttrSet): t = TypeBitfield32(self.family, self, elem, value) elif elem['type'] == 'nest': t = TypeNest(self.family, self, elem, value) - elif elem['type'] == 'array-nest': - t = TypeArrayNest(self.family, self, elem, value) + elif elem['type'] == 'indexed-array' and 'sub-type' in elem: + if elem["sub-type"] == 'nest': + t = TypeArrayNest(self.family, self, elem, value) + else: + raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}') elif elem['type'] == 'nest-type-value': t = TypeNestTypeValue(self.family, self, elem, value) else: @@ -1055,7 +1058,7 @@ class Family(SpecFamily): if nested in self.root_sets: raise Exception("Inheriting members to a space used as root not supported") inherit.update(set(spec['type-value'])) - elif spec['type'] == 'array-nest': + elif spec['type'] == 'indexed-array': inherit.add('idx') self.pure_nested_structs[nested].set_inherited(inherit) @@ -1619,9 +1622,12 @@ def _multi_parse(ri, struct, init_lines, local_vars): multi_attrs = set() needs_parg = False for arg, aspec in struct.member_list(): - if aspec['type'] == 'array-nest': - local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') - array_nests.add(arg) + if aspec['type'] == 'indexed-array' and 'sub-type' in aspec: + if aspec["sub-type"] == 'nest': + local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') + array_nests.add(arg) + else: + raise Exception(f'Not supported sub-type {aspec["sub-type"]}') if 'multi-attr' in aspec: multi_attrs.add(arg) needs_parg |= 'nested-attributes' in aspec -- cgit v1.2.3 From a7408b56e5f9e96c486cec9aaf7490452b713ebf Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 4 Apr 2024 14:31:13 +0800 Subject: ynl: support binary and integer sub-type for indexed-array Add binary and integer sub-type support for indexed-array to display bond arp and ns targets. Here is what the result looks like: # ip link add bond0 type bond mode 1 \ arp_ip_target 192.168.1.1,192.168.1.2 ns_ip6_target 2001::1,2001::2 # ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_link.yaml \ --do getlink --json '{"ifname": "bond0"}' --output-json | jq '.linkinfo' "arp-ip-target": [ "192.168.1.1", "192.168.1.2" ], [...] "ns-ip6-target": [ "2001::1", "2001::2" ], Signed-off-by: Hangbin Liu Link: https://lore.kernel.org/r/20240404063114.1221532-3-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/userspace-api/netlink/genetlink-legacy.rst | 10 +++++++--- tools/net/ynl/lib/ynl.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/Documentation/userspace-api/netlink/genetlink-legacy.rst b/Documentation/userspace-api/netlink/genetlink-legacy.rst index 54e8fb25e093..fa005989193a 100644 --- a/Documentation/userspace-api/netlink/genetlink-legacy.rst +++ b/Documentation/userspace-api/netlink/genetlink-legacy.rst @@ -66,9 +66,13 @@ looks like:: [MEMBER1] [MEMBER2] -It wraps the entire array in an extra attribute (hence limiting its size -to 64kB). The ``ENTRY`` nests are special and have the index of the entry -as their type instead of normal attribute type. +Other ``sub-type`` like ``u32`` means there is only one member as described +in ``sub-type`` in the ``ENTRY``. The structure looks like:: + + [SOME-OTHER-ATTR] + [ARRAY-ATTR] + [ENTRY u32] + [ENTRY u32] type-value ~~~~~~~~~~ diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index d671efea808a..0ba5f6fb8747 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -641,6 +641,16 @@ class YnlFamily(SpecFamily): if attr_spec["sub-type"] == 'nest': subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) decoded.append({ item.type: subattrs }) + elif attr_spec["sub-type"] == 'binary': + subattrs = item.as_bin() + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) + elif attr_spec["sub-type"] in NlAttr.type_formats: + subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order) + if attr_spec.display_hint: + subattrs = self._formatted_string(subattrs, attr_spec.display_hint) + decoded.append(subattrs) else: raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded -- cgit v1.2.3 From 5bd2ed658231b0698211e94efb06393836a4539d Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 6 Apr 2024 11:15:41 +0200 Subject: libbpf: Start v1.5 development cycle Bump libbpf.map to v1.5.0 to start a new libbpf version cycle. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240406092005.92399-2-andrea.righi@canonical.com --- tools/lib/bpf/libbpf.map | 3 +++ tools/lib/bpf/libbpf_version.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 51732ecb1385..5dd81a7b96b5 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -416,3 +416,6 @@ LIBBPF_1.4.0 { btf__new_split; btf_ext__raw_data; } LIBBPF_1.3.0; + +LIBBPF_1.5.0 { +} LIBBPF_1.4.0; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index e783a47da815..d6e5eff967cb 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 4 +#define LIBBPF_MINOR_VERSION 5 #endif /* __LIBBPF_VERSION_H */ -- cgit v1.2.3 From 13e8125a22763557d719db996f70c71f77c9509c Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 6 Apr 2024 11:15:42 +0200 Subject: libbpf: ringbuf: Allow to consume up to a certain amount of items In some cases, instead of always consuming all items from ring buffers in a greedy way, we may want to consume up to a certain amount of items, for example when we need to copy items from the BPF ring buffer to a limited user buffer. This change allows to set an upper limit to the amount of items consumed from one or more ring buffers. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240406092005.92399-3-andrea.righi@canonical.com --- tools/lib/bpf/ringbuf.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index aacb64278a01..db05e6f526de 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int64_t ringbuf_process_ring(struct ring *r) +static int64_t ringbuf_process_ring(struct ring *r, size_t n) { int *len_ptr, len, err; /* 64-bit to avoid overflow in case of extreme application behavior */ @@ -268,6 +268,9 @@ static int64_t ringbuf_process_ring(struct ring *r) } smp_store_release(r->consumer_pos, cons_pos); + + if (cnt >= n) + goto done; } } while (got_new_data); done: @@ -287,13 +290,15 @@ int ring_buffer__consume(struct ring_buffer *rb) for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = rb->rings[i]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; + if (res > INT_MAX) { + res = INT_MAX; + break; + } } - if (res > INT_MAX) - return INT_MAX; return res; } @@ -314,13 +319,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) __u32 ring_id = rb->events[i].data.fd; struct ring *ring = rb->rings[ring_id]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; } if (res > INT_MAX) - return INT_MAX; + res = INT_MAX; return res; } @@ -375,7 +380,7 @@ int ring__consume(struct ring *r) { int64_t res; - res = ringbuf_process_ring(r); + res = ringbuf_process_ring(r, INT_MAX); if (res < 0) return libbpf_err(res); -- cgit v1.2.3 From 4d22ea94ea33550538b3b14429d52cb9f96ad2c3 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 6 Apr 2024 11:15:43 +0200 Subject: libbpf: Add ring__consume_n / ring_buffer__consume_n Introduce a new API to consume items from a ring buffer, limited to a specified amount, and return to the caller the actual number of items consumed. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/lkml/20240310154726.734289-1-andrea.righi@canonical.com/T Link: https://lore.kernel.org/bpf/20240406092005.92399-4-andrea.righi@canonical.com --- tools/lib/bpf/libbpf.h | 12 ++++++++++++ tools/lib/bpf/libbpf.map | 3 +++ tools/lib/bpf/ringbuf.c | 38 +++++++++++++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f88ab50c0229..4f775a6dcaa0 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1293,6 +1293,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /** @@ -1367,6 +1368,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r); */ LIBBPF_API int ring__consume(struct ring *r); +/** + * @brief **ring__consume_n()** consumes up to a requested amount of items from + * a ringbuffer without event polling. + * + * @param r A ringbuffer object. + * @param n Maximum amount of items to consume. + * @return The number of items consumed, or a negative number if any of the + * callbacks return an error. + */ +LIBBPF_API int ring__consume_n(struct ring *r, size_t n); + struct user_ring_buffer_opts { size_t sz; /* size of this struct, for forward/backward compatibility */ }; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 5dd81a7b96b5..23d82bba021a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -418,4 +418,7 @@ LIBBPF_1.4.0 { } LIBBPF_1.3.0; LIBBPF_1.5.0 { + global: + ring__consume_n; + ring_buffer__consume_n; } LIBBPF_1.4.0; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index db05e6f526de..99e44cf02321 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -277,6 +277,33 @@ done: return cnt; } +/* Consume available ring buffer(s) data without event polling, up to n + * records. + * + * Returns number of records consumed across all registered ring buffers (or + * n, whichever is less), or negative number if any of the callbacks return + * error. + */ +int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = rb->rings[i]; + + err = ringbuf_process_ring(ring, n); + if (err < 0) + return libbpf_err(err); + res += err; + n -= err; + + if (n == 0) + break; + } + return res; +} + /* Consume available ring buffer(s) data without event polling. * Returns number of records consumed across all registered ring buffers (or * INT_MAX, whichever is less), or negative number if any of the callbacks @@ -376,17 +403,22 @@ int ring__map_fd(const struct ring *r) return r->map_fd; } -int ring__consume(struct ring *r) +int ring__consume_n(struct ring *r, size_t n) { - int64_t res; + int res; - res = ringbuf_process_ring(r, INT_MAX); + res = ringbuf_process_ring(r, n); if (res < 0) return libbpf_err(res); return res > INT_MAX ? INT_MAX : res; } +int ring__consume(struct ring *r) +{ + return ring__consume_n(r, INT_MAX); +} + static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) { if (rb->consumer_pos) { -- cgit v1.2.3 From f1c48c1ec73538a8e49695445a0fbc52156aac42 Mon Sep 17 00:00:00 2001 From: Chao Du Date: Tue, 2 Apr 2024 06:26:28 +0000 Subject: RISC-V: KVM: selftests: Add ebreak test support Initial support for RISC-V KVM ebreak test. Check the exit reason and the PC when guest debug is enabled. Also to make sure the guest could handle the ebreak exception without exiting to the VMM when guest debug is not enabled. Signed-off-by: Chao Du Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20240402062628.5425-4-duchao@eswincomputing.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/riscv/ebreak_test.c | 82 +++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 tools/testing/selftests/kvm/riscv/ebreak_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 741c7dc16afc..7f4430242c9e 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -189,6 +189,7 @@ TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c new file mode 100644 index 000000000000..823c132069b4 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V KVM ebreak test. + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#include "kvm_util.h" + +#define LABEL_ADDRESS(v) ((uint64_t)&(v)) + +extern unsigned char sw_bp_1, sw_bp_2; +static uint64_t sw_bp_addr; + +static void guest_code(void) +{ + asm volatile( + ".option push\n" + ".option norvc\n" + "sw_bp_1: ebreak\n" + "sw_bp_2: ebreak\n" + ".option pop\n" + ); + GUEST_ASSERT_EQ(READ_ONCE(sw_bp_addr), LABEL_ADDRESS(sw_bp_2)); + + GUEST_DONE(); +} + +static void guest_breakpoint_handler(struct ex_regs *regs) +{ + WRITE_ONCE(sw_bp_addr, regs->epc); + regs->epc += 4; +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint64_t pc; + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE, + }; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_vector_tables(vm); + vcpu_init_vector_tables(vcpu); + vm_install_exception_handler(vm, EXC_BREAKPOINT, + guest_breakpoint_handler); + + /* + * Enable the guest debug. + * ebreak should exit to the VMM with KVM_EXIT_DEBUG reason. + */ + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc); + TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1)); + + /* skip sw_bp_1 */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), pc + 4); + + /* + * Disable all debug controls. + * Guest should handle the ebreak without exiting to the VMM. + */ + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + + return 0; +} -- cgit v1.2.3 From b86761ff6374813cdf64ffd6b95ddd1813c435d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Apr 2024 19:45:22 -0700 Subject: selftests: net: add scaffolding for Netlink tests in Python Add glue code for accessing the YNL library which lives under tools/net and YAML spec files from under Documentation/. Automatically figure out if tests are run in tree or not. Since we'll want to use this library both from net and drivers/net test targets make the library a target as well, and automatically include it when net or drivers/net are included. Making net/lib a target ensures that we end up with only one copy of it, and saves us some path guessing. Add a tiny bit of formatting support to be able to output KTAP from the start. Reviewed-by: Petr Machata Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/Makefile | 9 ++- tools/testing/selftests/net/lib/Makefile | 8 +++ tools/testing/selftests/net/lib/py/__init__.py | 6 ++ tools/testing/selftests/net/lib/py/consts.py | 9 +++ tools/testing/selftests/net/lib/py/ksft.py | 96 ++++++++++++++++++++++++++ tools/testing/selftests/net/lib/py/utils.py | 47 +++++++++++++ tools/testing/selftests/net/lib/py/ynl.py | 49 +++++++++++++ 7 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/lib/Makefile create mode 100644 tools/testing/selftests/net/lib/py/__init__.py create mode 100644 tools/testing/selftests/net/lib/py/consts.py create mode 100644 tools/testing/selftests/net/lib/py/ksft.py create mode 100644 tools/testing/selftests/net/lib/py/utils.py create mode 100644 tools/testing/selftests/net/lib/py/ynl.py (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index e1504833654d..f533eb7054fe 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -116,6 +116,13 @@ TARGETS += zram TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug +# Networking tests want the net/lib target, include it automatically +ifneq ($(filter net,$(TARGETS)),) +ifeq ($(filter net/lib,$(TARGETS)),) + INSTALL_DEP_TARGETS := net/lib +endif +endif + # User can optionally provide a TARGETS skiplist. By default we skip # BPF since it has cutting edge build time dependencies which require # more effort to install. @@ -245,7 +252,7 @@ ifdef INSTALL_PATH install -m 744 run_kselftest.sh $(INSTALL_PATH)/ rm -f $(TEST_LIST) @ret=1; \ - for TARGET in $(TARGETS); do \ + for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \ INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \ diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile new file mode 100644 index 000000000000..48557e6250dd --- /dev/null +++ b/tools/testing/selftests/net/lib/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_FILES := ../../../../../Documentation/netlink/specs +TEST_FILES += ../../../../net/ynl + +TEST_INCLUDES := $(wildcard py/*.py) + +include ../../lib.mk diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py new file mode 100644 index 000000000000..7823b5c1f8d7 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .consts import KSRC +from .ksft import * +from .utils import * +from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily diff --git a/tools/testing/selftests/net/lib/py/consts.py b/tools/testing/selftests/net/lib/py/consts.py new file mode 100644 index 000000000000..f518ce79d82c --- /dev/null +++ b/tools/testing/selftests/net/lib/py/consts.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../..").resolve() +KSRC = (Path(__file__).parent / "../../../../../..").resolve() + +KSFT_MAIN_NAME = Path(sys.argv[0]).with_suffix("").name diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py new file mode 100644 index 000000000000..c7210525981c --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: GPL-2.0 + +import builtins +from .consts import KSFT_MAIN_NAME + +KSFT_RESULT = None + + +class KsftSkipEx(Exception): + pass + + +class KsftXfailEx(Exception): + pass + + +def ksft_pr(*objs, **kwargs): + print("#", *objs, **kwargs) + + +def ksft_eq(a, b, comment=""): + global KSFT_RESULT + if a != b: + KSFT_RESULT = False + ksft_pr("Check failed", a, "!=", b, comment) + + +def ksft_true(a, comment=""): + global KSFT_RESULT + if not a: + KSFT_RESULT = False + ksft_pr("Check failed", a, "does not eval to True", comment) + + +def ksft_in(a, b, comment=""): + global KSFT_RESULT + if a not in b: + KSFT_RESULT = False + ksft_pr("Check failed", a, "not in", b, comment) + + +def ksft_ge(a, b, comment=""): + global KSFT_RESULT + if a < b: + KSFT_RESULT = False + ksft_pr("Check failed", a, "<", b, comment) + + +def ktap_result(ok, cnt=1, case="", comment=""): + res = "" + if not ok: + res += "not " + res += "ok " + res += str(cnt) + " " + res += KSFT_MAIN_NAME + if case: + res += "." + str(case.__name__) + if comment: + res += " # " + comment + print(res) + + +def ksft_run(cases, args=()): + totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} + + print("KTAP version 1") + print("1.." + str(len(cases))) + + global KSFT_RESULT + cnt = 0 + for case in cases: + KSFT_RESULT = True + cnt += 1 + try: + case(*args) + except KsftSkipEx as e: + ktap_result(True, cnt, case, comment="SKIP " + str(e)) + totals['skip'] += 1 + continue + except KsftXfailEx as e: + ktap_result(True, cnt, case, comment="XFAIL " + str(e)) + totals['xfail'] += 1 + continue + except Exception as e: + for line in str(e).split('\n'): + ksft_pr("Exception|", line) + ktap_result(False, cnt, case) + totals['fail'] += 1 + continue + + ktap_result(KSFT_RESULT, cnt, case) + totals['pass'] += 1 + + print( + f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" + ) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py new file mode 100644 index 000000000000..f0d425731fd4 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json as _json +import subprocess + +class cmd: + def __init__(self, comm, shell=True, fail=True, ns=None, background=False): + if ns: + if isinstance(ns, NetNS): + ns = ns.name + comm = f'ip netns exec {ns} ' + comm + + self.stdout = None + self.stderr = None + self.ret = None + + self.comm = comm + self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not background: + self.process(terminate=False, fail=fail) + + def process(self, terminate=True, fail=None): + if terminate: + self.proc.terminate() + stdout, stderr = self.proc.communicate() + self.stdout = stdout.decode("utf-8") + self.stderr = stderr.decode("utf-8") + self.proc.stdout.close() + self.proc.stderr.close() + self.ret = self.proc.returncode + + if self.proc.returncode != 0 and fail: + if len(stderr) > 0 and stderr[-1] == "\n": + stderr = stderr[:-1] + raise Exception("Command failed: %s\n%s" % (self.proc.args, stderr)) + + +def ip(args, json=None, ns=None): + cmd_str = "ip " + if json: + cmd_str += '-j ' + cmd_str += args + cmd_obj = cmd(cmd_str, ns=ns) + if json: + return _json.loads(cmd_obj.stdout) + return cmd_obj diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py new file mode 100644 index 000000000000..1ace58370c06 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path +from .consts import KSRC, KSFT_DIR +from .ksft import ksft_pr, ktap_result + +# Resolve paths +try: + if (KSFT_DIR / "kselftest-list.txt").exists(): + # Running in "installed" selftests + tools_full_path = KSFT_DIR + SPEC_PATH = KSFT_DIR / "net/lib/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.lib.ynl.lib import YnlFamily, NlError + else: + # Running in tree + tools_full_path = KSRC / "tools" + SPEC_PATH = KSRC / "Documentation/netlink/specs" + + sys.path.append(tools_full_path.as_posix()) + from net.ynl.lib import YnlFamily, NlError +except ModuleNotFoundError as e: + ksft_pr("Failed importing `ynl` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +# +# Wrapper classes, loading the right specs +# Set schema='' to avoid jsonschema validation, it's slow +# +class EthtoolFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), + schema='') + + +class RtnlFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), + schema='') + + +class NetdevFamily(YnlFamily): + def __init__(self): + super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), + schema='') -- cgit v1.2.3 From 796c8c7fd257abe5c1e1ee22d4344123cd614f9f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Apr 2024 19:45:23 -0700 Subject: selftests: nl_netdev: add a trivial Netlink netdev test Add a trivial test using YNL. $ ./tools/testing/selftests/net/nl_netdev.py KTAP version 1 1..2 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check Instantiate the family once, it takes longer than the test itself. Reviewed-by: Petr Machata Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/nl_netdev.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100755 tools/testing/selftests/net/nl_netdev.py (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index cb418a2346bc..5e34c93aa51b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -34,6 +34,7 @@ TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh TEST_PROGS += netns-name.sh +TEST_PROGS += nl_netdev.py TEST_PROGS += srv6_end_dt46_l3vpn_test.sh TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py new file mode 100755 index 000000000000..2b8b488fb419 --- /dev/null +++ b/tools/testing/selftests/net/nl_netdev.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_pr, ksft_eq, ksft_ge, NetdevFamily + + +def empty_check(nf) -> None: + devs = nf.dev_get({}, dump=True) + ksft_ge(len(devs), 1) + + +def lo_check(nf) -> None: + lo_info = nf.dev_get({"ifindex": 1}) + ksft_eq(len(lo_info['xdp-features']), 0) + ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0) + + +def main() -> None: + nf = NetdevFamily() + ksft_run([empty_check, lo_check], args=(nf, )) + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From b4db9f840283caca0d904436f187ef56a9126eaa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Apr 2024 19:45:25 -0700 Subject: selftests: drivers: add scaffolding for Netlink tests in Python Add drivers/net as a target for mixed-use tests. The setup is expected to work similarly to the forwarding tests. Since we only need one interface (unlike forwarding tests) read the target device name from NETIF. If not present we'll try to run the test against netdevsim. Reviewed-by: Petr Machata Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/Makefile | 3 +- tools/testing/selftests/drivers/net/Makefile | 7 ++ tools/testing/selftests/drivers/net/README.rst | 30 ++++++ .../selftests/drivers/net/lib/py/__init__.py | 17 +++ tools/testing/selftests/drivers/net/lib/py/env.py | 52 ++++++++++ tools/testing/selftests/net/lib/py/__init__.py | 1 + tools/testing/selftests/net/lib/py/nsim.py | 115 +++++++++++++++++++++ 7 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/drivers/net/Makefile create mode 100644 tools/testing/selftests/drivers/net/README.rst create mode 100644 tools/testing/selftests/drivers/net/lib/py/__init__.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/env.py create mode 100644 tools/testing/selftests/net/lib/py/nsim.py (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f533eb7054fe..6dab886d6f7a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,6 +17,7 @@ TARGETS += devices TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice +TARGETS += drivers/net TARGETS += drivers/net/bonding TARGETS += drivers/net/team TARGETS += dt @@ -117,7 +118,7 @@ TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug # Networking tests want the net/lib target, include it automatically -ifneq ($(filter net,$(TARGETS)),) +ifneq ($(filter net drivers/net,$(TARGETS)),) ifeq ($(filter net/lib,$(TARGETS)),) INSTALL_DEP_TARGETS := net/lib endif diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile new file mode 100644 index 000000000000..379cdb1960a7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := $(wildcard lib/py/*.py) + +TEST_PROGS := stats.py + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst new file mode 100644 index 000000000000..5ef7c417d431 --- /dev/null +++ b/tools/testing/selftests/drivers/net/README.rst @@ -0,0 +1,30 @@ +Running tests +============= + +Tests are executed within kselftest framework like any other tests. +By default tests execute against software drivers such as netdevsim. +All tests must support running against a real device (SW-only tests +should instead be placed in net/ or drivers/net/netdevsim, HW-only +tests in drivers/net/hw). + +Set appropriate variables to point the tests at a real device. + +Variables +========= + +Variables can be set in the environment or by creating a net.config +file in the same directory as this README file. Example:: + + $ NETIF=eth0 ./some_test.sh + +or:: + + $ cat tools/testing/selftests/drivers/net/net.config + # Variable set in a file + NETIF=eth0 + +NETIF +~~~~~ + +Name of the netdevice against which the test should be executed. +When empty or not set software devices will be used. diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py new file mode 100644 index 000000000000..4653dffcd962 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) + +from .env import * diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py new file mode 100644 index 000000000000..e1abe9491daf --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import shlex +from pathlib import Path +from lib.py import ip +from lib.py import NetdevSimDev + +class NetDrvEnv: + def __init__(self, src_path): + self._ns = None + + self.env = os.environ.copy() + self._load_env_file(src_path) + + if 'NETIF' in self.env: + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + else: + self._ns = NetdevSimDev() + self.dev = self._ns.nsims[0].dev + self.ifindex = self.dev['ifindex'] + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + + def _load_env_file(self, src_path): + src_dir = Path(src_path).parent.resolve() + if not (src_dir / "net.config").exists(): + return + + lexer = shlex.shlex(open((src_dir / "net.config").as_posix(), 'r').read()) + k = None + for token in lexer: + if k is None: + k = token + self.env[k] = "" + elif token == "=": + pass + else: + self.env[k] = token + k = None diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py index 7823b5c1f8d7..ded7102df18a 100644 --- a/tools/testing/selftests/net/lib/py/__init__.py +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -2,5 +2,6 @@ from .consts import KSRC from .ksft import * +from .nsim import * from .utils import * from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py new file mode 100644 index 000000000000..b2d696e12805 --- /dev/null +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: GPL-2.0 + +import json +import os +import random +import re +import time +from .utils import cmd, ip + + +class NetdevSim: + """ + Class for netdevsim netdevice and its attributes. + """ + + def __init__(self, nsimdev, port_index, ifname, ns=None): + # In case udev renamed the netdev to according to new schema, + # check if the name matches the port_index. + nsimnamere = re.compile(r"eni\d+np(\d+)") + match = nsimnamere.match(ifname) + if match and int(match.groups()[0]) != port_index + 1: + raise Exception("netdevice name mismatches the expected one") + + self.nsimdev = nsimdev + self.port_index = port_index + ret = ip("-j link show dev %s" % ifname, ns=ns) + self.dev = json.loads(ret.stdout)[0] + + def dfs_write(self, path, val): + self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val) + + +class NetdevSimDev: + """ + Class for netdevsim bus device and its attributes. + """ + @staticmethod + def ctrl_write(path, val): + fullpath = os.path.join("/sys/bus/netdevsim/", path) + with open(fullpath, "w") as f: + f.write(val) + + def dfs_write(self, path, val): + fullpath = os.path.join(f"/sys/kernel/debug/netdevsim/netdevsim{self.addr}/", path) + with open(fullpath, "w") as f: + f.write(val) + + def __init__(self, port_count=1, ns=None): + # nsim will spawn in init_net, we'll set to actual ns once we switch it there + self.ns = None + + if not os.path.exists("/sys/bus/netdevsim"): + cmd("modprobe netdevsim") + + addr = random.randrange(1 << 15) + while True: + try: + self.ctrl_write("new_device", "%u %u" % (addr, port_count)) + except OSError as e: + if e.errno == errno.ENOSPC: + addr = random.randrange(1 << 15) + continue + raise e + break + self.addr = addr + + # As probe of netdevsim device might happen from a workqueue, + # so wait here until all netdevs appear. + self.wait_for_netdevs(port_count) + + if ns: + cmd(f"devlink dev reload netdevsim/netdevsim{addr} netns {ns.name}") + self.ns = ns + + cmd("udevadm settle", ns=self.ns) + ifnames = self.get_ifnames() + + self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr + + self.nsims = [] + for port_index in range(port_count): + self.nsims.append(NetdevSim(self, port_index, ifnames[port_index], + ns=ns)) + + def get_ifnames(self): + ifnames = [] + listdir = cmd(f"ls /sys/bus/netdevsim/devices/netdevsim{self.addr}/net/", + ns=self.ns).stdout.split() + for ifname in listdir: + ifnames.append(ifname) + ifnames.sort() + return ifnames + + def wait_for_netdevs(self, port_count): + timeout = 5 + timeout_start = time.time() + + while True: + try: + ifnames = self.get_ifnames() + except FileNotFoundError as e: + ifnames = [] + if len(ifnames) == port_count: + break + if time.time() < timeout_start + timeout: + continue + raise Exception("netdevices did not appear within timeout") + + def remove(self): + self.ctrl_write("del_device", "%u" % (self.addr, )) + + def remove_nsim(self, nsim): + self.nsims.remove(nsim) + self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), + "%u" % (nsim.port_index, )) -- cgit v1.2.3 From f0e6c86e4bab228ca51b863af180ade0a970a393 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Apr 2024 19:45:26 -0700 Subject: testing: net-drv: add a driver test for stats reporting Add a very simple test to make sure drivers report expected stats. Drivers which implement FEC or pause configuration should report relevant stats. Qstats must be reported, at least packet and byte counts, and they must match total device stats. Tested with netdevsim, bnxt, in-tree and installed. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/stats.py | 86 ++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/stats.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py new file mode 100755 index 000000000000..5a9d4e56b28b --- /dev/null +++ b/tools/testing/selftests/drivers/net/stats.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_in, ksft_true, KsftSkipEx, KsftXfailEx +from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError +from lib.py import NetDrvEnv + +ethnl = EthtoolFamily() +netfam = NetdevFamily() +rtnl = RtnlFamily() + + +def check_pause(cfg) -> None: + global ethnl + + try: + ethnl.pause_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("pause not supported by the device") + raise + + data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def check_fec(cfg) -> None: + global ethnl + + try: + ethnl.fec_get({"header": {"dev-index": cfg.ifindex}}) + except NlError as e: + if e.error == 95: + raise KsftXfailEx("FEC not supported by the device") + raise + + data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex, + "flags": {'stats'}}}) + ksft_true(data['stats'], "driver does not report stats") + + +def pkt_byte_sum(cfg) -> None: + global netfam, rtnl + + def get_qstat(test): + global netfam + stats = netfam.qstats_get({}, dump=True) + if stats: + for qs in stats: + if qs["ifindex"]== test.ifindex: + return qs + + qstat = get_qstat(cfg) + if qstat is None: + raise KsftSkipEx("qstats not supported by the device") + + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + ksft_in(key, qstat, "Drivers should always report basic keys") + + # Compare stats, rtnl stats and qstats must match, + # but the interface may be up, so do a series of dumps + # each time the more "recent" stats must be higher or same. + def stat_cmp(rstat, qstat): + for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']: + if rstat[key] != qstat[key]: + return rstat[key] - qstat[key] + return 0 + + for _ in range(10): + rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats'] + if stat_cmp(rtstat, qstat) < 0: + raise Exception("RTNL stats are lower, fetched later") + qstat = get_qstat(cfg) + if stat_cmp(rtstat, qstat) > 0: + raise Exception("Qstats are lower, fetched later") + + +def main() -> None: + with NetDrvEnv(__file__) as cfg: + ksft_run([check_pause, check_fec, pkt_byte_sum], + args=(cfg, )) + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From f30b04cacd8586b582e0544150be7dd46bd0bbe6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:05 +0200 Subject: selftests: mptcp: add tc check for check_tools tc are used in some test scripts: mptcp_connect.sh, mptcp_join.sh and simult_flows.sh. It makes sense to check if tc is installed before running these scripts, just like other tools. So this patch add 'tc' check for mptcp_lib_check_tools(), and check it in these test scripts. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 6 ++++++ tools/testing/selftests/net/mptcp/simult_flows.sh | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4131f3263a48..b77fb7065bfb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -147,7 +147,7 @@ cleanup() mptcp_lib_check_mptcp mptcp_lib_check_kallsyms -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc sin=$(mktemp) sout=$(mktemp) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e4403236f655..5a95798eb40a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -142,7 +142,7 @@ init() { mptcp_lib_check_mptcp mptcp_lib_check_kallsyms - mptcp_lib_check_tools ip ss "${iptables}" "${ip6tables}" + mptcp_lib_check_tools ip tc ss "${iptables}" "${ip6tables}" sin=$(mktemp) sout=$(mktemp) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d529b4b37af8..1fa05405f65e 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,6 +384,12 @@ mptcp_lib_check_tools() { exit ${KSFT_SKIP} fi ;; + "tc") + if ! tc -help &> /dev/null; then + mptcp_lib_pr_skip "Could not run test without tc tool" + exit ${KSFT_SKIP} + fi + ;; "ss") if ! ss -h | grep -q MPTCP; then mptcp_lib_pr_skip "ss tool does not support MPTCP" diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 1b2366220388..497141c49ccd 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -45,7 +45,7 @@ cleanup() } mptcp_lib_check_mptcp -mptcp_lib_check_tools ip +mptcp_lib_check_tools ip tc # "$ns1" ns2 ns3 # ns1eth1 ns2eth1 ns2eth3 ns3eth1 -- cgit v1.2.3 From 9109853a388b7b2b934f56f4ddb250d72e486555 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:06 +0200 Subject: selftests: mptcp: add ms units for tc-netem delay 'delay 1' in tc-netem is confusing, not sure if it's a delay of 1 second or 1 millisecond. This patch explicitly adds millisecond units to make these commands clearer. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 6 +++--- tools/testing/selftests/net/mptcp/simult_flows.sh | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 5a95798eb40a..73a2131e6da2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -125,8 +125,8 @@ init_shapers() { local i for i in $(seq 1 4); do - tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1 - tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1 + tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1ms + tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1ms done } @@ -3212,7 +3212,7 @@ fail_tests() # multiple subflows if reset_with_fail "MP_FAIL MP_RST" 2; then - tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5 + tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 497141c49ccd..4e6d8fc56b38 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -216,8 +216,8 @@ run_test() shift 4 local msg=$* - [ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1="" - [ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2="" + [ $delay1 -gt 0 ] && delay1="delay ${delay1}ms" || delay1="" + [ $delay2 -gt 0 ] && delay2="delay ${delay2}ms" || delay2="" for dev in ns1eth1 ns1eth2; do tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1 -- cgit v1.2.3 From 29aa32fee7d0fdc71221158719137c1f4f5de3b4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:07 +0200 Subject: selftests: mptcp: export ip_mptcp to mptcp_lib This patch exports ip_mptcp into mptcp_lib.sh as a public variable, named MPTCP_LIB_IP_MPTCP. Add a helper mptcp_lib_set_ip_mptcp() to set it, and a helper mptcp_lib_is_ip_mptcp() to test whether it is set. Use these two helpers in mptcp_join.sh. This patch is prepared for coming commits. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 17 ++++++++--------- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 9 +++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 73a2131e6da2..62fcfeaa3430 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -31,7 +31,6 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) capture=false checksum=false -ip_mptcp=0 check_invert=0 validate_checksum=false init=0 @@ -610,7 +609,7 @@ pm_nl_set_limits() local addrs=$2 local subflows=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows else ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows @@ -650,7 +649,7 @@ pm_nl_add_endpoint() nr=$((nr + 1)) done - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port else ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port @@ -663,7 +662,7 @@ pm_nl_del_endpoint() local id=$2 local addr=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then [ $id -ne 0 ] && addr='' ip -n $ns mptcp endpoint delete id $id $addr else @@ -675,7 +674,7 @@ pm_nl_flush_endpoint() { local ns=$1 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint flush else ip netns exec $ns ./pm_nl_ctl flush @@ -686,7 +685,7 @@ pm_nl_show_endpoints() { local ns=$1 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint show else ip netns exec $ns ./pm_nl_ctl dump @@ -699,7 +698,7 @@ pm_nl_change_endpoint() local id=$2 local flags=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint change id $id ${flags//","/" "} else ip netns exec $ns ./pm_nl_ctl set id $id flags $flags @@ -749,7 +748,7 @@ pm_nl_check_endpoint() return fi - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then # get line and trim trailing whitespace line=$(ip -n $ns mptcp endpoint show $id) line="${line% }" @@ -3702,7 +3701,7 @@ while getopts "${all_tests_args}cCih" opt; do checksum=true ;; i) - ip_mptcp=1 + mptcp_lib_set_ip_mptcp ;; h) usage diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 1fa05405f65e..ca01d949fad8 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -23,6 +23,7 @@ MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" +MPTCP_LIB_IP_MPTCP=0 # only if supported (or forced) and not disabled, see no-color.org if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } && @@ -511,3 +512,11 @@ mptcp_lib_verify_listener_events() { mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}" return "${rc}" } + +mptcp_lib_set_ip_mptcp() { + MPTCP_LIB_IP_MPTCP=1 +} + +mptcp_lib_is_ip_mptcp() { + [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] +} -- cgit v1.2.3 From 3188309c8ceb0611d0a6e9d1aca8464099695b8c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:08 +0200 Subject: selftests: mptcp: netlink: add 'limits' helpers The output format of 'ip mptcp limits' command is much different from that of 'pm_nl_ctl limits' command. This patch adds format_limits() helper to format the outputs of these two commands to hide the difference. get_limits() has been added to show the limits. Use these two helpers in pm_netlink.sh to replace all 'pm_nl_ctl limits' commands and outputs. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 32 +++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 6ab8c5d36340..d672d1e5d470 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -46,6 +46,26 @@ trap cleanup EXIT mptcp_lib_ns_init ns1 +format_limits() { + local accept="${1}" + local subflows="${2}" + + if mptcp_lib_is_ip_mptcp; then + # with a space at the end + printf "add_addr_accepted %d subflows %d \n" "${accept}" "${subflows}" + else + printf "accept %d\nsubflows %d\n" "${accept}" "${subflows}" + fi +} + +get_limits() { + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp limits + else + ip netns exec "${ns1}" ./pm_nl_ctl limits + fi +} + check() { local cmd="$1" @@ -69,10 +89,9 @@ check() check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" -default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)" +default_limits="$(get_limits)" if mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 -subflows 2" "defaults limits" + check "get_limits" "$(format_limits 0 2)" "defaults limits" fi ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 @@ -120,14 +139,13 @@ ip netns exec $ns1 ./pm_nl_ctl flush check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit" +check "get_limits" "${default_limits}" "rcv addrs above hard limit" ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit" +check "get_limits" "${default_limits}" "subflows above hard limit" ip netns exec $ns1 ./pm_nl_ctl limits 8 8 -check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 -subflows 8" "set limits" +check "get_limits" "$(format_limits 8 8)" "set limits" ip netns exec $ns1 ./pm_nl_ctl flush ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -- cgit v1.2.3 From 0d16ed0c2e743a721e372c64be73b828a0cf7cb9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:09 +0200 Subject: selftests: mptcp: add {get,format}_endpoint(s) helpers The output formats of 'ip mptcp' commands are much different from that of 'pm_nl_ctl' commands. This patch adds a new helper format_endpoints() to format the outputs of 'ip mptcp' and 'pm_nl_ctl' with 'endpoints' arguments to hide these differences. A new helper named get_endpoint() has also been added to show a specific endpoint identified by the given address ID, similar to mptcp_join.sh's pm_nl_show_endpoints() helper, but showing all entries. Use these two helpers in mptcp_join.sh and pm_netlink.sh to replace all 'pm_nl_ctl get' commands and outputs of 'pm_nl_ctl dump/get'. Suggested-by: Matthieu Baerts (NGI0) Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 32 ++------ tools/testing/selftests/net/mptcp/mptcp_lib.sh | 35 ++++++++ tools/testing/selftests/net/mptcp/pm_netlink.sh | 102 ++++++++++++++---------- 3 files changed, 98 insertions(+), 71 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 62fcfeaa3430..d48da81daa06 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -712,12 +712,9 @@ pm_nl_check_endpoint() local ns=$2 local addr=$3 local _flags="" - local flags local _port - local port local dev local _id - local id print_check "${msg}" @@ -725,48 +722,29 @@ pm_nl_check_endpoint() while [ -n "$1" ]; do if [ $1 = "flags" ]; then _flags=$2 - [ -n "$_flags" ]; flags="flags $_flags" shift elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $2" + [ -n "$2" ]; dev="$2" shift elif [ $1 = "id" ]; then _id=$2 - [ -n "$_id" ]; id="id $_id" shift elif [ $1 = "port" ]; then _port=$2 - [ -n "$_port" ]; port=" port $_port" shift fi shift done - if [ -z "$id" ]; then + if [ -z "${_id}" ]; then test_fail "bad test - missing endpoint id" return fi - if mptcp_lib_is_ip_mptcp; then - # get line and trim trailing whitespace - line=$(ip -n $ns mptcp endpoint show $id) - line="${line% }" - # the dump order is: address id flags port dev - [ -n "$addr" ] && expected_line="$addr" - expected_line+=" $id" - [ -n "$_flags" ] && expected_line+=" ${_flags//","/" "}" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$port" ] && expected_line+=" $port" - else - line=$(ip netns exec $ns ./pm_nl_ctl get $_id) - # the dump order is: id flags dev address port - expected_line="$id" - [ -n "$flags" ] && expected_line+=" $flags" - [ -n "$dev" ] && expected_line+=" $dev" - [ -n "$addr" ] && expected_line+=" $addr" - [ -n "$_port" ] && expected_line+=" $_port" - fi + line=$(mptcp_lib_pm_nl_get_endpoint "${ns}" "${_id}") + expected_line=$(mptcp_lib_pm_nl_format_endpoints \ + "${_id},${addr},${_flags//","/" "},${dev},${_port}") if [ "$line" = "$expected_line" ]; then print_ok else diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index ca01d949fad8..4fabb0091940 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -520,3 +520,38 @@ mptcp_lib_set_ip_mptcp() { mptcp_lib_is_ip_mptcp() { [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] } + +# format: ,,, +mptcp_lib_pm_nl_format_endpoints() { + local entry id ip flags dev port + + for entry in "${@}"; do + IFS=, read -r id ip flags dev port <<< "${entry}" + if mptcp_lib_is_ip_mptcp; then + echo -n "${ip}" + [ -n "${port}" ] && echo -n " port ${port}" + echo -n " id ${id}" + [ -n "${flags}" ] && echo -n " ${flags}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo " " # always a space at the end + else + echo -n "id ${id}" + echo -n " flags ${flags//" "/","}" + [ -n "${dev}" ] && echo -n " dev ${dev}" + echo -n " ${ip}" + [ -n "${port}" ] && echo -n " ${port}" + echo "" + fi + done +} + +mptcp_lib_pm_nl_get_endpoint() { + local ns=${1} + local id=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show id "${id}" + else + ip netns exec "${ns}" ./pm_nl_ctl get "${id}" + fi +} diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index d672d1e5d470..234b88eba0cb 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -66,6 +66,15 @@ get_limits() { fi } +format_endpoints() { + mptcp_lib_pm_nl_format_endpoints "${@}" +} + +get_endpoint() { + # shellcheck disable=SC2317 # invoked indirectly + mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" +} + check() { local cmd="$1" @@ -97,43 +106,44 @@ fi ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup -check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr" +check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr" check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 2 flags subflow dev lo 10.0.1.2 -id 3 flags signal,backup 10.0.1.3" "dump addrs" + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2,subflow,lo" \ + "3,10.0.1.3,signal backup")" "dump addrs" ip netns exec $ns1 ./pm_nl_ctl del 2 -check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" +check "get_endpoint 2" "" "simple del addr" check "ip netns exec $ns1 ./pm_nl_ctl dump" \ -"id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3" "dump addrs after del" + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup")" "dump addrs after del" ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" +check "get_endpoint 4" "" "duplicate addr" ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal -check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment" +check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" for i in $(seq 5 9); do ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 done -check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" -check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" +check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" +check "get_endpoint 10" "" "above hard addr limit" ip netns exec $ns1 ./pm_nl_ctl del 9 for i in $(seq 10 255); do ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i ip netns exec $ns1 ./pm_nl_ctl del $i done -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 3 flags signal,backup 10.0.1.3 -id 4 flags signal 10.0.1.4 -id 5 flags signal 10.0.1.5 -id 6 flags signal 10.0.1.6 -id 7 flags signal 10.0.1.7 -id 8 flags signal 10.0.1.8" "id limit" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ + "$(format_endpoints "1,10.0.1.1" \ + "3,10.0.1.3,signal backup" \ + "4,10.0.1.4,signal" \ + "5,10.0.1.5,signal" \ + "6,10.0.1.6,signal" \ + "7,10.0.1.7,signal" \ + "8,10.0.1.8,signal")" "id limit" ip netns exec $ns1 ./pm_nl_ctl flush check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" @@ -156,14 +166,15 @@ ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254 ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6 ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7 ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.1.2 -id 3 flags 10.0.1.7 -id 4 flags 10.0.1.8 -id 100 flags 10.0.1.3 -id 101 flags 10.0.1.4 -id 254 flags 10.0.1.5 -id 255 flags 10.0.1.6" "set ids" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ + "$(format_endpoints "1,10.0.1.1" \ + "2,10.0.1.2" \ + "3,10.0.1.7" \ + "4,10.0.1.8" \ + "100,10.0.1.3" \ + "101,10.0.1.4" \ + "254,10.0.1.5" \ + "255,10.0.1.6")" "set ids" ip netns exec $ns1 ./pm_nl_ctl flush ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1 @@ -174,36 +185,39 @@ ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253 ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6 ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7 ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.0.1 -id 2 flags 10.0.0.4 -id 3 flags 10.0.0.6 -id 4 flags 10.0.0.7 -id 5 flags 10.0.0.8 -id 253 flags 10.0.0.5 -id 254 flags 10.0.0.2 -id 255 flags 10.0.0.3" "wrap-around ids" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ + "$(format_endpoints "1,10.0.0.1" \ + "2,10.0.0.4" \ + "3,10.0.0.6" \ + "4,10.0.0.7" \ + "5,10.0.0.8" \ + "253,10.0.0.5" \ + "254,10.0.0.2" \ + "255,10.0.0.3")" "wrap-around ids" ip netns exec $ns1 ./pm_nl_ctl flush ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup 10.0.1.1" "set flags (backup)" +check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ + "set flags (backup)" ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nobackup)" +check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nobackup)" # fullmesh support has been added later ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" || mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,fullmesh 10.0.1.1" " (fullmesh)" + check "ip netns exec $ns1 ./pm_nl_ctl dump" \ + "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \ + " (fullmesh)" ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow 10.0.1.1" " (nofullmesh)" + check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow")" \ + " (nofullmesh)" ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \ -subflow,backup,fullmesh 10.0.1.1" " (backup,fullmesh)" + check "ip netns exec $ns1 ./pm_nl_ctl dump" \ + "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \ + " (backup,fullmesh)" else for st in fullmesh nofullmesh backup,fullmesh; do st=" (${st})" -- cgit v1.2.3 From b79e51c9994970699a0f488a7c121699d4595581 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:10 +0200 Subject: selftests: mptcp: netlink: add change_address helper The output formats of 'ip mptcp' commands are much different from that of 'pm_nl_ctl' commands. A new 'change_address' helper is added here, to change the flag of an address. This is a bit similar to mptcp_join.sh's pm_nl_change_endpoint(). Usage: Address ID - pm_nl_change_endpoint $ns id $id $flags IP address - change_address $ns $addr $flags Use this new helper in pm_netlink.sh to replace all 'pm_nl_ctl set' commands. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 234b88eba0cb..e27a731bd765 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -75,6 +75,17 @@ get_endpoint() { mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" } +change_address() { + local addr=${1} + local flags=${2} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp endpoint change "${addr}" "${flags}" + else + ip netns exec "${ns1}" ./pm_nl_ctl set "${addr}" flags "${flags}" + fi +} + check() { local cmd="$1" @@ -197,10 +208,10 @@ check "ip netns exec $ns1 ./pm_nl_ctl dump" \ ip netns exec $ns1 ./pm_nl_ctl flush ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup +change_address 10.0.1.1 backup check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ "set flags (backup)" -ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup +change_address 10.0.1.1 nobackup check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow")" \ " (nobackup)" -- cgit v1.2.3 From 571d79664a4a37bd66fbd80f714e9d66d2c2de35 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:11 +0200 Subject: selftests: mptcp: join: update endpoint ops This patch uses 'case' statements to simplify pm_nl_add_endpoint() and pm_nl_check_endpoint(). And simplify pm_nl_check_endpoint() with check_output() helper. Also update pm_nl_del_endpoint() to avoid the 'double quote' shellcheck warning. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 72 ++++++++----------------- 1 file changed, 23 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d48da81daa06..8862b6a5caa0 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -620,39 +620,27 @@ pm_nl_add_endpoint() { local ns=$1 local addr=$2 - local flags _flags - local port _port - local dev _dev - local id _id + local flags dev id port local nr=2 local p for p in "${@}" do - if [ $p = "flags" ]; then - eval _flags=\$"$nr" - [ -n "$_flags" ]; flags="flags $_flags" - fi - if [ $p = "dev" ]; then - eval _dev=\$"$nr" - [ -n "$_dev" ]; dev="dev $_dev" - fi - if [ $p = "id" ]; then - eval _id=\$"$nr" - [ -n "$_id" ]; id="id $_id" - fi - if [ $p = "port" ]; then - eval _port=\$"$nr" - [ -n "$_port" ]; port="port $_port" - fi + case "${p}" in + "flags" | "dev" | "id" | "port") + eval "${p}"=\$"${nr}" + ;; + esac nr=$((nr + 1)) done if mptcp_lib_is_ip_mptcp; then - ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port + ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} else - ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port + ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} fi } @@ -664,7 +652,7 @@ pm_nl_del_endpoint() if mptcp_lib_is_ip_mptcp; then [ $id -ne 0 ] && addr='' - ip -n $ns mptcp endpoint delete id $id $addr + ip -n $ns mptcp endpoint delete id $id ${addr:+"${addr}"} else ip netns exec $ns ./pm_nl_ctl del $id $addr fi @@ -707,49 +695,35 @@ pm_nl_change_endpoint() pm_nl_check_endpoint() { - local line expected_line local msg="$1" local ns=$2 local addr=$3 - local _flags="" - local _port - local dev - local _id + local flags dev id port print_check "${msg}" shift 3 while [ -n "$1" ]; do - if [ $1 = "flags" ]; then - _flags=$2 - shift - elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="$2" - shift - elif [ $1 = "id" ]; then - _id=$2 - shift - elif [ $1 = "port" ]; then - _port=$2 + case "${1}" in + "flags" | "dev" | "id" | "port") + eval "${1}"="${2}" shift - fi + ;; + *) + ;; + esac shift done - if [ -z "${_id}" ]; then + if [ -z "${id}" ]; then test_fail "bad test - missing endpoint id" return fi - line=$(mptcp_lib_pm_nl_get_endpoint "${ns}" "${_id}") - expected_line=$(mptcp_lib_pm_nl_format_endpoints \ - "${_id},${addr},${_flags//","/" "},${dev},${_port}") - if [ "$line" = "$expected_line" ]; then - print_ok - else - fail_test "expected '$expected_line' found '$line'" - fi + check_output "mptcp_lib_pm_nl_get_endpoint ${ns} ${id}" \ + "$(mptcp_lib_pm_nl_format_endpoints \ + "${id},${addr},${flags//","/" "},${dev},${port}")" } pm_nl_set_endpoint() -- cgit v1.2.3 From 441c6be9ae28e369d30fbcb27f2d6ae68d5ab529 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:12 +0200 Subject: selftests: mptcp: export pm_nl endpoint ops This patch exports six endpoint operation helpers with pm_nl_ prefix, pm_nl_set_limits(), pm_nl_add_endpoint(), pm_nl_del_endpoint(), pm_nl_flush_endpoint(), pm_nl_show_endpoints() and pm_nl_change_endpoint() into mptcp_lib.sh as public functions, and renamed each of them with a mptcp_lib_ prefix. Then these old pm_nl_ prefix helpers in mptcp_join.sh can be wrappers of mptcp_lib_ prefix ones. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 72 ++------------------- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 85 +++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 66 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 8862b6a5caa0..fefa9173bdaa 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -605,92 +605,32 @@ kill_events_pids() pm_nl_set_limits() { - local ns=$1 - local addrs=$2 - local subflows=$3 - - if mptcp_lib_is_ip_mptcp; then - ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows - else - ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows - fi + mptcp_lib_pm_nl_set_limits "${@}" } pm_nl_add_endpoint() { - local ns=$1 - local addr=$2 - local flags dev id port - local nr=2 - - local p - for p in "${@}" - do - case "${p}" in - "flags" | "dev" | "id" | "port") - eval "${p}"=\$"${nr}" - ;; - esac - - nr=$((nr + 1)) - done - - if mptcp_lib_is_ip_mptcp; then - ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \ - ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} - else - ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \ - ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} - fi + mptcp_lib_pm_nl_add_endpoint "${@}" } pm_nl_del_endpoint() { - local ns=$1 - local id=$2 - local addr=$3 - - if mptcp_lib_is_ip_mptcp; then - [ $id -ne 0 ] && addr='' - ip -n $ns mptcp endpoint delete id $id ${addr:+"${addr}"} - else - ip netns exec $ns ./pm_nl_ctl del $id $addr - fi + mptcp_lib_pm_nl_del_endpoint "${@}" } pm_nl_flush_endpoint() { - local ns=$1 - - if mptcp_lib_is_ip_mptcp; then - ip -n $ns mptcp endpoint flush - else - ip netns exec $ns ./pm_nl_ctl flush - fi + mptcp_lib_pm_nl_flush_endpoint "${@}" } pm_nl_show_endpoints() { - local ns=$1 - - if mptcp_lib_is_ip_mptcp; then - ip -n $ns mptcp endpoint show - else - ip netns exec $ns ./pm_nl_ctl dump - fi + mptcp_lib_pm_nl_show_endpoints "${@}" } pm_nl_change_endpoint() { - local ns=$1 - local id=$2 - local flags=$3 - - if mptcp_lib_is_ip_mptcp; then - ip -n $ns mptcp endpoint change id $id ${flags//","/" "} - else - ip netns exec $ns ./pm_nl_ctl set id $id flags $flags - fi + mptcp_lib_pm_nl_change_endpoint "${@}" } pm_nl_check_endpoint() diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 4fabb0091940..ad2ebda5cb64 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -555,3 +555,88 @@ mptcp_lib_pm_nl_get_endpoint() { ip netns exec "${ns}" ./pm_nl_ctl get "${id}" fi } + +mptcp_lib_pm_nl_set_limits() { + local ns=${1} + local addrs=${2} + local subflows=${3} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp limits set add_addr_accepted "${addrs}" subflows "${subflows}" + else + ip netns exec "${ns}" ./pm_nl_ctl limits "${addrs}" "${subflows}" + fi +} + +mptcp_lib_pm_nl_add_endpoint() { + local ns=${1} + local addr=${2} + local flags dev id port + local nr=2 + + local p + for p in "${@}"; do + case "${p}" in + "flags" | "dev" | "id" | "port") + eval "${p}"=\$"${nr}" + ;; + esac + + nr=$((nr + 1)) + done + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + else + ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \ + ${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"} + fi +} + +mptcp_lib_pm_nl_del_endpoint() { + local ns=${1} + local id=${2} + local addr=${3} + + if mptcp_lib_is_ip_mptcp; then + [ "${id}" -ne 0 ] && addr='' + ip -n "${ns}" mptcp endpoint delete id "${id}" ${addr:+"${addr}"} + else + ip netns exec "${ns}" ./pm_nl_ctl del "${id}" "${addr}" + fi +} + +mptcp_lib_pm_nl_flush_endpoint() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint flush + else + ip netns exec "${ns}" ./pm_nl_ctl flush + fi +} + +mptcp_lib_pm_nl_show_endpoints() { + local ns=${1} + + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns}" mptcp endpoint show + else + ip netns exec "${ns}" ./pm_nl_ctl dump + fi +} + +mptcp_lib_pm_nl_change_endpoint() { + local ns=${1} + local id=${2} + local flags=${3} + + if mptcp_lib_is_ip_mptcp; then + # shellcheck disable=SC2086 # blanks in flags, no double quote + ip -n "${ns}" mptcp endpoint change id "${id}" ${flags//","/" "} + else + ip netns exec "${ns}" ./pm_nl_ctl set id "${id}" flags "${flags}" + fi +} -- cgit v1.2.3 From c99d57d0007a77c9d5b2ff0dfc6ab68d5d3fe911 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:13 +0200 Subject: selftests: mptcp: use pm_nl endpoint ops Use those newly added pm_nl endpoint ops helpers to replace all 'pm_nl_ctl' commands with 'limits', 'add', 'del', 'flush', 'show' and 'set' arguments in scripts mptcp_sockopt.sh and simult_flows.sh. In pm_netlink.sh, add wrappers of there helpers to make the function names shorter. Then use the wrappers to replace all 'pm_nl_ctl' commands. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 12 +- tools/testing/selftests/net/mptcp/pm_netlink.sh | 132 +++++++++++++-------- tools/testing/selftests/net/mptcp/simult_flows.sh | 6 +- 3 files changed, 89 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index e2d70c18786e..36e81383f0e2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -58,15 +58,15 @@ init() # let $ns2 reach any $ns1 address from any interface ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal - ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "10.0.${i}.1" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns1}" "dead:beef:${i}::1" flags signal - ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal - ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "10.0.${i}.2" flags signal + mptcp_lib_pm_nl_add_endpoint "${ns2}" "dead:beef:${i}::2" flags signal done - ip netns exec $ns1 ./pm_nl_ctl limits 8 8 - ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + mptcp_lib_pm_nl_set_limits "${ns1}" 8 8 + mptcp_lib_pm_nl_set_limits "${ns2}" 8 8 add_mark_rules $ns1 1 add_mark_rules $ns2 2 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index e27a731bd765..b3adb39a7d3d 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -86,6 +86,36 @@ change_address() { fi } +set_limits() +{ + mptcp_lib_pm_nl_set_limits "${ns1}" "${@}" +} + +add_endpoint() +{ + mptcp_lib_pm_nl_add_endpoint "${ns1}" "${@}" +} + +del_endpoint() +{ + mptcp_lib_pm_nl_del_endpoint "${ns1}" "${@}" +} + +flush_endpoint() +{ + mptcp_lib_pm_nl_flush_endpoint "${ns1}" +} + +show_endpoints() +{ + mptcp_lib_pm_nl_show_endpoints "${ns1}" +} + +change_endpoint() +{ + mptcp_lib_pm_nl_change_endpoint "${ns1}" "${@}" +} + check() { local cmd="$1" @@ -107,47 +137,47 @@ check() fi } -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "show_endpoints" "" "defaults addr list" default_limits="$(get_limits)" if mptcp_lib_expect_all_features; then check "get_limits" "$(format_limits 0 2)" "defaults limits" fi -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 flags subflow dev lo +add_endpoint 10.0.1.3 flags signal,backup check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "2,10.0.1.2,subflow,lo" \ "3,10.0.1.3,signal backup")" "dump addrs" -ip netns exec $ns1 ./pm_nl_ctl del 2 +del_endpoint 2 check "get_endpoint 2" "" "simple del addr" -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "3,10.0.1.3,signal backup")" "dump addrs after del" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null +add_endpoint 10.0.1.3 2>/dev/null check "get_endpoint 4" "" "duplicate addr" -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal +add_endpoint 10.0.1.4 flags signal check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" for i in $(seq 5 9); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 + add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 done check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" check "get_endpoint 10" "" "above hard addr limit" -ip netns exec $ns1 ./pm_nl_ctl del 9 +del_endpoint 9 for i in $(seq 10 255); do - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i - ip netns exec $ns1 ./pm_nl_ctl del $i + add_endpoint 10.0.0.9 id "${i}" + del_endpoint "${i}" done -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "3,10.0.1.3,signal backup" \ "4,10.0.1.4,signal" \ @@ -156,28 +186,28 @@ check "ip netns exec $ns1 ./pm_nl_ctl dump" \ "7,10.0.1.7,signal" \ "8,10.0.1.8,signal")" "id limit" -ip netns exec $ns1 ./pm_nl_ctl flush -check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" +flush_endpoint +check "show_endpoints" "" "flush addrs" -ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null +set_limits 9 1 2>/dev/null check "get_limits" "${default_limits}" "rcv addrs above hard limit" -ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null +set_limits 1 9 2>/dev/null check "get_limits" "${default_limits}" "subflows above hard limit" -ip netns exec $ns1 ./pm_nl_ctl limits 8 8 +set_limits 8 8 check "get_limits" "$(format_limits 8 8)" "set limits" -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 id 100 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +flush_endpoint +add_endpoint 10.0.1.1 +add_endpoint 10.0.1.2 +add_endpoint 10.0.1.3 id 100 +add_endpoint 10.0.1.4 +add_endpoint 10.0.1.5 id 254 +add_endpoint 10.0.1.6 +add_endpoint 10.0.1.7 +add_endpoint 10.0.1.8 +check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "2,10.0.1.2" \ "3,10.0.1.7" \ @@ -187,16 +217,16 @@ check "ip netns exec $ns1 ./pm_nl_ctl dump" \ "254,10.0.1.5" \ "255,10.0.1.6")" "set ids" -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.2 id 254 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.3 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.4 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7 -ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8 -check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +flush_endpoint +add_endpoint 10.0.0.1 +add_endpoint 10.0.0.2 id 254 +add_endpoint 10.0.0.3 +add_endpoint 10.0.0.4 +add_endpoint 10.0.0.5 id 253 +add_endpoint 10.0.0.6 +add_endpoint 10.0.0.7 +add_endpoint 10.0.0.8 +check "show_endpoints" \ "$(format_endpoints "1,10.0.0.1" \ "2,10.0.0.4" \ "3,10.0.0.6" \ @@ -206,28 +236,26 @@ check "ip netns exec $ns1 ./pm_nl_ctl dump" \ "254,10.0.0.2" \ "255,10.0.0.3")" "wrap-around ids" -ip netns exec $ns1 ./pm_nl_ctl flush -ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow +flush_endpoint +add_endpoint 10.0.1.1 flags subflow change_address 10.0.1.1 backup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup")" \ "set flags (backup)" change_address 10.0.1.1 nobackup -check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow")" \ +check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ " (nobackup)" # fullmesh support has been added later -ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null -if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" || +change_endpoint 1 fullmesh 2>/dev/null +if show_endpoints | grep -q "fullmesh" || mptcp_lib_expect_all_features; then - check "ip netns exec $ns1 ./pm_nl_ctl dump" \ - "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \ + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \ " (fullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" "$(format_endpoints "1,10.0.1.1,subflow")" \ + change_endpoint 1 nofullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \ " (nofullmesh)" - ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh - check "ip netns exec $ns1 ./pm_nl_ctl dump" \ - "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \ + change_endpoint 1 backup,fullmesh + check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \ " (backup,fullmesh)" else for st in fullmesh nofullmesh backup,fullmesh; do diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 4e6d8fc56b38..3ebb2fb12c8a 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -85,8 +85,8 @@ setup() ip -net "$ns1" route add default via 10.0.2.2 metric 101 ip -net "$ns1" route add default via dead:beef:2::2 metric 101 - ip netns exec "$ns1" ./pm_nl_ctl limits 1 1 - ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow + mptcp_lib_pm_nl_set_limits "${ns1}" 1 1 + mptcp_lib_pm_nl_add_endpoint "${ns1}" 10.0.2.1 dev ns1eth2 flags subflow ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad @@ -108,7 +108,7 @@ setup() ip -net "$ns3" route add default via 10.0.3.2 ip -net "$ns3" route add default via dead:beef:3::2 - ip netns exec "$ns3" ./pm_nl_ctl limits 1 1 + mptcp_lib_pm_nl_set_limits "${ns3}" 1 1 # debug build can slow down measurably the test program # we use quite tight time limit on the run-time, to ensure -- cgit v1.2.3 From 0cef6fcac24d4829e6249b7dc844bb0e6dc56d41 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:14 +0200 Subject: selftests: mptcp: ip_mptcp option for more scripts This patch adds '-i' option for mptcp_sockopt.sh, pm_netlink.sh, and simult_flows.sh, to use 'ip mptcp' command in the tests instead of 'pm_nl_ctl'. Update usage() correspondingly. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 22 ++++++++++++++++++++++ tools/testing/selftests/net/mptcp/pm_netlink.sh | 9 +++++++-- tools/testing/selftests/net/mptcp/simult_flows.sh | 8 ++++++-- 3 files changed, 35 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 36e81383f0e2..68899a303a1a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -22,6 +22,28 @@ ns1="" ns2="" ns_sbox="" +usage() { + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" +} + +while getopts "hi" option;do + case "$option" in + "h") + usage "$0" + exit ${KSFT_PASS} + ;; + "i") + mptcp_lib_set_ip_mptcp + ;; + "?") + usage "$0" + exit ${KSFT_FAIL} + ;; + esac +done + add_mark_rules() { local ns=$1 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index b3adb39a7d3d..4859fa85d9a0 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -11,16 +11,21 @@ ret=0 usage() { - echo "Usage: $0 [ -h ]" + echo "Usage: $0 [ -i ] [ -h ]" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" + echo -e "\t-h: help" } -optstring=h +optstring=hi while getopts "$optstring" option;do case "$option" in "h") usage $0 exit ${KSFT_PASS} ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") usage $0 exit ${KSFT_FAIL} diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 3ebb2fb12c8a..4b14b4412166 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -27,10 +27,11 @@ capout="" size=0 usage() { - echo "Usage: $0 [ -b ] [ -c ] [ -d ]" + echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]" echo -e "\t-b: bail out after first error, otherwise runs al testcases" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" echo -e "\t-d: debug this script" + echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" } # This function is used in the cleanup trap @@ -259,7 +260,7 @@ run_test() fi } -while getopts "bcdh" option;do +while getopts "bcdhi" option;do case "$option" in "h") usage $0 @@ -274,6 +275,9 @@ while getopts "bcdh" option;do "d") set -x ;; + "i") + mptcp_lib_set_ip_mptcp + ;; "?") usage $0 exit ${KSFT_FAIL} -- cgit v1.2.3 From 6eaeda12dc77dee5c98b1d04ddb3e4134aed9ba0 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:15 +0200 Subject: selftests: mptcp: netlink: drop disable=SC2086 Now there are only a few of variables are not using double quotes. Modifying them, then "shellcheck disable=SC2086" can be dropped. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 4859fa85d9a0..2757378b1b13 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -1,11 +1,6 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Double quotes to prevent globbing and word splitting is recommended in new -# code but we accept it, especially because there were too many before having -# address all other issues detected by shellcheck. -#shellcheck disable=SC2086 - . "$(dirname "${0}")/mptcp_lib.sh" ret=0 @@ -20,14 +15,14 @@ optstring=hi while getopts "$optstring" option;do case "$option" in "h") - usage $0 + usage "$0" exit ${KSFT_PASS} ;; "i") mptcp_lib_set_ip_mptcp ;; "?") - usage $0 + usage "$0" exit ${KSFT_FAIL} ;; esac @@ -40,7 +35,7 @@ err=$(mktemp) #shellcheck disable=SC2317 cleanup() { - rm -f $err + rm -f "${err}" mptcp_lib_ns_exit "${ns1}" } -- cgit v1.2.3 From 812c5945bdb8fbdc7420e3467d005bac04e6fcf6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2024 09:47:49 -0400 Subject: cgroup/cpuset: Add test_cpuset_v1_hp.sh Add a simple test to verify that an empty v1 cpuset will force its tasks to be moved to an ancestor node. It is based on the test case documented in commit 76bb5ab8f6e3 ("cpuset: break kernfs active protection in cpuset_write_resmask()"). Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/Makefile | 2 +- .../testing/selftests/cgroup/test_cpuset_v1_hp.sh | 46 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 00b441928909..16461dc0ffdf 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wall -pthread all: ${HELPER_PROGS} TEST_FILES := with_stress.sh -TEST_PROGS := test_stress.sh test_cpuset_prs.sh +TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh TEST_GEN_FILES := wait_inotify TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh new file mode 100755 index 000000000000..3f45512fb512 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -0,0 +1,46 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Test the special cpuset v1 hotplug case where a cpuset become empty of +# CPUs will force migration of tasks out to an ancestor. +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, put a CPU and a task there and offline that CPU +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR +echo 1 > $CPUSET/$TDIR/cpuset.cpus +echo 0 > $CPUSET/$TDIR/cpuset.mems +sleep 10& +TASK=$! +echo $TASK > $CPUSET/$TDIR/tasks +NEWCS=$(cat /proc/$TASK/cpuset) +[[ $NEWCS != "/$TDIR" ]] && { + echo "Unexpected cpuset $NEWCS, test FAILED!" + exit 1 +} + +echo 0 > /sys/devices/system/cpu/cpu1/online +sleep 0.5 +echo 1 > /sys/devices/system/cpu/cpu1/online +NEWCS=$(cat /proc/$TASK/cpuset) +rmdir $CPUSET/$TDIR +[[ $NEWCS != "/" ]] && { + echo "cpuset $NEWCS, test FAILED!" + exit 1 +} +echo "Test PASSED" +exit 0 -- cgit v1.2.3 From 218c200f677d8af46a8540319e4d26c52b3277a5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 1 Apr 2024 14:08:05 -0700 Subject: perf script: Consolidate capstone print functions Consolidate capstone print functions, to reduce duplication. Amend call sites to use a file pointer for output, which is consistent with most perf tools print functions. Add print_opts with an option to print also the hex value of a resolved symbol+offset. Signed-off-by: Adrian Hunter Link: https://lore.kernel.org/r/20240401210925.209671-4-ak@linux.intel.com Signed-off-by: Andi Kleen [ Added missing inttypes.h include to use PRIx64 in util/print_insn.c ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 43 +++++++++++------- tools/perf/util/print_insn.c | 104 +++++++++++++++---------------------------- tools/perf/util/print_insn.h | 7 ++- 3 files changed, 68 insertions(+), 86 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index dd10f158ed0c..647cb31a47c8 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1165,18 +1165,29 @@ out: return ret; } -static const char *any_dump_insn(struct perf_event_attr *attr __maybe_unused, - struct perf_insn *x, uint64_t ip, - u8 *inbuf, int inlen, int *lenp) +static int any_dump_insn(struct perf_event_attr *attr __maybe_unused, + struct perf_insn *x, uint64_t ip, + u8 *inbuf, int inlen, int *lenp, + FILE *fp) { #ifdef HAVE_LIBCAPSTONE_SUPPORT if (PRINT_FIELD(BRSTACKDISASM)) { - const char *p = cs_dump_insn(x, ip, inbuf, inlen, lenp); - if (p) - return p; + int printed = fprintf_insn_asm(x->machine, x->thread, x->cpumode, x->is64bit, + (uint8_t *)inbuf, inlen, ip, lenp, + PRINT_INSN_IMM_HEX, fp); + + if (printed > 0) + return printed; } #endif - return dump_insn(x, ip, inbuf, inlen, lenp); + return fprintf(fp, "%s", dump_insn(x, ip, inbuf, inlen, lenp)); +} + +static int add_padding(FILE *fp, int printed, int padding) +{ + if (printed >= 0 && printed < padding) + printed += fprintf(fp, "%*s", padding - printed, ""); + return printed; } static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, @@ -1186,8 +1197,10 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct thread *thread) { int ilen = 0; - int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t", ip, - any_dump_insn(attr, x, ip, inbuf, len, &ilen)); + int printed = fprintf(fp, "\t%016" PRIx64 "\t", ip); + + printed += add_padding(fp, any_dump_insn(attr, x, ip, inbuf, len, &ilen, fp), 30); + printed += fprintf(fp, "\t"); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "ilen: %d\t", ilen); @@ -1330,8 +1343,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, break; } else { ilen = 0; - printed += fprintf(fp, "\t%016" PRIx64 "\t%s", ip, - any_dump_insn(attr, &x, ip, buffer + off, len - off, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t", ip); + printed += any_dump_insn(attr, &x, ip, buffer + off, len - off, &ilen, fp); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1378,8 +1391,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, if (len <= 0) goto out; ilen = 0; - printed += fprintf(fp, "\t%016" PRIx64 "\t%s", sample->ip, - any_dump_insn(attr, &x, sample->ip, buffer, len, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t", sample->ip); + printed += any_dump_insn(attr, &x, sample->ip, buffer, len, &ilen, fp); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); @@ -1389,8 +1402,8 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, } for (off = 0; off <= end - start; off += ilen) { ilen = 0; - printed += fprintf(fp, "\t%016" PRIx64 "\t%s", start + off, - any_dump_insn(attr, &x, start + off, buffer + off, len - off, &ilen)); + printed += fprintf(fp, "\t%016" PRIx64 "\t", start + off); + printed += any_dump_insn(attr, &x, start + off, buffer + off, len - off, &ilen, fp); if (PRINT_FIELD(BRSTACKINSNLEN)) printed += fprintf(fp, "\tilen: %d", ilen); printed += fprintf(fp, "\n"); diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index 8825330d435f..aab11d8e1b1d 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -4,6 +4,7 @@ * * Author(s): Changbin Du */ +#include #include #include #include "debug.h" @@ -72,59 +73,8 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64) return 0; } -static void dump_insn_x86(struct thread *thread, cs_insn *insn, struct perf_insn *x) -{ - struct addr_location al; - bool printed = false; - - if (insn->detail && insn->detail->x86.op_count == 1) { - cs_x86_op *op = &insn->detail->x86.operands[0]; - - addr_location__init(&al); - if (op->type == X86_OP_IMM && - thread__find_symbol(thread, x->cpumode, op->imm, &al) && - al.sym && - al.addr < al.sym->end) { - snprintf(x->out, sizeof(x->out), "%s %s+%#" PRIx64 " [%#" PRIx64 "]", insn[0].mnemonic, - al.sym->name, al.addr - al.sym->start, op->imm); - printed = true; - } - addr_location__exit(&al); - } - - if (!printed) - snprintf(x->out, sizeof(x->out), "%s %s", insn[0].mnemonic, insn[0].op_str); -} - -const char *cs_dump_insn(struct perf_insn *x, uint64_t ip, - u8 *inbuf, int inlen, int *lenp) -{ - int ret; - int count; - cs_insn *insn; - csh cs_handle; - - ret = capstone_init(x->machine, &cs_handle, x->is64bit); - if (ret < 0) - return NULL; - - count = cs_disasm(cs_handle, (uint8_t *)inbuf, inlen, ip, 1, &insn); - if (count > 0) { - if (machine__normalized_is(x->machine, "x86")) - dump_insn_x86(x->thread, &insn[0], x); - else - snprintf(x->out, sizeof(x->out), "%s %s", - insn[0].mnemonic, insn[0].op_str); - *lenp = insn->size; - cs_free(insn, count); - } else { - return NULL; - } - return x->out; -} - -static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, - cs_insn *insn, FILE *fp) +static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn, + int print_opts, FILE *fp) { struct addr_location al; size_t printed = 0; @@ -134,9 +84,11 @@ static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread, addr_location__init(&al); if (op->type == X86_OP_IMM && - thread__find_symbol(thread, sample->cpumode, op->imm, &al)) { + thread__find_symbol(thread, cpumode, op->imm, &al)) { printed += fprintf(fp, "%s ", insn[0].mnemonic); printed += symbol__fprintf_symname_offs(al.sym, &al, fp); + if (print_opts & PRINT_INSN_IMM_HEX) + printed += fprintf(fp, " [%#" PRIx64 "]", op->imm); addr_location__exit(&al); return printed; } @@ -159,39 +111,53 @@ static bool is64bitip(struct machine *machine, struct addr_location *al) machine__normalized_is(machine, "s390"); } -size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, - struct machine *machine, FILE *fp, - struct addr_location *al) +ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode, + bool is64bit, const uint8_t *code, size_t code_size, + uint64_t ip, int *lenp, int print_opts, FILE *fp) { - csh cs_handle; + size_t printed; cs_insn *insn; + csh cs_handle; size_t count; - size_t printed = 0; int ret; - bool is64bit = is64bitip(machine, al); /* TODO: Try to initiate capstone only once but need a proper place. */ ret = capstone_init(machine, &cs_handle, is64bit); - if (ret < 0) { - /* fallback */ - return sample__fprintf_insn_raw(sample, fp); - } + if (ret < 0) + return ret; - count = cs_disasm(cs_handle, (uint8_t *)sample->insn, sample->insn_len, - sample->ip, 1, &insn); + count = cs_disasm(cs_handle, code, code_size, ip, 1, &insn); if (count > 0) { if (machine__normalized_is(machine, "x86")) - printed += print_insn_x86(sample, thread, &insn[0], fp); + printed = print_insn_x86(thread, cpumode, &insn[0], print_opts, fp); else - printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); + printed = fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); + if (lenp) + *lenp = insn->size; cs_free(insn, count); } else { - printed += fprintf(fp, "illegal instruction"); + printed = -1; } cs_close(&cs_handle); return printed; } + +size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, + struct machine *machine, FILE *fp, + struct addr_location *al) +{ + bool is64bit = is64bitip(machine, al); + ssize_t printed; + + printed = fprintf_insn_asm(machine, thread, sample->cpumode, is64bit, + (uint8_t *)sample->insn, sample->insn_len, + sample->ip, NULL, 0, fp); + if (printed < 0) + return sample__fprintf_insn_raw(sample, fp); + + return printed; +} #else size_t sample__fprintf_insn_asm(struct perf_sample *sample __maybe_unused, struct thread *thread __maybe_unused, diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h index c2a6391a45ce..07d11af3fc1c 100644 --- a/tools/perf/util/print_insn.h +++ b/tools/perf/util/print_insn.h @@ -10,10 +10,13 @@ struct thread; struct machine; struct perf_insn; +#define PRINT_INSN_IMM_HEX (1<<0) + size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, struct machine *machine, FILE *fp, struct addr_location *al); size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp); -const char *cs_dump_insn(struct perf_insn *x, uint64_t ip, - u8 *inbuf, int inlen, int *lenp); +ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode, + bool is64bit, const uint8_t *code, size_t code_size, + uint64_t ip, int *lenp, int print_opts, FILE *fp); #endif /* PERF_PRINT_INSN_H */ -- cgit v1.2.3 From aaf494cf483a1a835c44e942861429b30a00cab0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:08 -0700 Subject: perf annotate: Fix annotation_calc_lines() to pass correct address to get_srcline() It should pass a proper address (i.e. suitable for objdump or addr2line) to get_srcline() in order to work correctly. It used to pass an address with map__rip_2objdump() as the second argument but later it's changed to use notes->start. It's ok in normal cases but it can be changed when annotate_opts.full_addr is set. So let's convert the address directly instead of using the notes->start. Also the last argument is an IP to print symbol offset if requested. So it should pass symbol-relative address. Fixes: 7d18a824b5e57ddd ("perf annotate: Toggle full address <-> offset display") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 35235147b111..a330e92c2552 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1440,7 +1440,7 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m annotation__update_column_widths(notes); } -static void annotation__calc_lines(struct annotation *notes, struct map *map, +static void annotation__calc_lines(struct annotation *notes, struct map_symbol *ms, struct rb_root *root) { struct annotation_line *al; @@ -1448,6 +1448,7 @@ static void annotation__calc_lines(struct annotation *notes, struct map *map, list_for_each_entry(al, ¬es->src->source, node) { double percent_max = 0.0; + u64 addr; int i; for (i = 0; i < al->data_nr; i++) { @@ -1463,8 +1464,9 @@ static void annotation__calc_lines(struct annotation *notes, struct map *map, if (percent_max <= 0.5) continue; - al->path = get_srcline(map__dso(map), notes->start + al->offset, NULL, - false, true, notes->start + al->offset); + addr = map__rip_2objdump(ms->map, ms->sym->start); + al->path = get_srcline(map__dso(ms->map), addr + al->offset, NULL, + false, true, ms->sym->start + al->offset); insert_source_line(&tmp_root, al); } @@ -1475,7 +1477,7 @@ static void symbol__calc_lines(struct map_symbol *ms, struct rb_root *root) { struct annotation *notes = symbol__annotation(ms->sym); - annotation__calc_lines(notes, ms->map, root); + annotation__calc_lines(notes, ms, root); } int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel) -- cgit v1.2.3 From bfd98ceb6267712ae298f10144ba0576cc03c70f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:09 -0700 Subject: perf annotate: Staticize some local functions I found annotation__mark_jump_targets(), annotation__set_offsets() and annotation__init_column_widths() are only used in the same file. Let's make them static. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 8 +++++--- tools/perf/util/annotate.h | 3 --- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index a330e92c2552..bbf4894b1309 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1316,7 +1316,8 @@ bool disasm_line__is_valid_local_jump(struct disasm_line *dl, struct symbol *sym return true; } -void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) +static void +annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) { u64 offset, size = symbol__size(sym); @@ -1347,7 +1348,7 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) } } -void annotation__set_offsets(struct annotation *notes, s64 size) +static void annotation__set_offsets(struct annotation *notes, s64 size) { struct annotation_line *al; struct annotated_source *src = notes->src; @@ -1404,7 +1405,8 @@ static int annotation__max_ins_name(struct annotation *notes) return max_name; } -void annotation__init_column_widths(struct annotation *notes, struct symbol *sym) +static void +annotation__init_column_widths(struct annotation *notes, struct symbol *sym) { notes->widths.addr = notes->widths.target = notes->widths.min_addr = hex_width(symbol__size(sym)); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index b3007c9966fd..3f383f38f65f 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -340,10 +340,7 @@ static inline bool annotation_line__filter(struct annotation_line *al) return annotate_opts.hide_src_code && al->offset == -1; } -void annotation__set_offsets(struct annotation *notes, s64 size); -void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym); void annotation__update_column_widths(struct annotation *notes); -void annotation__init_column_widths(struct annotation *notes, struct symbol *sym); void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms); static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx) -- cgit v1.2.3 From 6f157d9af1e42aafa469deb7c16203e39a2f9545 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:10 -0700 Subject: perf annotate: Introduce annotated_source__get_line() It's a helper function to get annotation_line at the given offset without using the offsets array. The goal is to get rid of the offsets array altogether. It just does the linear search but I think it's better to save memory as it won't be called in a hot path. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate.c | 26 +++++++++++++++++++++----- tools/perf/util/annotate.h | 3 +++ 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ec5e21932876..e72583f37972 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -186,7 +186,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) * name right after the '<' token and probably treating this like a * 'call' instruction. */ - target = notes->src->offsets[cursor->ops.target.offset]; + target = annotated_source__get_line(notes->src, cursor->ops.target.offset); if (target == NULL) { ui_helpline__printf("WARN: jump target inconsistency, press 'o', notes->offsets[%#x] = NULL\n", cursor->ops.target.offset); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index bbf4894b1309..2409d7424c71 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -369,13 +369,25 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, return err; } +struct annotation_line *annotated_source__get_line(struct annotated_source *src, + s64 offset) +{ + struct annotation_line *al; + + list_for_each_entry(al, &src->source, node) { + if (al->offset == offset) + return al; + } + return NULL; +} + static unsigned annotation__count_insn(struct annotation *notes, u64 start, u64 end) { unsigned n_insn = 0; u64 offset; for (offset = start; offset <= end; offset++) { - if (notes->src->offsets[offset]) + if (annotated_source__get_line(notes->src, offset)) n_insn++; } return n_insn; @@ -405,8 +417,9 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 return; for (offset = start; offset <= end; offset++) { - struct annotation_line *al = notes->src->offsets[offset]; + struct annotation_line *al; + al = annotated_source__get_line(notes->src, offset); if (al && al->cycles && al->cycles->ipc == 0.0) { al->cycles->ipc = ipc; cover_insn++; @@ -443,7 +456,7 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) if (ch && ch->cycles) { struct annotation_line *al; - al = notes->src->offsets[offset]; + al = annotated_source__get_line(notes->src, offset); if (al && al->cycles == NULL) { al->cycles = zalloc(sizeof(*al->cycles)); if (al->cycles == NULL) { @@ -466,7 +479,9 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; if (ch && ch->cycles) { - struct annotation_line *al = notes->src->offsets[offset]; + struct annotation_line *al; + + al = annotated_source__get_line(notes->src, offset); if (al) zfree(&al->cycles); } @@ -1326,9 +1341,10 @@ annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) return; for (offset = 0; offset < size; ++offset) { - struct annotation_line *al = notes->src->offsets[offset]; + struct annotation_line *al; struct disasm_line *dl; + al = annotated_source__get_line(notes->src, offset); dl = disasm_line(al); if (!disasm_line__is_valid_local_jump(dl, sym)) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 3f383f38f65f..aa3298c20300 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -270,6 +270,9 @@ struct annotated_source { u16 max_line_len; }; +struct annotation_line *annotated_source__get_line(struct annotated_source *src, + s64 offset); + /** * struct annotated_branch - basic block and IPC information for a symbol. * -- cgit v1.2.3 From 0c053ed27303660140ee5e9a82c06f923d4f9c73 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:11 -0700 Subject: perf annotate: Check annotation lines more efficiently In some places, it checks annotated (disasm) lines for each byte. But as it already has a list of disasm lines, it'd be better to traverse the list entries instead of checking every offset with linear search (by annotated_source__get_line() helper). Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 56 +++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 2409d7424c71..d98fc248ba5b 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -383,12 +383,19 @@ struct annotation_line *annotated_source__get_line(struct annotated_source *src, static unsigned annotation__count_insn(struct annotation *notes, u64 start, u64 end) { + struct annotation_line *al; unsigned n_insn = 0; - u64 offset; - for (offset = start; offset <= end; offset++) { - if (annotated_source__get_line(notes->src, offset)) - n_insn++; + al = annotated_source__get_line(notes->src, start); + if (al == NULL) + return 0; + + list_for_each_entry_from(al, ¬es->src->source, node) { + if (al->offset == -1) + continue; + if ((u64)al->offset > end) + break; + n_insn++; } return n_insn; } @@ -405,10 +412,10 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 { unsigned n_insn; unsigned int cover_insn = 0; - u64 offset; n_insn = annotation__count_insn(notes, start, end); if (n_insn && ch->num && ch->cycles) { + struct annotation_line *al; struct annotated_branch *branch; float ipc = n_insn / ((double)ch->cycles / (double)ch->num); @@ -416,11 +423,16 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 if (ch->reset >= 0x7fff) return; - for (offset = start; offset <= end; offset++) { - struct annotation_line *al; + al = annotated_source__get_line(notes->src, start); + if (al == NULL) + return; - al = annotated_source__get_line(notes->src, offset); - if (al && al->cycles && al->cycles->ipc == 0.0) { + list_for_each_entry_from(al, ¬es->src->source, node) { + if (al->offset == -1) + continue; + if ((u64)al->offset > end) + break; + if (al->cycles && al->cycles->ipc == 0.0) { al->cycles->ipc = ipc; cover_insn++; } @@ -1268,13 +1280,16 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx) { struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evidx); - int len = symbol__size(sym), offset; + struct annotation_line *al; h->nr_samples = 0; - for (offset = 0; offset < len; ++offset) { + list_for_each_entry(al, ¬es->src->source, node) { struct sym_hist_entry *entry; - entry = annotated_source__hist_entry(notes->src, evidx, offset); + if (al->offset == -1) + continue; + + entry = annotated_source__hist_entry(notes->src, evidx, al->offset); if (entry == NULL) continue; @@ -1334,33 +1349,32 @@ bool disasm_line__is_valid_local_jump(struct disasm_line *dl, struct symbol *sym static void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) { - u64 offset, size = symbol__size(sym); + struct annotation_line *al; /* PLT symbols contain external offsets */ if (strstr(sym->name, "@plt")) return; - for (offset = 0; offset < size; ++offset) { - struct annotation_line *al; + list_for_each_entry(al, ¬es->src->source, node) { struct disasm_line *dl; + struct annotation_line *target; - al = annotated_source__get_line(notes->src, offset); dl = disasm_line(al); if (!disasm_line__is_valid_local_jump(dl, sym)) continue; - al = notes->src->offsets[dl->ops.target.offset]; - + target = annotated_source__get_line(notes->src, + dl->ops.target.offset); /* * FIXME: Oops, no jump target? Buggy disassembler? Or do we * have to adjust to the previous offset? */ - if (al == NULL) + if (target == NULL) continue; - if (++al->jump_sources > notes->max_jump_sources) - notes->max_jump_sources = al->jump_sources; + if (++target->jump_sources > notes->max_jump_sources) + notes->max_jump_sources = target->jump_sources; } } -- cgit v1.2.3 From cee9b86043d3690bc9154dabe13e7d53ca44ef9b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:12 -0700 Subject: perf annotate: Get rid of offsets array The struct annotated_source.offsets[] is to save pointers to annotation_line at each offset. We can use annotated_source__get_line() helper instead so let's get rid of the array. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 5 +---- tools/perf/util/annotate.c | 29 ++++++----------------------- tools/perf/util/annotate.h | 2 -- 3 files changed, 7 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index e72583f37972..c93da2ce727f 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -977,7 +977,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, dso->annotate_warned = true; symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); - goto out_free_offsets; + return -1; } } @@ -996,8 +996,5 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, if(not_annotated) annotated_source__purge(notes->src); -out_free_offsets: - if(not_annotated) - zfree(¬es->src->offsets); return ret; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index d98fc248ba5b..0e8319835986 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1378,7 +1378,7 @@ annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) } } -static void annotation__set_offsets(struct annotation *notes, s64 size) +static void annotation__set_index(struct annotation *notes) { struct annotation_line *al; struct annotated_source *src = notes->src; @@ -1393,18 +1393,9 @@ static void annotation__set_offsets(struct annotation *notes, s64 size) if (src->max_line_len < line_len) src->max_line_len = line_len; al->idx = src->nr_entries++; - if (al->offset != -1) { + if (al->offset != -1) al->idx_asm = src->nr_asm_entries++; - /* - * FIXME: short term bandaid to cope with assembly - * routines that comes with labels in the same column - * as the address in objdump, sigh. - * - * E.g. copy_user_generic_unrolled - */ - if (al->offset < size) - notes->src->offsets[al->offset] = al; - } else + else al->idx_asm = -1; } } @@ -1835,25 +1826,21 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, size_t size = symbol__size(sym); int nr_pcnt = 1, err; - notes->src->offsets = zalloc(size * sizeof(struct annotation_line *)); - if (notes->src->offsets == NULL) - return ENOMEM; - if (evsel__is_group_event(evsel)) nr_pcnt = evsel->core.nr_members; err = symbol__annotate(ms, evsel, parch); if (err) - goto out_free_offsets; + return err; symbol__calc_percent(sym, evsel); - annotation__set_offsets(notes, size); + annotation__set_index(notes); annotation__mark_jump_targets(notes, sym); err = annotation__compute_ipc(notes, size); if (err) - goto out_free_offsets; + return err; annotation__init_column_widths(notes, sym); notes->nr_events = nr_pcnt; @@ -1862,10 +1849,6 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, sym->annotate2 = 1; return 0; - -out_free_offsets: - zfree(¬es->src->offsets); - return err; } static int annotation__config(const char *var, const char *value, void *data) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index aa3298c20300..d61184499bda 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -246,7 +246,6 @@ struct cyc_hist { * we have more than a group in a evlist, where we will want * to see each group separately, that is why symbol__annotate2() * sets src->nr_histograms to evsel->nr_members. - * @offsets: Array of annotation_line to be accessed by offset. * @samples: Hash map of sym_hist_entry. Keyed by event index and offset in symbol. * @nr_entries: Number of annotated_line in the source list. * @nr_asm_entries: Number of annotated_line with actual asm instruction in the @@ -262,7 +261,6 @@ struct cyc_hist { struct annotated_source { struct list_head source; struct sym_hist *histograms; - struct annotation_line **offsets; struct hashmap *samples; int nr_histograms; int nr_entries; -- cgit v1.2.3 From a46acc45673bc5537b0d9fc77b7a4edb5d068de6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:13 -0700 Subject: perf annotate: Move 'widths' struct to 'struct annotated_source' It's only used in 'perf annotate' output which means functions with actual samples. No need to consume memory for every symbol ('struct annotation'). Also move the 'max_line_len' field into it as it's related. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 6 +++--- tools/perf/util/annotate.c | 41 +++++++++++++++++++++------------------ tools/perf/util/annotate.h | 20 +++++++++---------- 3 files changed, 35 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index c93da2ce727f..032642a0b4b6 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -205,13 +205,13 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS); __ui_browser__line_arrow(browser, - pcnt_width + 2 + notes->widths.addr + width, + pcnt_width + 2 + notes->src->widths.addr + width, from, to); diff = is_fused(ab, cursor); if (diff > 0) { ui_browser__mark_fused(browser, - pcnt_width + 3 + notes->widths.addr + width, + pcnt_width + 3 + notes->src->widths.addr + width, from - diff, diff, to > from); } } @@ -983,7 +983,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, ui_helpline__push("Press ESC to exit"); - browser.b.width = notes->src->max_line_len; + browser.b.width = notes->src->widths.max_line_len; browser.b.nr_entries = notes->src->nr_entries; browser.b.entries = ¬es->src->source, browser.b.width += 18; /* Percentage */ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0e8319835986..0be744bb529c 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1383,15 +1383,15 @@ static void annotation__set_index(struct annotation *notes) struct annotation_line *al; struct annotated_source *src = notes->src; - src->max_line_len = 0; + src->widths.max_line_len = 0; src->nr_entries = 0; src->nr_asm_entries = 0; list_for_each_entry(al, &src->source, node) { size_t line_len = strlen(al->line); - if (src->max_line_len < line_len) - src->max_line_len = line_len; + if (src->widths.max_line_len < line_len) + src->widths.max_line_len = line_len; al->idx = src->nr_entries++; if (al->offset != -1) al->idx_asm = src->nr_asm_entries++; @@ -1429,26 +1429,26 @@ static int annotation__max_ins_name(struct annotation *notes) static void annotation__init_column_widths(struct annotation *notes, struct symbol *sym) { - notes->widths.addr = notes->widths.target = - notes->widths.min_addr = hex_width(symbol__size(sym)); - notes->widths.max_addr = hex_width(sym->end); - notes->widths.jumps = width_jumps(notes->max_jump_sources); - notes->widths.max_ins_name = annotation__max_ins_name(notes); + notes->src->widths.addr = notes->src->widths.target = + notes->src->widths.min_addr = hex_width(symbol__size(sym)); + notes->src->widths.max_addr = hex_width(sym->end); + notes->src->widths.jumps = width_jumps(notes->max_jump_sources); + notes->src->widths.max_ins_name = annotation__max_ins_name(notes); } void annotation__update_column_widths(struct annotation *notes) { if (annotate_opts.use_offset) - notes->widths.target = notes->widths.min_addr; + notes->src->widths.target = notes->src->widths.min_addr; else if (annotate_opts.full_addr) - notes->widths.target = BITS_PER_LONG / 4; + notes->src->widths.target = BITS_PER_LONG / 4; else - notes->widths.target = notes->widths.max_addr; + notes->src->widths.target = notes->src->widths.max_addr; - notes->widths.addr = notes->widths.target; + notes->src->widths.addr = notes->src->widths.target; if (annotate_opts.show_nr_jumps) - notes->widths.addr += notes->widths.jumps + 1; + notes->src->widths.addr += notes->src->widths.jumps + 1; } void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms) @@ -1625,7 +1625,8 @@ call_like: obj__printf(obj, " "); } - disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, notes->widths.max_ins_name); + disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, + notes->src->widths.max_ins_name); } static void ipc_coverage_string(char *bf, int size, struct annotation *notes) @@ -1753,9 +1754,11 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati obj__printf(obj, "%-*s", width - pcnt_width - cycles_width, " "); else if (al->offset == -1) { if (al->line_nr && annotate_opts.show_linenr) - printed = scnprintf(bf, sizeof(bf), "%-*d ", notes->widths.addr + 1, al->line_nr); + printed = scnprintf(bf, sizeof(bf), "%-*d ", + notes->src->widths.addr + 1, al->line_nr); else - printed = scnprintf(bf, sizeof(bf), "%-*s ", notes->widths.addr, " "); + printed = scnprintf(bf, sizeof(bf), "%-*s ", + notes->src->widths.addr, " "); obj__printf(obj, bf); obj__printf(obj, "%-*s", width - printed - pcnt_width - cycles_width + 1, al->line); } else { @@ -1773,7 +1776,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (annotate_opts.show_nr_jumps) { int prev; printed = scnprintf(bf, sizeof(bf), "%*d ", - notes->widths.jumps, + notes->src->widths.jumps, al->jump_sources); prev = obj__set_jumps_percent_color(obj, al->jump_sources, current_entry); @@ -1782,7 +1785,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati } print_addr: printed = scnprintf(bf, sizeof(bf), "%*" PRIx64 ": ", - notes->widths.target, addr); + notes->src->widths.target, addr); } else if (ins__is_call(&disasm_line(al)->ins) && annotate_opts.offset_level >= ANNOTATION__OFFSET_CALL) { goto print_addr; @@ -1790,7 +1793,7 @@ print_addr: goto print_addr; } else { printed = scnprintf(bf, sizeof(bf), "%-*s ", - notes->widths.addr, " "); + notes->src->widths.addr, " "); } } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d61184499bda..402ae774426b 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -250,7 +250,7 @@ struct cyc_hist { * @nr_entries: Number of annotated_line in the source list. * @nr_asm_entries: Number of annotated_line with actual asm instruction in the * source list. - * @max_line_len: Maximum length of objdump output in an annotated_line. + * @widths: Precalculated width of each column in the TUI output. * * disasm_lines are allocated, percentages calculated and all sorted by percentage * when the annotation is about to be presented, so the percentages are for @@ -265,7 +265,15 @@ struct annotated_source { int nr_histograms; int nr_entries; int nr_asm_entries; - u16 max_line_len; + struct { + u8 addr; + u8 jumps; + u8 target; + u8 min_addr; + u8 max_addr; + u8 max_ins_name; + u16 max_line_len; + } widths; }; struct annotation_line *annotated_source__get_line(struct annotated_source *src, @@ -302,14 +310,6 @@ struct LOCKABLE annotation { u64 start; int nr_events; int max_jump_sources; - struct { - u8 addr; - u8 jumps; - u8 target; - u8 min_addr; - u8 max_addr; - u8 max_ins_name; - } widths; struct annotated_source *src; struct annotated_branch *branch; }; -- cgit v1.2.3 From f6b18ababa5ea8d7f798aa452eae83058fd09c59 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:14 -0700 Subject: perf annotate: Move 'max_jump_sources' struct to 'struct annotated_source' It's only used in 'perf annotate' output which means functions with actual samples. No need to consume memory for every symbol ('struct annotation'). Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate.c | 6 +++--- tools/perf/util/annotate.h | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 032642a0b4b6..0e16c268e329 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -49,7 +49,7 @@ static int ui_browser__jumps_percent_color(struct ui_browser *browser, int nr, b if (current && (!browser->use_navkeypressed || browser->navkeypressed)) return HE_COLORSET_SELECTED; - if (nr == notes->max_jump_sources) + if (nr == notes->src->max_jump_sources) return HE_COLORSET_TOP; if (nr > 1) return HE_COLORSET_MEDIUM; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0be744bb529c..1fd51856d78f 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1373,8 +1373,8 @@ annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) if (target == NULL) continue; - if (++target->jump_sources > notes->max_jump_sources) - notes->max_jump_sources = target->jump_sources; + if (++target->jump_sources > notes->src->max_jump_sources) + notes->src->max_jump_sources = target->jump_sources; } } @@ -1432,7 +1432,7 @@ annotation__init_column_widths(struct annotation *notes, struct symbol *sym) notes->src->widths.addr = notes->src->widths.target = notes->src->widths.min_addr = hex_width(symbol__size(sym)); notes->src->widths.max_addr = hex_width(sym->end); - notes->src->widths.jumps = width_jumps(notes->max_jump_sources); + notes->src->widths.jumps = width_jumps(notes->src->max_jump_sources); notes->src->widths.max_ins_name = annotation__max_ins_name(notes); } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 402ae774426b..382705311d28 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -250,6 +250,8 @@ struct cyc_hist { * @nr_entries: Number of annotated_line in the source list. * @nr_asm_entries: Number of annotated_line with actual asm instruction in the * source list. + * @max_jump_sources: Maximum number of jump instructions targeting to the same + * instruction. * @widths: Precalculated width of each column in the TUI output. * * disasm_lines are allocated, percentages calculated and all sorted by percentage @@ -265,6 +267,7 @@ struct annotated_source { int nr_histograms; int nr_entries; int nr_asm_entries; + int max_jump_sources; struct { u8 addr; u8 jumps; @@ -309,7 +312,6 @@ struct annotated_branch { struct LOCKABLE annotation { u64 start; int nr_events; - int max_jump_sources; struct annotated_source *src; struct annotated_branch *branch; }; -- cgit v1.2.3 From 6f94a72d45295741b8be64da40a86780c7681a3b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:15 -0700 Subject: perf annotate: Move nr_events struct to 'struct annotated_source' It's only used in 'perf annotate' output which means functions with actual samples. No need to consume memory for every symbol ('struct annotation'). Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240404175716.1225482-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 6 +++--- tools/perf/util/annotate.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 1fd51856d78f..5f79ae0bccfd 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1584,7 +1584,7 @@ static double annotation_line__max_percent(struct annotation_line *al, double percent_max = 0.0; int i; - for (i = 0; i < notes->nr_events; i++) { + for (i = 0; i < notes->src->nr_events; i++) { double percent; percent = annotation_data__percent(&al->data[i], @@ -1674,7 +1674,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (al->offset != -1 && percent_max != 0.0) { int i; - for (i = 0; i < notes->nr_events; i++) { + for (i = 0; i < notes->src->nr_events; i++) { double percent; percent = annotation_data__percent(&al->data[i], percent_type); @@ -1846,7 +1846,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, return err; annotation__init_column_widths(notes, sym); - notes->nr_events = nr_pcnt; + notes->src->nr_events = nr_pcnt; annotation__update_column_widths(notes); sym->annotate2 = 1; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 382705311d28..d22b9e9a2fad 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -247,6 +247,7 @@ struct cyc_hist { * to see each group separately, that is why symbol__annotate2() * sets src->nr_histograms to evsel->nr_members. * @samples: Hash map of sym_hist_entry. Keyed by event index and offset in symbol. + * @nr_events: Number of events in the current output. * @nr_entries: Number of annotated_line in the source list. * @nr_asm_entries: Number of annotated_line with actual asm instruction in the * source list. @@ -265,6 +266,7 @@ struct annotated_source { struct sym_hist *histograms; struct hashmap *samples; int nr_histograms; + int nr_events; int nr_entries; int nr_asm_entries; int max_jump_sources; @@ -311,7 +313,6 @@ struct annotated_branch { struct LOCKABLE annotation { u64 start; - int nr_events; struct annotated_source *src; struct annotated_branch *branch; }; @@ -335,7 +336,7 @@ static inline int annotation__cycles_width(struct annotation *notes) static inline int annotation__pcnt_width(struct annotation *notes) { - return (symbol_conf.show_total_period ? 12 : 7) * notes->nr_events; + return (symbol_conf.show_total_period ? 12 : 7) * notes->src->nr_events; } static inline bool annotation_line__filter(struct annotation_line *al) -- cgit v1.2.3 From 8c004c7a608a278548e4a7f25df6bae0b0a04370 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 4 Apr 2024 10:57:16 -0700 Subject: perf annotate: Move 'start' field struct to 'struct annotated_source' It's only used in 'perf annotate' output which means functions with actual samples. No need to consume memory for every symbol ('struct annotation'). Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240404175716.1225482-10-namhyung@kernel.org Cc: Ian Rogers Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Ingo Molnar Cc: Kan Liang Cc: LKML Cc: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 10 +++++----- tools/perf/util/annotate.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 5f79ae0bccfd..4db49611c386 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -909,9 +909,9 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, args.arch = arch; args.ms = *ms; if (annotate_opts.full_addr) - notes->start = map__objdump_2mem(ms->map, ms->sym->start); + notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); else - notes->start = map__rip_2objdump(ms->map, ms->sym->start); + notes->src->start = map__rip_2objdump(ms->map, ms->sym->start); return symbol__disassemble(sym, &args); } @@ -1456,9 +1456,9 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m annotate_opts.full_addr = !annotate_opts.full_addr; if (annotate_opts.full_addr) - notes->start = map__objdump_2mem(ms->map, ms->sym->start); + notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); else - notes->start = map__rip_2objdump(ms->map, ms->sym->start); + notes->src->start = map__rip_2objdump(ms->map, ms->sym->start); annotation__update_column_widths(notes); } @@ -1766,7 +1766,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati int color = -1; if (!annotate_opts.use_offset) - addr += notes->start; + addr += notes->src->start; if (!annotate_opts.use_offset) { printed = scnprintf(bf, sizeof(bf), "%" PRIx64 ": ", addr); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d22b9e9a2fad..d5c821c22f79 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -270,6 +270,7 @@ struct annotated_source { int nr_entries; int nr_asm_entries; int max_jump_sources; + u64 start; struct { u8 addr; u8 jumps; @@ -312,7 +313,6 @@ struct annotated_branch { }; struct LOCKABLE annotation { - u64 start; struct annotated_source *src; struct annotated_branch *branch; }; -- cgit v1.2.3 From 705c09bb3cdffb141986598ad4ff9c9b0a66c3bd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 5 Apr 2024 00:09:30 -0700 Subject: tools subcmd: Add check_if_command_finished() Add non-blocking function to check if a 'struct child_process' has completed. If the process has completed the exit code is stored in the 'struct child_process' so that finish_command() returns it. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240405070931.1231245-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/run-command.c | 70 +++++++++++++++++++++++++++--------------- tools/lib/subcmd/run-command.h | 3 ++ 2 files changed, 49 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/lib/subcmd/run-command.c b/tools/lib/subcmd/run-command.c index d435eb42354b..4e3a557a2f37 100644 --- a/tools/lib/subcmd/run-command.c +++ b/tools/lib/subcmd/run-command.c @@ -165,43 +165,65 @@ int start_command(struct child_process *cmd) return 0; } -static int wait_or_whine(pid_t pid) +static int wait_or_whine(struct child_process *cmd, bool block) { - char sbuf[STRERR_BUFSIZE]; + bool finished = cmd->finished; + int result = cmd->finish_result; - for (;;) { + while (!finished) { int status, code; - pid_t waiting = waitpid(pid, &status, 0); + pid_t waiting = waitpid(cmd->pid, &status, block ? 0 : WNOHANG); + + if (!block && waiting == 0) + break; + + if (waiting < 0 && errno == EINTR) + continue; + finished = true; if (waiting < 0) { - if (errno == EINTR) - continue; + char sbuf[STRERR_BUFSIZE]; + fprintf(stderr, " Error: waitpid failed (%s)", str_error_r(errno, sbuf, sizeof(sbuf))); - return -ERR_RUN_COMMAND_WAITPID; - } - if (waiting != pid) - return -ERR_RUN_COMMAND_WAITPID_WRONG_PID; - if (WIFSIGNALED(status)) - return -ERR_RUN_COMMAND_WAITPID_SIGNAL; - - if (!WIFEXITED(status)) - return -ERR_RUN_COMMAND_WAITPID_NOEXIT; - code = WEXITSTATUS(status); - switch (code) { - case 127: - return -ERR_RUN_COMMAND_EXEC; - case 0: - return 0; - default: - return -code; + result = -ERR_RUN_COMMAND_WAITPID; + } else if (waiting != cmd->pid) { + result = -ERR_RUN_COMMAND_WAITPID_WRONG_PID; + } else if (WIFSIGNALED(status)) { + result = -ERR_RUN_COMMAND_WAITPID_SIGNAL; + } else if (!WIFEXITED(status)) { + result = -ERR_RUN_COMMAND_WAITPID_NOEXIT; + } else { + code = WEXITSTATUS(status); + switch (code) { + case 127: + result = -ERR_RUN_COMMAND_EXEC; + break; + case 0: + result = 0; + break; + default: + result = -code; + break; + } } } + if (finished) { + cmd->finished = 1; + cmd->finish_result = result; + } + return result; +} + +int check_if_command_finished(struct child_process *cmd) +{ + wait_or_whine(cmd, /*block=*/false); + return cmd->finished; } int finish_command(struct child_process *cmd) { - return wait_or_whine(cmd->pid); + return wait_or_whine(cmd, /*block=*/true); } int run_command(struct child_process *cmd) diff --git a/tools/lib/subcmd/run-command.h b/tools/lib/subcmd/run-command.h index d794138a797f..b2d39de6e690 100644 --- a/tools/lib/subcmd/run-command.h +++ b/tools/lib/subcmd/run-command.h @@ -41,17 +41,20 @@ struct child_process { int err; const char *dir; const char *const *env; + int finish_result; unsigned no_stdin:1; unsigned no_stdout:1; unsigned no_stderr:1; unsigned exec_cmd:1; /* if this is to be external sub-command */ unsigned stdout_to_stderr:1; + unsigned finished:1; void (*preexec_cb)(void); /* If set, call function in child rather than doing an exec. */ int (*no_exec_cmd)(struct child_process *process); }; int start_command(struct child_process *); +int check_if_command_finished(struct child_process *); int finish_command(struct child_process *); int run_command(struct child_process *); -- cgit v1.2.3 From bb761fcb821738e8d3b720e1460d3783db74c68a Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 6 Apr 2024 22:46:13 +0800 Subject: selftests/bpf: eliminate warning of get_cgroup_id_from_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The output goes like this if I make samples/bpf: ...warning: no previous prototype for ‘get_cgroup_id_from_path’... Make this function static could solve the warning problem since no one outside of the file calls it. Signed-off-by: Jason Xing Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240406144613.4434-1-kerneljasonxing@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/cgroup_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 19be9c63d5e8..f2952a65dcc2 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -429,7 +429,7 @@ int create_and_get_cgroup(const char *relative_path) * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) { int dirfd, err, flags, mount_id, fhsize; union { -- cgit v1.2.3 From 1a140b46da8f3be8e55fbd0950653c9a235b6ce9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 17 Feb 2024 17:57:35 -0800 Subject: rcutorture: Enable RCU priority boosting for TREE09 The TREE09 rcutorture scenario exhausts memory from time to time, and this is due to a reader being preempted and blocking grace periods, thus preventing recycling of the memory used in callback-flooding tests. This commit therefore enables RCU priority boosting and sets the boosting delay to 100 milliseconds after grace-period start. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/configs/rcu/TREE09 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index fc45645bb5f4..9ecd1b4e653d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -10,8 +10,9 @@ CONFIG_NO_HZ_FULL=n CONFIG_RCU_TRACE=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_BOOST=n +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=100 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_RCU_EXPERT=y CONFIG_KPROBES=n CONFIG_FTRACE=n -- cgit v1.2.3 From d0a2ba197bcbeaa795c5c961d927bcaf55964669 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Apr 2024 16:11:34 -0700 Subject: selftests/bpf: Add tests for atomics in bpf_arena. Add selftests for atomic instructions in bpf_arena. Signed-off-by: Alexei Starovoitov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240405231134.17274-2-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../selftests/bpf/prog_tests/arena_atomics.c | 186 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/arena_atomics.c | 178 ++++++++++++++++++++ 4 files changed, 366 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_atomics.c create mode 100644 tools/testing/selftests/bpf/progs/arena_atomics.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 0445ac38bc07..cf657fc35619 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,3 +10,4 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) +arena_atomics diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index f4a2f66a683d..c34adf39eeb2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -6,3 +6,4 @@ stacktrace_build_id # compare_map_keys stackid_hmap vs. sta verifier_iterating_callbacks verifier_arena # JIT does not support arena arena_htab # JIT does not support arena +arena_atomics diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c new file mode 100644 index 000000000000..0807a48a58ee --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include "arena_atomics.skel.h" + +static void test_add(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->add64_value, 3, "add64_value"); + ASSERT_EQ(skel->arena->add64_result, 1, "add64_result"); + + ASSERT_EQ(skel->arena->add32_value, 3, "add32_value"); + ASSERT_EQ(skel->arena->add32_result, 1, "add32_result"); + + ASSERT_EQ(skel->arena->add_stack_value_copy, 3, "add_stack_value"); + ASSERT_EQ(skel->arena->add_stack_result, 1, "add_stack_result"); + + ASSERT_EQ(skel->arena->add_noreturn_value, 3, "add_noreturn_value"); +} + +static void test_sub(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.sub); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->sub64_value, -1, "sub64_value"); + ASSERT_EQ(skel->arena->sub64_result, 1, "sub64_result"); + + ASSERT_EQ(skel->arena->sub32_value, -1, "sub32_value"); + ASSERT_EQ(skel->arena->sub32_result, 1, "sub32_result"); + + ASSERT_EQ(skel->arena->sub_stack_value_copy, -1, "sub_stack_value"); + ASSERT_EQ(skel->arena->sub_stack_result, 1, "sub_stack_result"); + + ASSERT_EQ(skel->arena->sub_noreturn_value, -1, "sub_noreturn_value"); +} + +static void test_and(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.and); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->and64_value, 0x010ull << 32, "and64_value"); + ASSERT_EQ(skel->arena->and32_value, 0x010, "and32_value"); +} + +static void test_or(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.or); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->or64_value, 0x111ull << 32, "or64_value"); + ASSERT_EQ(skel->arena->or32_value, 0x111, "or32_value"); +} + +static void test_xor(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xor); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xor64_value, 0x101ull << 32, "xor64_value"); + ASSERT_EQ(skel->arena->xor32_value, 0x101, "xor32_value"); +} + +static void test_cmpxchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.cmpxchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->cmpxchg64_value, 2, "cmpxchg64_value"); + ASSERT_EQ(skel->arena->cmpxchg64_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed"); + + ASSERT_EQ(skel->arena->cmpxchg32_value, 2, "lcmpxchg32_value"); + ASSERT_EQ(skel->arena->cmpxchg32_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); +} + +static void test_xchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xchg64_value, 2, "xchg64_value"); + ASSERT_EQ(skel->arena->xchg64_result, 1, "xchg64_result"); + + ASSERT_EQ(skel->arena->xchg32_value, 2, "xchg32_value"); + ASSERT_EQ(skel->arena->xchg32_result, 1, "xchg32_result"); +} + +void test_arena_atomics(void) +{ + struct arena_atomics *skel; + int err; + + skel = arena_atomics__open(); + if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) + return; + + if (skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", + __func__); + test__skip(); + goto cleanup; + } + err = arena_atomics__load(skel); + if (!ASSERT_OK(err, "arena atomics skeleton load")) + return; + skel->bss->pid = getpid(); + + if (test__start_subtest("add")) + test_add(skel); + if (test__start_subtest("sub")) + test_sub(skel); + if (test__start_subtest("and")) + test_and(skel); + if (test__start_subtest("or")) + test_or(skel); + if (test__start_subtest("xor")) + test_xor(skel); + if (test__start_subtest("cmpxchg")) + test_cmpxchg(skel); + if (test__start_subtest("xchg")) + test_xchg(skel); + +cleanup: + arena_atomics__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c new file mode 100644 index 000000000000..55f10563208d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 10); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +__u32 pid = 0; + +#undef __arena +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) +#define __arena __attribute__((address_space(1))) +#else +#define __arena SEC(".addr_space.1") +#endif + +__u64 __arena add64_value = 1; +__u64 __arena add64_result = 0; +__u32 __arena add32_value = 1; +__u32 __arena add32_result = 0; +__u64 __arena add_stack_value_copy = 0; +__u64 __arena add_stack_result = 0; +__u64 __arena add_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int add(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 add_stack_value = 1; + + add64_result = __sync_fetch_and_add(&add64_value, 2); + add32_result = __sync_fetch_and_add(&add32_value, 2); + add_stack_result = __sync_fetch_and_add(&add_stack_value, 2); + add_stack_value_copy = add_stack_value; + __sync_fetch_and_add(&add_noreturn_value, 2); +#endif + + return 0; +} + +__s64 __arena sub64_value = 1; +__s64 __arena sub64_result = 0; +__s32 __arena sub32_value = 1; +__s32 __arena sub32_result = 0; +__s64 __arena sub_stack_value_copy = 0; +__s64 __arena sub_stack_result = 0; +__s64 __arena sub_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int sub(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 sub_stack_value = 1; + + sub64_result = __sync_fetch_and_sub(&sub64_value, 2); + sub32_result = __sync_fetch_and_sub(&sub32_value, 2); + sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2); + sub_stack_value_copy = sub_stack_value; + __sync_fetch_and_sub(&sub_noreturn_value, 2); +#endif + + return 0; +} + +__u64 __arena and64_value = (0x110ull << 32); +__u32 __arena and32_value = 0x110; + +SEC("raw_tp/sys_enter") +int and(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + + __sync_fetch_and_and(&and64_value, 0x011ull << 32); + __sync_fetch_and_and(&and32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena or32_value = 0x110; +__u64 __arena or64_value = (0x110ull << 32); + +SEC("raw_tp/sys_enter") +int or(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_or(&or64_value, 0x011ull << 32); + __sync_fetch_and_or(&or32_value, 0x011); +#endif + + return 0; +} + +__u64 __arena xor64_value = (0x110ull << 32); +__u32 __arena xor32_value = 0x110; + +SEC("raw_tp/sys_enter") +int xor(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_xor(&xor64_value, 0x011ull << 32); + __sync_fetch_and_xor(&xor32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena cmpxchg32_value = 1; +__u32 __arena cmpxchg32_result_fail = 0; +__u32 __arena cmpxchg32_result_succeed = 0; +__u64 __arena cmpxchg64_value = 1; +__u64 __arena cmpxchg64_result_fail = 0; +__u64 __arena cmpxchg64_result_succeed = 0; + +SEC("raw_tp/sys_enter") +int cmpxchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3); + cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2); + + cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3); + cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2); +#endif + + return 0; +} + +__u64 __arena xchg64_value = 1; +__u64 __arena xchg64_result = 0; +__u32 __arena xchg32_value = 1; +__u32 __arena xchg32_result = 0; + +SEC("raw_tp/sys_enter") +int xchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 val64 = 2; + __u32 val32 = 2; + + xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64); + xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32); +#endif + + return 0; +} -- cgit v1.2.3 From 2ca76c12c48b7a2792b21a673ca01a6d8fb2e835 Mon Sep 17 00:00:00 2001 From: Anish Moorthy Date: Thu, 15 Feb 2024 23:54:01 +0000 Subject: KVM: selftests: Report per-vcpu demand paging rate from demand paging test Using the overall demand paging rate to measure performance can be slightly misleading when vCPU accesses are not overlapped. Adding more vCPUs will (usually) increase the overall demand paging rate even if performance remains constant or even degrades on a per-vcpu basis. As such, it makes sense to report both the total and per-vcpu paging rates. Signed-off-by: Anish Moorthy Link: https://lore.kernel.org/r/20240215235405.368539-11-amoorthy@google.com [sean: fix formatting] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/demand_paging_test.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index bf3609f71854..ddacbc6fd1ed 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -133,6 +133,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct uffd_desc **uffd_descs = NULL; struct timespec start; struct timespec ts_diff; + double vcpu_paging_rate; struct kvm_vm *vm; int i; @@ -191,11 +192,15 @@ static void run_test(enum vm_guest_mode mode, void *arg) uffd_stop_demand_paging(uffd_descs[i]); } - pr_info("Total guest execution time: %ld.%.9lds\n", + pr_info("Total guest execution time:\t%ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); - pr_info("Overall demand paging rate: %f pgs/sec\n", - memstress_args.vcpu_args[0].pages * nr_vcpus / - ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC)); + + vcpu_paging_rate = memstress_args.vcpu_args[0].pages / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC); + pr_info("Per-vcpu demand paging rate:\t%f pgs/sec/vcpu\n", + vcpu_paging_rate); + pr_info("Overall demand paging rate:\t%f pgs/sec\n", + vcpu_paging_rate * nr_vcpus); memstress_destroy_vm(vm); -- cgit v1.2.3 From df4ec5aada9da30486d5464f34ffc80acd0373d6 Mon Sep 17 00:00:00 2001 From: Anish Moorthy Date: Thu, 15 Feb 2024 23:54:02 +0000 Subject: KVM: selftests: Allow many vCPUs and reader threads per UFFD in demand paging test At the moment, demand_paging_test does not support profiling/testing multiple vCPU threads concurrently faulting on a single uffd because (a) "-u" (run test in userfaultfd mode) creates a uffd for each vCPU's region, so that each uffd services a single vCPU thread. (b) "-u -o" (userfaultfd mode + overlapped vCPU memory accesses) simply doesn't work: the test tries to register the same memory to multiple uffds, causing an error. Add support for many vcpus per uffd by (1) Keeping "-u" behavior unchanged. (2) Making "-u -a" create a single uffd for all of guest memory. (3) Making "-u -o" implicitly pass "-a", solving the problem in (b). In cases (2) and (3) all vCPU threads fault on a single uffd. With potentially multiple vCPUs per UFFD, it makes sense to allow configuring the number of reader threads per UFFD as well: add the "-r" flag to do so. Signed-off-by: Anish Moorthy Acked-by: James Houghton Link: https://lore.kernel.org/r/20240215235405.368539-12-amoorthy@google.com [sean: fix kernel style violations, use calloc() for arrays] Signed-off-by: Sean Christopherson --- .../selftests/kvm/aarch64/page_fault_test.c | 4 +- tools/testing/selftests/kvm/demand_paging_test.c | 76 ++++++++++++++++---- .../selftests/kvm/include/userfaultfd_util.h | 16 ++++- tools/testing/selftests/kvm/lib/userfaultfd_util.c | 81 ++++++++++++++-------- 4 files changed, 132 insertions(+), 45 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 5972905275cf..a2a158e2c0b8 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -375,14 +375,14 @@ static void setup_uffd(struct kvm_vm *vm, struct test_params *p, *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, pt_args.hva, pt_args.paging_size, - test->uffd_pt_handler); + 1, test->uffd_pt_handler); *data_uffd = NULL; if (test->uffd_data_handler) *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, data_args.hva, data_args.paging_size, - test->uffd_data_handler); + 1, test->uffd_data_handler); } static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index ddacbc6fd1ed..ca258968f6e1 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -77,8 +77,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, copy.mode = 0; r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -89,8 +101,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, cont.range.len = demand_paging_size; r = ioctl(uffd, UFFDIO_CONTINUE, &cont); - if (r == -1) { - pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx, thread %d, errno = %d\n", addr, tid, errno); return r; } @@ -110,7 +134,9 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, struct test_params { int uffd_mode; + bool single_uffd; useconds_t uffd_delay; + int readers_per_uffd; enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -131,11 +157,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct memstress_vcpu_args *vcpu_args; struct test_params *p = arg; struct uffd_desc **uffd_descs = NULL; + uint64_t uffd_region_size; struct timespec start; struct timespec ts_diff; double vcpu_paging_rate; struct kvm_vm *vm; - int i; + int i, num_uffds = 0; vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); @@ -148,7 +175,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) memset(guest_data_prototype, 0xAB, demand_paging_size); if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { - for (i = 0; i < nr_vcpus; i++) { + num_uffds = p->single_uffd ? 1 : nr_vcpus; + for (i = 0; i < num_uffds; i++) { vcpu_args = &memstress_args.vcpu_args[i]; prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), vcpu_args->pages * memstress_args.guest_page_size); @@ -156,9 +184,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) } if (p->uffd_mode) { - uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); + num_uffds = p->single_uffd ? 1 : nr_vcpus; + uffd_region_size = nr_vcpus * guest_percpu_mem_size / num_uffds; + + uffd_descs = malloc(num_uffds * sizeof(struct uffd_desc *)); TEST_ASSERT(uffd_descs, "Memory allocation failed"); - for (i = 0; i < nr_vcpus; i++) { + for (i = 0; i < num_uffds; i++) { + struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; vcpu_args = &memstress_args.vcpu_args[i]; @@ -171,7 +203,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) */ uffd_descs[i] = uffd_setup_demand_paging( p->uffd_mode, p->uffd_delay, vcpu_hva, - vcpu_args->pages * memstress_args.guest_page_size, + uffd_region_size, + p->readers_per_uffd, &handle_uffd_page_request); } } @@ -188,7 +221,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (p->uffd_mode) { /* Tell the user fault fd handler threads to quit */ - for (i = 0; i < nr_vcpus; i++) + for (i = 0; i < num_uffds; i++) uffd_stop_demand_paging(uffd_descs[i]); } @@ -212,15 +245,20 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-a]\n" + " [-d uffd_delay_usec] [-r readers_per_uffd] [-b memory]\n" + " [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); kvm_print_vcpu_pinning_help(); + printf(" -a: Use a single userfaultfd for all of guest memory, instead of\n" + " creating one for each region paged by a unique vCPU\n" + " Set implicitly with -o, and no effect without -u.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); + printf(" -r: Set the number of reader threads per uffd.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); @@ -239,12 +277,14 @@ int main(int argc, char *argv[]) struct test_params p = { .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, + .readers_per_uffd = 1, + .single_uffd = false, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) { + while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -256,6 +296,9 @@ int main(int argc, char *argv[]) p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; + case 'a': + p.single_uffd = true; + break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported."); @@ -276,6 +319,13 @@ int main(int argc, char *argv[]) break; case 'o': p.partition_vcpu_memory_access = false; + p.single_uffd = true; + break; + case 'r': + p.readers_per_uffd = atoi(optarg); + TEST_ASSERT(p.readers_per_uffd >= 1, + "Invalid number of readers per uffd %d: must be >=1", + p.readers_per_uffd); break; case 'h': default: diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h index 877449c34592..24f2cc5f4292 100644 --- a/tools/testing/selftests/kvm/include/userfaultfd_util.h +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -17,17 +17,27 @@ typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg); -struct uffd_desc { +struct uffd_reader_args { int uffd_mode; int uffd; - int pipefds[2]; useconds_t delay; uffd_handler_t handler; - pthread_t thread; + /* Holds the read end of the pipe for killing the reader. */ + int pipe; +}; + +struct uffd_desc { + int uffd; + uint64_t num_readers; + /* Holds the write ends of the pipes for killing the readers. */ + int *pipefds; + pthread_t *readers; + struct uffd_reader_args *reader_args; }; struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler); void uffd_stop_demand_paging(struct uffd_desc *uffd); diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index f4eef6eb2dc2..96a831c27c7f 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -27,10 +27,8 @@ static void *uffd_handler_thread_fn(void *arg) { - struct uffd_desc *uffd_desc = (struct uffd_desc *)arg; - int uffd = uffd_desc->uffd; - int pipefd = uffd_desc->pipefds[0]; - useconds_t delay = uffd_desc->delay; + struct uffd_reader_args *reader_args = (struct uffd_reader_args *)arg; + int uffd = reader_args->uffd; int64_t pages = 0; struct timespec start; struct timespec ts_diff; @@ -44,7 +42,7 @@ static void *uffd_handler_thread_fn(void *arg) pollfd[0].fd = uffd; pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd; + pollfd[1].fd = reader_args->pipe; pollfd[1].events = POLLIN; r = poll(pollfd, 2, -1); @@ -92,9 +90,9 @@ static void *uffd_handler_thread_fn(void *arg) if (!(msg.event & UFFD_EVENT_PAGEFAULT)) continue; - if (delay) - usleep(delay); - r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg); + if (reader_args->delay) + usleep(reader_args->delay); + r = reader_args->handler(reader_args->uffd_mode, uffd, &msg); if (r < 0) return NULL; pages++; @@ -110,6 +108,7 @@ static void *uffd_handler_thread_fn(void *arg) struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void *hva, uint64_t len, + uint64_t num_readers, uffd_handler_t handler) { struct uffd_desc *uffd_desc; @@ -118,14 +117,25 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - int ret; + int ret, i; PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", is_minor ? "MINOR" : "MISSING", is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); uffd_desc = malloc(sizeof(struct uffd_desc)); - TEST_ASSERT(uffd_desc, "malloc failed"); + TEST_ASSERT(uffd_desc, "Failed to malloc uffd descriptor"); + + uffd_desc->pipefds = calloc(sizeof(int), num_readers); + TEST_ASSERT(uffd_desc->pipefds, "Failed to alloc pipes"); + + uffd_desc->readers = calloc(sizeof(pthread_t), num_readers); + TEST_ASSERT(uffd_desc->readers, "Failed to alloc reader threads"); + + uffd_desc->reader_args = calloc(sizeof(struct uffd_reader_args), num_readers); + TEST_ASSERT(uffd_desc->reader_args, "Failed to alloc reader_args"); + + uffd_desc->num_readers = num_readers; /* In order to get minor faults, prefault via the alias. */ if (is_minor) @@ -148,18 +158,28 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == expected_ioctls, "missing userfaultfd ioctls"); - ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!ret, "Failed to set up pipefd"); - - uffd_desc->uffd_mode = uffd_mode; uffd_desc->uffd = uffd; - uffd_desc->delay = delay; - uffd_desc->handler = handler; - pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn, - uffd_desc); + for (i = 0; i < uffd_desc->num_readers; ++i) { + int pipes[2]; + + ret = pipe2((int *) &pipes, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd %i for uffd_desc %p", + i, uffd_desc); - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", - hva, hva + len); + uffd_desc->pipefds[i] = pipes[1]; + + uffd_desc->reader_args[i].uffd_mode = uffd_mode; + uffd_desc->reader_args[i].uffd = uffd; + uffd_desc->reader_args[i].delay = delay; + uffd_desc->reader_args[i].handler = handler; + uffd_desc->reader_args[i].pipe = pipes[0]; + + pthread_create(&uffd_desc->readers[i], NULL, uffd_handler_thread_fn, + &uffd_desc->reader_args[i]); + + PER_VCPU_DEBUG("Created uffd thread %i for HVA range [%p, %p)\n", + i, hva, hva + len); + } return uffd_desc; } @@ -167,19 +187,26 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, void uffd_stop_demand_paging(struct uffd_desc *uffd) { char c = 0; - int ret; + int i; - ret = write(uffd->pipefds[1], &c, 1); - TEST_ASSERT(ret == 1, "Unable to write to pipefd"); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(write(uffd->pipefds[i], &c, 1) == 1, + "Unable to write to pipefd %i for uffd_desc %p", i, uffd); - ret = pthread_join(uffd->thread, NULL); - TEST_ASSERT(ret == 0, "Pthread_join failed."); + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(!pthread_join(uffd->readers[i], NULL), + "Pthread_join failed on reader %i for uffd_desc %p", i, uffd); close(uffd->uffd); - close(uffd->pipefds[1]); - close(uffd->pipefds[0]); + for (i = 0; i < uffd->num_readers; ++i) { + close(uffd->pipefds[i]); + close(uffd->reader_args[i].pipe); + } + free(uffd->pipefds); + free(uffd->readers); + free(uffd->reader_args); free(uffd); } -- cgit v1.2.3 From 0cba6442e9e2dfabea042b899c99f5bfda5ab582 Mon Sep 17 00:00:00 2001 From: Anish Moorthy Date: Thu, 15 Feb 2024 23:54:03 +0000 Subject: KVM: selftests: Use EPOLL in userfaultfd_util reader threads With multiple reader threads POLLing a single UFFD, the demand paging test suffers from the thundering herd problem: performance degrades as the number of reader threads is increased. Solve this issue [1] by switching the the polling mechanism to EPOLL + EPOLLEXCLUSIVE. Also, change the error-handling convention of uffd_handler_thread_fn. Instead of just printing errors and returning early from the polling loop, check for them via TEST_ASSERT(). "return NULL" is reserved for a successful exit from uffd_handler_thread_fn, i.e. one triggered by a write to the exit pipe. Performance samples generated by the command in [2] are given below. Num Reader Threads, Paging Rate (POLL), Paging Rate (EPOLL) 1 249k 185k 2 201k 235k 4 186k 155k 16 150k 217k 32 89k 198k [1] Single-vCPU performance does suffer somewhat. [2] ./demand_paging_test -u MINOR -s shmem -v 4 -o -r Signed-off-by: Anish Moorthy Acked-by: James Houghton Link: https://lore.kernel.org/r/20240215235405.368539-13-amoorthy@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/demand_paging_test.c | 1 - tools/testing/selftests/kvm/lib/userfaultfd_util.c | 74 ++++++++++------------ 2 files changed, 35 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index ca258968f6e1..056ff1c87345 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index 96a831c27c7f..0ba866c4af69 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "kvm_util.h" @@ -32,60 +33,55 @@ static void *uffd_handler_thread_fn(void *arg) int64_t pages = 0; struct timespec start; struct timespec ts_diff; + struct epoll_event evt; + int epollfd; + + epollfd = epoll_create(1); + TEST_ASSERT(epollfd >= 0, "Failed to create epollfd."); + + evt.events = EPOLLIN | EPOLLEXCLUSIVE; + evt.data.u32 = 0; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, uffd, &evt), + "Failed to add uffd to epollfd"); + + evt.events = EPOLLIN; + evt.data.u32 = 1; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, reader_args->pipe, &evt), + "Failed to add pipe to epollfd"); clock_gettime(CLOCK_MONOTONIC, &start); while (1) { struct uffd_msg msg; - struct pollfd pollfd[2]; - char tmp_chr; int r; - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = reader_args->pipe; - pollfd[1].events = POLLIN; - - r = poll(pollfd, 2, -1); - switch (r) { - case -1: - pr_info("poll err"); - continue; - case 0: - continue; - case 1: - break; - default: - pr_info("Polling uffd returned %d", r); - return NULL; - } + r = epoll_wait(epollfd, &evt, 1, -1); + TEST_ASSERT(r == 1, + "Unexpected number of events (%d) from epoll, errno = %d", + r, errno); - if (pollfd[0].revents & POLLERR) { - pr_info("uffd revents has POLLERR"); - return NULL; - } + if (evt.data.u32 == 1) { + char tmp_chr; - if (pollfd[1].revents & POLLIN) { - r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on pipe."); + r = read(reader_args->pipe, &tmp_chr, 1); TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread"); + "Error reading pipefd in uffd reader thread"); break; } - if (!(pollfd[0].revents & POLLIN)) - continue; + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on uffd."); r = read(uffd, &msg, sizeof(msg)); if (r == -1) { - if (errno == EAGAIN) - continue; - pr_info("Read of uffd got errno %d\n", errno); - return NULL; + TEST_ASSERT(errno == EAGAIN, + "Error reading from UFFD: errno = %d", errno); + continue; } - if (r != sizeof(msg)) { - pr_info("Read on uffd returned unexpected size: %d bytes", r); - return NULL; - } + TEST_ASSERT(r == sizeof(msg), + "Read on uffd returned unexpected number of bytes (%d)", r); if (!(msg.event & UFFD_EVENT_PAGEFAULT)) continue; @@ -93,8 +89,8 @@ static void *uffd_handler_thread_fn(void *arg) if (reader_args->delay) usleep(reader_args->delay); r = reader_args->handler(reader_args->uffd_mode, uffd, &msg); - if (r < 0) - return NULL; + TEST_ASSERT(r >= 0, + "Reader thread handler fn returned negative value %d", r); pages++; } -- cgit v1.2.3 From 9f92c06e184074930174e469205f4e78338651f8 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 19 Oct 2023 11:59:00 +0200 Subject: KVM: selftests: Use TAP in the steal_time test For easier use of the tests in automation and for having some status information for the user while the test is running, let's provide some TAP output in this test. Signed-off-by: Thomas Huth Reviewed-by: Muhammad Usama Anjum Reviewed-by: Andrew Jones Reviewed-by: Zhao Liu Link: https://lore.kernel.org/r/20231019095900.450467-1-thuth@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/steal_time.c | 47 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index bae0c5026f82..4be5a1ffa06a 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -83,20 +83,18 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - int i; - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" steal: %lld\n", st->steal); - pr_info(" version: %d\n", st->version); - pr_info(" flags: %d\n", st->flags); - pr_info(" preempted: %d\n", st->preempted); - pr_info(" u8_pad: "); - for (i = 0; i < 3; ++i) - pr_info("%d", st->u8_pad[i]); - pr_info("\n pad: "); - for (i = 0; i < 11; ++i) - pr_info("%d", st->pad[i]); - pr_info("\n"); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" preempted: %d\n", st->preempted); + ksft_print_msg(" u8_pad: %d %d %d\n", + st->u8_pad[0], st->u8_pad[1], st->u8_pad[2]); + ksft_print_msg(" pad: %d %d %d %d %d %d %d %d %d %d %d\n", + st->pad[0], st->pad[1], st->pad[2], st->pad[3], + st->pad[4], st->pad[5], st->pad[6], st->pad[7], + st->pad[8], st->pad[9], st->pad[10]); } #elif defined(__aarch64__) @@ -199,10 +197,10 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" rev: %d\n", st->rev); - pr_info(" attr: %d\n", st->attr); - pr_info(" st_time: %ld\n", st->st_time); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" rev: %d\n", st->rev); + ksft_print_msg(" attr: %d\n", st->attr); + ksft_print_msg(" st_time: %ld\n", st->st_time); } #elif defined(__riscv) @@ -366,7 +364,9 @@ int main(int ac, char **av) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); + ksft_print_header(); TEST_REQUIRE(is_steal_time_supported(vcpus[0])); + ksft_set_plan(NR_VCPUS); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { @@ -407,14 +407,15 @@ int main(int ac, char **av) run_delay, stolen_time); if (verbose) { - pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, - guest_stolen_time[i], stolen_time); - if (stolen_time == run_delay) - pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); - pr_info("\n"); + ksft_print_msg("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld%s\n", + i, guest_stolen_time[i], stolen_time, + stolen_time == run_delay ? + " (BONUS: guest test-stolen-time even exactly matches test-run_delay)" : ""); steal_time_dump(vm, i); } + ksft_test_result_pass("vcpu%d\n", i); } - return 0; + /* Print results and exit() accordingly */ + ksft_finished(); } -- cgit v1.2.3 From 1553a1c48281243359a9529a10ddb551f3b967ab Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 5 Mar 2024 18:37:06 +0000 Subject: RISC-V: drop SOC_VIRT for ARCH_VIRT The ARCH_ and SOC_ versions of this symbol have persisted for quite a while now in parallel. Generated .config files from previous LTS kernels should have both. Finally remove SOC_VIRT and update all config files using it. Acked-by: Palmer Dabbelt Signed-off-by: Conor Dooley --- arch/riscv/Kconfig.socs | 3 --- arch/riscv/configs/defconfig | 2 +- arch/riscv/configs/nommu_virt_defconfig | 2 +- tools/testing/kunit/qemu_configs/riscv.py | 2 +- tools/testing/selftests/wireguard/qemu/arch/riscv32.config | 2 +- tools/testing/selftests/wireguard/qemu/arch/riscv64.config | 2 +- 6 files changed, 5 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 2797412a7ec9..f51bb24bc84c 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -52,9 +52,6 @@ config ARCH_THEAD This enables support for the RISC-V based T-HEAD SoCs. config ARCH_VIRT - def_bool SOC_VIRT - -config SOC_VIRT bool "QEMU Virt Machine" select CLINT_TIMER if RISCV_M_MODE select POWER_RESET diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 8f2bec476af2..d485e1de8a78 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -32,7 +32,7 @@ CONFIG_ARCH_SOPHGO=y CONFIG_SOC_STARFIVE=y CONFIG_ARCH_SUNXI=y CONFIG_ARCH_THEAD=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y CONFIG_PM=y diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index b794e2f8144e..de8143d1f738 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -24,7 +24,7 @@ CONFIG_EXPERT=y CONFIG_SLUB=y CONFIG_SLUB_TINY=y # CONFIG_MMU is not set -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_NONPORTABLE=y CONFIG_SMP=y CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x10000000,115200n8 console=ttyS0" diff --git a/tools/testing/kunit/qemu_configs/riscv.py b/tools/testing/kunit/qemu_configs/riscv.py index 12a1d525978a..c87758030ff7 100644 --- a/tools/testing/kunit/qemu_configs/riscv.py +++ b/tools/testing/kunit/qemu_configs/riscv.py @@ -13,7 +13,7 @@ if not os.path.isfile(OPENSBI_PATH): QEMU_ARCH = QemuArchParams(linux_arch='riscv', kconfig=''' -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config index a7f8e8a95625..66290cf289a9 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config +++ b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config @@ -2,7 +2,7 @@ CONFIG_NONPORTABLE=y CONFIG_ARCH_RV32I=y CONFIG_MMU=y CONFIG_FPU=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_RISCV_ISA_FALLBACK=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config index daeb3e5e0965..db1aa9f388b9 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config +++ b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config @@ -1,7 +1,7 @@ CONFIG_ARCH_RV64I=y CONFIG_MMU=y CONFIG_FPU=y -CONFIG_SOC_VIRT=y +CONFIG_ARCH_VIRT=y CONFIG_RISCV_ISA_FALLBACK=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -- cgit v1.2.3 From c8a1495947ffcab18d9a85144ab4abc570720e65 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 15 Mar 2024 15:44:40 +0100 Subject: selftests/hid: add KASAN to the VM tests It's always a good idea to have KASAN in tests. Link: https://lore.kernel.org/r/20240315-b4-hid-bpf-new-funcs-v4-3-079c282469d3@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/config.common | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common index 0f456dbab62f..45b5570441ce 100644 --- a/tools/testing/selftests/hid/config.common +++ b/tools/testing/selftests/hid/config.common @@ -238,3 +238,4 @@ CONFIG_VLAN_8021Q=y CONFIG_XFRM_SUB_POLICY=y CONFIG_XFRM_USER=y CONFIG_ZEROPLUS_FF=y +CONFIG_KASAN=y -- cgit v1.2.3 From db624e82c55f227b84ac9ebfa3de2f6f5fad666b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 15 Mar 2024 15:44:41 +0100 Subject: selftests/hid: Add test for hid_bpf_hw_output_report This time we need to ensure uhid receives it, thus the new mutex and condition. Link: https://lore.kernel.org/r/20240315-b4-hid-bpf-new-funcs-v4-4-079c282469d3@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 63 ++++++++++++++++++++++ tools/testing/selftests/hid/progs/hid.c | 24 +++++++++ .../testing/selftests/hid/progs/hid_bpf_helpers.h | 2 + 3 files changed, 89 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 2cf96f818f25..8332014838b0 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -16,6 +16,11 @@ #define SHOW_UHID_DEBUG 0 +#define min(a, b) \ + ({ __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; }) + static unsigned char rdesc[] = { 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ 0x09, 0x21, /* Usage (Vendor Usage 0x21) */ @@ -111,6 +116,10 @@ struct hid_hw_request_syscall_args { static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER; +static unsigned char output_report[10]; + /* no need to protect uhid_stopped, only one thread accesses it */ static bool uhid_stopped; @@ -205,6 +214,13 @@ static int uhid_event(struct __test_metadata *_metadata, int fd) break; case UHID_OUTPUT: UHID_LOG("UHID_OUTPUT from uhid-dev"); + + pthread_mutex_lock(&uhid_output_mtx); + memcpy(output_report, + ev.u.output.data, + min(ev.u.output.size, sizeof(output_report))); + pthread_cond_signal(&uhid_output_cond); + pthread_mutex_unlock(&uhid_output_mtx); break; case UHID_GET_REPORT: UHID_LOG("UHID_GET_REPORT from uhid-dev"); @@ -733,6 +749,53 @@ TEST_F(hid_bpf, test_hid_change_report) ASSERT_EQ(buf[2], 0) TH_LOG("leftovers_from_previous_test"); } +/* + * Call hid_bpf_hw_output_report against the given uhid device, + * check that the program is called and does the expected. + */ +TEST_F(hid_bpf, test_hid_user_output_report_call) +{ + struct hid_hw_request_syscall_args args = { + .retval = -1, + .size = 10, + }; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + int err, cond_err, prog_fd; + struct timespec time_to_wait; + + LOAD_BPF; + + args.hid = self->hid_id; + args.data[0] = 1; /* report ID */ + args.data[1] = 2; /* report ID */ + args.data[2] = 42; /* report ID */ + + prog_fd = bpf_program__fd(self->skel->progs.hid_user_output_report); + + pthread_mutex_lock(&uhid_output_mtx); + + memset(output_report, 0, sizeof(output_report)); + clock_gettime(CLOCK_REALTIME, &time_to_wait); + time_to_wait.tv_sec += 2; + + err = bpf_prog_test_run_opts(prog_fd, &tattrs); + cond_err = pthread_cond_timedwait(&uhid_output_cond, &uhid_output_mtx, &time_to_wait); + + ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts"); + ASSERT_OK(cond_err) TH_LOG("error while calling waiting for the condition"); + + ASSERT_EQ(args.retval, 3); + + ASSERT_EQ(output_report[0], 1); + ASSERT_EQ(output_report[1], 2); + ASSERT_EQ(output_report[2], 42); + + pthread_mutex_unlock(&uhid_output_mtx); +} + /* * Attach hid_user_raw_request to the given uhid device, * call the bpf program from userspace diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index 1e558826b809..2c2b679a83b1 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -101,6 +101,30 @@ int hid_user_raw_request(struct hid_hw_request_syscall_args *args) return 0; } +SEC("syscall") +int hid_user_output_report(struct hid_hw_request_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = args->size; + int i, ret = 0; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + ret = hid_bpf_hw_output_report(ctx, + args->data, + size); + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + static const __u8 rdesc[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x32, /* USAGE (Z) */ diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 65e657ac1198..50c6a0d5765e 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -94,5 +94,7 @@ extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, size_t buf__sz, enum hid_report_type type, enum hid_class_request reqtype) __ksym; +extern int hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx, + __u8 *buf, size_t buf__sz) __ksym; #endif /* __HID_BPF_HELPERS_H */ -- cgit v1.2.3 From 2c0e8ced7d4bf746923fc424415844d695f07808 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 15 Mar 2024 15:44:43 +0100 Subject: selftests/hid: add tests for hid_bpf_input_report Usual way of testing, we call the function and ensures we receive the event Link: https://lore.kernel.org/r/20240315-b4-hid-bpf-new-funcs-v4-6-079c282469d3@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 49 +++++++++++++++++++++- tools/testing/selftests/hid/progs/hid.c | 22 ++++++++++ .../testing/selftests/hid/progs/hid_bpf_helpers.h | 4 ++ 3 files changed, 73 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 8332014838b0..f825623e3edc 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -749,6 +749,52 @@ TEST_F(hid_bpf, test_hid_change_report) ASSERT_EQ(buf[2], 0) TH_LOG("leftovers_from_previous_test"); } +/* + * Call hid_bpf_input_report against the given uhid device, + * check that the program is called and does the expected. + */ +TEST_F(hid_bpf, test_hid_user_input_report_call) +{ + struct hid_hw_request_syscall_args args = { + .retval = -1, + .size = 10, + }; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + __u8 buf[10] = {0}; + int err, prog_fd; + + LOAD_BPF; + + args.hid = self->hid_id; + args.data[0] = 1; /* report ID */ + args.data[1] = 2; /* report ID */ + args.data[2] = 42; /* report ID */ + + prog_fd = bpf_program__fd(self->skel->progs.hid_user_input_report); + + /* check that there is no data to read from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, -1) TH_LOG("read_hidraw"); + + err = bpf_prog_test_run_opts(prog_fd, &tattrs); + + ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts"); + + ASSERT_EQ(args.retval, 0); + + /* read the data from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, 6) TH_LOG("read_hidraw"); + ASSERT_EQ(buf[0], 1); + ASSERT_EQ(buf[1], 2); + ASSERT_EQ(buf[2], 42); +} + /* * Call hid_bpf_hw_output_report against the given uhid device, * check that the program is called and does the expected. @@ -797,8 +843,7 @@ TEST_F(hid_bpf, test_hid_user_output_report_call) } /* - * Attach hid_user_raw_request to the given uhid device, - * call the bpf program from userspace + * Call hid_hw_raw_request against the given uhid device, * check that the program is called and does the expected. */ TEST_F(hid_bpf, test_hid_user_raw_request_call) diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c index 2c2b679a83b1..f67d35def142 100644 --- a/tools/testing/selftests/hid/progs/hid.c +++ b/tools/testing/selftests/hid/progs/hid.c @@ -125,6 +125,28 @@ int hid_user_output_report(struct hid_hw_request_syscall_args *args) return 0; } +SEC("syscall") +int hid_user_input_report(struct hid_hw_request_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = args->size; + int i, ret = 0; + + if (size > sizeof(args->data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + ret = hid_bpf_input_report(ctx, HID_INPUT_REPORT, args->data, size); + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + static const __u8 rdesc[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x32, /* USAGE (Z) */ diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 50c6a0d5765e..9cd56821d0f1 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -96,5 +96,9 @@ extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, enum hid_class_request reqtype) __ksym; extern int hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz) __ksym; +extern int hid_bpf_input_report(struct hid_bpf_ctx *ctx, + enum hid_report_type type, + __u8 *data, + size_t buf__sz) __ksym; #endif /* __HID_BPF_HELPERS_H */ -- cgit v1.2.3 From e59f0e93e92e0ddfd17e3373d586218cf638571e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 8 Apr 2024 20:15:46 -0700 Subject: selftests: move bpf-offload test from bpf to net We're building more python tests on the netdev side, and some of the classes from the venerable BPF offload tests can be reused. Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240409031549.3531084-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/bpf/Makefile | 1 - .../testing/selftests/bpf/progs/sample_map_ret0.c | 34 - tools/testing/selftests/bpf/progs/sample_ret0.c | 7 - tools/testing/selftests/bpf/test_offload.py | 1405 -------------------- tools/testing/selftests/net/Makefile | 8 +- tools/testing/selftests/net/bpf_offload.py | 1405 ++++++++++++++++++++ tools/testing/selftests/net/sample_map_ret0.bpf.c | 34 + tools/testing/selftests/net/sample_ret0.bpf.c | 7 + 8 files changed, 1453 insertions(+), 1448 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/sample_map_ret0.c delete mode 100644 tools/testing/selftests/bpf/progs/sample_ret0.c delete mode 100755 tools/testing/selftests/bpf/test_offload.py create mode 100755 tools/testing/selftests/net/bpf_offload.py create mode 100644 tools/testing/selftests/net/sample_map_ret0.bpf.c create mode 100644 tools/testing/selftests/net/sample_ret0.bpf.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b9eb40d6343..b0be07f29dde 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -102,7 +102,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ - test_offload.py \ test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/bpf/progs/sample_map_ret0.c deleted file mode 100644 index 495990d355ef..000000000000 --- a/tools/testing/selftests/bpf/progs/sample_map_ret0.c +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, __u32); - __type(value, long); - __uint(max_entries, 2); -} htab SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, __u32); - __type(value, long); - __uint(max_entries, 2); -} array SEC(".maps"); - -/* Sample program which should always load for testing control paths. */ -SEC(".text") int func() -{ - __u64 key64 = 0; - __u32 key = 0; - long *value; - - value = bpf_map_lookup_elem(&htab, &key); - if (!value) - return 1; - value = bpf_map_lookup_elem(&array, &key64); - if (!value) - return 1; - - return 0; -} diff --git a/tools/testing/selftests/bpf/progs/sample_ret0.c b/tools/testing/selftests/bpf/progs/sample_ret0.c deleted file mode 100644 index fec99750d6ea..000000000000 --- a/tools/testing/selftests/bpf/progs/sample_ret0.c +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ - -/* Sample program which should always load for testing control paths. */ -int func() -{ - return 0; -} diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py deleted file mode 100755 index 6157f884d091..000000000000 --- a/tools/testing/selftests/bpf/test_offload.py +++ /dev/null @@ -1,1405 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2017 Netronome Systems, Inc. -# Copyright (c) 2019 Mellanox Technologies. All rights reserved -# -# This software is licensed under the GNU General License Version 2, -# June 1991 as shown in the file COPYING in the top-level directory of this -# source tree. -# -# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" -# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, -# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE -# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME -# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - -from datetime import datetime -import argparse -import errno -import json -import os -import pprint -import random -import re -import stat -import string -import struct -import subprocess -import time -import traceback - -logfile = None -log_level = 1 -skip_extack = False -bpf_test_dir = os.path.dirname(os.path.realpath(__file__)) -pp = pprint.PrettyPrinter() -devs = [] # devices we created for clean up -files = [] # files to be removed -netns = [] # net namespaces to be removed - -def log_get_sec(level=0): - return "*" * (log_level + level) - -def log_level_inc(add=1): - global log_level - log_level += add - -def log_level_dec(sub=1): - global log_level - log_level -= sub - -def log_level_set(level): - global log_level - log_level = level - -def log(header, data, level=None): - """ - Output to an optional log. - """ - if logfile is None: - return - if level is not None: - log_level_set(level) - - if not isinstance(data, str): - data = pp.pformat(data) - - if len(header): - logfile.write("\n" + log_get_sec() + " ") - logfile.write(header) - if len(header) and len(data.strip()): - logfile.write("\n") - logfile.write(data) - -def skip(cond, msg): - if not cond: - return - print("SKIP: " + msg) - log("SKIP: " + msg, "", level=1) - os.sys.exit(0) - -def fail(cond, msg): - if not cond: - return - print("FAIL: " + msg) - tb = "".join(traceback.extract_stack().format()) - print(tb) - log("FAIL: " + msg, tb, level=1) - os.sys.exit(1) - -def start_test(msg): - log(msg, "", level=1) - log_level_inc() - print(msg) - -def cmd(cmd, shell=True, include_stderr=False, background=False, fail=True): - """ - Run a command in subprocess and return tuple of (retval, stdout); - optionally return stderr as well as third value. - """ - proc = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if background: - msg = "%s START: %s" % (log_get_sec(1), - datetime.now().strftime("%H:%M:%S.%f")) - log("BKG " + proc.args, msg) - return proc - - return cmd_result(proc, include_stderr=include_stderr, fail=fail) - -def cmd_result(proc, include_stderr=False, fail=False): - stdout, stderr = proc.communicate() - stdout = stdout.decode("utf-8") - stderr = stderr.decode("utf-8") - proc.stdout.close() - proc.stderr.close() - - stderr = "\n" + stderr - if stderr[-1] == "\n": - stderr = stderr[:-1] - - sec = log_get_sec(1) - log("CMD " + proc.args, - "RETCODE: %d\n%s STDOUT:\n%s%s STDERR:%s\n%s END: %s" % - (proc.returncode, sec, stdout, sec, stderr, - sec, datetime.now().strftime("%H:%M:%S.%f"))) - - if proc.returncode != 0 and fail: - if len(stderr) > 0 and stderr[-1] == "\n": - stderr = stderr[:-1] - raise Exception("Command failed: %s\n%s" % (proc.args, stderr)) - - if include_stderr: - return proc.returncode, stdout, stderr - else: - return proc.returncode, stdout - -def rm(f): - cmd("rm -f %s" % (f)) - if f in files: - files.remove(f) - -def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): - params = "" - if JSON: - params += "%s " % (flags["json"]) - - if ns != "": - ns = "ip netns exec %s " % (ns) - - if include_stderr: - ret, stdout, stderr = cmd(ns + name + " " + params + args, - fail=fail, include_stderr=True) - else: - ret, stdout = cmd(ns + name + " " + params + args, - fail=fail, include_stderr=False) - - if JSON and len(stdout.strip()) != 0: - out = json.loads(stdout) - else: - out = stdout - - if include_stderr: - return ret, out, stderr - else: - return ret, out - -def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False): - return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, - fail=fail, include_stderr=include_stderr) - -def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True): - _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) - # Remove the base progs - for p in base_progs: - if p in progs: - progs.remove(p) - if exclude_orphaned: - progs = [ p for p in progs if not p['orphaned'] ] - if expected is not None: - if len(progs) != expected: - fail(True, "%d BPF programs loaded, expected %d" % - (len(progs), expected)) - return progs - -def bpftool_map_list(expected=None, ns=""): - _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) - # Remove the base maps - maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names] - if expected is not None: - if len(maps) != expected: - fail(True, "%d BPF maps loaded, expected %d" % - (len(maps), expected)) - return maps - -def bpftool_prog_list_wait(expected=0, n_retry=20): - for i in range(n_retry): - nprogs = len(bpftool_prog_list()) - if nprogs == expected: - return - time.sleep(0.05) - raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) - -def bpftool_map_list_wait(expected=0, n_retry=20): - for i in range(n_retry): - nmaps = len(bpftool_map_list()) - if nmaps == expected: - return - time.sleep(0.05) - raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) - -def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, - fail=True, include_stderr=False): - args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) - if prog_type is not None: - args += " type " + prog_type - if dev is not None: - args += " dev " + dev - if len(maps): - args += " map " + " map ".join(maps) - - res = bpftool(args, fail=fail, include_stderr=include_stderr) - if res[0] == 0: - files.append(file_name) - return res - -def ip(args, force=False, JSON=True, ns="", fail=True, include_stderr=False): - if force: - args = "-force " + args - return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns, - fail=fail, include_stderr=include_stderr) - -def tc(args, JSON=True, ns="", fail=True, include_stderr=False): - return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns, - fail=fail, include_stderr=include_stderr) - -def ethtool(dev, opt, args, fail=True): - return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) - -def bpf_obj(name, sec=".text", path=bpf_test_dir,): - return "obj %s sec %s" % (os.path.join(path, name), sec) - -def bpf_pinned(name): - return "pinned %s" % (name) - -def bpf_bytecode(bytecode): - return "bytecode \"%s\"" % (bytecode) - -def mknetns(n_retry=10): - for i in range(n_retry): - name = ''.join([random.choice(string.ascii_letters) for i in range(8)]) - ret, _ = ip("netns add %s" % (name), fail=False) - if ret == 0: - netns.append(name) - return name - return None - -def int2str(fmt, val): - ret = [] - for b in struct.pack(fmt, val): - ret.append(int(b)) - return " ".join(map(lambda x: str(x), ret)) - -def str2int(strtab): - inttab = [] - for i in strtab: - inttab.append(int(i, 16)) - ba = bytearray(inttab) - if len(strtab) == 4: - fmt = "I" - elif len(strtab) == 8: - fmt = "Q" - else: - raise Exception("String array of len %d can't be unpacked to an int" % - (len(strtab))) - return struct.unpack(fmt, ba)[0] - -class DebugfsDir: - """ - Class for accessing DebugFS directories as a dictionary. - """ - - def __init__(self, path): - self.path = path - self._dict = self._debugfs_dir_read(path) - - def __len__(self): - return len(self._dict.keys()) - - def __getitem__(self, key): - if type(key) is int: - key = list(self._dict.keys())[key] - return self._dict[key] - - def __setitem__(self, key, value): - log("DebugFS set %s = %s" % (key, value), "") - log_level_inc() - - cmd("echo '%s' > %s/%s" % (value, self.path, key)) - log_level_dec() - - _, out = cmd('cat %s/%s' % (self.path, key)) - self._dict[key] = out.strip() - - def _debugfs_dir_read(self, path): - dfs = {} - - log("DebugFS state for %s" % (path), "") - log_level_inc(add=2) - - _, out = cmd('ls ' + path) - for f in out.split(): - if f == "ports": - continue - - p = os.path.join(path, f) - if not os.stat(p).st_mode & stat.S_IRUSR: - continue - - if os.path.isfile(p): - # We need to init trap_flow_action_cookie before read it - if f == "trap_flow_action_cookie": - cmd('echo deadbeef > %s/%s' % (path, f)) - _, out = cmd('cat %s/%s' % (path, f)) - dfs[f] = out.strip() - elif os.path.isdir(p): - dfs[f] = DebugfsDir(p) - else: - raise Exception("%s is neither file nor directory" % (p)) - - log_level_dec() - log("DebugFS state", dfs) - log_level_dec() - - return dfs - -class NetdevSimDev: - """ - Class for netdevsim bus device and its attributes. - """ - @staticmethod - def ctrl_write(path, val): - fullpath = os.path.join("/sys/bus/netdevsim/", path) - try: - with open(fullpath, "w") as f: - f.write(val) - except OSError as e: - log("WRITE %s: %r" % (fullpath, val), -e.errno) - raise e - log("WRITE %s: %r" % (fullpath, val), 0) - - def __init__(self, port_count=1): - addr = 0 - while True: - try: - self.ctrl_write("new_device", "%u %u" % (addr, port_count)) - except OSError as e: - if e.errno == errno.ENOSPC: - addr += 1 - continue - raise e - break - self.addr = addr - - # As probe of netdevsim device might happen from a workqueue, - # so wait here until all netdevs appear. - self.wait_for_netdevs(port_count) - - ret, out = cmd("udevadm settle", fail=False) - if ret: - raise Exception("udevadm settle failed") - ifnames = self.get_ifnames() - - devs.append(self) - self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr - - self.nsims = [] - for port_index in range(port_count): - self.nsims.append(NetdevSim(self, port_index, ifnames[port_index])) - - def get_ifnames(self): - ifnames = [] - listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr) - for ifname in listdir: - ifnames.append(ifname) - ifnames.sort() - return ifnames - - def wait_for_netdevs(self, port_count): - timeout = 5 - timeout_start = time.time() - - while True: - try: - ifnames = self.get_ifnames() - except FileNotFoundError as e: - ifnames = [] - if len(ifnames) == port_count: - break - if time.time() < timeout_start + timeout: - continue - raise Exception("netdevices did not appear within timeout") - - def dfs_num_bound_progs(self): - path = os.path.join(self.dfs_dir, "bpf_bound_progs") - _, progs = cmd('ls %s' % (path)) - return len(progs.split()) - - def dfs_get_bound_progs(self, expected): - progs = DebugfsDir(os.path.join(self.dfs_dir, "bpf_bound_progs")) - if expected is not None: - if len(progs) != expected: - fail(True, "%d BPF programs bound, expected %d" % - (len(progs), expected)) - return progs - - def remove(self): - self.ctrl_write("del_device", "%u" % (self.addr, )) - devs.remove(self) - - def remove_nsim(self, nsim): - self.nsims.remove(nsim) - self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), - "%u" % (nsim.port_index, )) - -class NetdevSim: - """ - Class for netdevsim netdevice and its attributes. - """ - - def __init__(self, nsimdev, port_index, ifname): - # In case udev renamed the netdev to according to new schema, - # check if the name matches the port_index. - nsimnamere = re.compile("eni\d+np(\d+)") - match = nsimnamere.match(ifname) - if match and int(match.groups()[0]) != port_index + 1: - raise Exception("netdevice name mismatches the expected one") - - self.nsimdev = nsimdev - self.port_index = port_index - self.ns = "" - self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) - self.dfs_refresh() - _, [self.dev] = ip("link show dev %s" % ifname) - - def __getitem__(self, key): - return self.dev[key] - - def remove(self): - self.nsimdev.remove_nsim(self) - - def dfs_refresh(self): - self.dfs = DebugfsDir(self.dfs_dir) - return self.dfs - - def dfs_read(self, f): - path = os.path.join(self.dfs_dir, f) - _, data = cmd('cat %s' % (path)) - return data.strip() - - def wait_for_flush(self, bound=0, total=0, n_retry=20): - for i in range(n_retry): - nbound = self.nsimdev.dfs_num_bound_progs() - nprogs = len(bpftool_prog_list()) - if nbound == bound and nprogs == total: - return - time.sleep(0.05) - raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) - - def set_ns(self, ns): - name = "1" if ns == "" else ns - ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) - self.ns = ns - - def set_mtu(self, mtu, fail=True): - return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu), - fail=fail) - - def set_xdp(self, bpf, mode, force=False, JSON=True, verbose=False, - fail=True, include_stderr=False): - if verbose: - bpf += " verbose" - return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf), - force=force, JSON=JSON, - fail=fail, include_stderr=include_stderr) - - def unset_xdp(self, mode, force=False, JSON=True, - fail=True, include_stderr=False): - return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode), - force=force, JSON=JSON, - fail=fail, include_stderr=include_stderr) - - def ip_link_show(self, xdp): - _, link = ip("link show dev %s" % (self['ifname'])) - if len(link) > 1: - raise Exception("Multiple objects on ip link show") - if len(link) < 1: - return {} - fail(xdp != "xdp" in link, - "XDP program not reporting in iplink (reported %s, expected %s)" % - ("xdp" in link, xdp)) - return link[0] - - def tc_add_ingress(self): - tc("qdisc add dev %s ingress" % (self['ifname'])) - - def tc_del_ingress(self): - tc("qdisc del dev %s ingress" % (self['ifname'])) - - def tc_flush_filters(self, bound=0, total=0): - self.tc_del_ingress() - self.tc_add_ingress() - self.wait_for_flush(bound=bound, total=total) - - def tc_show_ingress(self, expected=None): - # No JSON support, oh well... - flags = ["skip_sw", "skip_hw", "in_hw"] - named = ["protocol", "pref", "chain", "handle", "id", "tag"] - - args = "-s filter show dev %s ingress" % (self['ifname']) - _, out = tc(args, JSON=False) - - filters = [] - lines = out.split('\n') - for line in lines: - words = line.split() - if "handle" not in words: - continue - fltr = {} - for flag in flags: - fltr[flag] = flag in words - for name in named: - try: - idx = words.index(name) - fltr[name] = words[idx + 1] - except ValueError: - pass - filters.append(fltr) - - if expected is not None: - fail(len(filters) != expected, - "%d ingress filters loaded, expected %d" % - (len(filters), expected)) - return filters - - def cls_filter_op(self, op, qdisc="ingress", prio=None, handle=None, - chain=None, cls="", params="", - fail=True, include_stderr=False): - spec = "" - if prio is not None: - spec += " prio %d" % (prio) - if handle: - spec += " handle %s" % (handle) - if chain is not None: - spec += " chain %d" % (chain) - - return tc("filter {op} dev {dev} {qdisc} {spec} {cls} {params}"\ - .format(op=op, dev=self['ifname'], qdisc=qdisc, spec=spec, - cls=cls, params=params), - fail=fail, include_stderr=include_stderr) - - def cls_bpf_add_filter(self, bpf, op="add", prio=None, handle=None, - chain=None, da=False, verbose=False, - skip_sw=False, skip_hw=False, - fail=True, include_stderr=False): - cls = "bpf " + bpf - - params = "" - if da: - params += " da" - if verbose: - params += " verbose" - if skip_sw: - params += " skip_sw" - if skip_hw: - params += " skip_hw" - - return self.cls_filter_op(op=op, prio=prio, handle=handle, cls=cls, - chain=chain, params=params, - fail=fail, include_stderr=include_stderr) - - def set_ethtool_tc_offloads(self, enable, fail=True): - args = "hw-tc-offload %s" % ("on" if enable else "off") - return ethtool(self, "-K", args, fail=fail) - -################################################################################ -def clean_up(): - global files, netns, devs - - for dev in devs: - dev.remove() - for f in files: - cmd("rm -f %s" % (f)) - for ns in netns: - cmd("ip netns delete %s" % (ns)) - files = [] - netns = [] - -def pin_prog(file_name, idx=0): - progs = bpftool_prog_list(expected=(idx + 1)) - prog = progs[idx] - bpftool("prog pin id %d %s" % (prog["id"], file_name)) - files.append(file_name) - - return file_name, bpf_pinned(file_name) - -def pin_map(file_name, idx=0, expected=1): - maps = bpftool_map_list(expected=expected) - m = maps[idx] - bpftool("map pin id %d %s" % (m["id"], file_name)) - files.append(file_name) - - return file_name, bpf_pinned(file_name) - -def check_dev_info_removed(prog_file=None, map_file=None): - bpftool_prog_list(expected=0) - bpftool_prog_list(expected=1, exclude_orphaned=False) - ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) - fail(ret != 0, "failed to show prog with removed device") - - bpftool_map_list(expected=0) - ret, err = bpftool("map show pin %s" % (map_file), fail=False) - fail(ret == 0, "Showing map with removed device did not fail") - fail(err["error"].find("No such device") == -1, - "Showing map with removed device expected ENODEV, error is %s" % - (err["error"])) - -def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): - progs = bpftool_prog_list(expected=1, ns=ns) - prog = progs[0] - - fail("dev" not in prog.keys(), "Device parameters not reported") - dev = prog["dev"] - fail("ifindex" not in dev.keys(), "Device parameters not reported") - fail("ns_dev" not in dev.keys(), "Device parameters not reported") - fail("ns_inode" not in dev.keys(), "Device parameters not reported") - - if not other_ns: - fail("ifname" not in dev.keys(), "Ifname not reported") - fail(dev["ifname"] != sim["ifname"], - "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"])) - else: - fail("ifname" in dev.keys(), "Ifname is reported for other ns") - - maps = bpftool_map_list(expected=2, ns=ns) - for m in maps: - fail("dev" not in m.keys(), "Device parameters not reported") - fail(dev != m["dev"], "Map's device different than program's") - -def check_extack(output, reference, args): - if skip_extack: - return - lines = output.split("\n") - comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference - fail(not comp, "Missing or incorrect netlink extack message") - -def check_extack_nsim(output, reference, args): - check_extack(output, "netdevsim: " + reference, args) - -def check_no_extack(res, needle): - fail((res[1] + res[2]).count(needle) or (res[1] + res[2]).count("Warning:"), - "Found '%s' in command output, leaky extack?" % (needle)) - -def check_verifier_log(output, reference): - lines = output.split("\n") - for l in reversed(lines): - if l == reference: - return - fail(True, "Missing or incorrect message from netdevsim in verifier log") - -def check_multi_basic(two_xdps): - fail(two_xdps["mode"] != 4, "Bad mode reported with multiple programs") - fail("prog" in two_xdps, "Base program reported in multi program mode") - fail(len(two_xdps["attached"]) != 2, - "Wrong attached program count with two programs") - fail(two_xdps["attached"][0]["prog"]["id"] == - two_xdps["attached"][1]["prog"]["id"], - "Offloaded and other programs have the same id") - -def test_spurios_extack(sim, obj, skip_hw, needle): - res = sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=skip_hw, - include_stderr=True) - check_no_extack(res, needle) - res = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, - skip_hw=skip_hw, include_stderr=True) - check_no_extack(res, needle) - res = sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf", - include_stderr=True) - check_no_extack(res, needle) - -def test_multi_prog(simdev, sim, obj, modename, modeid): - start_test("Test multi-attachment XDP - %s + offload..." % - (modename or "default", )) - sim.set_xdp(obj, "offload") - xdp = sim.ip_link_show(xdp=True)["xdp"] - offloaded = sim.dfs_read("bpf_offloaded_id") - fail("prog" not in xdp, "Base program not reported in single program mode") - fail(len(xdp["attached"]) != 1, - "Wrong attached program count with one program") - - sim.set_xdp(obj, modename) - two_xdps = sim.ip_link_show(xdp=True)["xdp"] - - fail(xdp["attached"][0] not in two_xdps["attached"], - "Offload program not reported after other activated") - check_multi_basic(two_xdps) - - offloaded2 = sim.dfs_read("bpf_offloaded_id") - fail(offloaded != offloaded2, - "Offload ID changed after loading other program") - - start_test("Test multi-attachment XDP - replace...") - ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) - fail(ret == 0, "Replaced one of programs without -force") - check_extack(err, "XDP program already attached.", args) - - start_test("Test multi-attachment XDP - remove without mode...") - ret, _, err = sim.unset_xdp("", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program without a mode flag") - check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) - - sim.unset_xdp("offload") - xdp = sim.ip_link_show(xdp=True)["xdp"] - offloaded = sim.dfs_read("bpf_offloaded_id") - - fail(xdp["mode"] != modeid, "Bad mode reported after multiple programs") - fail("prog" not in xdp, - "Base program not reported after multi program mode") - fail(xdp["attached"][0] not in two_xdps["attached"], - "Offload program not reported after other activated") - fail(len(xdp["attached"]) != 1, - "Wrong attached program count with remaining programs") - fail(offloaded != "0", "Offload ID reported with only other program left") - - start_test("Test multi-attachment XDP - reattach...") - sim.set_xdp(obj, "offload") - two_xdps = sim.ip_link_show(xdp=True)["xdp"] - - fail(xdp["attached"][0] not in two_xdps["attached"], - "Other program not reported after offload activated") - check_multi_basic(two_xdps) - - start_test("Test multi-attachment XDP - device remove...") - simdev.remove() - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_ethtool_tc_offloads(True) - return [simdev, sim] - -# Parse command line -parser = argparse.ArgumentParser() -parser.add_argument("--log", help="output verbose log to given file") -args = parser.parse_args() -if args.log: - logfile = open(args.log, 'w+') - logfile.write("# -*-Org-*-") - -log("Prepare...", "", level=1) -log_level_inc() - -# Check permissions -skip(os.getuid() != 0, "test must be run as root") - -# Check tools -ret, progs = bpftool("prog", fail=False) -skip(ret != 0, "bpftool not installed") -base_progs = progs -_, base_maps = bpftool("map") -base_map_names = [ - 'pid_iter.rodata', # created on each bpftool invocation - 'libbpf_det_bind', # created on each bpftool invocation -] - -# Check netdevsim -if not os.path.isdir("/sys/bus/netdevsim/"): - ret, out = cmd("modprobe netdevsim", fail=False) - skip(ret != 0, "netdevsim module could not be loaded") - -# Check debugfs -_, out = cmd("mount") -if out.find("/sys/kernel/debug type debugfs") == -1: - cmd("mount -t debugfs none /sys/kernel/debug") - -# Check samples are compiled -samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"] -for s in samples: - ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) - skip(ret != 0, "sample %s/%s not found, please compile it" % - (bpf_test_dir, s)) - -# Check if iproute2 is built with libmnl (needed by extack support) -_, _, err = cmd("tc qdisc delete dev lo handle 0", - fail=False, include_stderr=True) -if err.find("Error: Failed to find qdisc with specified handle.") == -1: - print("Warning: no extack message in iproute2 output, libmnl missing?") - log("Warning: no extack message in iproute2 output, libmnl missing?", "") - skip_extack = True - -# Check if net namespaces seem to work -ns = mknetns() -skip(ns is None, "Could not create a net namespace") -cmd("ip netns delete %s" % (ns)) -netns = [] - -try: - obj = bpf_obj("sample_ret0.bpf.o") - bytecode = bpf_bytecode("1,6 0 0 4294967295,") - - start_test("Test destruction of generic XDP...") - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_xdp(obj, "generic") - simdev.remove() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.tc_add_ingress() - - start_test("Test TC non-offloaded...") - ret, _ = sim.cls_bpf_add_filter(obj, skip_hw=True, fail=False) - fail(ret != 0, "Software TC filter did not load") - - start_test("Test TC non-offloaded isn't getting bound...") - ret, _ = sim.cls_bpf_add_filter(obj, fail=False) - fail(ret != 0, "Software TC filter did not load") - simdev.dfs_get_bound_progs(expected=0) - - sim.tc_flush_filters() - - start_test("Test TC offloads are off by default...") - ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "TC filter loaded without enabling TC offloads") - check_extack(err, "TC offload is disabled on net device.", args) - sim.wait_for_flush() - - sim.set_ethtool_tc_offloads(True) - sim.dfs["bpf_tc_non_bound_accept"] = "Y" - - start_test("Test TC offload by default...") - ret, _ = sim.cls_bpf_add_filter(obj, fail=False) - fail(ret != 0, "Software TC filter did not load") - simdev.dfs_get_bound_progs(expected=0) - ingress = sim.tc_show_ingress(expected=1) - fltr = ingress[0] - fail(not fltr["in_hw"], "Filter not offloaded by default") - - sim.tc_flush_filters() - - start_test("Test TC cBPF bytcode tries offload by default...") - ret, _ = sim.cls_bpf_add_filter(bytecode, fail=False) - fail(ret != 0, "Software TC filter did not load") - simdev.dfs_get_bound_progs(expected=0) - ingress = sim.tc_show_ingress(expected=1) - fltr = ingress[0] - fail(not fltr["in_hw"], "Bytecode not offloaded by default") - - sim.tc_flush_filters() - sim.dfs["bpf_tc_non_bound_accept"] = "N" - - start_test("Test TC cBPF unbound bytecode doesn't offload...") - ret, _, err = sim.cls_bpf_add_filter(bytecode, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "TC bytecode loaded for offload") - check_extack_nsim(err, "netdevsim configured to reject unbound programs.", - args) - sim.wait_for_flush() - - start_test("Test non-0 chain offload...") - ret, _, err = sim.cls_bpf_add_filter(obj, chain=1, prio=1, handle=1, - skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "Offloaded a filter to chain other than 0") - check_extack(err, "Driver supports only offload of chain 0.", args) - sim.tc_flush_filters() - - start_test("Test TC replace...") - sim.cls_bpf_add_filter(obj, prio=1, handle=1) - sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_sw=True) - sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_sw=True) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=True) - sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_hw=True) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - start_test("Test TC replace bad flags...") - for i in range(3): - for j in range(3): - ret, _ = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, - skip_sw=(j == 1), skip_hw=(j == 2), - fail=False) - fail(bool(ret) != bool(j), - "Software TC incorrect load in replace test, iteration %d" % - (j)) - sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") - - start_test("Test spurious extack from the driver...") - test_spurios_extack(sim, obj, False, "netdevsim") - test_spurios_extack(sim, obj, True, "netdevsim") - - sim.set_ethtool_tc_offloads(False) - - test_spurios_extack(sim, obj, False, "TC offload is disabled") - test_spurios_extack(sim, obj, True, "TC offload is disabled") - - sim.set_ethtool_tc_offloads(True) - - sim.tc_flush_filters() - - start_test("Test TC offloads failure...") - sim.dfs["dev/bpf_bind_verifier_accept"] = 0 - ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "TC filter did not reject with TC offloads enabled") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") - sim.dfs["dev/bpf_bind_verifier_accept"] = 1 - - start_test("Test TC offloads work...") - ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, - fail=False, include_stderr=True) - fail(ret != 0, "TC filter did not load with TC offloads enabled") - - start_test("Test TC offload basics...") - dfs = simdev.dfs_get_bound_progs(expected=1) - progs = bpftool_prog_list(expected=1) - ingress = sim.tc_show_ingress(expected=1) - - dprog = dfs[0] - prog = progs[0] - fltr = ingress[0] - fail(fltr["skip_hw"], "TC does reports 'skip_hw' on offloaded filter") - fail(not fltr["in_hw"], "TC does not report 'in_hw' for offloaded filter") - fail(not fltr["skip_sw"], "TC does not report 'skip_sw' back") - - start_test("Test TC offload is device-bound...") - fail(str(prog["id"]) != fltr["id"], "Program IDs don't match") - fail(prog["tag"] != fltr["tag"], "Program tags don't match") - fail(fltr["id"] != dprog["id"], "Program IDs don't match") - fail(dprog["state"] != "xlated", "Offloaded program state not translated") - fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") - - start_test("Test disabling TC offloads is rejected while filters installed...") - ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) - fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") - sim.set_ethtool_tc_offloads(True) - - start_test("Test qdisc removal frees things...") - sim.tc_flush_filters() - sim.tc_show_ingress(expected=0) - - start_test("Test disabling TC offloads is OK without filters...") - ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) - fail(ret != 0, - "Driver refused to disable TC offloads without filters installed...") - - sim.set_ethtool_tc_offloads(True) - - start_test("Test destroying device gets rid of TC filters...") - sim.cls_bpf_add_filter(obj, skip_sw=True) - simdev.remove() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_ethtool_tc_offloads(True) - - start_test("Test destroying device gets rid of XDP...") - sim.set_xdp(obj, "offload") - simdev.remove() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_ethtool_tc_offloads(True) - - start_test("Test XDP prog reporting...") - sim.set_xdp(obj, "drv") - ipl = sim.ip_link_show(xdp=True) - progs = bpftool_prog_list(expected=1) - fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], - "Loaded program has wrong ID") - - start_test("Test XDP prog replace without force...") - ret, _ = sim.set_xdp(obj, "drv", fail=False) - fail(ret == 0, "Replaced XDP program without -force") - sim.wait_for_flush(total=1) - - start_test("Test XDP prog replace with force...") - ret, _ = sim.set_xdp(obj, "drv", force=True, fail=False) - fail(ret != 0, "Could not replace XDP program with -force") - bpftool_prog_list_wait(expected=1) - ipl = sim.ip_link_show(xdp=True) - progs = bpftool_prog_list(expected=1) - fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], - "Loaded program has wrong ID") - fail("dev" in progs[0].keys(), - "Device parameters reported for non-offloaded program") - - start_test("Test XDP prog replace with bad flags...") - ret, _, err = sim.set_xdp(obj, "generic", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack(err, - "Native and generic XDP can't be active at the same time.", - args) - - start_test("Test MTU restrictions...") - ret, _ = sim.set_mtu(9000, fail=False) - fail(ret == 0, - "Driver should refuse to increase MTU to 9000 with XDP loaded...") - sim.unset_xdp("drv") - bpftool_prog_list_wait(expected=0) - sim.set_mtu(9000) - ret, _, err = sim.set_xdp(obj, "drv", fail=False, include_stderr=True) - fail(ret == 0, "Driver should refuse to load program with MTU of 9000...") - check_extack_nsim(err, "MTU too large w/ XDP enabled.", args) - sim.set_mtu(1500) - - sim.wait_for_flush() - start_test("Test non-offload XDP attaching to HW...") - bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload") - nooffload = bpf_pinned("/sys/fs/bpf/nooffload") - ret, _, err = sim.set_xdp(nooffload, "offload", - fail=False, include_stderr=True) - fail(ret == 0, "attached non-offloaded XDP program to HW") - check_extack_nsim(err, "xdpoffload of non-bound program.", args) - rm("/sys/fs/bpf/nooffload") - - start_test("Test offload XDP attaching to drv...") - bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", - dev=sim['ifname']) - offload = bpf_pinned("/sys/fs/bpf/offload") - ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) - fail(ret == 0, "attached offloaded XDP program to drv") - check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args) - rm("/sys/fs/bpf/offload") - sim.wait_for_flush() - - start_test("Test XDP load failure...") - sim.dfs["dev/bpf_bind_verifier_accept"] = 0 - ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", - dev=sim['ifname'], fail=False, include_stderr=True) - fail(ret == 0, "verifier should fail on load") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") - sim.dfs["dev/bpf_bind_verifier_accept"] = 1 - sim.wait_for_flush() - - start_test("Test XDP offload...") - _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) - ipl = sim.ip_link_show(xdp=True) - link_xdp = ipl["xdp"]["prog"] - progs = bpftool_prog_list(expected=1) - prog = progs[0] - fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") - - start_test("Test XDP offload is device bound...") - dfs = simdev.dfs_get_bound_progs(expected=1) - dprog = dfs[0] - - fail(prog["id"] != link_xdp["id"], "Program IDs don't match") - fail(prog["tag"] != link_xdp["tag"], "Program tags don't match") - fail(str(link_xdp["id"]) != dprog["id"], "Program IDs don't match") - fail(dprog["state"] != "xlated", "Offloaded program state not translated") - fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") - - start_test("Test removing XDP program many times...") - sim.unset_xdp("offload") - sim.unset_xdp("offload") - sim.unset_xdp("drv") - sim.unset_xdp("drv") - sim.unset_xdp("") - sim.unset_xdp("") - bpftool_prog_list_wait(expected=0) - - start_test("Test attempt to use a program for a wrong device...") - simdev2 = NetdevSimDev() - sim2, = simdev2.nsims - sim2.set_xdp(obj, "offload") - pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") - - ret, _, err = sim.set_xdp(pinned, "offload", - fail=False, include_stderr=True) - fail(ret == 0, "Pinned program loaded for a different device accepted") - check_extack(err, "Program bound to different device.", args) - simdev2.remove() - ret, _, err = sim.set_xdp(pinned, "offload", - fail=False, include_stderr=True) - fail(ret == 0, "Pinned program loaded for a removed device accepted") - check_extack(err, "Program bound to different device.", args) - rm(pin_file) - bpftool_prog_list_wait(expected=0) - - simdev, sim = test_multi_prog(simdev, sim, obj, "", 1) - simdev, sim = test_multi_prog(simdev, sim, obj, "drv", 1) - simdev, sim = test_multi_prog(simdev, sim, obj, "generic", 2) - - start_test("Test mixing of TC and XDP...") - sim.tc_add_ingress() - sim.set_xdp(obj, "offload") - ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "Loading TC when XDP active should fail") - check_extack_nsim(err, "driver and netdev offload states mismatch.", args) - sim.unset_xdp("offload") - sim.wait_for_flush() - - sim.cls_bpf_add_filter(obj, skip_sw=True) - ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) - fail(ret == 0, "Loading XDP when TC active should fail") - check_extack_nsim(err, "TC program is already loaded.", args) - - start_test("Test binding TC from pinned...") - pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") - sim.tc_flush_filters(bound=1, total=1) - sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True) - sim.tc_flush_filters(bound=1, total=1) - - start_test("Test binding XDP from pinned...") - sim.set_xdp(obj, "offload") - pin_file, pinned = pin_prog("/sys/fs/bpf/tmp2", idx=1) - - sim.set_xdp(pinned, "offload", force=True) - sim.unset_xdp("offload") - sim.set_xdp(pinned, "offload", force=True) - sim.unset_xdp("offload") - - start_test("Test offload of wrong type fails...") - ret, _ = sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True, fail=False) - fail(ret == 0, "Managed to attach XDP program to TC") - - start_test("Test asking for TC offload of two filters...") - sim.cls_bpf_add_filter(obj, da=True, skip_sw=True) - ret, _, err = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True, - fail=False, include_stderr=True) - fail(ret == 0, "Managed to offload two TC filters at the same time") - check_extack_nsim(err, "driver and netdev offload states mismatch.", args) - - sim.tc_flush_filters(bound=2, total=2) - - start_test("Test if netdev removal waits for translation...") - delay_msec = 500 - sim.dfs["dev/bpf_bind_verifier_delay"] = delay_msec - start = time.time() - cmd_line = "tc filter add dev %s ingress bpf %s da skip_sw" % \ - (sim['ifname'], obj) - tc_proc = cmd(cmd_line, background=True, fail=False) - # Wait for the verifier to start - while simdev.dfs_num_bound_progs() <= 2: - pass - simdev.remove() - end = time.time() - ret, _ = cmd_result(tc_proc, fail=False) - time_diff = end - start - log("Time", "start:\t%s\nend:\t%s\ndiff:\t%s" % (start, end, time_diff)) - - fail(ret == 0, "Managed to load TC filter on a unregistering device") - delay_sec = delay_msec * 0.001 - fail(time_diff < delay_sec, "Removal process took %s, expected %s" % - (time_diff, delay_sec)) - - # Remove all pinned files and reinstantiate the netdev - clean_up() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - map_obj = bpf_obj("sample_map_ret0.bpf.o") - start_test("Test loading program with maps...") - sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - - start_test("Test bpftool bound info reporting (own ns)...") - check_dev_info(False, "") - - start_test("Test bpftool bound info reporting (other ns)...") - ns = mknetns() - sim.set_ns(ns) - check_dev_info(True, "") - - start_test("Test bpftool bound info reporting (remote ns)...") - check_dev_info(False, ns) - - start_test("Test bpftool bound info reporting (back to own ns)...") - sim.set_ns("") - check_dev_info(False, "") - - prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog") - map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2) - simdev.remove() - - start_test("Test bpftool bound info reporting (removed dev)...") - check_dev_info_removed(prog_file=prog_file, map_file=map_file) - - # Remove all pinned files and reinstantiate the netdev - clean_up() - bpftool_prog_list_wait(expected=0) - - simdev = NetdevSimDev() - sim, = simdev.nsims - - start_test("Test map update (no flags)...") - sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - maps = bpftool_map_list(expected=2) - array = maps[0] if maps[0]["type"] == "array" else maps[1] - htab = maps[0] if maps[0]["type"] == "hash" else maps[1] - for m in maps: - for i in range(2): - bpftool("map update id %d key %s value %s" % - (m["id"], int2str("I", i), int2str("Q", i * 3))) - - for m in maps: - ret, _ = bpftool("map update id %d key %s value %s" % - (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), - fail=False) - fail(ret == 0, "added too many entries") - - start_test("Test map update (exists)...") - for m in maps: - for i in range(2): - bpftool("map update id %d key %s value %s exist" % - (m["id"], int2str("I", i), int2str("Q", i * 3))) - - for m in maps: - ret, err = bpftool("map update id %d key %s value %s exist" % - (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), - fail=False) - fail(ret == 0, "updated non-existing key") - fail(err["error"].find("No such file or directory") == -1, - "expected ENOENT, error is '%s'" % (err["error"])) - - start_test("Test map update (noexist)...") - for m in maps: - for i in range(2): - ret, err = bpftool("map update id %d key %s value %s noexist" % - (m["id"], int2str("I", i), int2str("Q", i * 3)), - fail=False) - fail(ret == 0, "updated existing key") - fail(err["error"].find("File exists") == -1, - "expected EEXIST, error is '%s'" % (err["error"])) - - start_test("Test map dump...") - for m in maps: - _, entries = bpftool("map dump id %d" % (m["id"])) - for i in range(2): - key = str2int(entries[i]["key"]) - fail(key != i, "expected key %d, got %d" % (key, i)) - val = str2int(entries[i]["value"]) - fail(val != i * 3, "expected value %d, got %d" % (val, i * 3)) - - start_test("Test map getnext...") - for m in maps: - _, entry = bpftool("map getnext id %d" % (m["id"])) - key = str2int(entry["next_key"]) - fail(key != 0, "next key %d, expected %d" % (key, 0)) - _, entry = bpftool("map getnext id %d key %s" % - (m["id"], int2str("I", 0))) - key = str2int(entry["next_key"]) - fail(key != 1, "next key %d, expected %d" % (key, 1)) - ret, err = bpftool("map getnext id %d key %s" % - (m["id"], int2str("I", 1)), fail=False) - fail(ret == 0, "got next key past the end of map") - fail(err["error"].find("No such file or directory") == -1, - "expected ENOENT, error is '%s'" % (err["error"])) - - start_test("Test map delete (htab)...") - for i in range(2): - bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i))) - - start_test("Test map delete (array)...") - for i in range(2): - ret, err = bpftool("map delete id %d key %s" % - (htab["id"], int2str("I", i)), fail=False) - fail(ret == 0, "removed entry from an array") - fail(err["error"].find("No such file or directory") == -1, - "expected ENOENT, error is '%s'" % (err["error"])) - - start_test("Test map remove...") - sim.unset_xdp("offload") - bpftool_map_list_wait(expected=0) - simdev.remove() - - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - simdev.remove() - bpftool_map_list_wait(expected=0) - - start_test("Test map creation fail path...") - simdev = NetdevSimDev() - sim, = simdev.nsims - sim.dfs["bpf_map_accept"] = "N" - ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) - fail(ret == 0, - "netdevsim didn't refuse to create a map with offload disabled") - - simdev.remove() - - start_test("Test multi-dev ASIC program reuse...") - simdevA = NetdevSimDev() - simA, = simdevA.nsims - simdevB = NetdevSimDev(3) - simB1, simB2, simB3 = simdevB.nsims - sims = (simA, simB1, simB2, simB3) - simB = (simB1, simB2, simB3) - - bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA", - dev=simA['ifname']) - progA = bpf_pinned("/sys/fs/bpf/nsimA") - bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB", - dev=simB1['ifname']) - progB = bpf_pinned("/sys/fs/bpf/nsimB") - - simA.set_xdp(progA, "offload", JSON=False) - for d in simdevB.nsims: - d.set_xdp(progB, "offload", JSON=False) - - start_test("Test multi-dev ASIC cross-dev replace...") - ret, _ = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False) - fail(ret == 0, "cross-ASIC program allowed") - for d in simdevB.nsims: - ret, _ = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False) - fail(ret == 0, "cross-ASIC program allowed") - - start_test("Test multi-dev ASIC cross-dev install...") - for d in sims: - d.unset_xdp("offload") - - ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False, - fail=False, include_stderr=True) - fail(ret == 0, "cross-ASIC program allowed") - check_extack(err, "Program bound to different device.", args) - for d in simdevB.nsims: - ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False, - fail=False, include_stderr=True) - fail(ret == 0, "cross-ASIC program allowed") - check_extack(err, "Program bound to different device.", args) - - start_test("Test multi-dev ASIC cross-dev map reuse...") - - mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0] - mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0] - - ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", - dev=simB3['ifname'], - maps=["idx 0 id %d" % (mapB)], - fail=False) - fail(ret != 0, "couldn't reuse a map on the same ASIC") - rm("/sys/fs/bpf/nsimB_") - - ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_", - dev=simA['ifname'], - maps=["idx 0 id %d" % (mapB)], - fail=False, include_stderr=True) - fail(ret == 0, "could reuse a map on a different ASIC") - fail(err.count("offload device mismatch between prog and map") == 0, - "error message missing for cross-ASIC map") - - ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", - dev=simB1['ifname'], - maps=["idx 0 id %d" % (mapA)], - fail=False, include_stderr=True) - fail(ret == 0, "could reuse a map on a different ASIC") - fail(err.count("offload device mismatch between prog and map") == 0, - "error message missing for cross-ASIC map") - - start_test("Test multi-dev ASIC cross-dev destruction...") - bpftool_prog_list_wait(expected=2) - - simdevA.remove() - bpftool_prog_list_wait(expected=1) - - ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] - fail(ifnameB != simB1['ifname'], "program not bound to original device") - simB1.remove() - bpftool_prog_list_wait(expected=1) - - start_test("Test multi-dev ASIC cross-dev destruction - move...") - ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] - fail(ifnameB not in (simB2['ifname'], simB3['ifname']), - "program not bound to remaining devices") - - simB2.remove() - ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] - fail(ifnameB != simB3['ifname'], "program not bound to remaining device") - - simB3.remove() - simdevB.remove() - bpftool_prog_list_wait(expected=0) - - start_test("Test multi-dev ASIC cross-dev destruction - orphaned...") - ret, out = bpftool("prog show %s" % (progB), fail=False) - fail(ret != 0, "couldn't get information about orphaned program") - - print("%s: OK" % (os.path.basename(__file__))) - -finally: - log("Clean up...", "", level=1) - log_level_inc() - clean_up() diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 5e34c93aa51b..e8bfa715aa49 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -84,6 +84,8 @@ TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += csum TEST_GEN_FILES += nat6to4.o TEST_GEN_FILES += xdp_dummy.o +TEST_GEN_FILES += sample_ret0.bpf.o +TEST_GEN_FILES += sample_map_ret0.bpf.o TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh @@ -93,6 +95,7 @@ TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += fdb_flush.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh +TEST_PROGS += bpf_offload.py TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh @@ -142,7 +145,10 @@ endif CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) -$(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o: $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) +BPF_PROG_OBJS := $(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o \ + $(OUTPUT)/sample_map_ret0.bpf.o $(OUTPUT)/sample_ret0.bpf.o + +$(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) $(CLANG) -O2 --target=bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py new file mode 100755 index 000000000000..6157f884d091 --- /dev/null +++ b/tools/testing/selftests/net/bpf_offload.py @@ -0,0 +1,1405 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2017 Netronome Systems, Inc. +# Copyright (c) 2019 Mellanox Technologies. All rights reserved +# +# This software is licensed under the GNU General License Version 2, +# June 1991 as shown in the file COPYING in the top-level directory of this +# source tree. +# +# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" +# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, +# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE +# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME +# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +from datetime import datetime +import argparse +import errno +import json +import os +import pprint +import random +import re +import stat +import string +import struct +import subprocess +import time +import traceback + +logfile = None +log_level = 1 +skip_extack = False +bpf_test_dir = os.path.dirname(os.path.realpath(__file__)) +pp = pprint.PrettyPrinter() +devs = [] # devices we created for clean up +files = [] # files to be removed +netns = [] # net namespaces to be removed + +def log_get_sec(level=0): + return "*" * (log_level + level) + +def log_level_inc(add=1): + global log_level + log_level += add + +def log_level_dec(sub=1): + global log_level + log_level -= sub + +def log_level_set(level): + global log_level + log_level = level + +def log(header, data, level=None): + """ + Output to an optional log. + """ + if logfile is None: + return + if level is not None: + log_level_set(level) + + if not isinstance(data, str): + data = pp.pformat(data) + + if len(header): + logfile.write("\n" + log_get_sec() + " ") + logfile.write(header) + if len(header) and len(data.strip()): + logfile.write("\n") + logfile.write(data) + +def skip(cond, msg): + if not cond: + return + print("SKIP: " + msg) + log("SKIP: " + msg, "", level=1) + os.sys.exit(0) + +def fail(cond, msg): + if not cond: + return + print("FAIL: " + msg) + tb = "".join(traceback.extract_stack().format()) + print(tb) + log("FAIL: " + msg, tb, level=1) + os.sys.exit(1) + +def start_test(msg): + log(msg, "", level=1) + log_level_inc() + print(msg) + +def cmd(cmd, shell=True, include_stderr=False, background=False, fail=True): + """ + Run a command in subprocess and return tuple of (retval, stdout); + optionally return stderr as well as third value. + """ + proc = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if background: + msg = "%s START: %s" % (log_get_sec(1), + datetime.now().strftime("%H:%M:%S.%f")) + log("BKG " + proc.args, msg) + return proc + + return cmd_result(proc, include_stderr=include_stderr, fail=fail) + +def cmd_result(proc, include_stderr=False, fail=False): + stdout, stderr = proc.communicate() + stdout = stdout.decode("utf-8") + stderr = stderr.decode("utf-8") + proc.stdout.close() + proc.stderr.close() + + stderr = "\n" + stderr + if stderr[-1] == "\n": + stderr = stderr[:-1] + + sec = log_get_sec(1) + log("CMD " + proc.args, + "RETCODE: %d\n%s STDOUT:\n%s%s STDERR:%s\n%s END: %s" % + (proc.returncode, sec, stdout, sec, stderr, + sec, datetime.now().strftime("%H:%M:%S.%f"))) + + if proc.returncode != 0 and fail: + if len(stderr) > 0 and stderr[-1] == "\n": + stderr = stderr[:-1] + raise Exception("Command failed: %s\n%s" % (proc.args, stderr)) + + if include_stderr: + return proc.returncode, stdout, stderr + else: + return proc.returncode, stdout + +def rm(f): + cmd("rm -f %s" % (f)) + if f in files: + files.remove(f) + +def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): + params = "" + if JSON: + params += "%s " % (flags["json"]) + + if ns != "": + ns = "ip netns exec %s " % (ns) + + if include_stderr: + ret, stdout, stderr = cmd(ns + name + " " + params + args, + fail=fail, include_stderr=True) + else: + ret, stdout = cmd(ns + name + " " + params + args, + fail=fail, include_stderr=False) + + if JSON and len(stdout.strip()) != 0: + out = json.loads(stdout) + else: + out = stdout + + if include_stderr: + return ret, out, stderr + else: + return ret, out + +def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False): + return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, + fail=fail, include_stderr=include_stderr) + +def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True): + _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) + # Remove the base progs + for p in base_progs: + if p in progs: + progs.remove(p) + if exclude_orphaned: + progs = [ p for p in progs if not p['orphaned'] ] + if expected is not None: + if len(progs) != expected: + fail(True, "%d BPF programs loaded, expected %d" % + (len(progs), expected)) + return progs + +def bpftool_map_list(expected=None, ns=""): + _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) + # Remove the base maps + maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names] + if expected is not None: + if len(maps) != expected: + fail(True, "%d BPF maps loaded, expected %d" % + (len(maps), expected)) + return maps + +def bpftool_prog_list_wait(expected=0, n_retry=20): + for i in range(n_retry): + nprogs = len(bpftool_prog_list()) + if nprogs == expected: + return + time.sleep(0.05) + raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) + +def bpftool_map_list_wait(expected=0, n_retry=20): + for i in range(n_retry): + nmaps = len(bpftool_map_list()) + if nmaps == expected: + return + time.sleep(0.05) + raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) + +def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, + fail=True, include_stderr=False): + args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) + if prog_type is not None: + args += " type " + prog_type + if dev is not None: + args += " dev " + dev + if len(maps): + args += " map " + " map ".join(maps) + + res = bpftool(args, fail=fail, include_stderr=include_stderr) + if res[0] == 0: + files.append(file_name) + return res + +def ip(args, force=False, JSON=True, ns="", fail=True, include_stderr=False): + if force: + args = "-force " + args + return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns, + fail=fail, include_stderr=include_stderr) + +def tc(args, JSON=True, ns="", fail=True, include_stderr=False): + return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns, + fail=fail, include_stderr=include_stderr) + +def ethtool(dev, opt, args, fail=True): + return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) + +def bpf_obj(name, sec=".text", path=bpf_test_dir,): + return "obj %s sec %s" % (os.path.join(path, name), sec) + +def bpf_pinned(name): + return "pinned %s" % (name) + +def bpf_bytecode(bytecode): + return "bytecode \"%s\"" % (bytecode) + +def mknetns(n_retry=10): + for i in range(n_retry): + name = ''.join([random.choice(string.ascii_letters) for i in range(8)]) + ret, _ = ip("netns add %s" % (name), fail=False) + if ret == 0: + netns.append(name) + return name + return None + +def int2str(fmt, val): + ret = [] + for b in struct.pack(fmt, val): + ret.append(int(b)) + return " ".join(map(lambda x: str(x), ret)) + +def str2int(strtab): + inttab = [] + for i in strtab: + inttab.append(int(i, 16)) + ba = bytearray(inttab) + if len(strtab) == 4: + fmt = "I" + elif len(strtab) == 8: + fmt = "Q" + else: + raise Exception("String array of len %d can't be unpacked to an int" % + (len(strtab))) + return struct.unpack(fmt, ba)[0] + +class DebugfsDir: + """ + Class for accessing DebugFS directories as a dictionary. + """ + + def __init__(self, path): + self.path = path + self._dict = self._debugfs_dir_read(path) + + def __len__(self): + return len(self._dict.keys()) + + def __getitem__(self, key): + if type(key) is int: + key = list(self._dict.keys())[key] + return self._dict[key] + + def __setitem__(self, key, value): + log("DebugFS set %s = %s" % (key, value), "") + log_level_inc() + + cmd("echo '%s' > %s/%s" % (value, self.path, key)) + log_level_dec() + + _, out = cmd('cat %s/%s' % (self.path, key)) + self._dict[key] = out.strip() + + def _debugfs_dir_read(self, path): + dfs = {} + + log("DebugFS state for %s" % (path), "") + log_level_inc(add=2) + + _, out = cmd('ls ' + path) + for f in out.split(): + if f == "ports": + continue + + p = os.path.join(path, f) + if not os.stat(p).st_mode & stat.S_IRUSR: + continue + + if os.path.isfile(p): + # We need to init trap_flow_action_cookie before read it + if f == "trap_flow_action_cookie": + cmd('echo deadbeef > %s/%s' % (path, f)) + _, out = cmd('cat %s/%s' % (path, f)) + dfs[f] = out.strip() + elif os.path.isdir(p): + dfs[f] = DebugfsDir(p) + else: + raise Exception("%s is neither file nor directory" % (p)) + + log_level_dec() + log("DebugFS state", dfs) + log_level_dec() + + return dfs + +class NetdevSimDev: + """ + Class for netdevsim bus device and its attributes. + """ + @staticmethod + def ctrl_write(path, val): + fullpath = os.path.join("/sys/bus/netdevsim/", path) + try: + with open(fullpath, "w") as f: + f.write(val) + except OSError as e: + log("WRITE %s: %r" % (fullpath, val), -e.errno) + raise e + log("WRITE %s: %r" % (fullpath, val), 0) + + def __init__(self, port_count=1): + addr = 0 + while True: + try: + self.ctrl_write("new_device", "%u %u" % (addr, port_count)) + except OSError as e: + if e.errno == errno.ENOSPC: + addr += 1 + continue + raise e + break + self.addr = addr + + # As probe of netdevsim device might happen from a workqueue, + # so wait here until all netdevs appear. + self.wait_for_netdevs(port_count) + + ret, out = cmd("udevadm settle", fail=False) + if ret: + raise Exception("udevadm settle failed") + ifnames = self.get_ifnames() + + devs.append(self) + self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr + + self.nsims = [] + for port_index in range(port_count): + self.nsims.append(NetdevSim(self, port_index, ifnames[port_index])) + + def get_ifnames(self): + ifnames = [] + listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr) + for ifname in listdir: + ifnames.append(ifname) + ifnames.sort() + return ifnames + + def wait_for_netdevs(self, port_count): + timeout = 5 + timeout_start = time.time() + + while True: + try: + ifnames = self.get_ifnames() + except FileNotFoundError as e: + ifnames = [] + if len(ifnames) == port_count: + break + if time.time() < timeout_start + timeout: + continue + raise Exception("netdevices did not appear within timeout") + + def dfs_num_bound_progs(self): + path = os.path.join(self.dfs_dir, "bpf_bound_progs") + _, progs = cmd('ls %s' % (path)) + return len(progs.split()) + + def dfs_get_bound_progs(self, expected): + progs = DebugfsDir(os.path.join(self.dfs_dir, "bpf_bound_progs")) + if expected is not None: + if len(progs) != expected: + fail(True, "%d BPF programs bound, expected %d" % + (len(progs), expected)) + return progs + + def remove(self): + self.ctrl_write("del_device", "%u" % (self.addr, )) + devs.remove(self) + + def remove_nsim(self, nsim): + self.nsims.remove(nsim) + self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), + "%u" % (nsim.port_index, )) + +class NetdevSim: + """ + Class for netdevsim netdevice and its attributes. + """ + + def __init__(self, nsimdev, port_index, ifname): + # In case udev renamed the netdev to according to new schema, + # check if the name matches the port_index. + nsimnamere = re.compile("eni\d+np(\d+)") + match = nsimnamere.match(ifname) + if match and int(match.groups()[0]) != port_index + 1: + raise Exception("netdevice name mismatches the expected one") + + self.nsimdev = nsimdev + self.port_index = port_index + self.ns = "" + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) + self.dfs_refresh() + _, [self.dev] = ip("link show dev %s" % ifname) + + def __getitem__(self, key): + return self.dev[key] + + def remove(self): + self.nsimdev.remove_nsim(self) + + def dfs_refresh(self): + self.dfs = DebugfsDir(self.dfs_dir) + return self.dfs + + def dfs_read(self, f): + path = os.path.join(self.dfs_dir, f) + _, data = cmd('cat %s' % (path)) + return data.strip() + + def wait_for_flush(self, bound=0, total=0, n_retry=20): + for i in range(n_retry): + nbound = self.nsimdev.dfs_num_bound_progs() + nprogs = len(bpftool_prog_list()) + if nbound == bound and nprogs == total: + return + time.sleep(0.05) + raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) + + def set_ns(self, ns): + name = "1" if ns == "" else ns + ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) + self.ns = ns + + def set_mtu(self, mtu, fail=True): + return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu), + fail=fail) + + def set_xdp(self, bpf, mode, force=False, JSON=True, verbose=False, + fail=True, include_stderr=False): + if verbose: + bpf += " verbose" + return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf), + force=force, JSON=JSON, + fail=fail, include_stderr=include_stderr) + + def unset_xdp(self, mode, force=False, JSON=True, + fail=True, include_stderr=False): + return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode), + force=force, JSON=JSON, + fail=fail, include_stderr=include_stderr) + + def ip_link_show(self, xdp): + _, link = ip("link show dev %s" % (self['ifname'])) + if len(link) > 1: + raise Exception("Multiple objects on ip link show") + if len(link) < 1: + return {} + fail(xdp != "xdp" in link, + "XDP program not reporting in iplink (reported %s, expected %s)" % + ("xdp" in link, xdp)) + return link[0] + + def tc_add_ingress(self): + tc("qdisc add dev %s ingress" % (self['ifname'])) + + def tc_del_ingress(self): + tc("qdisc del dev %s ingress" % (self['ifname'])) + + def tc_flush_filters(self, bound=0, total=0): + self.tc_del_ingress() + self.tc_add_ingress() + self.wait_for_flush(bound=bound, total=total) + + def tc_show_ingress(self, expected=None): + # No JSON support, oh well... + flags = ["skip_sw", "skip_hw", "in_hw"] + named = ["protocol", "pref", "chain", "handle", "id", "tag"] + + args = "-s filter show dev %s ingress" % (self['ifname']) + _, out = tc(args, JSON=False) + + filters = [] + lines = out.split('\n') + for line in lines: + words = line.split() + if "handle" not in words: + continue + fltr = {} + for flag in flags: + fltr[flag] = flag in words + for name in named: + try: + idx = words.index(name) + fltr[name] = words[idx + 1] + except ValueError: + pass + filters.append(fltr) + + if expected is not None: + fail(len(filters) != expected, + "%d ingress filters loaded, expected %d" % + (len(filters), expected)) + return filters + + def cls_filter_op(self, op, qdisc="ingress", prio=None, handle=None, + chain=None, cls="", params="", + fail=True, include_stderr=False): + spec = "" + if prio is not None: + spec += " prio %d" % (prio) + if handle: + spec += " handle %s" % (handle) + if chain is not None: + spec += " chain %d" % (chain) + + return tc("filter {op} dev {dev} {qdisc} {spec} {cls} {params}"\ + .format(op=op, dev=self['ifname'], qdisc=qdisc, spec=spec, + cls=cls, params=params), + fail=fail, include_stderr=include_stderr) + + def cls_bpf_add_filter(self, bpf, op="add", prio=None, handle=None, + chain=None, da=False, verbose=False, + skip_sw=False, skip_hw=False, + fail=True, include_stderr=False): + cls = "bpf " + bpf + + params = "" + if da: + params += " da" + if verbose: + params += " verbose" + if skip_sw: + params += " skip_sw" + if skip_hw: + params += " skip_hw" + + return self.cls_filter_op(op=op, prio=prio, handle=handle, cls=cls, + chain=chain, params=params, + fail=fail, include_stderr=include_stderr) + + def set_ethtool_tc_offloads(self, enable, fail=True): + args = "hw-tc-offload %s" % ("on" if enable else "off") + return ethtool(self, "-K", args, fail=fail) + +################################################################################ +def clean_up(): + global files, netns, devs + + for dev in devs: + dev.remove() + for f in files: + cmd("rm -f %s" % (f)) + for ns in netns: + cmd("ip netns delete %s" % (ns)) + files = [] + netns = [] + +def pin_prog(file_name, idx=0): + progs = bpftool_prog_list(expected=(idx + 1)) + prog = progs[idx] + bpftool("prog pin id %d %s" % (prog["id"], file_name)) + files.append(file_name) + + return file_name, bpf_pinned(file_name) + +def pin_map(file_name, idx=0, expected=1): + maps = bpftool_map_list(expected=expected) + m = maps[idx] + bpftool("map pin id %d %s" % (m["id"], file_name)) + files.append(file_name) + + return file_name, bpf_pinned(file_name) + +def check_dev_info_removed(prog_file=None, map_file=None): + bpftool_prog_list(expected=0) + bpftool_prog_list(expected=1, exclude_orphaned=False) + ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) + fail(ret != 0, "failed to show prog with removed device") + + bpftool_map_list(expected=0) + ret, err = bpftool("map show pin %s" % (map_file), fail=False) + fail(ret == 0, "Showing map with removed device did not fail") + fail(err["error"].find("No such device") == -1, + "Showing map with removed device expected ENODEV, error is %s" % + (err["error"])) + +def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): + progs = bpftool_prog_list(expected=1, ns=ns) + prog = progs[0] + + fail("dev" not in prog.keys(), "Device parameters not reported") + dev = prog["dev"] + fail("ifindex" not in dev.keys(), "Device parameters not reported") + fail("ns_dev" not in dev.keys(), "Device parameters not reported") + fail("ns_inode" not in dev.keys(), "Device parameters not reported") + + if not other_ns: + fail("ifname" not in dev.keys(), "Ifname not reported") + fail(dev["ifname"] != sim["ifname"], + "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"])) + else: + fail("ifname" in dev.keys(), "Ifname is reported for other ns") + + maps = bpftool_map_list(expected=2, ns=ns) + for m in maps: + fail("dev" not in m.keys(), "Device parameters not reported") + fail(dev != m["dev"], "Map's device different than program's") + +def check_extack(output, reference, args): + if skip_extack: + return + lines = output.split("\n") + comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference + fail(not comp, "Missing or incorrect netlink extack message") + +def check_extack_nsim(output, reference, args): + check_extack(output, "netdevsim: " + reference, args) + +def check_no_extack(res, needle): + fail((res[1] + res[2]).count(needle) or (res[1] + res[2]).count("Warning:"), + "Found '%s' in command output, leaky extack?" % (needle)) + +def check_verifier_log(output, reference): + lines = output.split("\n") + for l in reversed(lines): + if l == reference: + return + fail(True, "Missing or incorrect message from netdevsim in verifier log") + +def check_multi_basic(two_xdps): + fail(two_xdps["mode"] != 4, "Bad mode reported with multiple programs") + fail("prog" in two_xdps, "Base program reported in multi program mode") + fail(len(two_xdps["attached"]) != 2, + "Wrong attached program count with two programs") + fail(two_xdps["attached"][0]["prog"]["id"] == + two_xdps["attached"][1]["prog"]["id"], + "Offloaded and other programs have the same id") + +def test_spurios_extack(sim, obj, skip_hw, needle): + res = sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=skip_hw, + include_stderr=True) + check_no_extack(res, needle) + res = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, + skip_hw=skip_hw, include_stderr=True) + check_no_extack(res, needle) + res = sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf", + include_stderr=True) + check_no_extack(res, needle) + +def test_multi_prog(simdev, sim, obj, modename, modeid): + start_test("Test multi-attachment XDP - %s + offload..." % + (modename or "default", )) + sim.set_xdp(obj, "offload") + xdp = sim.ip_link_show(xdp=True)["xdp"] + offloaded = sim.dfs_read("bpf_offloaded_id") + fail("prog" not in xdp, "Base program not reported in single program mode") + fail(len(xdp["attached"]) != 1, + "Wrong attached program count with one program") + + sim.set_xdp(obj, modename) + two_xdps = sim.ip_link_show(xdp=True)["xdp"] + + fail(xdp["attached"][0] not in two_xdps["attached"], + "Offload program not reported after other activated") + check_multi_basic(two_xdps) + + offloaded2 = sim.dfs_read("bpf_offloaded_id") + fail(offloaded != offloaded2, + "Offload ID changed after loading other program") + + start_test("Test multi-attachment XDP - replace...") + ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) + fail(ret == 0, "Replaced one of programs without -force") + check_extack(err, "XDP program already attached.", args) + + start_test("Test multi-attachment XDP - remove without mode...") + ret, _, err = sim.unset_xdp("", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Removed program without a mode flag") + check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) + + sim.unset_xdp("offload") + xdp = sim.ip_link_show(xdp=True)["xdp"] + offloaded = sim.dfs_read("bpf_offloaded_id") + + fail(xdp["mode"] != modeid, "Bad mode reported after multiple programs") + fail("prog" not in xdp, + "Base program not reported after multi program mode") + fail(xdp["attached"][0] not in two_xdps["attached"], + "Offload program not reported after other activated") + fail(len(xdp["attached"]) != 1, + "Wrong attached program count with remaining programs") + fail(offloaded != "0", "Offload ID reported with only other program left") + + start_test("Test multi-attachment XDP - reattach...") + sim.set_xdp(obj, "offload") + two_xdps = sim.ip_link_show(xdp=True)["xdp"] + + fail(xdp["attached"][0] not in two_xdps["attached"], + "Other program not reported after offload activated") + check_multi_basic(two_xdps) + + start_test("Test multi-attachment XDP - device remove...") + simdev.remove() + + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.set_ethtool_tc_offloads(True) + return [simdev, sim] + +# Parse command line +parser = argparse.ArgumentParser() +parser.add_argument("--log", help="output verbose log to given file") +args = parser.parse_args() +if args.log: + logfile = open(args.log, 'w+') + logfile.write("# -*-Org-*-") + +log("Prepare...", "", level=1) +log_level_inc() + +# Check permissions +skip(os.getuid() != 0, "test must be run as root") + +# Check tools +ret, progs = bpftool("prog", fail=False) +skip(ret != 0, "bpftool not installed") +base_progs = progs +_, base_maps = bpftool("map") +base_map_names = [ + 'pid_iter.rodata', # created on each bpftool invocation + 'libbpf_det_bind', # created on each bpftool invocation +] + +# Check netdevsim +if not os.path.isdir("/sys/bus/netdevsim/"): + ret, out = cmd("modprobe netdevsim", fail=False) + skip(ret != 0, "netdevsim module could not be loaded") + +# Check debugfs +_, out = cmd("mount") +if out.find("/sys/kernel/debug type debugfs") == -1: + cmd("mount -t debugfs none /sys/kernel/debug") + +# Check samples are compiled +samples = ["sample_ret0.bpf.o", "sample_map_ret0.bpf.o"] +for s in samples: + ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False) + skip(ret != 0, "sample %s/%s not found, please compile it" % + (bpf_test_dir, s)) + +# Check if iproute2 is built with libmnl (needed by extack support) +_, _, err = cmd("tc qdisc delete dev lo handle 0", + fail=False, include_stderr=True) +if err.find("Error: Failed to find qdisc with specified handle.") == -1: + print("Warning: no extack message in iproute2 output, libmnl missing?") + log("Warning: no extack message in iproute2 output, libmnl missing?", "") + skip_extack = True + +# Check if net namespaces seem to work +ns = mknetns() +skip(ns is None, "Could not create a net namespace") +cmd("ip netns delete %s" % (ns)) +netns = [] + +try: + obj = bpf_obj("sample_ret0.bpf.o") + bytecode = bpf_bytecode("1,6 0 0 4294967295,") + + start_test("Test destruction of generic XDP...") + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.set_xdp(obj, "generic") + simdev.remove() + bpftool_prog_list_wait(expected=0) + + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.tc_add_ingress() + + start_test("Test TC non-offloaded...") + ret, _ = sim.cls_bpf_add_filter(obj, skip_hw=True, fail=False) + fail(ret != 0, "Software TC filter did not load") + + start_test("Test TC non-offloaded isn't getting bound...") + ret, _ = sim.cls_bpf_add_filter(obj, fail=False) + fail(ret != 0, "Software TC filter did not load") + simdev.dfs_get_bound_progs(expected=0) + + sim.tc_flush_filters() + + start_test("Test TC offloads are off by default...") + ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter loaded without enabling TC offloads") + check_extack(err, "TC offload is disabled on net device.", args) + sim.wait_for_flush() + + sim.set_ethtool_tc_offloads(True) + sim.dfs["bpf_tc_non_bound_accept"] = "Y" + + start_test("Test TC offload by default...") + ret, _ = sim.cls_bpf_add_filter(obj, fail=False) + fail(ret != 0, "Software TC filter did not load") + simdev.dfs_get_bound_progs(expected=0) + ingress = sim.tc_show_ingress(expected=1) + fltr = ingress[0] + fail(not fltr["in_hw"], "Filter not offloaded by default") + + sim.tc_flush_filters() + + start_test("Test TC cBPF bytcode tries offload by default...") + ret, _ = sim.cls_bpf_add_filter(bytecode, fail=False) + fail(ret != 0, "Software TC filter did not load") + simdev.dfs_get_bound_progs(expected=0) + ingress = sim.tc_show_ingress(expected=1) + fltr = ingress[0] + fail(not fltr["in_hw"], "Bytecode not offloaded by default") + + sim.tc_flush_filters() + sim.dfs["bpf_tc_non_bound_accept"] = "N" + + start_test("Test TC cBPF unbound bytecode doesn't offload...") + ret, _, err = sim.cls_bpf_add_filter(bytecode, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC bytecode loaded for offload") + check_extack_nsim(err, "netdevsim configured to reject unbound programs.", + args) + sim.wait_for_flush() + + start_test("Test non-0 chain offload...") + ret, _, err = sim.cls_bpf_add_filter(obj, chain=1, prio=1, handle=1, + skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "Offloaded a filter to chain other than 0") + check_extack(err, "Driver supports only offload of chain 0.", args) + sim.tc_flush_filters() + + start_test("Test TC replace...") + sim.cls_bpf_add_filter(obj, prio=1, handle=1) + sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_sw=True) + sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_sw=True) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=True) + sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_hw=True) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + start_test("Test TC replace bad flags...") + for i in range(3): + for j in range(3): + ret, _ = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, + skip_sw=(j == 1), skip_hw=(j == 2), + fail=False) + fail(bool(ret) != bool(j), + "Software TC incorrect load in replace test, iteration %d" % + (j)) + sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf") + + start_test("Test spurious extack from the driver...") + test_spurios_extack(sim, obj, False, "netdevsim") + test_spurios_extack(sim, obj, True, "netdevsim") + + sim.set_ethtool_tc_offloads(False) + + test_spurios_extack(sim, obj, False, "TC offload is disabled") + test_spurios_extack(sim, obj, True, "TC offload is disabled") + + sim.set_ethtool_tc_offloads(True) + + sim.tc_flush_filters() + + start_test("Test TC offloads failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter did not reject with TC offloads enabled") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + + start_test("Test TC offloads work...") + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret != 0, "TC filter did not load with TC offloads enabled") + + start_test("Test TC offload basics...") + dfs = simdev.dfs_get_bound_progs(expected=1) + progs = bpftool_prog_list(expected=1) + ingress = sim.tc_show_ingress(expected=1) + + dprog = dfs[0] + prog = progs[0] + fltr = ingress[0] + fail(fltr["skip_hw"], "TC does reports 'skip_hw' on offloaded filter") + fail(not fltr["in_hw"], "TC does not report 'in_hw' for offloaded filter") + fail(not fltr["skip_sw"], "TC does not report 'skip_sw' back") + + start_test("Test TC offload is device-bound...") + fail(str(prog["id"]) != fltr["id"], "Program IDs don't match") + fail(prog["tag"] != fltr["tag"], "Program tags don't match") + fail(fltr["id"] != dprog["id"], "Program IDs don't match") + fail(dprog["state"] != "xlated", "Offloaded program state not translated") + fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") + + start_test("Test disabling TC offloads is rejected while filters installed...") + ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) + fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") + sim.set_ethtool_tc_offloads(True) + + start_test("Test qdisc removal frees things...") + sim.tc_flush_filters() + sim.tc_show_ingress(expected=0) + + start_test("Test disabling TC offloads is OK without filters...") + ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) + fail(ret != 0, + "Driver refused to disable TC offloads without filters installed...") + + sim.set_ethtool_tc_offloads(True) + + start_test("Test destroying device gets rid of TC filters...") + sim.cls_bpf_add_filter(obj, skip_sw=True) + simdev.remove() + bpftool_prog_list_wait(expected=0) + + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.set_ethtool_tc_offloads(True) + + start_test("Test destroying device gets rid of XDP...") + sim.set_xdp(obj, "offload") + simdev.remove() + bpftool_prog_list_wait(expected=0) + + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.set_ethtool_tc_offloads(True) + + start_test("Test XDP prog reporting...") + sim.set_xdp(obj, "drv") + ipl = sim.ip_link_show(xdp=True) + progs = bpftool_prog_list(expected=1) + fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], + "Loaded program has wrong ID") + + start_test("Test XDP prog replace without force...") + ret, _ = sim.set_xdp(obj, "drv", fail=False) + fail(ret == 0, "Replaced XDP program without -force") + sim.wait_for_flush(total=1) + + start_test("Test XDP prog replace with force...") + ret, _ = sim.set_xdp(obj, "drv", force=True, fail=False) + fail(ret != 0, "Could not replace XDP program with -force") + bpftool_prog_list_wait(expected=1) + ipl = sim.ip_link_show(xdp=True) + progs = bpftool_prog_list(expected=1) + fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"], + "Loaded program has wrong ID") + fail("dev" in progs[0].keys(), + "Device parameters reported for non-offloaded program") + + start_test("Test XDP prog replace with bad flags...") + ret, _, err = sim.set_xdp(obj, "generic", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Replaced XDP program with a program in different mode") + check_extack(err, + "Native and generic XDP can't be active at the same time.", + args) + + start_test("Test MTU restrictions...") + ret, _ = sim.set_mtu(9000, fail=False) + fail(ret == 0, + "Driver should refuse to increase MTU to 9000 with XDP loaded...") + sim.unset_xdp("drv") + bpftool_prog_list_wait(expected=0) + sim.set_mtu(9000) + ret, _, err = sim.set_xdp(obj, "drv", fail=False, include_stderr=True) + fail(ret == 0, "Driver should refuse to load program with MTU of 9000...") + check_extack_nsim(err, "MTU too large w/ XDP enabled.", args) + sim.set_mtu(1500) + + sim.wait_for_flush() + start_test("Test non-offload XDP attaching to HW...") + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/nooffload") + nooffload = bpf_pinned("/sys/fs/bpf/nooffload") + ret, _, err = sim.set_xdp(nooffload, "offload", + fail=False, include_stderr=True) + fail(ret == 0, "attached non-offloaded XDP program to HW") + check_extack_nsim(err, "xdpoffload of non-bound program.", args) + rm("/sys/fs/bpf/nooffload") + + start_test("Test offload XDP attaching to drv...") + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", + dev=sim['ifname']) + offload = bpf_pinned("/sys/fs/bpf/offload") + ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) + fail(ret == 0, "attached offloaded XDP program to drv") + check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args) + rm("/sys/fs/bpf/offload") + sim.wait_for_flush() + + start_test("Test XDP load failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", + dev=sim['ifname'], fail=False, include_stderr=True) + fail(ret == 0, "verifier should fail on load") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + sim.wait_for_flush() + + start_test("Test XDP offload...") + _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) + ipl = sim.ip_link_show(xdp=True) + link_xdp = ipl["xdp"]["prog"] + progs = bpftool_prog_list(expected=1) + prog = progs[0] + fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") + + start_test("Test XDP offload is device bound...") + dfs = simdev.dfs_get_bound_progs(expected=1) + dprog = dfs[0] + + fail(prog["id"] != link_xdp["id"], "Program IDs don't match") + fail(prog["tag"] != link_xdp["tag"], "Program tags don't match") + fail(str(link_xdp["id"]) != dprog["id"], "Program IDs don't match") + fail(dprog["state"] != "xlated", "Offloaded program state not translated") + fail(dprog["loaded"] != "Y", "Offloaded program is not loaded") + + start_test("Test removing XDP program many times...") + sim.unset_xdp("offload") + sim.unset_xdp("offload") + sim.unset_xdp("drv") + sim.unset_xdp("drv") + sim.unset_xdp("") + sim.unset_xdp("") + bpftool_prog_list_wait(expected=0) + + start_test("Test attempt to use a program for a wrong device...") + simdev2 = NetdevSimDev() + sim2, = simdev2.nsims + sim2.set_xdp(obj, "offload") + pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") + + ret, _, err = sim.set_xdp(pinned, "offload", + fail=False, include_stderr=True) + fail(ret == 0, "Pinned program loaded for a different device accepted") + check_extack(err, "Program bound to different device.", args) + simdev2.remove() + ret, _, err = sim.set_xdp(pinned, "offload", + fail=False, include_stderr=True) + fail(ret == 0, "Pinned program loaded for a removed device accepted") + check_extack(err, "Program bound to different device.", args) + rm(pin_file) + bpftool_prog_list_wait(expected=0) + + simdev, sim = test_multi_prog(simdev, sim, obj, "", 1) + simdev, sim = test_multi_prog(simdev, sim, obj, "drv", 1) + simdev, sim = test_multi_prog(simdev, sim, obj, "generic", 2) + + start_test("Test mixing of TC and XDP...") + sim.tc_add_ingress() + sim.set_xdp(obj, "offload") + ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "Loading TC when XDP active should fail") + check_extack_nsim(err, "driver and netdev offload states mismatch.", args) + sim.unset_xdp("offload") + sim.wait_for_flush() + + sim.cls_bpf_add_filter(obj, skip_sw=True) + ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True) + fail(ret == 0, "Loading XDP when TC active should fail") + check_extack_nsim(err, "TC program is already loaded.", args) + + start_test("Test binding TC from pinned...") + pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") + sim.tc_flush_filters(bound=1, total=1) + sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True) + sim.tc_flush_filters(bound=1, total=1) + + start_test("Test binding XDP from pinned...") + sim.set_xdp(obj, "offload") + pin_file, pinned = pin_prog("/sys/fs/bpf/tmp2", idx=1) + + sim.set_xdp(pinned, "offload", force=True) + sim.unset_xdp("offload") + sim.set_xdp(pinned, "offload", force=True) + sim.unset_xdp("offload") + + start_test("Test offload of wrong type fails...") + ret, _ = sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True, fail=False) + fail(ret == 0, "Managed to attach XDP program to TC") + + start_test("Test asking for TC offload of two filters...") + sim.cls_bpf_add_filter(obj, da=True, skip_sw=True) + ret, _, err = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "Managed to offload two TC filters at the same time") + check_extack_nsim(err, "driver and netdev offload states mismatch.", args) + + sim.tc_flush_filters(bound=2, total=2) + + start_test("Test if netdev removal waits for translation...") + delay_msec = 500 + sim.dfs["dev/bpf_bind_verifier_delay"] = delay_msec + start = time.time() + cmd_line = "tc filter add dev %s ingress bpf %s da skip_sw" % \ + (sim['ifname'], obj) + tc_proc = cmd(cmd_line, background=True, fail=False) + # Wait for the verifier to start + while simdev.dfs_num_bound_progs() <= 2: + pass + simdev.remove() + end = time.time() + ret, _ = cmd_result(tc_proc, fail=False) + time_diff = end - start + log("Time", "start:\t%s\nend:\t%s\ndiff:\t%s" % (start, end, time_diff)) + + fail(ret == 0, "Managed to load TC filter on a unregistering device") + delay_sec = delay_msec * 0.001 + fail(time_diff < delay_sec, "Removal process took %s, expected %s" % + (time_diff, delay_sec)) + + # Remove all pinned files and reinstantiate the netdev + clean_up() + bpftool_prog_list_wait(expected=0) + + simdev = NetdevSimDev() + sim, = simdev.nsims + map_obj = bpf_obj("sample_map_ret0.bpf.o") + start_test("Test loading program with maps...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + + start_test("Test bpftool bound info reporting (own ns)...") + check_dev_info(False, "") + + start_test("Test bpftool bound info reporting (other ns)...") + ns = mknetns() + sim.set_ns(ns) + check_dev_info(True, "") + + start_test("Test bpftool bound info reporting (remote ns)...") + check_dev_info(False, ns) + + start_test("Test bpftool bound info reporting (back to own ns)...") + sim.set_ns("") + check_dev_info(False, "") + + prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog") + map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2) + simdev.remove() + + start_test("Test bpftool bound info reporting (removed dev)...") + check_dev_info_removed(prog_file=prog_file, map_file=map_file) + + # Remove all pinned files and reinstantiate the netdev + clean_up() + bpftool_prog_list_wait(expected=0) + + simdev = NetdevSimDev() + sim, = simdev.nsims + + start_test("Test map update (no flags)...") + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + maps = bpftool_map_list(expected=2) + array = maps[0] if maps[0]["type"] == "array" else maps[1] + htab = maps[0] if maps[0]["type"] == "hash" else maps[1] + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, _ = bpftool("map update id %d key %s value %s" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "added too many entries") + + start_test("Test map update (exists)...") + for m in maps: + for i in range(2): + bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", i), int2str("Q", i * 3))) + + for m in maps: + ret, err = bpftool("map update id %d key %s value %s exist" % + (m["id"], int2str("I", 3), int2str("Q", 3 * 3)), + fail=False) + fail(ret == 0, "updated non-existing key") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map update (noexist)...") + for m in maps: + for i in range(2): + ret, err = bpftool("map update id %d key %s value %s noexist" % + (m["id"], int2str("I", i), int2str("Q", i * 3)), + fail=False) + fail(ret == 0, "updated existing key") + fail(err["error"].find("File exists") == -1, + "expected EEXIST, error is '%s'" % (err["error"])) + + start_test("Test map dump...") + for m in maps: + _, entries = bpftool("map dump id %d" % (m["id"])) + for i in range(2): + key = str2int(entries[i]["key"]) + fail(key != i, "expected key %d, got %d" % (key, i)) + val = str2int(entries[i]["value"]) + fail(val != i * 3, "expected value %d, got %d" % (val, i * 3)) + + start_test("Test map getnext...") + for m in maps: + _, entry = bpftool("map getnext id %d" % (m["id"])) + key = str2int(entry["next_key"]) + fail(key != 0, "next key %d, expected %d" % (key, 0)) + _, entry = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 0))) + key = str2int(entry["next_key"]) + fail(key != 1, "next key %d, expected %d" % (key, 1)) + ret, err = bpftool("map getnext id %d key %s" % + (m["id"], int2str("I", 1)), fail=False) + fail(ret == 0, "got next key past the end of map") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map delete (htab)...") + for i in range(2): + bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i))) + + start_test("Test map delete (array)...") + for i in range(2): + ret, err = bpftool("map delete id %d key %s" % + (htab["id"], int2str("I", i)), fail=False) + fail(ret == 0, "removed entry from an array") + fail(err["error"].find("No such file or directory") == -1, + "expected ENOENT, error is '%s'" % (err["error"])) + + start_test("Test map remove...") + sim.unset_xdp("offload") + bpftool_map_list_wait(expected=0) + simdev.remove() + + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON + simdev.remove() + bpftool_map_list_wait(expected=0) + + start_test("Test map creation fail path...") + simdev = NetdevSimDev() + sim, = simdev.nsims + sim.dfs["bpf_map_accept"] = "N" + ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) + fail(ret == 0, + "netdevsim didn't refuse to create a map with offload disabled") + + simdev.remove() + + start_test("Test multi-dev ASIC program reuse...") + simdevA = NetdevSimDev() + simA, = simdevA.nsims + simdevB = NetdevSimDev(3) + simB1, simB2, simB3 = simdevB.nsims + sims = (simA, simB1, simB2, simB3) + simB = (simB1, simB2, simB3) + + bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA", + dev=simA['ifname']) + progA = bpf_pinned("/sys/fs/bpf/nsimA") + bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB", + dev=simB1['ifname']) + progB = bpf_pinned("/sys/fs/bpf/nsimB") + + simA.set_xdp(progA, "offload", JSON=False) + for d in simdevB.nsims: + d.set_xdp(progB, "offload", JSON=False) + + start_test("Test multi-dev ASIC cross-dev replace...") + ret, _ = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False) + fail(ret == 0, "cross-ASIC program allowed") + for d in simdevB.nsims: + ret, _ = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False) + fail(ret == 0, "cross-ASIC program allowed") + + start_test("Test multi-dev ASIC cross-dev install...") + for d in sims: + d.unset_xdp("offload") + + ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False, + fail=False, include_stderr=True) + fail(ret == 0, "cross-ASIC program allowed") + check_extack(err, "Program bound to different device.", args) + for d in simdevB.nsims: + ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False, + fail=False, include_stderr=True) + fail(ret == 0, "cross-ASIC program allowed") + check_extack(err, "Program bound to different device.", args) + + start_test("Test multi-dev ASIC cross-dev map reuse...") + + mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0] + mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0] + + ret, _ = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", + dev=simB3['ifname'], + maps=["idx 0 id %d" % (mapB)], + fail=False) + fail(ret != 0, "couldn't reuse a map on the same ASIC") + rm("/sys/fs/bpf/nsimB_") + + ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimA_", + dev=simA['ifname'], + maps=["idx 0 id %d" % (mapB)], + fail=False, include_stderr=True) + fail(ret == 0, "could reuse a map on a different ASIC") + fail(err.count("offload device mismatch between prog and map") == 0, + "error message missing for cross-ASIC map") + + ret, _, err = bpftool_prog_load("sample_map_ret0.bpf.o", "/sys/fs/bpf/nsimB_", + dev=simB1['ifname'], + maps=["idx 0 id %d" % (mapA)], + fail=False, include_stderr=True) + fail(ret == 0, "could reuse a map on a different ASIC") + fail(err.count("offload device mismatch between prog and map") == 0, + "error message missing for cross-ASIC map") + + start_test("Test multi-dev ASIC cross-dev destruction...") + bpftool_prog_list_wait(expected=2) + + simdevA.remove() + bpftool_prog_list_wait(expected=1) + + ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] + fail(ifnameB != simB1['ifname'], "program not bound to original device") + simB1.remove() + bpftool_prog_list_wait(expected=1) + + start_test("Test multi-dev ASIC cross-dev destruction - move...") + ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] + fail(ifnameB not in (simB2['ifname'], simB3['ifname']), + "program not bound to remaining devices") + + simB2.remove() + ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"] + fail(ifnameB != simB3['ifname'], "program not bound to remaining device") + + simB3.remove() + simdevB.remove() + bpftool_prog_list_wait(expected=0) + + start_test("Test multi-dev ASIC cross-dev destruction - orphaned...") + ret, out = bpftool("prog show %s" % (progB), fail=False) + fail(ret != 0, "couldn't get information about orphaned program") + + print("%s: OK" % (os.path.basename(__file__))) + +finally: + log("Clean up...", "", level=1) + log_level_inc() + clean_up() diff --git a/tools/testing/selftests/net/sample_map_ret0.bpf.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c new file mode 100644 index 000000000000..495990d355ef --- /dev/null +++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, long); + __uint(max_entries, 2); +} htab SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, long); + __uint(max_entries, 2); +} array SEC(".maps"); + +/* Sample program which should always load for testing control paths. */ +SEC(".text") int func() +{ + __u64 key64 = 0; + __u32 key = 0; + long *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (!value) + return 1; + value = bpf_map_lookup_elem(&array, &key64); + if (!value) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/net/sample_ret0.bpf.c b/tools/testing/selftests/net/sample_ret0.bpf.c new file mode 100644 index 000000000000..fec99750d6ea --- /dev/null +++ b/tools/testing/selftests/net/sample_ret0.bpf.c @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ + +/* Sample program which should always load for testing control paths. */ +int func() +{ + return 0; +} -- cgit v1.2.3 From fc50c698c28bcf307dfb14ba9d0b3cacb091c1cb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 8 Apr 2024 20:15:47 -0700 Subject: selftests: net: bpf_offload: wait for maps Maps are removed asynchronously. Either there's a bigger delay now or the test has always been flaky. Retry waiting in the loop. Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240409031549.3531084-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bpf_offload.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index 6157f884d091..174dba1a48d3 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -201,11 +201,11 @@ def bpftool_prog_list_wait(expected=0, n_retry=20): time.sleep(0.05) raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) -def bpftool_map_list_wait(expected=0, n_retry=20): +def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): for i in range(n_retry): - nmaps = len(bpftool_map_list()) - if nmaps == expected: - return + maps = bpftool_map_list(ns=ns) + if len(maps) == expected: + return maps time.sleep(0.05) raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) @@ -605,7 +605,7 @@ def pin_prog(file_name, idx=0): return file_name, bpf_pinned(file_name) def pin_map(file_name, idx=0, expected=1): - maps = bpftool_map_list(expected=expected) + maps = bpftool_map_list_wait(expected=expected) m = maps[idx] bpftool("map pin id %d %s" % (m["id"], file_name)) files.append(file_name) @@ -618,7 +618,7 @@ def check_dev_info_removed(prog_file=None, map_file=None): ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) fail(ret != 0, "failed to show prog with removed device") - bpftool_map_list(expected=0) + bpftool_map_list_wait(expected=0) ret, err = bpftool("map show pin %s" % (map_file), fail=False) fail(ret == 0, "Showing map with removed device did not fail") fail(err["error"].find("No such device") == -1, @@ -642,7 +642,7 @@ def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False): else: fail("ifname" in dev.keys(), "Ifname is reported for other ns") - maps = bpftool_map_list(expected=2, ns=ns) + maps = bpftool_map_list_wait(expected=2, ns=ns) for m in maps: fail("dev" not in m.keys(), "Device parameters not reported") fail(dev != m["dev"], "Map's device different than program's") @@ -1206,7 +1206,7 @@ try: start_test("Test map update (no flags)...") sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON - maps = bpftool_map_list(expected=2) + maps = bpftool_map_list_wait(expected=2) array = maps[0] if maps[0]["type"] == "array" else maps[1] htab = maps[0] if maps[0]["type"] == "hash" else maps[1] for m in maps: -- cgit v1.2.3 From b1c2ce11d42886d08cfa28e38ee07f2b606ced0b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 8 Apr 2024 20:15:48 -0700 Subject: selftests: net: declare section names for bpf_offload Non-ancient ip (iproute2-5.15.0, libbpf 0.7.0) refuses to load the sample with maps because we don't generate BTF: libbpf: BTF is required, but is missing or corrupted. ERROR: opening BPF object file failed Enable BTF by adding -g to clang flags. With that done neither of the programs load: libbpf: prog 'func': error relocating .BTF.ext function info: -22 libbpf: prog 'func': failed to relocate calls: -22 libbpf: failed to load object 'ksft-net-drv/net/sample_ret0.bpf.o' Andrii explains that this is because we don't specify section names for the code. Add the section names, too. Acked-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240409031549.3531084-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 3 ++- tools/testing/selftests/net/bpf_offload.py | 2 +- tools/testing/selftests/net/sample_map_ret0.bpf.c | 2 +- tools/testing/selftests/net/sample_ret0.bpf.c | 3 +++ 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index e8bfa715aa49..a3c781cb8367 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -149,7 +149,8 @@ BPF_PROG_OBJS := $(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o \ $(OUTPUT)/sample_map_ret0.bpf.o $(OUTPUT)/sample_ret0.bpf.o $(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) - $(CLANG) -O2 --target=bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@ + $(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \ + -c $< -o $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index 174dba1a48d3..76b53ac2c8c6 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -237,7 +237,7 @@ def tc(args, JSON=True, ns="", fail=True, include_stderr=False): def ethtool(dev, opt, args, fail=True): return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail) -def bpf_obj(name, sec=".text", path=bpf_test_dir,): +def bpf_obj(name, sec="xdp", path=bpf_test_dir,): return "obj %s sec %s" % (os.path.join(path, name), sec) def bpf_pinned(name): diff --git a/tools/testing/selftests/net/sample_map_ret0.bpf.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c index 495990d355ef..43ca92594926 100644 --- a/tools/testing/selftests/net/sample_map_ret0.bpf.c +++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c @@ -17,7 +17,7 @@ struct { } array SEC(".maps"); /* Sample program which should always load for testing control paths. */ -SEC(".text") int func() +SEC("xdp") int func() { __u64 key64 = 0; __u32 key = 0; diff --git a/tools/testing/selftests/net/sample_ret0.bpf.c b/tools/testing/selftests/net/sample_ret0.bpf.c index fec99750d6ea..1df5ca98bb65 100644 --- a/tools/testing/selftests/net/sample_ret0.bpf.c +++ b/tools/testing/selftests/net/sample_ret0.bpf.c @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +#define SEC(name) __attribute__((section(name), used)) + /* Sample program which should always load for testing control paths. */ +SEC("xdp") int func() { return 0; -- cgit v1.2.3 From 6ce2b689932ba8288ceef9a82c1caf029b0b23f9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 8 Apr 2024 20:15:49 -0700 Subject: selftests: net: reuse common code in bpf_offload net/lib/py/nsim.py already contains the most useful parts of the netdevsim wrapper classes. Reuse them. Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240409031549.3531084-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bpf_offload.py | 124 +++++++---------------------- tools/testing/selftests/net/lib/py/nsim.py | 9 ++- 2 files changed, 37 insertions(+), 96 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index 76b53ac2c8c6..3efe44f6e92a 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -29,6 +29,9 @@ import subprocess import time import traceback +from lib.py import NetdevSim, NetdevSimDev + + logfile = None log_level = 1 skip_extack = False @@ -145,8 +148,10 @@ def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False): if JSON: params += "%s " % (flags["json"]) - if ns != "": + if ns: ns = "ip netns exec %s " % (ns) + elif ns is None: + ns = "" if include_stderr: ret, stdout, stderr = cmd(ns + name + " " + params + args, @@ -334,72 +339,16 @@ class DebugfsDir: return dfs -class NetdevSimDev: +class BpfNetdevSimDev(NetdevSimDev): """ Class for netdevsim bus device and its attributes. """ - @staticmethod - def ctrl_write(path, val): - fullpath = os.path.join("/sys/bus/netdevsim/", path) - try: - with open(fullpath, "w") as f: - f.write(val) - except OSError as e: - log("WRITE %s: %r" % (fullpath, val), -e.errno) - raise e - log("WRITE %s: %r" % (fullpath, val), 0) - - def __init__(self, port_count=1): - addr = 0 - while True: - try: - self.ctrl_write("new_device", "%u %u" % (addr, port_count)) - except OSError as e: - if e.errno == errno.ENOSPC: - addr += 1 - continue - raise e - break - self.addr = addr - - # As probe of netdevsim device might happen from a workqueue, - # so wait here until all netdevs appear. - self.wait_for_netdevs(port_count) - - ret, out = cmd("udevadm settle", fail=False) - if ret: - raise Exception("udevadm settle failed") - ifnames = self.get_ifnames() - + def __init__(self, port_count=1, ns=None): + super().__init__(port_count, ns=ns) devs.append(self) - self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr - - self.nsims = [] - for port_index in range(port_count): - self.nsims.append(NetdevSim(self, port_index, ifnames[port_index])) - - def get_ifnames(self): - ifnames = [] - listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr) - for ifname in listdir: - ifnames.append(ifname) - ifnames.sort() - return ifnames - - def wait_for_netdevs(self, port_count): - timeout = 5 - timeout_start = time.time() - - while True: - try: - ifnames = self.get_ifnames() - except FileNotFoundError as e: - ifnames = [] - if len(ifnames) == port_count: - break - if time.time() < timeout_start + timeout: - continue - raise Exception("netdevices did not appear within timeout") + + def _make_port(self, port_index, ifname): + return BpfNetdevSim(self, port_index, ifname, self.ns) def dfs_num_bound_progs(self): path = os.path.join(self.dfs_dir, "bpf_bound_progs") @@ -415,33 +364,20 @@ class NetdevSimDev: return progs def remove(self): - self.ctrl_write("del_device", "%u" % (self.addr, )) + super().remove() devs.remove(self) - def remove_nsim(self, nsim): - self.nsims.remove(nsim) - self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ), - "%u" % (nsim.port_index, )) -class NetdevSim: +class BpfNetdevSim(NetdevSim): """ Class for netdevsim netdevice and its attributes. """ - def __init__(self, nsimdev, port_index, ifname): - # In case udev renamed the netdev to according to new schema, - # check if the name matches the port_index. - nsimnamere = re.compile("eni\d+np(\d+)") - match = nsimnamere.match(ifname) - if match and int(match.groups()[0]) != port_index + 1: - raise Exception("netdevice name mismatches the expected one") - - self.nsimdev = nsimdev - self.port_index = port_index - self.ns = "" + def __init__(self, nsimdev, port_index, ifname, ns=None): + super().__init__(nsimdev, port_index, ifname, ns=ns) + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) self.dfs_refresh() - _, [self.dev] = ip("link show dev %s" % ifname) def __getitem__(self, key): return self.dev[key] @@ -468,7 +404,7 @@ class NetdevSim: raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs)) def set_ns(self, ns): - name = "1" if ns == "" else ns + name = ns if ns else "1" ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns) self.ns = ns @@ -744,7 +680,7 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): start_test("Test multi-attachment XDP - device remove...") simdev.remove() - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) return [simdev, sim] @@ -809,13 +745,13 @@ try: bytecode = bpf_bytecode("1,6 0 0 4294967295,") start_test("Test destruction of generic XDP...") - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_xdp(obj, "generic") simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.tc_add_ingress() @@ -967,7 +903,7 @@ try: simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) @@ -976,7 +912,7 @@ try: simdev.remove() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_ethtool_tc_offloads(True) @@ -1080,7 +1016,7 @@ try: bpftool_prog_list_wait(expected=0) start_test("Test attempt to use a program for a wrong device...") - simdev2 = NetdevSimDev() + simdev2 = BpfNetdevSimDev() sim2, = simdev2.nsims sim2.set_xdp(obj, "offload") pin_file, pinned = pin_prog("/sys/fs/bpf/tmp") @@ -1169,7 +1105,7 @@ try: clean_up() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims map_obj = bpf_obj("sample_map_ret0.bpf.o") start_test("Test loading program with maps...") @@ -1201,7 +1137,7 @@ try: clean_up() bpftool_prog_list_wait(expected=0) - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims start_test("Test map update (no flags)...") @@ -1285,14 +1221,14 @@ try: bpftool_map_list_wait(expected=0) simdev.remove() - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON simdev.remove() bpftool_map_list_wait(expected=0) start_test("Test map creation fail path...") - simdev = NetdevSimDev() + simdev = BpfNetdevSimDev() sim, = simdev.nsims sim.dfs["bpf_map_accept"] = "N" ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False) @@ -1302,9 +1238,9 @@ try: simdev.remove() start_test("Test multi-dev ASIC program reuse...") - simdevA = NetdevSimDev() + simdevA = BpfNetdevSimDev() simA, = simdevA.nsims - simdevB = NetdevSimDev(3) + simdevB = BpfNetdevSimDev(3) simB1, simB2, simB3 = simdevB.nsims sims = (simA, simB1, simB2, simB3) simB = (simB1, simB2, simB3) diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py index b2d696e12805..97457aca7e08 100644 --- a/tools/testing/selftests/net/lib/py/nsim.py +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -21,8 +21,11 @@ class NetdevSim: if match and int(match.groups()[0]) != port_index + 1: raise Exception("netdevice name mismatches the expected one") + self.ifname = ifname self.nsimdev = nsimdev self.port_index = port_index + self.ns = ns + self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) ret = ip("-j link show dev %s" % ifname, ns=ns) self.dev = json.loads(ret.stdout)[0] @@ -79,8 +82,10 @@ class NetdevSimDev: self.nsims = [] for port_index in range(port_count): - self.nsims.append(NetdevSim(self, port_index, ifnames[port_index], - ns=ns)) + self.nsims.append(self._make_port(port_index, ifnames[port_index])) + + def _make_port(self, port_index, ifname): + return NetdevSim(self, port_index, ifname, self.ns) def get_ifnames(self): ifnames = [] -- cgit v1.2.3 From 791f4641142e2aced85de082e5783b4fb0b977c2 Mon Sep 17 00:00:00 2001 From: Brennan Xavier McManus Date: Tue, 9 Jan 2024 18:44:02 -0500 Subject: tools/nolibc/stdlib: fix memory error in realloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass user_p_len to memcpy() instead of heap->len to prevent realloc() from copying an extra sizeof(heap) bytes from beyond the allocated region. Signed-off-by: Brennan Xavier McManus Cc: stable@vger.kernel.org Reviewed-by: Ammar Faizi Fixes: 0e0ff638400be8f497a35b51a4751fd823f6bd6a ("tools/nolibc/stdlib: Implement `malloc()`, `calloc()`, `realloc()` and `free()`") Signed-off-by: Willy Tarreau Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/stdlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index bacfd35c5156..5be9d3c7435a 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -185,7 +185,7 @@ void *realloc(void *old_ptr, size_t new_size) if (__builtin_expect(!ret, 0)) return NULL; - memcpy(ret, heap->user_p, heap->len); + memcpy(ret, heap->user_p, user_p_len); munmap(heap, heap->len); return ret; } -- cgit v1.2.3 From 689230b674188163fe56b3aecd7d01f79ca518e6 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Sun, 18 Feb 2024 16:51:03 -0300 Subject: tools/nolibc/string: export strlen() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As with commit 8d304a374023, "tools/nolibc/string: export memset() and memmove()", gcc -Os without -ffreestanding may fail to compile with: cc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib -lgcc -static -o test test.c /usr/bin/ld: /tmp/cccIasKL.o: in function `main': test.c:(.text.startup+0x1e): undefined reference to `strlen' collect2: error: ld returned 1 exit status As on the aforementioned commit, this patch adds a section to export this function so compilation works on those cases too. Signed-off-by: Rodrigo Campos Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index a01c69dd495f..ed15c22b1b2a 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -123,7 +123,7 @@ char *strcpy(char *dst, const char *src) * thus itself, hence the asm() statement below that's meant to disable this * confusing practice. */ -static __attribute__((unused)) +__attribute__((weak,unused,section(".text.nolibc_strlen"))) size_t strlen(const char *str) { size_t len; -- cgit v1.2.3 From 34d232c39a1e05ba734dc6ad9dc01d15788cd91d Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Sun, 18 Feb 2024 16:51:04 -0300 Subject: tools/nolibc: Fix strlcat() return code and size usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return code should always be strlen(src) + strnlen(dst, size). Let's make sure to copy at most size-1 bytes from src and null-terminate the dst buffer if we did copied something. While we can use strnlen() and strncpy() to implement strlcat(), this is simple enough and results in shorter code when compiled. Signed-off-by: Rodrigo Campos Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/string.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index ed15c22b1b2a..cc51fd6b63d0 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -187,22 +187,31 @@ char *strndup(const char *str, size_t maxlen) static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) { - size_t len; - char c; + size_t len = 0; - for (len = 0; dst[len]; len++) - ; + for (; len < size; len++) { + if (dst[len] == '\0') + break; + } - for (;;) { - c = *src; - if (len < size) - dst[len] = c; - if (!c) + /* + * We want len < size-1. But as size is unsigned and can wrap + * around, we use len + 1 instead. + */ + while (len + 1 < size) { + dst[len] = *src; + if (*src == '\0') break; len++; src++; } + if (len < size) + dst[len] = '\0'; + + while (*src++) + len++; + return len; } -- cgit v1.2.3 From fbffce819e5ac151e137f881b89a9c1da0ebb76c Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Sun, 18 Feb 2024 16:51:05 -0300 Subject: tools/nolibc: Fix strlcpy() return code and size usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return code should always be strlen(src), and we should copy at most size-1 bytes. While we are there, make sure to null-terminate the dst buffer if we copied something. Signed-off-by: Rodrigo Campos Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/string.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index cc51fd6b63d0..565230a4ad47 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -219,16 +219,18 @@ static __attribute__((unused)) size_t strlcpy(char *dst, const char *src, size_t size) { size_t len; - char c; - for (len = 0;;) { - c = src[len]; - if (len < size) - dst[len] = c; - if (!c) - break; - len++; + for (len = 0; len < size; len++) { + dst[len] = src[len]; + if (!dst[len]) + return len; } + if (size) + dst[size-1] = '\0'; + + while (src[len]) + len++; + return len; } -- cgit v1.2.3 From 1063649cf531c276740b7f011df2eed82227ba92 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Sun, 18 Feb 2024 16:51:06 -0300 Subject: selftests/nolibc: Add tests for strlcat() and strlcpy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I've verified that the tests matches libbsd's strlcat()/strlcpy() implementation. Please note that as strlcat()/strlcpy() are not part of the libc, the tests are only compiled when using nolibc. Signed-off-by: Rodrigo Campos Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6ba4f8275ac4..d373fc14706c 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -600,6 +600,25 @@ int expect_strne(const char *expr, int llen, const char *cmp) return ret; } +#define EXPECT_STRBUFEQ(cond, expr, buf, val, cmp) \ + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_str_buf_eq(expr, buf, val, llen, cmp); } while (0) + +static __attribute__((unused)) +int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const char *cmp) +{ + llen += printf(" = %lu <%s> ", expr, buf); + if (strcmp(buf, cmp) != 0) { + result(llen, FAIL); + return 1; + } + if (expr != val) { + result(llen, FAIL); + return 1; + } + + result(llen, OK); + return 0; +} /* declare tests based on line numbers. There must be exactly one test per line. */ #define CASE_TEST(name) \ @@ -991,6 +1010,14 @@ int run_stdlib(int min, int max) for (test = min; test >= 0 && test <= max; test++) { int llen = 0; /* line length */ + /* For functions that take a long buffer, like strlcat() + * Add some more chars after the \0, to test functions that overwrite the buffer set + * the \0 at the exact right position. + */ + char buf[10] = "test123456"; + buf[4] = '\0'; + + /* avoid leaving empty lines below, this will insert holes into * test numbers. */ @@ -1007,6 +1034,19 @@ int run_stdlib(int min, int max) CASE_TEST(strchr_foobar_z); EXPECT_STRZR(1, strchr("foobar", 'z')); break; CASE_TEST(strrchr_foobar_o); EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break; CASE_TEST(strrchr_foobar_z); EXPECT_STRZR(1, strrchr("foobar", 'z')); break; +#ifdef NOLIBC + CASE_TEST(strlcat_0); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcat_1); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 1), buf, 4, "test"); break; + CASE_TEST(strlcat_5); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 5), buf, 7, "test"); break; + CASE_TEST(strlcat_6); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 6), buf, 7, "testb"); break; + CASE_TEST(strlcat_7); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 7), buf, 7, "testba"); break; + CASE_TEST(strlcat_8); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 8), buf, 7, "testbar"); break; + CASE_TEST(strlcpy_0); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcpy_1); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 1), buf, 3, ""); break; + CASE_TEST(strlcpy_2); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 2), buf, 3, "b"); break; + CASE_TEST(strlcpy_3); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 3), buf, 3, "ba"); break; + CASE_TEST(strlcpy_4); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 4), buf, 3, "bar"); break; +#endif CASE_TEST(memcmp_20_20); EXPECT_EQ(1, memcmp("aaa\x20", "aaa\x20", 4), 0); break; CASE_TEST(memcmp_20_60); EXPECT_LT(1, memcmp("aaa\x20", "aaa\x60", 4), 0); break; CASE_TEST(memcmp_60_20); EXPECT_GT(1, memcmp("aaa\x60", "aaa\x20", 4), 0); break; -- cgit v1.2.3 From e93b912ecf6ab113c9d4ec9ced2fa30dfac24c70 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 10 Apr 2024 23:27:06 +0200 Subject: tools/nolibc/string: remove open-coded strnlen() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The same header already defines an implementation of strnlen(), so use it. Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/string.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 565230a4ad47..f9ab28421e6d 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -187,12 +187,7 @@ char *strndup(const char *str, size_t maxlen) static __attribute__((unused)) size_t strlcat(char *dst, const char *src, size_t size) { - size_t len = 0; - - for (; len < size; len++) { - if (dst[len] == '\0') - break; - } + size_t len = strnlen(dst, size); /* * We want len < size-1. But as size is unsigned and can wrap -- cgit v1.2.3 From 7e36c3372fd5a1e76e0f6028d7ef64a8ace5eb55 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Apr 2024 14:08:16 +0300 Subject: selftests: fib_rule_tests: Add VRF tests After commit 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices") it is possible to configure FIB rules that match on iif / oif being a l3mdev port. It was not possible before as these parameters were reset to the ifindex of the l3mdev device itself prior to the FIB rules lookup. Add tests that cover this functionality as it does not seem to be covered by existing ones and I am aware of at least one user that needs this functionality in addition to the one mentioned in [1]. Reuse the existing FIB rules tests by simply configuring a VRF prior to the test and removing it afterwards. Differentiate the output of the non-VRF tests from the VRF tests by appending "(VRF)" to the test name if a l3mdev FIB rule is present. Verified that these tests do fail on kernel 5.15.y which does not include the previously mentioned commit: # ./fib_rule_tests.sh -t fib_rule6_vrf [...] TEST: rule6 check: oif redirect to table (VRF) [FAIL] [...] TEST: rule6 check: iif redirect to table (VRF) [FAIL] # ./fib_rule_tests.sh -t fib_rule4_vrf [...] TEST: rule4 check: oif redirect to table (VRF) [FAIL] [...] TEST: rule4 check: iif redirect to table (VRF) [FAIL] [1] https://lore.kernel.org/netdev/20200922131122.GB1601@ICIPI.localdomain/ Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240409110816.2508498-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 46 +++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 51157a5559b7..7c01f58a20de 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -9,6 +9,7 @@ PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} RTABLE=100 RTABLE_PEER=101 +RTABLE_VRF=102 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -17,7 +18,14 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" +TESTS=" + fib_rule6 + fib_rule4 + fib_rule6_connect + fib_rule4_connect + fib_rule6_vrf + fib_rule4_vrf +" SELFTEST_PATH="" @@ -27,13 +35,18 @@ log_test() local expected=$2 local msg="$3" + $IP rule show | grep -q l3mdev + if [ $? -eq 0 ]; then + msg="$msg (VRF)" + fi + if [ ${rc} -eq ${expected} ]; then nsuccess=$((nsuccess+1)) - printf "\n TEST: %-50s [ OK ]\n" "${msg}" + printf "\n TEST: %-60s [ OK ]\n" "${msg}" else ret=1 nfail=$((nfail+1)) - printf "\n TEST: %-50s [FAIL]\n" "${msg}" + printf "\n TEST: %-60s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" @@ -130,6 +143,17 @@ cleanup_peer() ip netns del $peerns } +setup_vrf() +{ + $IP link add name vrf0 up type vrf table $RTABLE_VRF + $IP link set dev $DEV master vrf0 +} + +cleanup_vrf() +{ + $IP link del dev vrf0 +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -248,6 +272,13 @@ fib_rule6_test() fi } +fib_rule6_vrf_test() +{ + setup_vrf + fib_rule6_test + cleanup_vrf +} + # Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly # taken into account when connecting the socket and when sending packets. fib_rule6_connect_test() @@ -385,6 +416,13 @@ fib_rule4_test() fi } +fib_rule4_vrf_test() +{ + setup_vrf + fib_rule4_test + cleanup_vrf +} + # Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken # into account when connecting the socket and when sending packets. fib_rule4_connect_test() @@ -467,6 +505,8 @@ do fib_rule4_test|fib_rule4) fib_rule4_test;; fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; + fib_rule6_vrf_test|fib_rule6_vrf) fib_rule6_vrf_test;; + fib_rule4_vrf_test|fib_rule4_vrf) fib_rule4_vrf_test;; help) echo "Test names: $TESTS"; exit 0;; -- cgit v1.2.3 From 699c23f02c65cbfc3e638f14ce0d70c23a2e1f02 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:27 -0700 Subject: bpf: Add bpf_link support for sk_msg and sk_skb progs Add bpf_link support for sk_msg and sk_skb programs. We have an internal request to support bpf_link for sk_msg programs so user space can have a uniform handling with bpf_link based libbpf APIs. Using bpf_link based libbpf API also has a benefit which makes system robust by decoupling prog life cycle and attachment life cycle. Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043527.3737160-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/skmsg.h | 4 + include/uapi/linux/bpf.h | 5 + kernel/bpf/syscall.c | 4 + net/core/sock_map.c | 263 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 5 + 6 files changed, 271 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 62762390c93d..5034c1b4ded7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2996,6 +2996,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); @@ -3094,6 +3095,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..9c8dd4c01412 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,10 @@ struct sk_psock_progs { struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; + struct bpf_link *msg_parser_link; + struct bpf_link *stream_parser_link; + struct bpf_link *stream_verdict_link; + struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fe9f11c8abe..cee0a7915c08 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -6724,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e44c276e8617..7d392ec83655 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f65e..63c016b4c169 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -1454,55 +1466,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1527,7 +1568,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1657,6 +1698,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6fe9f11c8abe..cee0a7915c08 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -6724,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From 849989af61added13b2a9005608b1cf46f36f88b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:32 -0700 Subject: libbpf: Add bpf_link support for BPF_PROG_TYPE_SOCKMAP Introduce a libbpf API function bpf_program__attach_sockmap() which allow user to get a bpf_link for their corresponding programs. Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043532.3737722-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 7 +++++++ tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b091154bc5b5..97eb6e5dd7c8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -149,6 +149,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TCX] = "tcx", [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", + [BPF_LINK_TYPE_SOCKMAP] = "sockmap", }; static const char * const map_type_name[] = { @@ -12533,6 +12534,12 @@ bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); } +struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd) +{ + return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL); +} + struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) { /* target_fd/target_ifindex use the same field in LINK_CREATE */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 4f775a6dcaa0..1333ae20ebe6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -795,6 +795,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); LIBBPF_API struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); LIBBPF_API struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd); +LIBBPF_API struct bpf_link * bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 23d82bba021a..c1ce8aa3520b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -419,6 +419,7 @@ LIBBPF_1.4.0 { LIBBPF_1.5.0 { global: + bpf_program__attach_sockmap; ring__consume_n; ring_buffer__consume_n; } LIBBPF_1.4.0; -- cgit v1.2.3 From 1f3e2091d25b2b140967480177fcaee2f0eebfb1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:37 -0700 Subject: bpftool: Add link dump support for BPF_LINK_TYPE_SOCKMAP An example output looks like: $ bpftool link 1776: sk_skb prog 49730 map_id 0 attach_type sk_skb_verdict pids test_progs(8424) 1777: sk_skb prog 49755 map_id 0 attach_type sk_skb_stream_verdict pids test_progs(8424) 1778: sk_msg prog 49770 map_id 8208 attach_type sk_msg_verdict pids test_progs(8424) Reviewed-by: John Fastabend Reviewed-by: Quentin Monnet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043537.3737928-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/link.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index afde9d0c2ea1..5cd503b763d7 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -526,6 +526,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) show_link_ifindex_json(info->netkit.ifindex, json_wtr); show_link_attach_type_json(info->netkit.attach_type, json_wtr); break; + case BPF_LINK_TYPE_SOCKMAP: + jsonw_uint_field(json_wtr, "map_id", info->sockmap.map_id); + show_link_attach_type_json(info->sockmap.attach_type, json_wtr); + break; case BPF_LINK_TYPE_XDP: show_link_ifindex_json(info->xdp.ifindex, json_wtr); break; @@ -915,6 +919,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) show_link_ifindex_plain(info->netkit.ifindex); show_link_attach_type_plain(info->netkit.attach_type); break; + case BPF_LINK_TYPE_SOCKMAP: + printf("\n\t"); + printf("map_id %u ", info->sockmap.map_id); + show_link_attach_type_plain(info->sockmap.attach_type); + break; case BPF_LINK_TYPE_XDP: printf("\n\t"); show_link_ifindex_plain(info->xdp.ifindex); -- cgit v1.2.3 From a15d58b2bc82abd8c4c994af158b0410424a18d3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:42 -0700 Subject: selftests/bpf: Refactor out helper functions for a few tests These helper functions will be used later new tests as well. There are no functionality change. Acked-by: Eduard Zingerman Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043542.3738166-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 38 +++++++++++++--------- .../selftests/bpf/progs/test_skmsg_load_helpers.c | 9 +++-- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 77e26ecffa9d..63fb2da7930a 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -475,30 +475,19 @@ out: test_sockmap_drop_prog__destroy(drop); } -static void test_sockmap_skb_verdict_peek(void) +static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; - struct test_sockmap_pass_prog *pass; + int err, s, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - pass = test_sockmap_pass_prog__open_and_load(); - if (!ASSERT_OK_PTR(pass, "open_and_load")) - return; - verdict = bpf_program__fd(pass->progs.prog_skb_verdict); - map = bpf_map__fd(pass->maps.sock_map_rx); - - err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach")) - goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; + return; err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); if (!ASSERT_OK(err, "create_pairs(s)")) - goto out; + return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) @@ -520,6 +509,25 @@ static void test_sockmap_skb_verdict_peek(void) out_close: close(c1); close(p1); +} + +static void test_sockmap_skb_verdict_peek(void) +{ + struct test_sockmap_pass_prog *pass; + int err, map, verdict; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); + map = bpf_map__fd(pass->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + out: test_sockmap_pass_prog__destroy(pass); } diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index 45e8fc75a739..b753672f04c9 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -24,8 +24,7 @@ struct { __type(value, __u64); } socket_storage SEC(".maps"); -SEC("sk_msg") -int prog_msg_verdict(struct sk_msg_md *msg) +static int prog_msg_verdict_common(struct sk_msg_md *msg) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); int verdict = SK_PASS; @@ -44,4 +43,10 @@ int prog_msg_verdict(struct sk_msg_md *msg) return verdict; } +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 8ba218e625f0dfb3ef46fe0721dcdf565726ff76 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:47 -0700 Subject: selftests/bpf: Add some tests with new bpf_program__attach_sockmap() APIs Add a few more tests in sockmap_basic.c and sockmap_listen.c to test bpf_link based APIs for SK_MSG and SK_SKB programs. Link attach/detach/update are all tested. All tests are passed. Acked-by: Eduard Zingerman Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043547.3738448-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 133 +++++++++++++++++++++ .../selftests/bpf/prog_tests/sockmap_listen.c | 38 ++++++ .../selftests/bpf/progs/test_skmsg_load_helpers.c | 18 +++ .../selftests/bpf/progs/test_sockmap_pass_prog.c | 17 ++- .../bpf/progs/test_sockmap_skb_verdict_attach.c | 2 +- 5 files changed, 206 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 63fb2da7930a..1337153eb0ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -131,6 +131,65 @@ out: test_skmsg_load_helpers__destroy(skel); } +static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) +{ + struct bpf_program *prog, *prog_clone, *prog_clone2; + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts); + struct test_skmsg_load_helpers *skel; + struct bpf_link *link, *link2; + int err, map; + + skel = test_skmsg_load_helpers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load")) + return; + + prog = skel->progs.prog_msg_verdict; + prog_clone = skel->progs.prog_msg_verdict_clone; + prog_clone2 = skel->progs.prog_msg_verdict_clone2; + map = bpf_map__fd(skel->maps.sock_map); + + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + /* Fail since bpf_link for the same prog has been created. */ + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_ERR(err, "bpf_prog_attach")) + goto out; + + /* Fail since bpf_link for the same prog type has been created. */ + link2 = bpf_program__attach_sockmap(prog_clone, map); + if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { + bpf_link__detach(link2); + goto out; + } + + err = bpf_link__update_program(link, prog_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different type attempts to do update. */ + err = bpf_link__update_program(link, skel->progs.prog_skb_verdict); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + /* Fail since the old prog does not match the one in the kernel. */ + opts.old_prog_fd = bpf_program__fd(prog_clone2); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_ERR(err, "bpf_link_update")) + goto out; + + opts.old_prog_fd = bpf_program__fd(prog_clone); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_link_update")) + goto out; +out: + bpf_link__detach(link); + test_skmsg_load_helpers__destroy(skel); +} + static void test_sockmap_update(enum bpf_map_type map_type) { int err, prog, src; @@ -298,6 +357,40 @@ out: test_sockmap_skb_verdict_attach__destroy(skel); } +static void test_sockmap_skb_verdict_attach_with_link(void) +{ + struct test_sockmap_skb_verdict_attach *skel; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + prog = skel->progs.prog_skb_verdict; + map = bpf_map__fd(skel->maps.sock_map); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + bpf_link__detach(link); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + /* Fail since attaching with the same prog/map has been done. */ + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) + bpf_link__detach(link); + + err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); + if (!ASSERT_OK(err, "bpf_prog_detach2")) + goto out; +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + static __u32 query_prog_id(int prog_fd) { struct bpf_prog_info info = {}; @@ -532,6 +625,38 @@ out: test_sockmap_pass_prog__destroy(pass); } +static void test_sockmap_skb_verdict_peek_with_link(void) +{ + struct test_sockmap_pass_prog *pass; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + prog = pass->progs.prog_skb_verdict; + map = bpf_map__fd(pass->maps.sock_map_rx); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + err = bpf_link__update_program(link, pass->progs.prog_skb_verdict_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different attach type attempts to do update. */ + err = bpf_link__update_program(link, pass->progs.prog_skb_parser); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); +out: + bpf_link__detach(link); + test_sockmap_pass_prog__destroy(pass); +} + static void test_sockmap_unconnected_unix(void) { int err, map, stream = 0, dgram = 0, zero = 0; @@ -796,6 +921,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, BPF_SK_SKB_VERDICT); } + if (test__start_subtest("sockmap skb_verdict attach_with_link")) + test_sockmap_skb_verdict_attach_with_link(); if (test__start_subtest("sockmap msg_verdict progs query")) test_sockmap_progs_query(BPF_SK_MSG_VERDICT); if (test__start_subtest("sockmap stream_parser progs query")) @@ -812,6 +939,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); + if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) + test_sockmap_skb_verdict_peek_with_link(); if (test__start_subtest("sockmap unconnected af_unix")) test_sockmap_unconnected_unix(); if (test__start_subtest("sockmap one socket to many map entries")) @@ -820,4 +949,8 @@ void test_sockmap_basic(void) test_sockmap_many_maps(); if (test__start_subtest("sockmap same socket replace")) test_sockmap_same_sock(); + if (test__start_subtest("sockmap sk_msg attach sockmap helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg attach sockhash helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index a92807bfcd13..e91b59366030 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -767,6 +767,24 @@ static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_connected_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int prog_msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int link_fd; + + link_fd = bpf_link_create(prog_msg_verdict, sock_map, BPF_SK_MSG_VERDICT, NULL); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + close(link_fd); +} + static void redir_to_listening(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -869,6 +887,24 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + struct bpf_program *verdict = skel->progs.prog_msg_verdict; + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + struct bpf_link *link; + + link = bpf_program__attach_sockmap(verdict, sock_map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + bpf_link__detach(link); +} + static void redir_partial(int family, int sotype, int sock_map, int parser_map) { int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; @@ -1316,7 +1352,9 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, TEST(test_skb_redir_to_listening), TEST(test_skb_redir_partial), TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_connected_with_link), TEST(test_msg_redir_to_listening), + TEST(test_msg_redir_to_listening_with_link), }; const char *family_name, *map_name; const struct redir_test *t; diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index b753672f04c9..996b177324ba 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -49,4 +49,22 @@ int prog_msg_verdict(struct sk_msg_md *msg) return prog_msg_verdict_common(msg); } +SEC("sk_msg") +int prog_msg_verdict_clone(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone2(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 1d86a717a290..69aacc96db36 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -23,10 +23,25 @@ struct { __type(value, int); } sock_map_msg SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/stream_verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_PASS; } +int clone_called; + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_clone(struct __sk_buff *skb) +{ + clone_called = 1; + return SK_PASS; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c index 3c69aa971738..d25b0bb30fc0 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -9,7 +9,7 @@ struct { __type(value, __u64); } sock_map SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_DROP; -- cgit v1.2.3 From ffa6b26b4d8a0520b78636ca9373ab842cb3b1a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 10 Apr 2024 08:33:26 -0700 Subject: selftests/bpf: Enable tests for atomics with cpuv4 When looking at Alexei's patch ([1]) which added tests for atomics, I noticed that the tests will be skipped with cpuv4. For example, with latest llvm19, I see: [root@arch-fb-vm1 bpf]# ./test_progs -t arena_atomics #3/1 arena_atomics/add:OK ... #3/7 arena_atomics/xchg:OK #3 arena_atomics:OK Summary: 1/7 PASSED, 0 SKIPPED, 0 FAILED [root@arch-fb-vm1 bpf]# ./test_progs-cpuv4 -t arena_atomics #3 arena_atomics:SKIP Summary: 1/0 PASSED, 1 SKIPPED, 0 FAILED [root@arch-fb-vm1 bpf]# It is perfectly fine to enable atomics-related tests for cpuv4. With this patch, I have [root@arch-fb-vm1 bpf]# ./test_progs-cpuv4 -t arena_atomics #3/1 arena_atomics/add:OK ... #3/7 arena_atomics/xchg:OK #3 arena_atomics:OK Summary: 1/7 PASSED, 0 SKIPPED, 0 FAILED [1] https://lore.kernel.org/r/20240405231134.17274-2-alexei.starovoitov@gmail.com Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410153326.1851055-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index eea5b8deaaf0..edc73f8f5aef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -658,7 +658,7 @@ $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs-cpuv4 test runner. ifneq ($(CLANG_CPUV4),) TRUNNER_BPF_BUILD_RULE := CLANG_CPUV4_BPF_BUILD_RULE -TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs,cpuv4)) endif -- cgit v1.2.3 From a36b69775fcaab42262f6dbf288ccb6d470158ae Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 8 Mar 2024 09:51:22 +0100 Subject: ndtest: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/c04bfc941a9f5d249b049572c1ae122fe551ee5d.1709886922.git.u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- tools/testing/nvdimm/test/ndtest.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c index b8419f460368..2c6285aae852 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -830,12 +830,11 @@ static int ndtest_bus_register(struct ndtest_priv *p) return 0; } -static int ndtest_remove(struct platform_device *pdev) +static void ndtest_remove(struct platform_device *pdev) { struct ndtest_priv *p = to_ndtest_priv(&pdev->dev); nvdimm_bus_unregister(p->bus); - return 0; } static int ndtest_probe(struct platform_device *pdev) @@ -882,7 +881,7 @@ static const struct platform_device_id ndtest_id[] = { static struct platform_driver ndtest_driver = { .probe = ndtest_probe, - .remove = ndtest_remove, + .remove_new = ndtest_remove, .driver = { .name = KBUILD_MODNAME, }, -- cgit v1.2.3 From 45bab4d746510ac2bdd9102bf152db617ef96a6b Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sat, 30 Mar 2024 01:52:00 -0700 Subject: tools: hv: Add vmbus_bufring Common userspace interface for read/write from VMBus ringbuffer. This implementation is open for use by any userspace driver or application seeking direct control over VMBus ring buffers. A significant part of this code is borrowed from DPDK. Link: https://github.com/DPDK/dpdk/ Currently this library is not supported for ARM64. Signed-off-by: Mary Hardy Signed-off-by: Saurabh Sengar Reviewed-by: Long Li Link: https://lore.kernel.org/r/1711788723-8593-5-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Greg Kroah-Hartman --- tools/hv/vmbus_bufring.c | 318 +++++++++++++++++++++++++++++++++++++++++++++++ tools/hv/vmbus_bufring.h | 158 +++++++++++++++++++++++ 2 files changed, 476 insertions(+) create mode 100644 tools/hv/vmbus_bufring.c create mode 100644 tools/hv/vmbus_bufring.h (limited to 'tools') diff --git a/tools/hv/vmbus_bufring.c b/tools/hv/vmbus_bufring.c new file mode 100644 index 000000000000..bac32c1109df --- /dev/null +++ b/tools/hv/vmbus_bufring.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Copyright (c) 2009-2012,2016,2023 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vmbus_bufring.h" + +/** + * Compiler barrier. + * + * Guarantees that operation reordering does not occur at compile time + * for operations directly before and after the barrier. + */ +#define rte_compiler_barrier() ({ asm volatile ("" : : : "memory"); }) + +#define VMBUS_RQST_ERROR 0xFFFFFFFFFFFFFFFF +#define ALIGN(val, align) ((typeof(val))((val) & (~((typeof(val))((align) - 1))))) + +void *vmbus_uio_map(int *fd, int size) +{ + void *map; + + map = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); + if (map == MAP_FAILED) + return NULL; + + return map; +} + +/* Increase bufring index by inc with wraparound */ +static inline uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) +{ + idx += inc; + if (idx >= sz) + idx -= sz; + + return idx; +} + +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen) +{ + br->vbr = buf; + br->windex = br->vbr->windex; + br->dsize = blen - sizeof(struct vmbus_bufring); +} + +static inline __always_inline void +rte_smp_mb(void) +{ + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); +} + +static inline int +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) +{ + uint8_t res; + + asm volatile("lock ; " + "cmpxchgl %[src], %[dst];" + "sete %[res];" + : [res] "=a" (res), /* output */ + [dst] "=m" (*dst) + : [src] "r" (src), /* input */ + "a" (exp), + "m" (*dst) + : "memory"); /* no-clobber list */ + return res; +} + +static inline uint32_t +vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex, + const void *src0, uint32_t cplen) +{ + uint8_t *br_data = tbr->vbr->data; + uint32_t br_dsize = tbr->dsize; + const uint8_t *src = src0; + + /* XXX use double mapping like Linux kernel? */ + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + memcpy(br_data + windex, src, fraglen); + memcpy(br_data, src + fraglen, cplen - fraglen); + } else { + memcpy(br_data + windex, src, cplen); + } + + return vmbus_br_idxinc(windex, cplen, br_dsize); +} + +/* + * Write scattered channel packet to TX bufring. + * + * The offset of this channel packet is written as a 64bits value + * immediately after this channel packet. + * + * The write goes through three stages: + * 1. Reserve space in ring buffer for the new data. + * Writer atomically moves priv_write_index. + * 2. Copy the new data into the ring. + * 3. Update the tail of the ring (visible to host) that indicates + * next read location. Writer updates write_index + */ +static int +vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen) +{ + struct vmbus_bufring *vbr = tbr->vbr; + uint32_t ring_size = tbr->dsize; + uint32_t old_windex, next_windex, windex, total; + uint64_t save_windex; + int i; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + /* Reserve space in ring */ + do { + uint32_t avail; + + /* Get current free location */ + old_windex = tbr->windex; + + /* Prevent compiler reordering this with calculation */ + rte_compiler_barrier(); + + avail = vmbus_br_availwrite(tbr, old_windex); + + /* If not enough space in ring, then tell caller. */ + if (avail <= total) + return -EAGAIN; + + next_windex = vmbus_br_idxinc(old_windex, total, ring_size); + + /* Atomic update of next write_index for other threads */ + } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, next_windex)); + + /* Space from old..new is now reserved */ + windex = old_windex; + for (i = 0; i < iovlen; i++) + windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base, iov[i].iov_len); + + /* Set the offset of the current channel packet. */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* The region reserved should match region used */ + if (windex != next_windex) + return -EINVAL; + + /* Ensure that data is available before updating host index */ + rte_compiler_barrier(); + + /* Checkin for our reservation. wait for our turn to update host */ + while (!rte_atomic32_cmpset(&vbr->windex, old_windex, next_windex)) + _mm_pause(); + + return 0; +} + +int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void *data, + uint32_t dlen, uint32_t flags) +{ + struct vmbus_chanpkt pkt; + unsigned int pktlen, pad_pktlen; + const uint32_t hlen = sizeof(pkt); + uint64_t pad = 0; + struct iovec iov[3]; + int error; + + pktlen = hlen + dlen; + pad_pktlen = ALIGN(pktlen, sizeof(uint64_t)); + + pkt.hdr.type = type; + pkt.hdr.flags = flags; + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.xactid = VMBUS_RQST_ERROR; + + iov[0].iov_base = &pkt; + iov[0].iov_len = hlen; + iov[1].iov_base = data; + iov[1].iov_len = dlen; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(txbr, iov, 3); + + return error; +} + +static inline uint32_t +vmbus_rxbr_copyfrom(const struct vmbus_br *rbr, uint32_t rindex, + void *dst0, size_t cplen) +{ + const uint8_t *br_data = rbr->vbr->data; + uint32_t br_dsize = rbr->dsize; + uint8_t *dst = dst0; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + memcpy(dst, br_data + rindex, fraglen); + memcpy(dst + fraglen, br_data, cplen - fraglen); + } else { + memcpy(dst, br_data + rindex, cplen); + } + + return vmbus_br_idxinc(rindex, cplen, br_dsize); +} + +/* Copy data from receive ring but don't change index */ +static int +vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen) +{ + uint32_t avail; + + /* + * The requested data and the 64bits channel packet + * offset should be there at least. + */ + avail = vmbus_br_availread(rbr); + if (avail < dlen + sizeof(uint64_t)) + return -EAGAIN; + + vmbus_rxbr_copyfrom(rbr, rbr->vbr->rindex, data, dlen); + return 0; +} + +/* + * Copy data from receive ring and change index + * NOTE: + * We assume (dlen + skip) == sizeof(channel packet). + */ +static int +vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t skip) +{ + struct vmbus_bufring *vbr = rbr->vbr; + uint32_t br_dsize = rbr->dsize; + uint32_t rindex; + + if (vmbus_br_availread(rbr) < dlen + skip + sizeof(uint64_t)) + return -EAGAIN; + + /* Record where host was when we started read (for debug) */ + rbr->windex = rbr->vbr->windex; + + /* + * Copy channel packet from RX bufring. + */ + rindex = vmbus_br_idxinc(rbr->vbr->rindex, skip, br_dsize); + rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen); + + /* + * Discard this channel packet's 64bits offset, which is useless to us. + */ + rindex = vmbus_br_idxinc(rindex, sizeof(uint64_t), br_dsize); + + /* Update the read index _after_ the channel packet is fetched. */ + rte_compiler_barrier(); + + vbr->rindex = rindex; + + return 0; +} + +int rte_vmbus_chan_recv_raw(struct vmbus_br *rxbr, + void *data, uint32_t *len) +{ + struct vmbus_chanpkt_hdr pkt; + uint32_t dlen, bufferlen = *len; + int error; + + error = vmbus_rxbr_peek(rxbr, &pkt, sizeof(pkt)); + if (error) + return error; + + if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) + /* XXX this channel is dead actually. */ + return -EIO; + + if (unlikely(pkt.hlen > pkt.tlen)) + return -EIO; + + /* Length are in quad words */ + dlen = pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT; + *len = dlen; + + /* If caller buffer is not large enough */ + if (unlikely(dlen > bufferlen)) + return -ENOBUFS; + + /* Read data and skip packet header */ + error = vmbus_rxbr_read(rxbr, data, dlen, 0); + if (error) + return error; + + /* Return the number of bytes read */ + return dlen + sizeof(uint64_t); +} diff --git a/tools/hv/vmbus_bufring.h b/tools/hv/vmbus_bufring.h new file mode 100644 index 000000000000..6e7caacfff57 --- /dev/null +++ b/tools/hv/vmbus_bufring.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ + +#ifndef _VMBUS_BUF_H_ +#define _VMBUS_BUF_H_ + +#include +#include + +#define __packed __attribute__((__packed__)) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define ICMSGHDRFLAG_TRANSACTION 1 +#define ICMSGHDRFLAG_REQUEST 2 +#define ICMSGHDRFLAG_RESPONSE 4 + +#define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100 +#define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)) +#define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \ + (ICMSG_HDR + sizeof(struct icmsg_negotiate) + \ + (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version))) + +/* + * Channel packets + */ + +/* Channel packet flags */ +#define VMBUS_CHANPKT_TYPE_INBAND 0x0006 +#define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 +#define VMBUS_CHANPKT_TYPE_GPA 0x0009 +#define VMBUS_CHANPKT_TYPE_COMP 0x000b + +#define VMBUS_CHANPKT_FLAG_NONE 0 +#define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ + +#define VMBUS_CHANPKT_SIZE_SHIFT 3 +#define VMBUS_CHANPKT_SIZE_ALIGN BIT(VMBUS_CHANPKT_SIZE_SHIFT) +#define VMBUS_CHANPKT_HLEN_MIN \ + (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) + +/* + * Buffer ring + */ +struct vmbus_bufring { + volatile uint32_t windex; + volatile uint32_t rindex; + + /* + * Interrupt mask {0,1} + * + * For TX bufring, host set this to 1, when it is processing + * the TX bufring, so that we can safely skip the TX event + * notification to host. + * + * For RX bufring, once this is set to 1 by us, host will not + * further dispatch interrupts to us, even if there are data + * pending on the RX bufring. This effectively disables the + * interrupt of the channel to which this RX bufring is attached. + */ + volatile uint32_t imask; + + /* + * Win8 uses some of the reserved bits to implement + * interrupt driven flow management. On the send side + * we can request that the receiver interrupt the sender + * when the ring transitions from being full to being able + * to handle a message of size "pending_send_sz". + * + * Add necessary state for this enhancement. + */ + volatile uint32_t pending_send; + uint32_t reserved1[12]; + + union { + struct { + uint32_t feat_pending_send_sz:1; + }; + uint32_t value; + } feature_bits; + + /* Pad it to rte_mem_page_size() so that data starts on page boundary */ + uint8_t reserved2[4028]; + + /* + * Ring data starts here + RingDataStartOffset + * !!! DO NOT place any fields below this !!! + */ + uint8_t data[]; +} __packed; + +struct vmbus_br { + struct vmbus_bufring *vbr; + uint32_t dsize; + uint32_t windex; /* next available location */ +}; + +struct vmbus_chanpkt_hdr { + uint16_t type; /* VMBUS_CHANPKT_TYPE_ */ + uint16_t hlen; /* header len, in 8 bytes */ + uint16_t tlen; /* total len, in 8 bytes */ + uint16_t flags; /* VMBUS_CHANPKT_FLAG_ */ + uint64_t xactid; +} __packed; + +struct vmbus_chanpkt { + struct vmbus_chanpkt_hdr hdr; +} __packed; + +struct vmbuspipe_hdr { + unsigned int flags; + unsigned int msgsize; +} __packed; + +struct ic_version { + unsigned short major; + unsigned short minor; +} __packed; + +struct icmsg_negotiate { + unsigned short icframe_vercnt; + unsigned short icmsg_vercnt; + unsigned int reserved; + struct ic_version icversion_data[]; /* any size array */ +} __packed; + +struct icmsg_hdr { + struct ic_version icverframe; + unsigned short icmsgtype; + struct ic_version icvermsg; + unsigned short icmsgsize; + unsigned int status; + unsigned char ictransaction_id; + unsigned char icflags; + unsigned char reserved[2]; +} __packed; + +int rte_vmbus_chan_recv_raw(struct vmbus_br *rxbr, void *data, uint32_t *len); +int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void *data, + uint32_t dlen, uint32_t flags); +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen); +void *vmbus_uio_map(int *fd, int size); + +/* Amount of space available for write */ +static inline uint32_t vmbus_br_availwrite(const struct vmbus_br *br, uint32_t windex) +{ + uint32_t rindex = br->vbr->rindex; + + if (windex >= rindex) + return br->dsize - (windex - rindex); + else + return rindex - windex; +} + +static inline uint32_t vmbus_br_availread(const struct vmbus_br *br) +{ + return br->dsize - vmbus_br_availwrite(br, br->vbr->windex); +} + +#endif /* !_VMBUS_BUF_H_ */ -- cgit v1.2.3 From 82b0945ce2c2d636d5e893ad50210875c929f257 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sat, 30 Mar 2024 01:52:01 -0700 Subject: tools: hv: Add new fcopy application based on uio driver New fcopy application using uio_hv_generic driver. This application copies file from Hyper-V host to guest VM. A big part of this code is copied from tools/hv/hv_fcopy_daemon.c which this new application is replacing. Signed-off-by: Saurabh Sengar Reviewed-by: Long Li Link: https://lore.kernel.org/r/1711788723-8593-6-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Greg Kroah-Hartman --- tools/hv/Build | 3 +- tools/hv/Makefile | 14 +- tools/hv/hv_fcopy_uio_daemon.c | 490 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 501 insertions(+), 6 deletions(-) create mode 100644 tools/hv/hv_fcopy_uio_daemon.c (limited to 'tools') diff --git a/tools/hv/Build b/tools/hv/Build index 6cf51fa4b306..7d1f1698069b 100644 --- a/tools/hv/Build +++ b/tools/hv/Build @@ -1,3 +1,4 @@ hv_kvp_daemon-y += hv_kvp_daemon.o hv_vss_daemon-y += hv_vss_daemon.o -hv_fcopy_daemon-y += hv_fcopy_daemon.o +hv_fcopy_uio_daemon-y += hv_fcopy_uio_daemon.o +hv_fcopy_uio_daemon-y += vmbus_bufring.o diff --git a/tools/hv/Makefile b/tools/hv/Makefile index fe770e679ae8..bb52871da341 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -2,6 +2,7 @@ # Makefile for Hyper-V tools include ../scripts/Makefile.include +ARCH := $(shell uname -m 2>/dev/null) sbindir ?= /usr/sbin libexecdir ?= /usr/libexec sharedstatedir ?= /var/lib @@ -17,7 +18,10 @@ MAKEFLAGS += -r override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include -ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ALL_TARGETS := hv_kvp_daemon hv_vss_daemon +ifneq ($(ARCH), aarch64) +ALL_TARGETS += hv_fcopy_uio_daemon +endif ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh @@ -39,10 +43,10 @@ $(HV_VSS_DAEMON_IN): FORCE $(OUTPUT)hv_vss_daemon: $(HV_VSS_DAEMON_IN) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -HV_FCOPY_DAEMON_IN := $(OUTPUT)hv_fcopy_daemon-in.o -$(HV_FCOPY_DAEMON_IN): FORCE - $(Q)$(MAKE) $(build)=hv_fcopy_daemon -$(OUTPUT)hv_fcopy_daemon: $(HV_FCOPY_DAEMON_IN) +HV_FCOPY_UIO_DAEMON_IN := $(OUTPUT)hv_fcopy_uio_daemon-in.o +$(HV_FCOPY_UIO_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_fcopy_uio_daemon +$(OUTPUT)hv_fcopy_uio_daemon: $(HV_FCOPY_UIO_DAEMON_IN) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ clean: diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c new file mode 100644 index 000000000000..3ce316cc9f97 --- /dev/null +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * An implementation of host to guest copy functionality for Linux. + * + * Copyright (C) 2023, Microsoft, Inc. + * + * Author : K. Y. Srinivasan + * Author : Saurabh Sengar + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vmbus_bufring.h" + +#define ICMSGTYPE_NEGOTIATE 0 +#define ICMSGTYPE_FCOPY 7 + +#define WIN8_SRV_MAJOR 1 +#define WIN8_SRV_MINOR 1 +#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) + +#define MAX_FOLDER_NAME 15 +#define MAX_PATH_LEN 15 +#define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio" + +#define FCOPY_VER_COUNT 1 +static const int fcopy_versions[] = { + WIN8_SRV_VERSION +}; + +#define FW_VER_COUNT 1 +static const int fw_versions[] = { + UTIL_FW_VERSION +}; + +#define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */ + +unsigned char desc[HV_RING_SIZE]; + +static int target_fd; +static char target_fname[PATH_MAX]; +static unsigned long long filesize; + +static int hv_fcopy_create_file(char *file_name, char *path_name, __u32 flags) +{ + int error = HV_E_FAIL; + char *q, *p; + + filesize = 0; + p = path_name; + snprintf(target_fname, sizeof(target_fname), "%s/%s", + path_name, file_name); + + /* + * Check to see if the path is already in place; if not, + * create if required. + */ + while ((q = strchr(p, '/')) != NULL) { + if (q == p) { + p++; + continue; + } + *q = '\0'; + if (access(path_name, F_OK)) { + if (flags & CREATE_PATH) { + if (mkdir(path_name, 0755)) { + syslog(LOG_ERR, "Failed to create %s", + path_name); + goto done; + } + } else { + syslog(LOG_ERR, "Invalid path: %s", path_name); + goto done; + } + } + p = q + 1; + *q = '/'; + } + + if (!access(target_fname, F_OK)) { + syslog(LOG_INFO, "File: %s exists", target_fname); + if (!(flags & OVER_WRITE)) { + error = HV_ERROR_ALREADY_EXISTS; + goto done; + } + } + + target_fd = open(target_fname, + O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC, 0744); + if (target_fd == -1) { + syslog(LOG_INFO, "Open Failed: %s", strerror(errno)); + goto done; + } + + error = 0; +done: + if (error) + target_fname[0] = '\0'; + return error; +} + +/* copy the data into the file */ +static int hv_copy_data(struct hv_do_fcopy *cpmsg) +{ + ssize_t len; + int ret = 0; + + len = pwrite(target_fd, cpmsg->data, cpmsg->size, cpmsg->offset); + + filesize += cpmsg->size; + if (len != cpmsg->size) { + switch (errno) { + case ENOSPC: + ret = HV_ERROR_DISK_FULL; + break; + default: + ret = HV_E_FAIL; + break; + } + syslog(LOG_ERR, "pwrite failed to write %llu bytes: %ld (%s)", + filesize, (long)len, strerror(errno)); + } + + return ret; +} + +static int hv_copy_finished(void) +{ + close(target_fd); + target_fname[0] = '\0'; + + return 0; +} + +static void print_usage(char *argv[]) +{ + fprintf(stderr, "Usage: %s [options]\n" + "Options are:\n" + " -n, --no-daemon stay in foreground, don't daemonize\n" + " -h, --help print this help\n", argv[0]); +} + +static bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, unsigned char *buf, + unsigned int buflen, const int *fw_version, int fw_vercnt, + const int *srv_version, int srv_vercnt, + int *nego_fw_version, int *nego_srv_version) +{ + int icframe_major, icframe_minor; + int icmsg_major, icmsg_minor; + int fw_major, fw_minor; + int srv_major, srv_minor; + int i, j; + bool found_match = false; + struct icmsg_negotiate *negop; + + /* Check that there's enough space for icframe_vercnt, icmsg_vercnt */ + if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) { + syslog(LOG_ERR, "Invalid icmsg negotiate"); + return false; + } + + icmsghdrp->icmsgsize = 0x10; + negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR]; + + icframe_major = negop->icframe_vercnt; + icframe_minor = 0; + + icmsg_major = negop->icmsg_vercnt; + icmsg_minor = 0; + + /* Validate negop packet */ + if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT || + icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT || + ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) { + syslog(LOG_ERR, "Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n", + icframe_major, icmsg_major); + goto fw_error; + } + + /* + * Select the framework version number we will + * support. + */ + + for (i = 0; i < fw_vercnt; i++) { + fw_major = (fw_version[i] >> 16); + fw_minor = (fw_version[i] & 0xFFFF); + + for (j = 0; j < negop->icframe_vercnt; j++) { + if (negop->icversion_data[j].major == fw_major && + negop->icversion_data[j].minor == fw_minor) { + icframe_major = negop->icversion_data[j].major; + icframe_minor = negop->icversion_data[j].minor; + found_match = true; + break; + } + } + + if (found_match) + break; + } + + if (!found_match) + goto fw_error; + + found_match = false; + + for (i = 0; i < srv_vercnt; i++) { + srv_major = (srv_version[i] >> 16); + srv_minor = (srv_version[i] & 0xFFFF); + + for (j = negop->icframe_vercnt; + (j < negop->icframe_vercnt + negop->icmsg_vercnt); + j++) { + if (negop->icversion_data[j].major == srv_major && + negop->icversion_data[j].minor == srv_minor) { + icmsg_major = negop->icversion_data[j].major; + icmsg_minor = negop->icversion_data[j].minor; + found_match = true; + break; + } + } + + if (found_match) + break; + } + + /* + * Respond with the framework and service + * version numbers we can support. + */ +fw_error: + if (!found_match) { + negop->icframe_vercnt = 0; + negop->icmsg_vercnt = 0; + } else { + negop->icframe_vercnt = 1; + negop->icmsg_vercnt = 1; + } + + if (nego_fw_version) + *nego_fw_version = (icframe_major << 16) | icframe_minor; + + if (nego_srv_version) + *nego_srv_version = (icmsg_major << 16) | icmsg_minor; + + negop->icversion_data[0].major = icframe_major; + negop->icversion_data[0].minor = icframe_minor; + negop->icversion_data[1].major = icmsg_major; + negop->icversion_data[1].minor = icmsg_minor; + + return found_match; +} + +static void wcstoutf8(char *dest, const __u16 *src, size_t dest_size) +{ + size_t len = 0; + + while (len < dest_size) { + if (src[len] < 0x80) + dest[len++] = (char)(*src++); + else + dest[len++] = 'X'; + } + + dest[len] = '\0'; +} + +static int hv_fcopy_start(struct hv_start_fcopy *smsg_in) +{ + setlocale(LC_ALL, "en_US.utf8"); + size_t file_size, path_size; + char *file_name, *path_name; + char *in_file_name = (char *)smsg_in->file_name; + char *in_path_name = (char *)smsg_in->path_name; + + file_size = wcstombs(NULL, (const wchar_t *restrict)in_file_name, 0) + 1; + path_size = wcstombs(NULL, (const wchar_t *restrict)in_path_name, 0) + 1; + + file_name = (char *)malloc(file_size * sizeof(char)); + path_name = (char *)malloc(path_size * sizeof(char)); + + wcstoutf8(file_name, (__u16 *)in_file_name, file_size); + wcstoutf8(path_name, (__u16 *)in_path_name, path_size); + + return hv_fcopy_create_file(file_name, path_name, smsg_in->copy_flags); +} + +static int hv_fcopy_send_data(struct hv_fcopy_hdr *fcopy_msg, int recvlen) +{ + int operation = fcopy_msg->operation; + + /* + * The strings sent from the host are encoded in + * utf16; convert it to utf8 strings. + * The host assures us that the utf16 strings will not exceed + * the max lengths specified. We will however, reserve room + * for the string terminating character - in the utf16s_utf8s() + * function we limit the size of the buffer where the converted + * string is placed to W_MAX_PATH -1 to guarantee + * that the strings can be properly terminated! + */ + + switch (operation) { + case START_FILE_COPY: + return hv_fcopy_start((struct hv_start_fcopy *)fcopy_msg); + case WRITE_TO_FILE: + return hv_copy_data((struct hv_do_fcopy *)fcopy_msg); + case COMPLETE_FCOPY: + return hv_copy_finished(); + } + + return HV_E_FAIL; +} + +/* process the packet recv from host */ +static int fcopy_pkt_process(struct vmbus_br *txbr) +{ + int ret, offset, pktlen; + int fcopy_srv_version; + const struct vmbus_chanpkt_hdr *pkt; + struct hv_fcopy_hdr *fcopy_msg; + struct icmsg_hdr *icmsghdr; + + pkt = (const struct vmbus_chanpkt_hdr *)desc; + offset = pkt->hlen << 3; + pktlen = (pkt->tlen << 3) - offset; + icmsghdr = (struct icmsg_hdr *)&desc[offset + sizeof(struct vmbuspipe_hdr)]; + icmsghdr->status = HV_E_FAIL; + + if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { + if (vmbus_prep_negotiate_resp(icmsghdr, desc + offset, pktlen, fw_versions, + FW_VER_COUNT, fcopy_versions, FCOPY_VER_COUNT, + NULL, &fcopy_srv_version)) { + syslog(LOG_INFO, "FCopy IC version %d.%d", + fcopy_srv_version >> 16, fcopy_srv_version & 0xFFFF); + icmsghdr->status = 0; + } + } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) { + /* Ensure recvlen is big enough to contain hv_fcopy_hdr */ + if (pktlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) { + syslog(LOG_ERR, "Invalid Fcopy hdr. Packet length too small: %u", + pktlen); + return -ENOBUFS; + } + + fcopy_msg = (struct hv_fcopy_hdr *)&desc[offset + ICMSG_HDR]; + icmsghdr->status = hv_fcopy_send_data(fcopy_msg, pktlen); + } + + icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; + ret = rte_vmbus_chan_send(txbr, 0x6, desc + offset, pktlen, 0); + if (ret) { + syslog(LOG_ERR, "Write to ringbuffer failed err: %d", ret); + return ret; + } + + return 0; +} + +static void fcopy_get_first_folder(char *path, char *chan_no) +{ + DIR *dir = opendir(path); + struct dirent *entry; + + if (!dir) { + syslog(LOG_ERR, "Failed to open directory (errno=%s).\n", strerror(errno)); + return; + } + + while ((entry = readdir(dir)) != NULL) { + if (entry->d_type == DT_DIR && strcmp(entry->d_name, ".") != 0 && + strcmp(entry->d_name, "..") != 0) { + strcpy(chan_no, entry->d_name); + break; + } + } + + closedir(dir); +} + +int main(int argc, char *argv[]) +{ + int fcopy_fd = -1, tmp = 1; + int daemonize = 1, long_index = 0, opt, ret = -EINVAL; + struct vmbus_br txbr, rxbr; + void *ring; + uint32_t len = HV_RING_SIZE; + char uio_name[MAX_FOLDER_NAME] = {0}; + char uio_dev_path[MAX_PATH_LEN] = {0}; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"no-daemon", no_argument, 0, 'n' }, + {0, 0, 0, 0 } + }; + + while ((opt = getopt_long(argc, argv, "hn", long_options, + &long_index)) != -1) { + switch (opt) { + case 'n': + daemonize = 0; + break; + case 'h': + default: + print_usage(argv); + goto exit; + } + } + + if (daemonize && daemon(1, 0)) { + syslog(LOG_ERR, "daemon() failed; error: %s", strerror(errno)); + goto exit; + } + + openlog("HV_UIO_FCOPY", 0, LOG_USER); + syslog(LOG_INFO, "starting; pid is:%d", getpid()); + + fcopy_get_first_folder(FCOPY_UIO, uio_name); + snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name); + fcopy_fd = open(uio_dev_path, O_RDWR); + + if (fcopy_fd < 0) { + syslog(LOG_ERR, "open %s failed; error: %d %s", + uio_dev_path, errno, strerror(errno)); + ret = fcopy_fd; + goto exit; + } + + ring = vmbus_uio_map(&fcopy_fd, HV_RING_SIZE); + if (!ring) { + ret = errno; + syslog(LOG_ERR, "mmap ringbuffer failed; error: %d %s", ret, strerror(ret)); + goto close; + } + vmbus_br_setup(&txbr, ring, HV_RING_SIZE); + vmbus_br_setup(&rxbr, (char *)ring + HV_RING_SIZE, HV_RING_SIZE); + + rxbr.vbr->imask = 0; + + while (1) { + /* + * In this loop we process fcopy messages after the + * handshake is complete. + */ + ret = pread(fcopy_fd, &tmp, sizeof(int), 0); + if (ret < 0) { + syslog(LOG_ERR, "pread failed: %s", strerror(errno)); + continue; + } + + len = HV_RING_SIZE; + ret = rte_vmbus_chan_recv_raw(&rxbr, desc, &len); + if (unlikely(ret <= 0)) { + /* This indicates a failure to communicate (or worse) */ + syslog(LOG_ERR, "VMBus channel recv error: %d", ret); + } else { + ret = fcopy_pkt_process(&txbr); + if (ret < 0) + goto close; + + /* Signal host */ + if ((write(fcopy_fd, &tmp, sizeof(int))) != sizeof(int)) { + ret = errno; + syslog(LOG_ERR, "Signal to host failed: %s\n", strerror(ret)); + goto close; + } + } + } +close: + close(fcopy_fd); +exit: + return ret; +} -- cgit v1.2.3 From ec314f61e4fc2d3dd6ea78aa18a5ac276eb1a8e3 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sat, 30 Mar 2024 01:52:02 -0700 Subject: Drivers: hv: Remove fcopy driver As the new fcopy driver using uio is introduced, remove obsolete driver and application. Signed-off-by: Saurabh Sengar Reviewed-by: Long Li Link: https://lore.kernel.org/r/1711788723-8593-7-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Greg Kroah-Hartman --- drivers/hv/Makefile | 2 +- drivers/hv/hv_fcopy.c | 427 --------------------------------------------- drivers/hv/hv_util.c | 12 -- tools/hv/hv_fcopy_daemon.c | 266 ---------------------------- 4 files changed, 1 insertion(+), 706 deletions(-) delete mode 100644 drivers/hv/hv_fcopy.c delete mode 100644 tools/hv/hv_fcopy_daemon.c (limited to 'tools') diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index d76df5c8c2a9..b992c0ed182b 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -10,7 +10,7 @@ hv_vmbus-y := vmbus_drv.o \ hv.o connection.o channel.o \ channel_mgmt.o ring_buffer.o hv_trace.o hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o -hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o +hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o # Code that must be built-in obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c deleted file mode 100644 index 922d83eb7ddf..000000000000 --- a/drivers/hv/hv_fcopy.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * An implementation of file copy service. - * - * Copyright (C) 2014, Microsoft, Inc. - * - * Author : K. Y. Srinivasan - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include - -#include "hyperv_vmbus.h" -#include "hv_utils_transport.h" - -#define WIN8_SRV_MAJOR 1 -#define WIN8_SRV_MINOR 1 -#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) - -#define FCOPY_VER_COUNT 1 -static const int fcopy_versions[] = { - WIN8_SRV_VERSION -}; - -#define FW_VER_COUNT 1 -static const int fw_versions[] = { - UTIL_FW_VERSION -}; - -/* - * Global state maintained for transaction that is being processed. - * For a class of integration services, including the "file copy service", - * the specified protocol is a "request/response" protocol which means that - * there can only be single outstanding transaction from the host at any - * given point in time. We use this to simplify memory management in this - * driver - we cache and process only one message at a time. - * - * While the request/response protocol is guaranteed by the host, we further - * ensure this by serializing packet processing in this driver - we do not - * read additional packets from the VMBUs until the current packet is fully - * handled. - */ - -static struct { - int state; /* hvutil_device_state */ - int recv_len; /* number of bytes received. */ - struct hv_fcopy_hdr *fcopy_msg; /* current message */ - struct vmbus_channel *recv_channel; /* chn we got the request */ - u64 recv_req_id; /* request ID. */ -} fcopy_transaction; - -static void fcopy_respond_to_host(int error); -static void fcopy_send_data(struct work_struct *dummy); -static void fcopy_timeout_func(struct work_struct *dummy); -static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func); -static DECLARE_WORK(fcopy_send_work, fcopy_send_data); -static const char fcopy_devname[] = "vmbus/hv_fcopy"; -static u8 *recv_buffer; -static struct hvutil_transport *hvt; -/* - * This state maintains the version number registered by the daemon. - */ -static int dm_reg_value; - -static void fcopy_poll_wrapper(void *channel) -{ - /* Transaction is finished, reset the state here to avoid races. */ - fcopy_transaction.state = HVUTIL_READY; - tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); -} - -static void fcopy_timeout_func(struct work_struct *dummy) -{ - /* - * If the timer fires, the user-mode component has not responded; - * process the pending transaction. - */ - fcopy_respond_to_host(HV_E_FAIL); - hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); -} - -static void fcopy_register_done(void) -{ - pr_debug("FCP: userspace daemon registered\n"); - hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); -} - -static int fcopy_handle_handshake(u32 version) -{ - u32 our_ver = FCOPY_CURRENT_VERSION; - - switch (version) { - case FCOPY_VERSION_0: - /* Daemon doesn't expect us to reply */ - dm_reg_value = version; - break; - case FCOPY_VERSION_1: - /* Daemon expects us to reply with our own version */ - if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver), - fcopy_register_done)) - return -EFAULT; - dm_reg_value = version; - break; - default: - /* - * For now we will fail the registration. - * If and when we have multiple versions to - * deal with, we will be backward compatible. - * We will add this code when needed. - */ - return -EINVAL; - } - pr_debug("FCP: userspace daemon ver. %d connected\n", version); - return 0; -} - -static void fcopy_send_data(struct work_struct *dummy) -{ - struct hv_start_fcopy *smsg_out = NULL; - int operation = fcopy_transaction.fcopy_msg->operation; - struct hv_start_fcopy *smsg_in; - void *out_src; - int rc, out_len; - - /* - * The strings sent from the host are encoded in - * utf16; convert it to utf8 strings. - * The host assures us that the utf16 strings will not exceed - * the max lengths specified. We will however, reserve room - * for the string terminating character - in the utf16s_utf8s() - * function we limit the size of the buffer where the converted - * string is placed to W_MAX_PATH -1 to guarantee - * that the strings can be properly terminated! - */ - - switch (operation) { - case START_FILE_COPY: - out_len = sizeof(struct hv_start_fcopy); - smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL); - if (!smsg_out) - return; - - smsg_out->hdr.operation = operation; - smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg; - - utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH, - UTF16_LITTLE_ENDIAN, - (__u8 *)&smsg_out->file_name, W_MAX_PATH - 1); - - utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH, - UTF16_LITTLE_ENDIAN, - (__u8 *)&smsg_out->path_name, W_MAX_PATH - 1); - - smsg_out->copy_flags = smsg_in->copy_flags; - smsg_out->file_size = smsg_in->file_size; - out_src = smsg_out; - break; - - case WRITE_TO_FILE: - out_src = fcopy_transaction.fcopy_msg; - out_len = sizeof(struct hv_do_fcopy); - break; - default: - out_src = fcopy_transaction.fcopy_msg; - out_len = fcopy_transaction.recv_len; - break; - } - - fcopy_transaction.state = HVUTIL_USERSPACE_REQ; - rc = hvutil_transport_send(hvt, out_src, out_len, NULL); - if (rc) { - pr_debug("FCP: failed to communicate to the daemon: %d\n", rc); - if (cancel_delayed_work_sync(&fcopy_timeout_work)) { - fcopy_respond_to_host(HV_E_FAIL); - fcopy_transaction.state = HVUTIL_READY; - } - } - kfree(smsg_out); -} - -/* - * Send a response back to the host. - */ - -static void -fcopy_respond_to_host(int error) -{ - struct icmsg_hdr *icmsghdr; - u32 buf_len; - struct vmbus_channel *channel; - u64 req_id; - - /* - * Copy the global state for completing the transaction. Note that - * only one transaction can be active at a time. This is guaranteed - * by the file copy protocol implemented by the host. Furthermore, - * the "transaction active" state we maintain ensures that there can - * only be one active transaction at a time. - */ - - buf_len = fcopy_transaction.recv_len; - channel = fcopy_transaction.recv_channel; - req_id = fcopy_transaction.recv_req_id; - - icmsghdr = (struct icmsg_hdr *) - &recv_buffer[sizeof(struct vmbuspipe_hdr)]; - - if (channel->onchannel_callback == NULL) - /* - * We have raced with util driver being unloaded; - * silently return. - */ - return; - - icmsghdr->status = error; - icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; - vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, - VM_PKT_DATA_INBAND, 0); -} - -void hv_fcopy_onchannelcallback(void *context) -{ - struct vmbus_channel *channel = context; - u32 recvlen; - u64 requestid; - struct hv_fcopy_hdr *fcopy_msg; - struct icmsg_hdr *icmsghdr; - int fcopy_srv_version; - - if (fcopy_transaction.state > HVUTIL_READY) - return; - - if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) { - pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n"); - return; - } - - if (!recvlen) - return; - - /* Ensure recvlen is big enough to read header data */ - if (recvlen < ICMSG_HDR) { - pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n", - recvlen); - return; - } - - icmsghdr = (struct icmsg_hdr *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr)]; - - if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdr, - recv_buffer, recvlen, - fw_versions, FW_VER_COUNT, - fcopy_versions, FCOPY_VER_COUNT, - NULL, &fcopy_srv_version)) { - - pr_info("FCopy IC version %d.%d\n", - fcopy_srv_version >> 16, - fcopy_srv_version & 0xFFFF); - } - } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) { - /* Ensure recvlen is big enough to contain hv_fcopy_hdr */ - if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) { - pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n", - recvlen); - return; - } - fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR]; - - /* - * Stash away this global state for completing the - * transaction; note transactions are serialized. - */ - - fcopy_transaction.recv_len = recvlen; - fcopy_transaction.recv_req_id = requestid; - fcopy_transaction.fcopy_msg = fcopy_msg; - - if (fcopy_transaction.state < HVUTIL_READY) { - /* Userspace is not registered yet */ - fcopy_respond_to_host(HV_E_FAIL); - return; - } - fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED; - - /* - * Send the information to the user-level daemon. - */ - schedule_work(&fcopy_send_work); - schedule_delayed_work(&fcopy_timeout_work, - HV_UTIL_TIMEOUT * HZ); - return; - } else { - pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n", - icmsghdr->icmsgtype); - return; - } - icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; - vmbus_sendpacket(channel, recv_buffer, recvlen, requestid, - VM_PKT_DATA_INBAND, 0); -} - -/* Callback when data is received from userspace */ -static int fcopy_on_msg(void *msg, int len) -{ - int *val = (int *)msg; - - if (len != sizeof(int)) - return -EINVAL; - - if (fcopy_transaction.state == HVUTIL_DEVICE_INIT) - return fcopy_handle_handshake(*val); - - if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ) - return -EINVAL; - - /* - * Complete the transaction by forwarding the result - * to the host. But first, cancel the timeout. - */ - if (cancel_delayed_work_sync(&fcopy_timeout_work)) { - fcopy_transaction.state = HVUTIL_USERSPACE_RECV; - fcopy_respond_to_host(*val); - hv_poll_channel(fcopy_transaction.recv_channel, - fcopy_poll_wrapper); - } - - return 0; -} - -static void fcopy_on_reset(void) -{ - /* - * The daemon has exited; reset the state. - */ - fcopy_transaction.state = HVUTIL_DEVICE_INIT; - - if (cancel_delayed_work_sync(&fcopy_timeout_work)) - fcopy_respond_to_host(HV_E_FAIL); -} - -int hv_fcopy_init(struct hv_util_service *srv) -{ - recv_buffer = srv->recv_buffer; - fcopy_transaction.recv_channel = srv->channel; - fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2; - - /* - * When this driver loads, the user level daemon that - * processes the host requests may not yet be running. - * Defer processing channel callbacks until the daemon - * has registered. - */ - fcopy_transaction.state = HVUTIL_DEVICE_INIT; - - hvt = hvutil_transport_init(fcopy_devname, 0, 0, - fcopy_on_msg, fcopy_on_reset); - if (!hvt) - return -EFAULT; - - return 0; -} - -static void hv_fcopy_cancel_work(void) -{ - cancel_delayed_work_sync(&fcopy_timeout_work); - cancel_work_sync(&fcopy_send_work); -} - -int hv_fcopy_pre_suspend(void) -{ - struct vmbus_channel *channel = fcopy_transaction.recv_channel; - struct hv_fcopy_hdr *fcopy_msg; - - /* - * Fake a CANCEL_FCOPY message for the user space daemon in case the - * daemon is in the middle of copying some file. It doesn't matter if - * there is already a message pending to be delivered to the user - * space since we force fcopy_transaction.state to be HVUTIL_READY, so - * the user space daemon's write() will fail with EINVAL (see - * fcopy_on_msg()), and the daemon will reset the device by closing - * and re-opening it. - */ - fcopy_msg = kzalloc(sizeof(*fcopy_msg), GFP_KERNEL); - if (!fcopy_msg) - return -ENOMEM; - - tasklet_disable(&channel->callback_event); - - fcopy_msg->operation = CANCEL_FCOPY; - - hv_fcopy_cancel_work(); - - /* We don't care about the return value. */ - hvutil_transport_send(hvt, fcopy_msg, sizeof(*fcopy_msg), NULL); - - kfree(fcopy_msg); - - fcopy_transaction.state = HVUTIL_READY; - - /* tasklet_enable() will be called in hv_fcopy_pre_resume(). */ - return 0; -} - -int hv_fcopy_pre_resume(void) -{ - struct vmbus_channel *channel = fcopy_transaction.recv_channel; - - tasklet_enable(&channel->callback_event); - - return 0; -} - -void hv_fcopy_deinit(void) -{ - fcopy_transaction.state = HVUTIL_DEVICE_DYING; - - hv_fcopy_cancel_work(); - - hvutil_transport_destroy(hvt); -} diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 9c97c4065fe7..c4f525325790 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -154,14 +154,6 @@ static struct hv_util_service util_vss = { .util_deinit = hv_vss_deinit, }; -static struct hv_util_service util_fcopy = { - .util_cb = hv_fcopy_onchannelcallback, - .util_init = hv_fcopy_init, - .util_pre_suspend = hv_fcopy_pre_suspend, - .util_pre_resume = hv_fcopy_pre_resume, - .util_deinit = hv_fcopy_deinit, -}; - static void perform_shutdown(struct work_struct *dummy) { orderly_poweroff(true); @@ -700,10 +692,6 @@ static const struct hv_vmbus_device_id id_table[] = { { HV_VSS_GUID, .driver_data = (unsigned long)&util_vss }, - /* File copy GUID */ - { HV_FCOPY_GUID, - .driver_data = (unsigned long)&util_fcopy - }, { }, }; diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c deleted file mode 100644 index 16d629b22c25..000000000000 --- a/tools/hv/hv_fcopy_daemon.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * An implementation of host to guest copy functionality for Linux. - * - * Copyright (C) 2014, Microsoft, Inc. - * - * Author : K. Y. Srinivasan - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int target_fd; -static char target_fname[PATH_MAX]; -static unsigned long long filesize; - -static int hv_start_fcopy(struct hv_start_fcopy *smsg) -{ - int error = HV_E_FAIL; - char *q, *p; - - filesize = 0; - p = (char *)smsg->path_name; - snprintf(target_fname, sizeof(target_fname), "%s/%s", - (char *)smsg->path_name, (char *)smsg->file_name); - - syslog(LOG_INFO, "Target file name: %s", target_fname); - /* - * Check to see if the path is already in place; if not, - * create if required. - */ - while ((q = strchr(p, '/')) != NULL) { - if (q == p) { - p++; - continue; - } - *q = '\0'; - if (access((char *)smsg->path_name, F_OK)) { - if (smsg->copy_flags & CREATE_PATH) { - if (mkdir((char *)smsg->path_name, 0755)) { - syslog(LOG_ERR, "Failed to create %s", - (char *)smsg->path_name); - goto done; - } - } else { - syslog(LOG_ERR, "Invalid path: %s", - (char *)smsg->path_name); - goto done; - } - } - p = q + 1; - *q = '/'; - } - - if (!access(target_fname, F_OK)) { - syslog(LOG_INFO, "File: %s exists", target_fname); - if (!(smsg->copy_flags & OVER_WRITE)) { - error = HV_ERROR_ALREADY_EXISTS; - goto done; - } - } - - target_fd = open(target_fname, - O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC, 0744); - if (target_fd == -1) { - syslog(LOG_INFO, "Open Failed: %s", strerror(errno)); - goto done; - } - - error = 0; -done: - if (error) - target_fname[0] = '\0'; - return error; -} - -static int hv_copy_data(struct hv_do_fcopy *cpmsg) -{ - ssize_t bytes_written; - int ret = 0; - - bytes_written = pwrite(target_fd, cpmsg->data, cpmsg->size, - cpmsg->offset); - - filesize += cpmsg->size; - if (bytes_written != cpmsg->size) { - switch (errno) { - case ENOSPC: - ret = HV_ERROR_DISK_FULL; - break; - default: - ret = HV_E_FAIL; - break; - } - syslog(LOG_ERR, "pwrite failed to write %llu bytes: %ld (%s)", - filesize, (long)bytes_written, strerror(errno)); - } - - return ret; -} - -/* - * Reset target_fname to "" in the two below functions for hibernation: if - * the fcopy operation is aborted by hibernation, the daemon should remove the - * partially-copied file; to achieve this, the hv_utils driver always fakes a - * CANCEL_FCOPY message upon suspend, and later when the VM resumes back, - * the daemon calls hv_copy_cancel() to remove the file; if a file is copied - * successfully before suspend, hv_copy_finished() must reset target_fname to - * avoid that the file can be incorrectly removed upon resume, since the faked - * CANCEL_FCOPY message is spurious in this case. - */ -static int hv_copy_finished(void) -{ - close(target_fd); - target_fname[0] = '\0'; - return 0; -} -static int hv_copy_cancel(void) -{ - close(target_fd); - if (strlen(target_fname) > 0) { - unlink(target_fname); - target_fname[0] = '\0'; - } - return 0; - -} - -void print_usage(char *argv[]) -{ - fprintf(stderr, "Usage: %s [options]\n" - "Options are:\n" - " -n, --no-daemon stay in foreground, don't daemonize\n" - " -h, --help print this help\n", argv[0]); -} - -int main(int argc, char *argv[]) -{ - int fcopy_fd = -1; - int error; - int daemonize = 1, long_index = 0, opt; - int version = FCOPY_CURRENT_VERSION; - union { - struct hv_fcopy_hdr hdr; - struct hv_start_fcopy start; - struct hv_do_fcopy copy; - __u32 kernel_modver; - } buffer = { }; - int in_handshake; - - static struct option long_options[] = { - {"help", no_argument, 0, 'h' }, - {"no-daemon", no_argument, 0, 'n' }, - {0, 0, 0, 0 } - }; - - while ((opt = getopt_long(argc, argv, "hn", long_options, - &long_index)) != -1) { - switch (opt) { - case 'n': - daemonize = 0; - break; - case 'h': - default: - print_usage(argv); - exit(EXIT_FAILURE); - } - } - - if (daemonize && daemon(1, 0)) { - syslog(LOG_ERR, "daemon() failed; error: %s", strerror(errno)); - exit(EXIT_FAILURE); - } - - openlog("HV_FCOPY", 0, LOG_USER); - syslog(LOG_INFO, "starting; pid is:%d", getpid()); - -reopen_fcopy_fd: - if (fcopy_fd != -1) - close(fcopy_fd); - /* Remove any possible partially-copied file on error */ - hv_copy_cancel(); - in_handshake = 1; - fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR); - - if (fcopy_fd < 0) { - syslog(LOG_ERR, "open /dev/vmbus/hv_fcopy failed; error: %d %s", - errno, strerror(errno)); - exit(EXIT_FAILURE); - } - - /* - * Register with the kernel. - */ - if ((write(fcopy_fd, &version, sizeof(int))) != sizeof(int)) { - syslog(LOG_ERR, "Registration failed: %s", strerror(errno)); - exit(EXIT_FAILURE); - } - - while (1) { - /* - * In this loop we process fcopy messages after the - * handshake is complete. - */ - ssize_t len; - - len = pread(fcopy_fd, &buffer, sizeof(buffer), 0); - if (len < 0) { - syslog(LOG_ERR, "pread failed: %s", strerror(errno)); - goto reopen_fcopy_fd; - } - - if (in_handshake) { - if (len != sizeof(buffer.kernel_modver)) { - syslog(LOG_ERR, "invalid version negotiation"); - exit(EXIT_FAILURE); - } - in_handshake = 0; - syslog(LOG_INFO, "kernel module version: %u", - buffer.kernel_modver); - continue; - } - - switch (buffer.hdr.operation) { - case START_FILE_COPY: - error = hv_start_fcopy(&buffer.start); - break; - case WRITE_TO_FILE: - error = hv_copy_data(&buffer.copy); - break; - case COMPLETE_FCOPY: - error = hv_copy_finished(); - break; - case CANCEL_FCOPY: - error = hv_copy_cancel(); - break; - - default: - error = HV_E_FAIL; - syslog(LOG_ERR, "Unknown operation: %d", - buffer.hdr.operation); - - } - - /* - * pwrite() may return an error due to the faked CANCEL_FCOPY - * message upon hibernation. Ignore the error by resetting the - * dev file, i.e. closing and re-opening it. - */ - if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) { - syslog(LOG_ERR, "pwrite failed: %s", strerror(errno)); - goto reopen_fcopy_fd; - } - } -} -- cgit v1.2.3 From 22724c89892f5d08a350941734125194b8f97c21 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 10 Apr 2024 11:48:25 +0200 Subject: selftests: mptcp: test last time mptcp_info This patch adds a new helper chk_msk_info() to show the counters in mptcp_info of the given info, and check that the timestamps move forward. Use it to show newly added last_data_sent, last_data_recv and last_ack_recv in mptcp_info in chk_last_time_info(). Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240410-upstream-net-next-20240405-mptcp-last-time-info-v2-2-f95bd6b33e51@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/diag.sh | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index bc97ab33a00e..776d43a6922d 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -200,6 +200,58 @@ chk_msk_cestab() "${expected}" "${msg}" "" } +msk_info_get_value() +{ + local port="${1}" + local info="${2}" + + ss -N "${ns}" -inHM dport "${port}" | \ + mptcp_lib_get_info_value "${info}" "${info}" +} + +chk_msk_info() +{ + local port="${1}" + local info="${2}" + local cnt="${3}" + local msg="....chk ${info}" + local delta_ms=250 # half what we waited before, just to be sure + local now + + now=$(msk_info_get_value "${port}" "${info}") + + mptcp_lib_print_title "${msg}" + if { [ -z "${cnt}" ] || [ -z "${now}" ]; } && + ! mptcp_lib_expect_all_features; then + mptcp_lib_pr_skip "Feature probably not supported" + mptcp_lib_result_skip "${msg}" + elif [ "$((cnt + delta_ms))" -lt "${now}" ]; then + mptcp_lib_pr_ok + mptcp_lib_result_pass "${msg}" + else + mptcp_lib_pr_fail "value of ${info} changed by $((now - cnt))ms," \ + "expected at least ${delta_ms}ms" + mptcp_lib_result_fail "${msg}" + ret=${KSFT_FAIL} + fi +} + +chk_last_time_info() +{ + local port="${1}" + local data_sent data_recv ack_recv + + data_sent=$(msk_info_get_value "${port}" "last_data_sent") + data_recv=$(msk_info_get_value "${port}" "last_data_recv") + ack_recv=$(msk_info_get_value "${port}" "last_ack_recv") + + sleep 0.5 # wait to check after if the timestamps difference + + chk_msk_info "${port}" "last_data_sent" "${data_sent}" + chk_msk_info "${port}" "last_data_recv" "${data_recv}" + chk_msk_info "${port}" "last_ack_recv" "${ack_recv}" +} + wait_connected() { local listener_ns="${1}" @@ -233,6 +285,7 @@ echo "b" | \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " +chk_last_time_info 10000 chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" chk_msk_inuse 2 -- cgit v1.2.3 From dfc083a181bac7d36992d21274e6f5820d5518ef Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Apr 2024 08:13:24 -0400 Subject: selftests: kvm: add tests for KVM_SEV_INIT2 Signed-off-by: Paolo Bonzini Message-ID: <20240404121327.3107131-15-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 + .../testing/selftests/kvm/include/kvm_util_base.h | 6 +- .../testing/selftests/kvm/set_memory_region_test.c | 8 +- .../testing/selftests/kvm/x86_64/sev_init2_tests.c | 152 +++++++++++++++++++++ 4 files changed, 159 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/sev_init2_tests.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 741c7dc16afc..871e2de3eb05 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -120,6 +120,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test TEST_GEN_PROGS_x86_64 += x86_64/amx_test diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 3e0db283a46a..7c06ceb36643 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -890,17 +890,15 @@ static inline struct kvm_vm *vm_create_barebones(void) return ____vm_create(VM_SHAPE_DEFAULT); } -#ifdef __x86_64__ -static inline struct kvm_vm *vm_create_barebones_protected_vm(void) +static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) { const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, - .type = KVM_X86_SW_PROTECTED_VM, + .type = type, }; return ____vm_create(shape); } -#endif static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 06b43ed23580..904d58793fc6 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -339,7 +339,7 @@ static void test_invalid_memory_region_flags(void) #ifdef __x86_64__ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); else #endif vm = vm_create_barebones(); @@ -462,7 +462,7 @@ static void test_add_private_memory_region(void) pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); @@ -471,7 +471,7 @@ static void test_add_private_memory_region(void) test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); close(memfd); - vm2 = vm_create_barebones_protected_vm(); + vm2 = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); @@ -499,7 +499,7 @@ static void test_add_overlapping_private_memory_regions(void) pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); - vm = vm_create_barebones_protected_vm(); + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c new file mode 100644 index 000000000000..7a4a61be119b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "kselftest.h" + +#define SVM_SEV_FEAT_DEBUG_SWAP 32u + +/* + * Some features may have hidden dependencies, or may only work + * for certain VM types. Err on the side of safety and don't + * expect that all supported features can be passed one by one + * to KVM_SEV_INIT2. + * + * (Well, right now there's only one...) + */ +#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP + +int kvm_fd; +u64 supported_vmsa_features; +bool have_sev_es; + +static int __sev_ioctl(int vm_fd, int cmd_id, void *data) +{ + struct kvm_sev_cmd cmd = { + .id = cmd_id, + .data = (uint64_t)data, + .sev_fd = open_sev_dev_path_or_exit(), + }; + int ret; + + ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); + TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS, + "%d failed: fw error: %d\n", + cmd_id, cmd.error); + + return ret; +} + +static void test_init2(unsigned long vm_type, struct kvm_sev_init *init) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == 0, + "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d", + ret, errno); + kvm_vm_free(vm); +} + +static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "KVM_SEV_INIT2 should fail, %s.", + msg); + kvm_vm_free(vm); +} + +void test_vm_types(void) +{ + test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){}); + + /* + * TODO: check that unsupported types cannot be created. Probably + * a separate selftest. + */ + if (have_sev_es) + test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){}); + + test_init2_invalid(0, &(struct kvm_sev_init){}, + "VM type is KVM_X86_DEFAULT_VM"); + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){}, + "VM type is KVM_X86_SW_PROTECTED_VM"); +} + +void test_flags(uint32_t vm_type) +{ + int i; + + for (i = 0; i < 32; i++) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .flags = BIT(i) }, + "invalid flag"); +} + +void test_features(uint32_t vm_type, uint64_t supported_features) +{ + int i; + + for (i = 0; i < 64; i++) { + if (!(supported_features & (1u << i))) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, + "unknown feature"); + else if (KNOWN_FEATURES & (1u << i)) + test_init2(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); + } +} + +int main(int argc, char *argv[]) +{ + int kvm_fd = open_kvm_dev_path_or_exit(); + bool have_sev; + + TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES) == 0); + kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES, + &supported_vmsa_features); + + have_sev = kvm_cpu_has(X86_FEATURE_SEV); + TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)), + "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM); + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)); + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + + TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)), + "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + + test_vm_types(); + + test_flags(KVM_X86_SEV_VM); + if (have_sev_es) + test_flags(KVM_X86_SEV_ES_VM); + + test_features(KVM_X86_SEV_VM, 0); + if (have_sev_es) + test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features); + + return 0; +} -- cgit v1.2.3 From d18c8648166e34d93b9d4a48e975bb24514d4a16 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Apr 2024 08:13:25 -0400 Subject: selftests: kvm: switch to using KVM_X86_*_VM This removes the concept of "subtypes", instead letting the tests use proper VM types that were recently added. While the sev_init_vm() and sev_es_init_vm() are still able to operate with the legacy KVM_SEV_INIT and KVM_SEV_ES_INIT ioctls, this is limited to VMs that are created manually with vm_create_barebones(). Signed-off-by: Paolo Bonzini Message-ID: <20240404121327.3107131-16-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 5 ++-- .../selftests/kvm/include/x86_64/processor.h | 6 ----- tools/testing/selftests/kvm/include/x86_64/sev.h | 16 ++---------- tools/testing/selftests/kvm/lib/kvm_util.c | 1 - tools/testing/selftests/kvm/lib/x86_64/processor.c | 14 ++++++---- tools/testing/selftests/kvm/lib/x86_64/sev.c | 30 +++++++++++++++++++--- 6 files changed, 40 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 7c06ceb36643..8acca8237687 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -93,7 +93,6 @@ enum kvm_mem_region_type { struct kvm_vm { int mode; unsigned long type; - uint8_t subtype; int kvm_fd; int fd; unsigned int pgtable_levels; @@ -200,8 +199,8 @@ enum vm_guest_mode { struct vm_shape { uint32_t type; uint8_t mode; - uint8_t subtype; - uint16_t padding; + uint8_t pad0; + uint16_t pad1; }; kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 81ce37ec407d..74a59c7ce7ed 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -23,12 +23,6 @@ extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; -enum vm_guest_x86_subtype { - VM_SUBTYPE_NONE = 0, - VM_SUBTYPE_SEV, - VM_SUBTYPE_SEV_ES, -}; - /* Forced emulation prefix, used to invoke the emulator unconditionally. */ #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h index 8a1bf88474c9..0719f083351a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/sev.h +++ b/tools/testing/selftests/kvm/include/x86_64/sev.h @@ -67,20 +67,8 @@ kvm_static_assert(SEV_RET_SUCCESS == 0); __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ }) -static inline void sev_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); -} - - -static inline void sev_es_vm_init(struct kvm_vm *vm) -{ - vm->arch.sev_fd = open_sev_dev_path_or_exit(); - - vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); -} +void sev_vm_init(struct kvm_vm *vm); +void sev_es_vm_init(struct kvm_vm *vm); static inline void sev_register_encrypted_memory(struct kvm_vm *vm, struct userspace_mem_region *region) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b2262b5fad9e..9da388100f3a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -276,7 +276,6 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->mode = shape.mode; vm->type = shape.type; - vm->subtype = shape.subtype; vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 74a4c736c9ae..9f87ca8b7ab6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -578,10 +578,11 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm) sync_global_to_guest(vm, host_cpu_is_intel); sync_global_to_guest(vm, host_cpu_is_amd); - if (vm->subtype == VM_SUBTYPE_SEV) - sev_vm_init(vm); - else if (vm->subtype == VM_SUBTYPE_SEV_ES) - sev_es_vm_init(vm); + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + struct kvm_sev_init init = { 0 }; + + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -1081,9 +1082,12 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) void kvm_init_vm_address_properties(struct kvm_vm *vm) { - if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) { + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); vm->gpa_tag_mask = vm->arch.c_bit; + } else { + vm->arch.sev_fd = -1; } } diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c index e248d3364b9c..597994fa4f41 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/sev.c +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -35,6 +35,32 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio } } +void sev_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + +void sev_es_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_ES_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) { struct kvm_sev_launch_start launch_start = { @@ -91,10 +117,8 @@ struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, struct kvm_vcpu **cpu) { struct vm_shape shape = { - .type = VM_TYPE_DEFAULT, .mode = VM_MODE_DEFAULT, - .subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES : - VM_SUBTYPE_SEV, + .type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM, }; struct kvm_vm *vm; struct kvm_vcpu *cpus[1]; -- cgit v1.2.3 From 4c180a57b03a3fec46a66c608ff525bf26b3a3be Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Apr 2024 08:13:26 -0400 Subject: selftests: kvm: split "launch" phase of SEV VM creation Allow the caller to set the initial state of the VM. Doing this before sev_vm_launch() matters for SEV-ES, since that is the place where the VMSA is updated and after which the guest state becomes sealed. Signed-off-by: Paolo Bonzini Message-ID: <20240404121327.3107131-17-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/sev.h | 3 ++- tools/testing/selftests/kvm/lib/x86_64/sev.c | 16 ++++++++++------ tools/testing/selftests/kvm/x86_64/sev_smoke_test.c | 7 ++++++- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h index 0719f083351a..82c11c81a956 100644 --- a/tools/testing/selftests/kvm/include/x86_64/sev.h +++ b/tools/testing/selftests/kvm/include/x86_64/sev.h @@ -31,8 +31,9 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); void sev_vm_launch_finish(struct kvm_vm *vm); -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu); +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement); kvm_static_assert(SEV_RET_SUCCESS == 0); diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c index 597994fa4f41..d482029b6004 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/sev.c +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -113,26 +113,30 @@ void sev_vm_launch_finish(struct kvm_vm *vm) TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); } -struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu) { struct vm_shape shape = { .mode = VM_MODE_DEFAULT, - .type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM, + .type = type, }; struct kvm_vm *vm; struct kvm_vcpu *cpus[1]; - uint8_t measurement[512]; vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus); *cpu = cpus[0]; + return vm; +} + +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement) +{ sev_vm_launch(vm, policy); - /* TODO: Validate the measurement is as expected. */ + if (!measurement) + measurement = alloca(256); + sev_vm_launch_measure(vm, measurement); sev_vm_launch_finish(vm); - - return vm; } diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 026779f3ed06..234c80dd344d 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -41,7 +41,12 @@ static void test_sev(void *guest_code, uint64_t policy) struct kvm_vm *vm; struct ucall uc; - vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu); + uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu); + + /* TODO: Validate the measurement is as expected. */ + vm_sev_launch(vm, policy, NULL); for (;;) { vcpu_run(vcpu); -- cgit v1.2.3 From 8c53183dbaa23db7a650197b92566772b38c1e12 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Apr 2024 08:13:27 -0400 Subject: selftests: kvm: add test for transferring FPU state into VMSA Signed-off-by: Paolo Bonzini Message-ID: <20240404121327.3107131-18-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/sev_smoke_test.c | 89 ++++++++++++++++++++++ 1 file changed, 89 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 234c80dd344d..7c70c0da4fb7 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "test_util.h" #include "kvm_util.h" @@ -13,6 +14,8 @@ #include "sev.h" +#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM) + static void guest_sev_es_code(void) { /* TODO: Check CPUID after GHCB-based hypercall support is added. */ @@ -35,6 +38,86 @@ static void guest_sev_code(void) GUEST_DONE(); } +/* Stash state passed via VMSA before any compiled code runs. */ +extern void guest_code_xsave(void); +asm("guest_code_xsave:\n" + "mov $-1, %eax\n" + "mov $-1, %edx\n" + "xsave (%rdi)\n" + "jmp guest_sev_es_code"); + +static void compare_xsave(u8 *from_host, u8 *from_guest) +{ + int i; + bool bad = false; + for (i = 0; i < 4095; i++) { + if (from_host[i] != from_guest[i]) { + printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + bad = true; + } + } + + if (bad) + abort(); +} + +static void test_sync_vmsa(uint32_t policy) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t gva; + void *hva; + + double x87val = M_PI; + struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 }; + struct kvm_sregs sregs; + struct kvm_xcrs xcrs = { + .nr_xcrs = 1, + .xcrs[0].xcr = 0, + .xcrs[0].value = XFEATURE_MASK_X87_AVX, + }; + + vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu); + gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR, + MEM_REGION_TEST_DATA); + hva = addr_gva2hva(vm, gva); + + vcpu_args_set(vcpu, 1, gva); + + vcpu_sregs_get(vcpu, &sregs); + sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE; + vcpu_sregs_set(vcpu, &sregs); + + vcpu_xcrs_set(vcpu, &xcrs); + asm("fninit\n" + "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n" + "fldl %3\n" + "xsave (%2)\n" + "fstp %%st\n" + : "=m"(xsave) + : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val) + : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); + vcpu_xsave_set(vcpu, &xsave); + + vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL); + + /* This page is shared, so make it decrypted. */ + memset(hva, 0, 4096); + + vcpu_run(vcpu); + + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, + "Wanted SYSTEM_EVENT, got %s", + exit_reason_str(vcpu->run->exit_reason)); + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); + + compare_xsave((u8 *)&xsave, (u8 *)hva); + + kvm_vm_free(vm); +} + static void test_sev(void *guest_code, uint64_t policy) { struct kvm_vcpu *vcpu; @@ -87,6 +170,12 @@ int main(int argc, char *argv[]) if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); test_sev(guest_sev_es_code, SEV_POLICY_ES); + + if (kvm_has_cap(KVM_CAP_XCRS) && + (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { + test_sync_vmsa(0); + test_sync_vmsa(SEV_POLICY_NO_DBG); + } } return 0; -- cgit v1.2.3 From d75142dbeb2bd1587b9cc19f841578f541275a64 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 9 Apr 2024 13:18:40 +0800 Subject: selftests/bpf: Fix umount cgroup2 error in test_sockmap This patch fixes the following "umount cgroup2" error in test_sockmap.c: (cgroup_helpers.c:353: errno: Device or resource busy) umount cgroup2 Cgroup fd cg_fd should be closed before cleanup_cgroup_environment(). Fixes: 13a5f3ffd202 ("bpf: Selftests, sockmap test prog run without setting cgroup") Signed-off-by: Geliang Tang Acked-by: Yonghong Song Link: https://lore.kernel.org/r/0399983bde729708773416b8488bac2cd5e022b8.1712639568.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 024a0faafb3b..43612de44fbf 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -2104,9 +2104,9 @@ out: free(options.whitelist); if (options.blacklist) free(options.blacklist); + close(cg_fd); if (cg_created) cleanup_cgroup_environment(); - close(cg_fd); return err; } -- cgit v1.2.3 From 68acca6e6f99b1f928a2c05b92bb1c272edb8ae7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 11 Apr 2024 13:43:11 +0800 Subject: selftests/bpf: Add struct send_recv_arg Avoid setting total_bytes and stop as global variables, this patch adds a new struct named send_recv_arg to pass arguments between threads. Put these two variables together with fd into this struct and pass it to server thread, so that server thread can access these two variables without setting them as global ones. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/ca1dd703b796f6810985418373e750f7068b4186.1712813933.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 34 ++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 077b107130f6..64f172f02a9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -21,7 +21,6 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static int expected_stg = 0xeB9F; -static int stop; static int settcpca(int fd, const char *tcp_ca) { @@ -34,13 +33,20 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + static void *server(void *arg) { - int lfd = (int)(long)arg, err = 0, fd; + struct send_recv_arg *a = (struct send_recv_arg *)arg; ssize_t nr_sent = 0, bytes = 0; char batch[1500]; + int err = 0, fd; - fd = accept(lfd, NULL, NULL); + fd = accept(a->fd, NULL, NULL); while (fd == -1) { if (errno == EINTR) continue; @@ -53,9 +59,9 @@ static void *server(void *arg) goto done; } - while (bytes < total_bytes && !READ_ONCE(stop)) { + while (bytes < a->bytes && !READ_ONCE(a->stop)) { nr_sent = send(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); + MIN(a->bytes - bytes, sizeof(batch)), 0); if (nr_sent == -1 && errno == EINTR) continue; if (nr_sent == -1) { @@ -65,13 +71,13 @@ static void *server(void *arg) bytes += nr_sent; } - ASSERT_EQ(bytes, total_bytes, "send"); + ASSERT_EQ(bytes, a->bytes, "send"); done: if (fd >= 0) close(fd); if (err) { - WRITE_ONCE(stop, 1); + WRITE_ONCE(a->stop, 1); return ERR_PTR(err); } return NULL; @@ -80,18 +86,22 @@ done: static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .bytes = total_bytes, + .stop = 0, + }; int lfd = -1, fd = -1; pthread_t srv_thread; void *thread_ret; char batch[1500]; int err; - WRITE_ONCE(stop, 0); - lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; + arg.fd = lfd; + fd = socket(AF_INET6, SOCK_STREAM, 0); if (!ASSERT_NEQ(fd, -1, "socket")) { close(lfd); @@ -123,12 +133,12 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); + err = pthread_create(&srv_thread, NULL, server, (void *)&arg); if (!ASSERT_OK(err, "pthread_create")) goto done; /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(stop)) { + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { nr_recv = recv(fd, &batch, MIN(total_bytes - bytes, sizeof(batch)), 0); if (nr_recv == -1 && errno == EINTR) @@ -140,7 +150,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) ASSERT_EQ(bytes, total_bytes, "recv"); - WRITE_ONCE(stop, 1); + WRITE_ONCE(arg.stop, 1); pthread_join(srv_thread, &thread_ret); ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); -- cgit v1.2.3 From dc34e44ea6a1c11cc517adc6df527b457acb9eaf Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 11 Apr 2024 13:43:12 +0800 Subject: selftests/bpf: Export send_recv_data helper This patch extracts the code to send and receive data into a new helper named send_recv_data() in network_helpers.c and export it in network_helpers.h. This helper will be used for MPTCP BPF selftests. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/5231103be91fadcce3674a589542c63b6a5eedd4.1712813933.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 102 +++++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 1 + .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 81 +--------------- 3 files changed, 104 insertions(+), 80 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 04175e16195a..dc1fd7af9c7a 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -545,3 +545,105 @@ int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) close(sockfd); return 0; } + +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + +static void *send_recv_server(void *arg) +{ + struct send_recv_arg *a = (struct send_recv_arg *)arg; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + int err = 0, fd; + + fd = accept(a->fd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd, 0)) { + err = -errno; + goto done; + } + + while (bytes < a->bytes && !READ_ONCE(a->stop)) { + nr_sent = send(fd, &batch, + MIN(a->bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + if (bytes != a->bytes) { + log_err("send %zd expected %u", bytes, a->bytes); + if (!err) + err = bytes > a->bytes ? -E2BIG : -EINTR; + } + +done: + if (fd >= 0) + close(fd); + if (err) { + WRITE_ONCE(a->stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +int send_recv_data(int lfd, int fd, uint32_t total_bytes) +{ + ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .fd = lfd, + .bytes = total_bytes, + .stop = 0, + }; + pthread_t srv_thread; + void *thread_ret; + char batch[1500]; + int err = 0; + + err = pthread_create(&srv_thread, NULL, send_recv_server, (void *)&arg); + if (err) { + log_err("Failed to pthread_create"); + return err; + } + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { + nr_recv = recv(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) { + err = -errno; + break; + } + bytes += nr_recv; + } + + if (bytes != total_bytes) { + log_err("recv %zd expected %u", bytes, total_bytes); + if (!err) + err = bytes > total_bytes ? -E2BIG : -EINTR; + } + + WRITE_ONCE(arg.stop, 1); + pthread_join(srv_thread, &thread_ret); + if (IS_ERR(thread_ret)) { + log_err("Failed in thread_ret %ld", PTR_ERR(thread_ret)); + err = err ? : PTR_ERR(thread_ret); + } + + return err; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 6457445cc6e2..70f4e4c92733 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -76,6 +76,7 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); +int send_recv_data(int lfd, int fd, uint32_t total_bytes); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 64f172f02a9a..907bac46c774 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -33,75 +33,15 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } -struct send_recv_arg { - int fd; - uint32_t bytes; - int stop; -}; - -static void *server(void *arg) -{ - struct send_recv_arg *a = (struct send_recv_arg *)arg; - ssize_t nr_sent = 0, bytes = 0; - char batch[1500]; - int err = 0, fd; - - fd = accept(a->fd, NULL, NULL); - while (fd == -1) { - if (errno == EINTR) - continue; - err = -errno; - goto done; - } - - if (settimeo(fd, 0)) { - err = -errno; - goto done; - } - - while (bytes < a->bytes && !READ_ONCE(a->stop)) { - nr_sent = send(fd, &batch, - MIN(a->bytes - bytes, sizeof(batch)), 0); - if (nr_sent == -1 && errno == EINTR) - continue; - if (nr_sent == -1) { - err = -errno; - break; - } - bytes += nr_sent; - } - - ASSERT_EQ(bytes, a->bytes, "send"); - -done: - if (fd >= 0) - close(fd); - if (err) { - WRITE_ONCE(a->stop, 1); - return ERR_PTR(err); - } - return NULL; -} - static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - ssize_t nr_recv = 0, bytes = 0; - struct send_recv_arg arg = { - .bytes = total_bytes, - .stop = 0, - }; int lfd = -1, fd = -1; - pthread_t srv_thread; - void *thread_ret; - char batch[1500]; int err; lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; - arg.fd = lfd; - fd = socket(AF_INET6, SOCK_STREAM, 0); if (!ASSERT_NEQ(fd, -1, "socket")) { close(lfd); @@ -133,26 +73,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)&arg); - if (!ASSERT_OK(err, "pthread_create")) - goto done; - - /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(arg.stop)) { - nr_recv = recv(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_recv == -1 && errno == EINTR) - continue; - if (nr_recv == -1) - break; - bytes += nr_recv; - } - - ASSERT_EQ(bytes, total_bytes, "recv"); - - WRITE_ONCE(arg.stop, 1); - pthread_join(srv_thread, &thread_ret); - ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); + ASSERT_OK(send_recv_data(lfd, fd, total_bytes), "send_recv_data"); done: close(lfd); -- cgit v1.2.3 From 657852135d39b75b1b5139839b7388c1d47f3ecc Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Apr 2024 14:17:58 -0700 Subject: perf annotate-data: Fix global variable lookup The recent change in the global variable handling added a bug to miss setting the return value even if it found a data type. Also add the type name in the debug message. Fixes: 1ebb5e17ef21b492 ("perf annotate-data: Add get_global_var_type()") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240405211800.1412920-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 043d80791bd0..1047ea9d578c 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1468,8 +1468,10 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) &offset, type_die)) { dloc->type_offset = offset; - pr_debug_dtp("found PC-rel by addr=%#"PRIx64" offset=%#x\n", + pr_debug_dtp("found PC-rel by addr=%#"PRIx64" offset=%#x", dloc->var_addr, offset); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = 0; goto out; } } -- cgit v1.2.3 From 879ebf3c830dba437781d034c536af53b0bff0c0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Apr 2024 14:17:59 -0700 Subject: perf annotate-data: Do not delete non-asm lines For data type profiling, it removed non-instruction lines from the list of annotation lines. It was to simplify the implementation dealing with instructions like to calculate the PC-relative address and to search the shortest path to the target instruction or basic block. But it means that it removes all the comments and debug information in the annotate output like source file name and line numbers. To support both code annotation and data type annotation, it'd be better to keep the non-instruction lines as well. So this change is to skip those lines during the data type profiling and to display them in the normal perf annotate output. No function changes intended (other than having more lines). Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240405211800.1412920-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 6 +++ tools/perf/util/annotate.c | 93 ++++++++++++++++++++++++++++++----------- 2 files changed, 74 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 1047ea9d578c..b69a1cd1577a 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1314,6 +1314,8 @@ static int find_data_type_insn(struct data_loc_info *dloc, int reg, list_for_each_entry(bb, basic_blocks, list) { struct disasm_line *dl = bb->begin; + BUG_ON(bb->begin->al.offset == -1 || bb->end->al.offset == -1); + pr_debug_dtp("bb: [%"PRIx64" - %"PRIx64"]\n", bb->begin->al.offset, bb->end->al.offset); @@ -1321,6 +1323,10 @@ static int find_data_type_insn(struct data_loc_info *dloc, int reg, u64 this_ip = sym->start + dl->al.offset; u64 addr = map__rip_2objdump(dloc->ms->map, this_ip); + /* Skip comment or debug info lines */ + if (dl->al.offset == -1) + continue; + /* Update variable type at this address */ update_var_state(&state, dloc, addr, dl->al.offset, var_types); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 4db49611c386..5d9b79559c73 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2159,23 +2159,10 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, static void symbol__ensure_annotate(struct map_symbol *ms, struct evsel *evsel) { - struct disasm_line *dl, *tmp_dl; - struct annotation *notes; - - notes = symbol__annotation(ms->sym); - if (!list_empty(¬es->src->source)) - return; - - if (symbol__annotate(ms, evsel, NULL) < 0) - return; + struct annotation *notes = symbol__annotation(ms->sym); - /* remove non-insn disasm lines for simplicity */ - list_for_each_entry_safe(dl, tmp_dl, ¬es->src->source, al.node) { - if (dl->al.offset == -1) { - list_del(&dl->al.node); - free(dl); - } - } + if (list_empty(¬es->src->source)) + symbol__annotate(ms, evsel, NULL); } static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip, @@ -2187,6 +2174,9 @@ static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip, notes = symbol__annotation(sym); list_for_each_entry(dl, ¬es->src->source, al.node) { + if (dl->al.offset == -1) + continue; + if (sym->start + dl->al.offset == ip) { /* * llvm-objdump places "lock" in a separate line and @@ -2251,6 +2241,46 @@ static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) return false; } +static struct disasm_line * +annotation__prev_asm_line(struct annotation *notes, struct disasm_line *curr) +{ + struct list_head *sources = ¬es->src->source; + struct disasm_line *prev; + + if (curr == list_first_entry(sources, struct disasm_line, al.node)) + return NULL; + + prev = list_prev_entry(curr, al.node); + while (prev->al.offset == -1 && + prev != list_first_entry(sources, struct disasm_line, al.node)) + prev = list_prev_entry(prev, al.node); + + if (prev->al.offset == -1) + return NULL; + + return prev; +} + +static struct disasm_line * +annotation__next_asm_line(struct annotation *notes, struct disasm_line *curr) +{ + struct list_head *sources = ¬es->src->source; + struct disasm_line *next; + + if (curr == list_last_entry(sources, struct disasm_line, al.node)) + return NULL; + + next = list_next_entry(curr, al.node); + while (next->al.offset == -1 && + next != list_last_entry(sources, struct disasm_line, al.node)) + next = list_next_entry(next, al.node); + + if (next->al.offset == -1) + return NULL; + + return next; +} + u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, struct disasm_line *dl) { @@ -2266,12 +2296,12 @@ u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, * disasm_line. If it's the last one, we can use symbol's end * address directly. */ - if (&dl->al.node == notes->src->source.prev) + next = annotation__next_asm_line(notes, dl); + if (next == NULL) addr = ms->sym->end + offset; - else { - next = list_next_entry(dl, al.node); + else addr = ip + (next->al.offset - dl->al.offset) + offset; - } + return map__rip_2objdump(ms->map, addr); } @@ -2403,10 +2433,13 @@ retry: * from the previous instruction. */ if (dl->al.offset > 0) { + struct annotation *notes; struct disasm_line *prev_dl; - prev_dl = list_prev_entry(dl, al.node); - if (ins__is_fused(arch, prev_dl->ins.name, dl->ins.name)) { + notes = symbol__annotation(ms->sym); + prev_dl = annotation__prev_asm_line(notes, dl); + + if (prev_dl && ins__is_fused(arch, prev_dl->ins.name, dl->ins.name)) { dl = prev_dl; goto retry; } @@ -2511,8 +2544,16 @@ static bool process_basic_block(struct basic_block_data *bb_data, last_dl = list_last_entry(¬es->src->source, struct disasm_line, al.node); + if (last_dl->al.offset == -1) + last_dl = annotation__prev_asm_line(notes, last_dl); + + if (last_dl == NULL) + return false; list_for_each_entry_from(dl, ¬es->src->source, al.node) { + /* Skip comment or debug info line */ + if (dl->al.offset == -1) + continue; /* Found the target instruction */ if (sym->start + dl->al.offset == target) { found = true; @@ -2533,7 +2574,8 @@ static bool process_basic_block(struct basic_block_data *bb_data, /* jump instruction creates new basic block(s) */ next_dl = find_disasm_line(sym, sym->start + dl->ops.target.offset, /*allow_update=*/false); - add_basic_block(bb_data, link, next_dl); + if (next_dl) + add_basic_block(bb_data, link, next_dl); /* * FIXME: determine conditional jumps properly. @@ -2541,8 +2583,9 @@ static bool process_basic_block(struct basic_block_data *bb_data, * next disasm line. */ if (!strstr(dl->ins.name, "jmp")) { - next_dl = list_next_entry(dl, al.node); - add_basic_block(bb_data, link, next_dl); + next_dl = annotation__next_asm_line(notes, dl); + if (next_dl) + add_basic_block(bb_data, link, next_dl); } break; -- cgit v1.2.3 From 0235abd89feaf29fe67d3abbe2c18b7119503537 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Apr 2024 14:18:00 -0700 Subject: perf annotate: Get rid of symbol__ensure_annotate() Now symbol__annotate() is reentrant and it doesn't need to remove non-instruction lines. Let's get rid of symbol__ensure_annotate() and call symbol__annotate() directly. Also we can use it to get the arch pointer instead of calling evsel__get_arch() directly. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240405211800.1412920-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 5d9b79559c73..11da27801d88 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2157,14 +2157,6 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, return 0; } -static void symbol__ensure_annotate(struct map_symbol *ms, struct evsel *evsel) -{ - struct annotation *notes = symbol__annotation(ms->sym); - - if (list_empty(¬es->src->source)) - symbol__annotate(ms, evsel, NULL); -} - static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip, bool allow_update) { @@ -2339,14 +2331,12 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) return NULL; } - if (evsel__get_arch(evsel, &arch) < 0) { + /* Make sure it has the disasm of the function */ + if (symbol__annotate(ms, evsel, &arch) < 0) { ann_data_stat.no_insn++; return NULL; } - /* Make sure it runs objdump to get disasm of the function */ - symbol__ensure_annotate(ms, evsel); - /* * Get a disasm to extract the location from the insn. * This is too slow... -- cgit v1.2.3 From 4b5ee6db2d3cd6249919ae554552856de0e29791 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Apr 2024 09:46:36 -0700 Subject: perf metrics: Remove the "No_group" metric group Rather than place metrics without a metric group in "No_group" place them in a a metric group that is their name. Still allow such metrics to be selected if "No_group" is passed, this change just impacts perf list. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240403164636.3429091-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 79ef6095ab28..6ec083af14a1 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -455,7 +455,7 @@ static int metricgroup__add_to_mep_groups(const struct pmu_metric *pm, const char *g; char *omg, *mg; - mg = strdup(pm->metric_group ?: "No_group"); + mg = strdup(pm->metric_group ?: pm->metric_name); if (!mg) return -ENOMEM; omg = mg; @@ -466,7 +466,7 @@ static int metricgroup__add_to_mep_groups(const struct pmu_metric *pm, if (strlen(g)) me = mep_lookup(groups, g, pm->metric_name); else - me = mep_lookup(groups, "No_group", pm->metric_name); + me = mep_lookup(groups, pm->metric_name, pm->metric_name); if (me) { me->metric_desc = pm->desc; -- cgit v1.2.3 From 256ef072b3842273ce703db18b603b051aca95fe Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 10 Apr 2024 11:34:52 +0100 Subject: perf tests: Make "test data symbol" more robust on Neoverse N1 To prevent anyone from seeing a test failure appear as a regression and thinking that it was caused by their code change, insert some noise into the loop which makes it immune to sampling bias issues (errata 1694299). The "test data symbol" test can fail with any unrelated change that shifts the loop into an unfortunate position in the Perf binary which is almost impossible to debug as the root cause of the test failure. Ultimately it's caused by the referenced errata. Fixes: 60abedb8aa902b06 ("perf test: Introduce script for data symbol testing") Reviewed-by: Ian Rogers Signed-off-by: James Clark Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Spoorthy S Link: https://lore.kernel.org/r/20240410103458.813656-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/workloads/datasym.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/workloads/datasym.c b/tools/perf/tests/workloads/datasym.c index ddd40bc63448..8e08fc75a973 100644 --- a/tools/perf/tests/workloads/datasym.c +++ b/tools/perf/tests/workloads/datasym.c @@ -16,6 +16,22 @@ static int datasym(int argc __maybe_unused, const char **argv __maybe_unused) { for (;;) { buf1.data1++; + if (buf1.data1 == 123) { + /* + * Add some 'noise' in the loop to work around errata + * 1694299 on Arm N1. + * + * Bias exists in SPE sampling which can cause the load + * and store instructions to be skipped entirely. This + * comes and goes randomly depending on the offset the + * linker places the datasym loop at in the Perf binary. + * With an extra branch in the middle of the loop that + * isn't always taken, the instruction stream is no + * longer a continuous repeating pattern that interacts + * badly with the bias. + */ + buf1.data1++; + } buf1.data2 += buf1.data1; } return 0; -- cgit v1.2.3 From 2dade41a533f337447b945239b87ff31a8857890 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 10 Apr 2024 11:34:53 +0100 Subject: perf tests: Apply attributes to all events in object code reading test PERF_PMU_CAP_EXTENDED_HW_TYPE results in multiple events being opened on heterogeneous systems. Currently this test only sets its required attributes on the first event. Not disabling enable_on_exec on the other events causes the test to fail because the forked objdump processes are sampled. No tracking event is opened so Perf only knows about its own mappings causing the objdump samples to give the following error: $ perf test -vvv "object code reading" Reading object code for memory address: 0xffff9aaa55ec thread__find_map failed ---- end(-1) ---- 24: Object code reading : FAILED! Fixes: 251aa040244a3b17 ("perf parse-events: Wildcard most "numeric" events") Reviewed-by: Ian Rogers Signed-off-by: James Clark Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Spoorthy S Link: https://lore.kernel.org/r/20240410103458.813656-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/code-reading.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 7a3a7bbbec71..29d2f3ee4e10 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -637,11 +637,11 @@ static int do_test_code_reading(bool try_kcore) evlist__config(evlist, &opts, NULL); - evsel = evlist__first(evlist); - - evsel->core.attr.comm = 1; - evsel->core.attr.disabled = 1; - evsel->core.attr.enable_on_exec = 0; + evlist__for_each_entry(evlist, evsel) { + evsel->core.attr.comm = 1; + evsel->core.attr.disabled = 1; + evsel->core.attr.enable_on_exec = 0; + } ret = evlist__open(evlist); if (ret < 0) { -- cgit v1.2.3 From df12e21d4e15e48a5e7d12e58f1a00742c4177d0 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 10 Apr 2024 11:34:54 +0100 Subject: perf map: Remove kernel map before updating start and end addresses In a debug build there is validation that mmap lists are sorted when taking a lock. In machine__update_kernel_mmap() the start and end addresses are updated resulting in an unsorted list before the map is removed from the list. When the map is removed, the lock is taken which triggers the validation and the failure: $ perf test "object code reading" --- start --- perf: util/maps.c:88: check_invariants: Assertion `map__start(prev) <= map__start(map)' failed. Aborted Fix it by updating the addresses after removal, but before insertion. The bug depends on the ordering and type of debug info on the system and doesn't reproduce everywhere. Fixes: 659ad3492b913c90 ("perf maps: Switch from rbtree to lazily sorted array for addresses") Reviewed-by: Ian Rogers Signed-off-by: James Clark Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Spoorthy S Link: https://lore.kernel.org/r/20240410103458.813656-4-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 5eb9044bc223..a26c8bea58d0 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1549,8 +1549,8 @@ static int machine__update_kernel_mmap(struct machine *machine, updated = map__get(orig); machine->vmlinux_map = updated; - machine__set_kernel_mmap(machine, start, end); maps__remove(machine__kernel_maps(machine), orig); + machine__set_kernel_mmap(machine, start, end); err = maps__insert(machine__kernel_maps(machine), updated); map__put(orig); -- cgit v1.2.3 From 7aa87499797c8e805c93fb0fd457c6babfb44bdb Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 10 Apr 2024 11:34:55 +0100 Subject: perf tests: Remove dependency on lscpu This check can be done with uname which is more portable. At the same time re-arrange it into a standard if statement so that it's more readable. Reviewed-by: Ian Rogers Signed-off-by: James Clark Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Spoorthy S Link: https://lore.kernel.org/r/20240410103458.813656-5-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_callgraph_fp.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh index 83b53591b1ea..61898e256616 100755 --- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh +++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh @@ -6,7 +6,9 @@ shelldir=$(dirname "$0") # shellcheck source=lib/perf_has_symbol.sh . "${shelldir}"/lib/perf_has_symbol.sh -lscpu | grep -q "aarch64" || exit 2 +if [ "$(uname -m)" != "aarch64" ]; then + exit 2 +fi if perf version --build-options | grep HAVE_DWARF_UNWIND_SUPPORT | grep -q OFF then -- cgit v1.2.3 From eb833488631b171e693a6917eb17b41fdedda659 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:50 -0700 Subject: perf annotate-data: Skip sample histogram for stack canary It's a pseudo data type and has no field. Fixes: b3c95109c131fcc9 ("perf annotate-data: Add stack canary type") Closes: https://lore.kernel.org/lkml/Zhb6jJneP36Z-or0@x1 Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 11da27801d88..ec79c120a7d2 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2399,8 +2399,9 @@ retry: mem_type = find_data_type(&dloc); if (mem_type == NULL && is_stack_canary(arch, op_loc)) { - mem_type = &canary_type; - dloc.type_offset = 0; + istat->good++; + he->mem_type_off = 0; + return &canary_type; } if (mem_type) -- cgit v1.2.3 From d9aedc12d3477ab72ec48bb7abb0f038c902c937 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:51 -0700 Subject: perf annotate: Show progress of sample processing Like 'perf report', it can take a while to process samples. Show a progress window to inform users how that it is not stuck. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 16e1581207c9..332e1ddcacbd 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -37,6 +37,7 @@ #include "util/map_symbol.h" #include "util/branch.h" #include "util/util.h" +#include "ui/progress.h" #include #include @@ -665,13 +666,23 @@ static int __cmd_annotate(struct perf_annotate *ann) evlist__for_each_entry(session->evlist, pos) { struct hists *hists = evsel__hists(pos); u32 nr_samples = hists->stats.nr_samples; + struct ui_progress prog; if (nr_samples > 0) { total_nr_samples += nr_samples; - hists__collapse_resort(hists, NULL); + + ui_progress__init(&prog, nr_samples, + "Merging related events..."); + hists__collapse_resort(hists, &prog); + ui_progress__finish(); + /* Don't sort callchain */ evsel__reset_sample_bit(pos, CALLCHAIN); - evsel__output_resort(pos, NULL); + + ui_progress__init(&prog, nr_samples, + "Sorting events for output..."); + evsel__output_resort(pos, &prog); + ui_progress__finish(); /* * An event group needs to display other events too. -- cgit v1.2.3 From 9b561be15febda6f9b314c9eab51e157f8f34dea Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:52 -0700 Subject: perf annotate-data: Add hist_entry__annotate_data_tty() And move the related code into util/annotate-data.c file. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 106 +------------------------------------ tools/perf/util/annotate-data.c | 112 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/annotate-data.h | 9 ++++ 3 files changed, 122 insertions(+), 105 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 332e1ddcacbd..0812664faa54 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -329,108 +329,6 @@ static int hist_entry__tty_annotate(struct hist_entry *he, return symbol__tty_annotate2(&he->ms, evsel); } -static void print_annotated_data_header(struct hist_entry *he, struct evsel *evsel) -{ - struct dso *dso = map__dso(he->ms.map); - int nr_members = 1; - int nr_samples = he->stat.nr_events; - int width = 7; - const char *val_hdr = "Percent"; - - if (evsel__is_group_event(evsel)) { - struct hist_entry *pair; - - list_for_each_entry(pair, &he->pairs.head, pairs.node) - nr_samples += pair->stat.nr_events; - } - - printf("Annotate type: '%s' in %s (%d samples):\n", - he->mem_type->self.type_name, dso->name, nr_samples); - - if (evsel__is_group_event(evsel)) { - struct evsel *pos; - int i = 0; - - for_each_group_evsel(pos, evsel) - printf(" event[%d] = %s\n", i++, pos->name); - - nr_members = evsel->core.nr_members; - } - - if (symbol_conf.show_total_period) { - width = 11; - val_hdr = "Period"; - } else if (symbol_conf.show_nr_samples) { - width = 7; - val_hdr = "Samples"; - } - - printf("============================================================================\n"); - printf("%*s %10s %10s %s\n", (width + 1) * nr_members, val_hdr, - "offset", "size", "field"); -} - -static void print_annotated_data_value(struct type_hist *h, u64 period, int nr_samples) -{ - double percent = h->period ? (100.0 * period / h->period) : 0; - const char *color = get_percent_color(percent); - - if (symbol_conf.show_total_period) - color_fprintf(stdout, color, " %11" PRIu64, period); - else if (symbol_conf.show_nr_samples) - color_fprintf(stdout, color, " %7d", nr_samples); - else - color_fprintf(stdout, color, " %7.2f", percent); -} - -static void print_annotated_data_type(struct annotated_data_type *mem_type, - struct annotated_member *member, - struct evsel *evsel, int indent) -{ - struct annotated_member *child; - struct type_hist *h = mem_type->histograms[evsel->core.idx]; - int i, nr_events = 1, samples = 0; - u64 period = 0; - int width = symbol_conf.show_total_period ? 11 : 7; - - for (i = 0; i < member->size; i++) { - samples += h->addr[member->offset + i].nr_samples; - period += h->addr[member->offset + i].period; - } - print_annotated_data_value(h, period, samples); - - if (evsel__is_group_event(evsel)) { - struct evsel *pos; - - for_each_group_member(pos, evsel) { - h = mem_type->histograms[pos->core.idx]; - - samples = 0; - period = 0; - for (i = 0; i < member->size; i++) { - samples += h->addr[member->offset + i].nr_samples; - period += h->addr[member->offset + i].period; - } - print_annotated_data_value(h, period, samples); - } - nr_events = evsel->core.nr_members; - } - - printf(" %10d %10d %*s%s\t%s", - member->offset, member->size, indent, "", member->type_name, - member->var_name ?: ""); - - if (!list_empty(&member->children)) - printf(" {\n"); - - list_for_each_entry(child, &member->children, node) - print_annotated_data_type(mem_type, child, evsel, indent + 4); - - if (!list_empty(&member->children)) - printf("%*s}", (width + 1) * nr_events + 24 + indent, ""); - printf(";\n"); -} - static void print_annotate_data_stat(struct annotated_data_stat *s) { #define PRINT_STAT(fld) if (s->fld) printf("%10d : %s\n", s->fld, #fld) @@ -571,9 +469,7 @@ find_next: goto find_next; } - print_annotated_data_header(he, evsel); - print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); - printf("\n"); + hist_entry__annotate_data_tty(he, evsel); goto find_next; } diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index b69a1cd1577a..b150137a92dc 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -19,6 +19,7 @@ #include "evlist.h" #include "map.h" #include "map_symbol.h" +#include "sort.h" #include "strbuf.h" #include "symbol.h" #include "symbol_conf.h" @@ -1710,3 +1711,114 @@ int annotated_data_type__update_samples(struct annotated_data_type *adt, h->addr[offset].period += period; return 0; } + +static void print_annotated_data_header(struct hist_entry *he, struct evsel *evsel) +{ + struct dso *dso = map__dso(he->ms.map); + int nr_members = 1; + int nr_samples = he->stat.nr_events; + int width = 7; + const char *val_hdr = "Percent"; + + if (evsel__is_group_event(evsel)) { + struct hist_entry *pair; + + list_for_each_entry(pair, &he->pairs.head, pairs.node) + nr_samples += pair->stat.nr_events; + } + + printf("Annotate type: '%s' in %s (%d samples):\n", + he->mem_type->self.type_name, dso->name, nr_samples); + + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + int i = 0; + + for_each_group_evsel(pos, evsel) + printf(" event[%d] = %s\n", i++, pos->name); + + nr_members = evsel->core.nr_members; + } + + if (symbol_conf.show_total_period) { + width = 11; + val_hdr = "Period"; + } else if (symbol_conf.show_nr_samples) { + width = 7; + val_hdr = "Samples"; + } + + printf("============================================================================\n"); + printf("%*s %10s %10s %s\n", (width + 1) * nr_members, val_hdr, + "offset", "size", "field"); +} + +static void print_annotated_data_value(struct type_hist *h, u64 period, int nr_samples) +{ + double percent = h->period ? (100.0 * period / h->period) : 0; + const char *color = get_percent_color(percent); + + if (symbol_conf.show_total_period) + color_fprintf(stdout, color, " %11" PRIu64, period); + else if (symbol_conf.show_nr_samples) + color_fprintf(stdout, color, " %7d", nr_samples); + else + color_fprintf(stdout, color, " %7.2f", percent); +} + +static void print_annotated_data_type(struct annotated_data_type *mem_type, + struct annotated_member *member, + struct evsel *evsel, int indent) +{ + struct annotated_member *child; + struct type_hist *h = mem_type->histograms[evsel->core.idx]; + int i, nr_events = 1, samples = 0; + u64 period = 0; + int width = symbol_conf.show_total_period ? 11 : 7; + + for (i = 0; i < member->size; i++) { + samples += h->addr[member->offset + i].nr_samples; + period += h->addr[member->offset + i].period; + } + print_annotated_data_value(h, period, samples); + + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + + for_each_group_member(pos, evsel) { + h = mem_type->histograms[pos->core.idx]; + + samples = 0; + period = 0; + for (i = 0; i < member->size; i++) { + samples += h->addr[member->offset + i].nr_samples; + period += h->addr[member->offset + i].period; + } + print_annotated_data_value(h, period, samples); + } + nr_events = evsel->core.nr_members; + } + + printf(" %10d %10d %*s%s\t%s", + member->offset, member->size, indent, "", member->type_name, + member->var_name ?: ""); + + if (!list_empty(&member->children)) + printf(" {\n"); + + list_for_each_entry(child, &member->children, node) + print_annotated_data_type(mem_type, child, evsel, indent + 4); + + if (!list_empty(&member->children)) + printf("%*s}", (width + 1) * nr_events + 24 + indent, ""); + printf(";\n"); +} + +int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel) +{ + print_annotated_data_header(he, evsel); + print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); + printf("\n"); + + return 0; +} diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index fe1e53d6e8c7..01489db267d4 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -10,6 +10,7 @@ struct annotated_op_loc; struct debuginfo; struct evsel; +struct hist_entry; struct map_symbol; struct thread; @@ -156,6 +157,8 @@ void annotated_data_type__tree_delete(struct rb_root *root); /* Release all global variable information in the tree */ void global_var_type__tree_delete(struct rb_root *root); +int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel); + #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * @@ -182,6 +185,12 @@ static inline void global_var_type__tree_delete(struct rb_root *root __maybe_unu { } +static inline int hist_entry__annotate_data_tty(struct hist_entry *he __maybe_unused, + struct evsel *evsel __maybe_unused) +{ + return -1; +} + #endif /* HAVE_DWARF_SUPPORT */ #endif /* _PERF_ANNOTATE_DATA_H */ -- cgit v1.2.3 From d001c7a7f473674353311a79cf140d054b26afcd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:53 -0700 Subject: perf annotate-data: Add hist_entry__annotate_data_tui() Support data type profiling output on TUI. Testing from Arnaldo: First make sure that the debug information for your workload binaries in embedded in them by building it with '-g' or install the debuginfo packages, since our workload is 'find': root@number:~# type find find is hashed (/usr/bin/find) root@number:~# rpm -qf /usr/bin/find findutils-4.9.0-5.fc39.x86_64 root@number:~# dnf debuginfo-install findutils root@number:~# Then collect some data: root@number:~# echo 1 > /proc/sys/vm/drop_caches root@number:~# perf mem record find / > /dev/null [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.331 MB perf.data (3982 samples) ] root@number:~# Finally do data-type annotation with the following command, that will default, as 'perf report' to the --tui mode, with lines colored to highlight the hotspots, etc. root@number:~# perf annotate --data-type Annotate type: 'struct predicate' (58 samples) Percent Offset Size Field 100.00 0 312 struct predicate { 0.00 0 8 PRED_FUNC pred_func; 0.00 8 8 char* p_name; 0.00 16 4 enum predicate_type p_type; 0.00 20 4 enum predicate_precedence p_prec; 0.00 24 1 _Bool side_effects; 0.00 25 1 _Bool no_default_print; 0.00 26 1 _Bool need_stat; 0.00 27 1 _Bool need_type; 0.00 28 1 _Bool need_inum; 0.00 32 4 enum EvaluationCost p_cost; 0.00 36 4 float est_success_rate; 0.00 40 1 _Bool literal_control_chars; 0.00 41 1 _Bool artificial; 0.00 48 8 char* arg_text; Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 30 +++- tools/perf/ui/browsers/Build | 1 + tools/perf/ui/browsers/annotate-data.c | 282 +++++++++++++++++++++++++++++++++ tools/perf/util/annotate-data.c | 3 +- tools/perf/util/annotate-data.h | 13 ++ 5 files changed, 324 insertions(+), 5 deletions(-) create mode 100644 tools/perf/ui/browsers/annotate-data.c (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 0812664faa54..6f7104f06c42 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -469,8 +469,32 @@ find_next: goto find_next; } - hist_entry__annotate_data_tty(he, evsel); - goto find_next; + if (use_browser == 1) + key = hist_entry__annotate_data_tui(he, evsel, NULL); + else + key = hist_entry__annotate_data_tty(he, evsel); + + switch (key) { + case -1: + if (!ann->skip_missing) + return; + /* fall through */ + case K_RIGHT: + case '>': + next = rb_next(nd); + break; + case K_LEFT: + case '<': + next = rb_prev(nd); + break; + default: + return; + } + + if (next != NULL) + nd = next; + + continue; } if (use_browser == 2) { @@ -873,9 +897,7 @@ int cmd_annotate(int argc, const char **argv) use_browser = 2; #endif - /* FIXME: only support stdio for now */ if (annotate.data_type) { - use_browser = 0; annotate_opts.annotate_src = false; symbol_conf.annotate_data_member = true; symbol_conf.annotate_data_sample = true; diff --git a/tools/perf/ui/browsers/Build b/tools/perf/ui/browsers/Build index 7a1d5ddaf688..2608b5da3167 100644 --- a/tools/perf/ui/browsers/Build +++ b/tools/perf/ui/browsers/Build @@ -1,4 +1,5 @@ perf-y += annotate.o +perf-y += annotate-data.o perf-y += hists.o perf-y += map.o perf-y += scripts.o diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c new file mode 100644 index 000000000000..fefacaaf16db --- /dev/null +++ b/tools/perf/ui/browsers/annotate-data.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "ui/browser.h" +#include "ui/helpline.h" +#include "ui/keysyms.h" +#include "ui/ui.h" +#include "util/annotate.h" +#include "util/annotate-data.h" +#include "util/evsel.h" +#include "util/sort.h" + +struct annotated_data_browser { + struct ui_browser b; + struct list_head entries; +}; + +struct browser_entry { + struct list_head node; + struct annotated_member *data; + struct type_hist_entry hists; + int indent; +}; + +static void update_hist_entry(struct type_hist_entry *dst, + struct type_hist_entry *src) +{ + dst->nr_samples += src->nr_samples; + dst->period += src->period; +} + +static int get_member_overhead(struct annotated_data_type *adt, + struct browser_entry *entry, + struct evsel *evsel) +{ + struct annotated_member *member = entry->data; + int i; + + for (i = 0; i < member->size; i++) { + struct type_hist *h; + int offset = member->offset + i; + + h = adt->histograms[evsel->core.idx]; + update_hist_entry(&entry->hists, &h->addr[offset]); + } + return 0; +} + +static int add_child_entries(struct annotated_data_browser *browser, + struct annotated_data_type *adt, + struct annotated_member *member, + struct evsel *evsel, int indent) +{ + struct annotated_member *pos; + struct browser_entry *entry; + int nr_entries = 0; + + entry = zalloc(sizeof(*entry)); + if (entry == NULL) + return -1; + + entry->data = member; + entry->indent = indent; + if (get_member_overhead(adt, entry, evsel) < 0) { + free(entry); + return -1; + } + + list_add_tail(&entry->node, &browser->entries); + nr_entries++; + + list_for_each_entry(pos, &member->children, node) { + int nr = add_child_entries(browser, adt, pos, evsel, indent + 1); + + if (nr < 0) + return nr; + + nr_entries += nr; + } + + /* add an entry for the closing bracket ("}") */ + if (!list_empty(&member->children)) { + entry = zalloc(sizeof(*entry)); + if (entry == NULL) + return -1; + + entry->indent = indent; + list_add_tail(&entry->node, &browser->entries); + nr_entries++; + } + + return nr_entries; +} + +static int annotated_data_browser__collect_entries(struct annotated_data_browser *browser) +{ + struct hist_entry *he = browser->b.priv; + struct annotated_data_type *adt = he->mem_type; + struct evsel *evsel = hists_to_evsel(he->hists); + + INIT_LIST_HEAD(&browser->entries); + browser->b.entries = &browser->entries; + browser->b.nr_entries = add_child_entries(browser, adt, &adt->self, + evsel, /*indent=*/0); + return 0; +} + +static void annotated_data_browser__delete_entries(struct annotated_data_browser *browser) +{ + struct browser_entry *pos, *tmp; + + list_for_each_entry_safe(pos, tmp, &browser->entries, node) { + list_del_init(&pos->node); + free(pos); + } +} + +static unsigned int browser__refresh(struct ui_browser *uib) +{ + return ui_browser__list_head_refresh(uib); +} + +static int browser__show(struct ui_browser *uib) +{ + struct hist_entry *he = uib->priv; + struct annotated_data_type *adt = he->mem_type; + const char *help = "Press 'h' for help on key bindings"; + char title[256]; + + snprintf(title, sizeof(title), "Annotate type: '%s' (%d samples)", + adt->self.type_name, he->stat.nr_events); + + if (ui_browser__show(uib, title, help) < 0) + return -1; + + /* second line header */ + ui_browser__gotorc_title(uib, 0, 0); + ui_browser__set_color(uib, HE_COLORSET_ROOT); + + if (symbol_conf.show_total_period) + strcpy(title, "Period"); + else if (symbol_conf.show_nr_samples) + strcpy(title, "Samples"); + else + strcpy(title, "Percent"); + + ui_browser__printf(uib, " %10s %10s %10s %s", + title, "Offset", "Size", "Field"); + ui_browser__write_nstring(uib, "", uib->width); + return 0; +} + +static void browser__write_overhead(struct ui_browser *uib, + struct type_hist *total, + struct type_hist_entry *hist, int row) +{ + u64 period = hist->period; + double percent = total->period ? (100.0 * period / total->period) : 0; + bool current = ui_browser__is_current_entry(uib, row); + int nr_samples = 0; + + ui_browser__set_percent_color(uib, percent, current); + + if (symbol_conf.show_total_period) + ui_browser__printf(uib, " %10" PRIu64, period); + else if (symbol_conf.show_nr_samples) + ui_browser__printf(uib, " %10d", nr_samples); + else + ui_browser__printf(uib, " %10.2f", percent); + + ui_browser__set_percent_color(uib, 0, current); +} + +static void browser__write(struct ui_browser *uib, void *entry, int row) +{ + struct browser_entry *be = entry; + struct annotated_member *member = be->data; + struct hist_entry *he = uib->priv; + struct annotated_data_type *adt = he->mem_type; + struct evsel *evsel = hists_to_evsel(he->hists); + + if (member == NULL) { + bool current = ui_browser__is_current_entry(uib, row); + + /* print the closing bracket */ + ui_browser__set_percent_color(uib, 0, current); + ui_browser__write_nstring(uib, "", 11); + ui_browser__printf(uib, " %10s %10s %*s};", + "", "", be->indent * 4, ""); + ui_browser__write_nstring(uib, "", uib->width); + return; + } + + /* print the number */ + browser__write_overhead(uib, adt->histograms[evsel->core.idx], + &be->hists, row); + + /* print type info */ + if (be->indent == 0 && !member->var_name) { + ui_browser__printf(uib, " %10d %10d %s%s", + member->offset, member->size, + member->type_name, + list_empty(&member->children) ? ";" : " {"); + } else { + ui_browser__printf(uib, " %10d %10d %*s%s\t%s%s", + member->offset, member->size, + be->indent * 4, "", member->type_name, + member->var_name ?: "", + list_empty(&member->children) ? ";" : " {"); + } + /* fill the rest */ + ui_browser__write_nstring(uib, "", uib->width); +} + +static int annotated_data_browser__run(struct annotated_data_browser *browser, + struct evsel *evsel __maybe_unused, + struct hist_browser_timer *hbt) +{ + int delay_secs = hbt ? hbt->refresh : 0; + int key; + + if (browser__show(&browser->b) < 0) + return -1; + + while (1) { + key = ui_browser__run(&browser->b, delay_secs); + + switch (key) { + case K_TIMER: + if (hbt) + hbt->timer(hbt->arg); + continue; + case K_F1: + case 'h': + ui_browser__help_window(&browser->b, + "UP/DOWN/PGUP\n" + "PGDN/SPACE Navigate\n" + " Move to prev/next symbol\n" + "q/ESC/CTRL+C Exit\n\n"); + continue; + case K_LEFT: + case '<': + case '>': + case K_ESC: + case 'q': + case CTRL('c'): + goto out; + default: + continue; + } + } +out: + ui_browser__hide(&browser->b); + return key; +} + +int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, + struct hist_browser_timer *hbt) +{ + struct annotated_data_browser browser = { + .b = { + .refresh = browser__refresh, + .seek = ui_browser__list_head_seek, + .write = browser__write, + .priv = he, + .extra_title_lines = 1, + }, + }; + int ret; + + ui_helpline__push("Press ESC to exit"); + + ret = annotated_data_browser__collect_entries(&browser); + if (ret == 0) + ret = annotated_data_browser__run(&browser, evsel, hbt); + + annotated_data_browser__delete_entries(&browser); + + return ret; +} diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index b150137a92dc..1cd857400038 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1820,5 +1820,6 @@ int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel) print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); printf("\n"); - return 0; + /* move to the next entry */ + return '>'; } diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 01489db267d4..0a57d9f5ee78 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -10,6 +10,7 @@ struct annotated_op_loc; struct debuginfo; struct evsel; +struct hist_browser_timer; struct hist_entry; struct map_symbol; struct thread; @@ -193,4 +194,16 @@ static inline int hist_entry__annotate_data_tty(struct hist_entry *he __maybe_un #endif /* HAVE_DWARF_SUPPORT */ +#ifdef HAVE_SLANG_SUPPORT +int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, + struct hist_browser_timer *hbt); +#else +static inline int hist_entry__annotate_data_tui(struct hist_entry *he __maybe_unused, + struct evsel *evsel __maybe_unused, + struct hist_browser_timer *hbt __maybe_unused) +{ + return -1; +} +#endif /* HAVE_SLANG_SUPPORT */ + #endif /* _PERF_ANNOTATE_DATA_H */ -- cgit v1.2.3 From 2b08f219d592d5b3d17db9a3850a9d793e6c9d83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:54 -0700 Subject: perf annotate-data: Support event group display in TUI Like in stdio, it should print all events in a group together. Committer notes: Collect it: root@number:~# perf record -a -e '{cpu_core/mem-loads,ldlat=30/P,cpu_core/mem-stores/P}' ^C[ perf record: Woken up 8 times to write data ] [ perf record: Captured and wrote 4.980 MB perf.data (55825 samples) ] root@number:~# Then do it in stdio: root@number:~# perf annotate --stdio --data-type Annotate type: 'union ' in /usr/lib64/libc.so.6 (1131 samples): event[0] = cpu_core/mem-loads,ldlat=30/P event[1] = cpu_core/mem-stores/P ============================================================================ Percent offset size field 100.00 100.00 0 40 union { 100.00 100.00 0 40 struct __pthread_mutex_s __data { 48.61 23.46 0 4 int __lock; 0.00 0.48 4 4 unsigned int __count; 6.38 41.32 8 4 int __owner; 8.74 34.02 12 4 unsigned int __nusers; 35.66 0.26 16 4 int __kind; 0.61 0.45 20 2 short int __spins; 0.00 0.00 22 2 short int __elision; 0.00 0.00 24 16 __pthread_list_t __list { 0.00 0.00 24 8 struct __pthread_internal_list* __prev; 0.00 0.00 32 8 struct __pthread_internal_list* __next; }; }; 0.00 0.00 0 0 char* __size; 48.61 23.94 0 8 long int __align; }; Now with TUI before this patch: root@number:~# perf annotate --tui --data-type Annotate type: 'union ' (790 samples) Percent Offset Size Field 100.00 0 40 union { 100.00 0 40 struct __pthread_mutex_s __data { 48.61 0 4 int __lock; 0.00 4 4 unsigned int __count; 6.38 8 4 int __owner; 8.74 12 4 unsigned int __nusers; 35.66 16 4 int __kind; 0.61 20 2 short int __spins; 0.00 22 2 short int __elision; 0.00 24 16 __pthread_list_t __list { 0.00 24 8 struct __pthread_internal_list* __prev; 0.00 32 8 struct __pthread_internal_list* __next; 0.00 0 0 char* __size; 48.61 0 8 long int __align; }; And now after this patch: Annotate type: 'union ' (790 samples) Percent Offset Size Field 100.00 100.00 0 40 union { 100.00 100.00 0 40 struct __pthread_mutex_s __data { 48.61 23.46 0 4 int __lock; 0.00 0.48 4 4 unsigned int __count; 6.38 41.32 8 4 int __owner; 8.74 34.02 12 4 unsigned int __nusers; 35.66 0.26 16 4 int __kind; 0.61 0.45 20 2 short int __spins; 0.00 0.00 22 2 short int __elision; 0.00 0.00 24 16 __pthread_list_t __list { 0.00 0.00 24 8 struct __pthread_internal_list* __prev; 0.00 0.00 32 8 struct __pthread_internal_list* __next; }; }; 0.00 0.00 0 0 char* __size; 48.61 23.94 0 8 long int __align; }; On a followup patch the --tui output should have this that is present in --stdio: And the --stdio has all the missing info in TUI: Annotate type: 'union ' in /usr/lib64/libc.so.6 (1131 samples): event[0] = cpu_core/mem-loads,ldlat=30/P event[1] = cpu_core/mem-stores/P Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 50 +++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index fefacaaf16db..a4a0f042f201 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -10,20 +10,27 @@ #include "util/annotate.h" #include "util/annotate-data.h" #include "util/evsel.h" +#include "util/evlist.h" #include "util/sort.h" struct annotated_data_browser { struct ui_browser b; struct list_head entries; + int nr_events; }; struct browser_entry { struct list_head node; struct annotated_member *data; - struct type_hist_entry hists; + struct type_hist_entry *hists; int indent; }; +static struct annotated_data_browser *get_browser(struct ui_browser *uib) +{ + return container_of(uib, struct annotated_data_browser, b); +} + static void update_hist_entry(struct type_hist_entry *dst, struct type_hist_entry *src) { @@ -33,17 +40,21 @@ static void update_hist_entry(struct type_hist_entry *dst, static int get_member_overhead(struct annotated_data_type *adt, struct browser_entry *entry, - struct evsel *evsel) + struct evsel *leader) { struct annotated_member *member = entry->data; - int i; + int i, k; for (i = 0; i < member->size; i++) { struct type_hist *h; + struct evsel *evsel; int offset = member->offset + i; - h = adt->histograms[evsel->core.idx]; - update_hist_entry(&entry->hists, &h->addr[offset]); + for_each_group_evsel(evsel, leader) { + h = adt->histograms[evsel->core.idx]; + k = evsel__group_idx(evsel); + update_hist_entry(&entry->hists[k], &h->addr[offset]); + } } return 0; } @@ -61,6 +72,12 @@ static int add_child_entries(struct annotated_data_browser *browser, if (entry == NULL) return -1; + entry->hists = calloc(browser->nr_events, sizeof(*entry->hists)); + if (entry->hists == NULL) { + free(entry); + return -1; + } + entry->data = member; entry->indent = indent; if (get_member_overhead(adt, entry, evsel) < 0) { @@ -113,6 +130,7 @@ static void annotated_data_browser__delete_entries(struct annotated_data_browser list_for_each_entry_safe(pos, tmp, &browser->entries, node) { list_del_init(&pos->node); + free(pos->hists); free(pos); } } @@ -126,6 +144,7 @@ static int browser__show(struct ui_browser *uib) { struct hist_entry *he = uib->priv; struct annotated_data_type *adt = he->mem_type; + struct annotated_data_browser *browser = get_browser(uib); const char *help = "Press 'h' for help on key bindings"; char title[256]; @@ -146,7 +165,8 @@ static int browser__show(struct ui_browser *uib) else strcpy(title, "Percent"); - ui_browser__printf(uib, " %10s %10s %10s %s", + ui_browser__printf(uib, "%*s %10s %10s %10s %s", + 11 * (browser->nr_events - 1), "", title, "Offset", "Size", "Field"); ui_browser__write_nstring(uib, "", uib->width); return 0; @@ -175,18 +195,20 @@ static void browser__write_overhead(struct ui_browser *uib, static void browser__write(struct ui_browser *uib, void *entry, int row) { + struct annotated_data_browser *browser = get_browser(uib); struct browser_entry *be = entry; struct annotated_member *member = be->data; struct hist_entry *he = uib->priv; struct annotated_data_type *adt = he->mem_type; - struct evsel *evsel = hists_to_evsel(he->hists); + struct evsel *leader = hists_to_evsel(he->hists); + struct evsel *evsel; if (member == NULL) { bool current = ui_browser__is_current_entry(uib, row); /* print the closing bracket */ ui_browser__set_percent_color(uib, 0, current); - ui_browser__write_nstring(uib, "", 11); + ui_browser__write_nstring(uib, "", 11 * browser->nr_events); ui_browser__printf(uib, " %10s %10s %*s};", "", "", be->indent * 4, ""); ui_browser__write_nstring(uib, "", uib->width); @@ -194,8 +216,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) } /* print the number */ - browser__write_overhead(uib, adt->histograms[evsel->core.idx], - &be->hists, row); + for_each_group_evsel(evsel, leader) { + struct type_hist *h = adt->histograms[evsel->core.idx]; + int idx = evsel__group_idx(evsel); + + browser__write_overhead(uib, h, &be->hists[idx], row); + } /* print type info */ if (be->indent == 0 && !member->var_name) { @@ -267,11 +293,15 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, .priv = he, .extra_title_lines = 1, }, + .nr_events = 1, }; int ret; ui_helpline__push("Press ESC to exit"); + if (evsel__is_group_event(evsel)) + browser.nr_events = evsel->core.nr_members; + ret = annotated_data_browser__collect_entries(&browser); if (ret == 0) ret = annotated_data_browser__run(&browser, evsel, hbt); -- cgit v1.2.3 From 0bfbe661a21f7c77a22eb978e0de3b672041e502 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:55 -0700 Subject: perf report: Add a menu item to annotate data type in TUI When the hist entry has the type info, it should be able to display the annotation browser for the type like in `perf annotate --data-type`. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 5 +++++ tools/perf/ui/browsers/hists.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dcd93ee5fc24..aaa6427a1224 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1694,6 +1694,11 @@ repeat: else use_browser = 0; + if (report.data_type && use_browser == 1) { + symbol_conf.annotate_data_member = true; + symbol_conf.annotate_data_sample = true; + } + if (sort_order && strstr(sort_order, "ipc")) { parse_options_usage(report_usage, options, "s", 1); goto error; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 0c02b3a8e121..71b32591d61a 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -38,6 +38,7 @@ #include "../ui.h" #include "map.h" #include "annotate.h" +#include "annotate-data.h" #include "srcline.h" #include "string2.h" #include "units.h" @@ -2505,6 +2506,32 @@ add_annotate_opt(struct hist_browser *browser __maybe_unused, return 1; } +static int +do_annotate_type(struct hist_browser *browser, struct popup_action *act) +{ + struct hist_entry *he = browser->he_selection; + + hist_entry__annotate_data_tui(he, act->evsel, browser->hbt); + ui_browser__handle_resize(&browser->b); + return 0; +} + +static int +add_annotate_type_opt(struct hist_browser *browser, + struct popup_action *act, char **optstr, + struct hist_entry *he) +{ + if (he == NULL || he->mem_type == NULL || he->mem_type->histograms == NULL) + return 0; + + if (asprintf(optstr, "Annotate type %s", he->mem_type->self.type_name) < 0) + return 0; + + act->evsel = hists_to_evsel(browser->hists); + act->fn = do_annotate_type; + return 1; +} + static int do_zoom_thread(struct hist_browser *browser, struct popup_action *act) { @@ -3307,6 +3334,10 @@ do_hotkey: // key came straight from options ui__popup_menu() browser->he_selection->ip); } skip_annotation: + nr_options += add_annotate_type_opt(browser, + &actions[nr_options], + &options[nr_options], + browser->he_selection); nr_options += add_thread_opt(browser, &actions[nr_options], &options[nr_options], thread); nr_options += add_dso_opt(browser, &actions[nr_options], -- cgit v1.2.3 From 6cdd977ec24e1538b35a08bde823a74b69e829f2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 20:32:56 -0700 Subject: perf report: Do not collect sample histogram unnecessarily The data type profiling alone doesn't need the sample histogram for functions. It only needs the histogram for the types. Let's remove the condition in the report_callback to check if data type profiling is selected and make sure the annotation has the 'struct annotated_source' instantiated before calling symbol__disassemble(). Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240411033256.2099646-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- tools/perf/util/annotate.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index aaa6427a1224..dafba6e030ef 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -172,7 +172,7 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter, struct mem_info *mi; struct branch_info *bi; - if (!ui__has_annotation() && !rep->symbol_ipc && !rep->data_type) + if (!ui__has_annotation() && !rep->symbol_ipc) return 0; if (sort__mode == SORT_MODE__BRANCH) { diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ec79c120a7d2..7595c8fbc2c5 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -908,6 +908,13 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, args.arch = arch; args.ms = *ms; + + if (notes->src == NULL) { + notes->src = annotated_source__new(); + if (notes->src == NULL) + return -1; + } + if (annotate_opts.full_addr) notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); else -- cgit v1.2.3 From 873a83731f1cc85c8427fdc7a9e24fb270faca44 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Apr 2024 11:51:17 -0700 Subject: perf annotate: Skip DSOs not found In some data file, I see the following messages repeated. It seems it doesn't have DSOs in the system and the dso->binary_type is set to DSO_BINARY_TYPE__NOT_FOUND. Let's skip them to avoid the followings. No output from objdump --start-address=0x0000000000000000 --stop-address=0x00000000000000d4 -d --no-show-raw-insn -C "$1" Error running objdump --start-address=0x0000000000000000 --stop-address=0x0000000000000631 -d --no-show-raw-insn -C "$1" ... Closes: https://lore.kernel.org/linux-perf-users/15e1a2847b8cebab4de57fc68e033086aa6980ce.camel@yandex.ru/ Reported-by: Konstantin Kharlamov Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Konstantin Kharlamov Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240410185117.1987239-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index a1219eb930aa..92937809be85 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1669,6 +1669,8 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) return symbol__disassemble_bpf(sym, args); } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { return symbol__disassemble_bpf_image(sym, args); + } else if (dso->binary_type == DSO_BINARY_TYPE__NOT_FOUND) { + return -1; } else if (dso__is_kcore(dso)) { kce.kcore_filename = symfs_filename; kce.addr = map__rip_2objdump(map, sym->start); -- cgit v1.2.3 From 792bc998baf9ae17297b1f93b1edc3ca34a0b7e2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 11 Apr 2024 10:54:47 +0300 Subject: perf record: Fix debug message placement for test consumption evlist__config() might mess up the debug output consumed by test "Test per-thread recording" in "Miscellaneous Intel PT testing". Move it out from between the debug prints: "perf record opening and mmapping events" and "perf record done opening and mmapping events" Fixes: da4062021e0e6da5 ("perf tools: Add debug messages and comments for testing") Closes: https://lore.kernel.org/linux-perf-users/ZhVfc5jYLarnGzKa@x1/ Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240411075447.17306-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 40d2c1c48666..6aeae398ec28 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1355,8 +1355,6 @@ static int record__open(struct record *rec) struct record_opts *opts = &rec->opts; int rc = 0; - evlist__config(evlist, opts, &callchain_param); - evlist__for_each_entry(evlist, pos) { try_again: if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { @@ -2483,6 +2481,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) evlist__uniquify_name(rec->evlist); + evlist__config(rec->evlist, opts, &callchain_param); + /* Debug message used by test scripts */ pr_debug3("perf record opening and mmapping events\n"); if (record__open(rec) != 0) { -- cgit v1.2.3 From 83acca9f90c700bafd81229f2c2d5debcf6b27bc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Apr 2024 23:42:03 -0700 Subject: perf dsos: Attempt to better abstract DSOs internals Move functions from machine and build-id to dsos. Pass 'struct dsos' rather than internal state. Rename some functions to better represent which data structure they operate on. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Markus Elfring Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Yang Jihong Cc: Yanteng Si Cc: Yicong Yang Cc: zhaimingbing Link: https://lore.kernel.org/r/20240410064214.2755936-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 2 +- tools/perf/builtin-record.c | 2 +- tools/perf/util/build-id.c | 38 +---------------------------------- tools/perf/util/build-id.h | 2 -- tools/perf/util/dso.h | 6 ------ tools/perf/util/dsos.c | 49 ++++++++++++++++++++++++++++++++++++++++++--- tools/perf/util/dsos.h | 19 ++++++++++++++---- tools/perf/util/machine.c | 40 ++++++++---------------------------- tools/perf/util/machine.h | 2 ++ tools/perf/util/session.c | 21 +++++++++++++++++++ tools/perf/util/session.h | 2 ++ 11 files changed, 97 insertions(+), 86 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index eb3ef5c24b66..ef73317e6ae7 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2122,7 +2122,7 @@ static int __cmd_inject(struct perf_inject *inject) */ if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) && inject->have_auxtrace && !inject->itrace_synth_opts.set) - dsos__hit_all(session); + perf_session__dsos_hit_all(session); /* * The AUX areas have been removed and replaced with * synthesized hardware events, so clear the feature flag. diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6aeae398ec28..2ff718d3e202 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1788,7 +1788,7 @@ record__finish_output(struct record *rec) process_buildids(rec); if (rec->buildid_all) - dsos__hit_all(rec->session); + perf_session__dsos_hit_all(rec->session); } perf_session__write_header(rec->session, rec->evlist, fd, true); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 03c64b85383b..a617b1917e6b 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -390,42 +390,6 @@ int perf_session__write_buildid_table(struct perf_session *session, return err; } -static int __dsos__hit_all(struct list_head *head) -{ - struct dso *pos; - - list_for_each_entry(pos, head, node) - pos->hit = true; - - return 0; -} - -static int machine__hit_all_dsos(struct machine *machine) -{ - return __dsos__hit_all(&machine->dsos.head); -} - -int dsos__hit_all(struct perf_session *session) -{ - struct rb_node *nd; - int err; - - err = machine__hit_all_dsos(&session->machines.host); - if (err) - return err; - - for (nd = rb_first_cached(&session->machines.guests); nd; - nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - - err = machine__hit_all_dsos(pos); - if (err) - return err; - } - - return 0; -} - void disable_buildid_cache(void) { no_buildid_cache = true; @@ -992,7 +956,7 @@ int perf_session__cache_build_ids(struct perf_session *session) static bool machine__read_build_ids(struct machine *machine, bool with_hits) { - return __dsos__read_build_ids(&machine->dsos.head, with_hits); + return __dsos__read_build_ids(&machine->dsos, with_hits); } bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index 4e3a1169379b..3fa8bffb07ca 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -39,8 +39,6 @@ int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); -int dsos__hit_all(struct perf_session *session); - int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 17dab230a2ca..3d4faad8d5dc 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -233,12 +233,6 @@ struct dso { #define dso__for_each_symbol(dso, pos, n) \ symbols__for_each_entry(&(dso)->symbols, pos, n) -#define dsos__for_each_with_build_id(pos, head) \ - list_for_each_entry(pos, head, node) \ - if (!pos->has_build_id) \ - continue; \ - else - static inline void dso__set_loaded(struct dso *dso) { dso->loaded = true; diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index cf80aa42dd07..e65ef6762bed 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -12,6 +12,35 @@ #include // filename__read_build_id #include +void dsos__init(struct dsos *dsos) +{ + INIT_LIST_HEAD(&dsos->head); + dsos->root = RB_ROOT; + init_rwsem(&dsos->lock); +} + +static void dsos__purge(struct dsos *dsos) +{ + struct dso *pos, *n; + + down_write(&dsos->lock); + + list_for_each_entry_safe(pos, n, &dsos->head, node) { + RB_CLEAR_NODE(&pos->rb_node); + pos->root = NULL; + list_del_init(&pos->node); + dso__put(pos); + } + + up_write(&dsos->lock); +} + +void dsos__exit(struct dsos *dsos) +{ + dsos__purge(dsos); + exit_rwsem(&dsos->lock); +} + static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) { if (a->maj > b->maj) return -1; @@ -73,8 +102,9 @@ int dso__cmp_id(struct dso *a, struct dso *b) return __dso_id__cmp(&a->id, &b->id); } -bool __dsos__read_build_ids(struct list_head *head, bool with_hits) +bool __dsos__read_build_ids(struct dsos *dsos, bool with_hits) { + struct list_head *head = &dsos->head; bool have_build_id = false; struct dso *pos; struct nscookie nsc; @@ -303,9 +333,10 @@ struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id return dso; } -size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, +size_t __dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm) { + struct list_head *head = &dsos->head; struct dso *pos; size_t ret = 0; @@ -320,8 +351,9 @@ size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, return ret; } -size_t __dsos__fprintf(struct list_head *head, FILE *fp) +size_t __dsos__fprintf(struct dsos *dsos, FILE *fp) { + struct list_head *head = &dsos->head; struct dso *pos; size_t ret = 0; @@ -331,3 +363,14 @@ size_t __dsos__fprintf(struct list_head *head, FILE *fp) return ret; } + +int __dsos__hit_all(struct dsos *dsos) +{ + struct list_head *head = &dsos->head; + struct dso *pos; + + list_for_each_entry(pos, head, node) + pos->hit = true; + + return 0; +} diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 5dbec2bc6966..1c81ddf07f8f 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -21,6 +21,15 @@ struct dsos { struct rw_semaphore lock; }; +#define dsos__for_each_with_build_id(pos, head) \ + list_for_each_entry(pos, head, node) \ + if (!pos->has_build_id) \ + continue; \ + else + +void dsos__init(struct dsos *dsos); +void dsos__exit(struct dsos *dsos); + void __dsos__add(struct dsos *dsos, struct dso *dso); void dsos__add(struct dsos *dsos, struct dso *dso); struct dso *__dsos__addnew(struct dsos *dsos, const char *name); @@ -28,13 +37,15 @@ struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short); struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); +bool __dsos__read_build_ids(struct dsos *dsos, bool with_hits); + struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, const char *name, struct dso_id *id); -bool __dsos__read_build_ids(struct list_head *head, bool with_hits); - -size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, +size_t __dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm); -size_t __dsos__fprintf(struct list_head *head, FILE *fp); +size_t __dsos__fprintf(struct dsos *dsos, FILE *fp); + +int __dsos__hit_all(struct dsos *dsos); #endif /* __PERF_DSOS */ diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a26c8bea58d0..848a94b8602b 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -48,13 +48,6 @@ static struct dso *machine__kernel_dso(struct machine *machine) return map__dso(machine->vmlinux_map); } -static void dsos__init(struct dsos *dsos) -{ - INIT_LIST_HEAD(&dsos->head); - dsos->root = RB_ROOT; - init_rwsem(&dsos->lock); -} - static int machine__set_mmap_name(struct machine *machine) { if (machine__is_host(machine)) @@ -165,28 +158,6 @@ struct machine *machine__new_kallsyms(void) return machine; } -static void dsos__purge(struct dsos *dsos) -{ - struct dso *pos, *n; - - down_write(&dsos->lock); - - list_for_each_entry_safe(pos, n, &dsos->head, node) { - RB_CLEAR_NODE(&pos->rb_node); - pos->root = NULL; - list_del_init(&pos->node); - dso__put(pos); - } - - up_write(&dsos->lock); -} - -static void dsos__exit(struct dsos *dsos) -{ - dsos__purge(dsos); - exit_rwsem(&dsos->lock); -} - void machine__delete_threads(struct machine *machine) { threads__remove_all_threads(&machine->threads); @@ -907,11 +878,11 @@ out: size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) { struct rb_node *nd; - size_t ret = __dsos__fprintf(&machines->host.dsos.head, fp); + size_t ret = __dsos__fprintf(&machines->host.dsos, fp); for (nd = rb_first_cached(&machines->guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret += __dsos__fprintf(&pos->dsos.head, fp); + ret += __dsos__fprintf(&pos->dsos, fp); } return ret; @@ -920,7 +891,7 @@ size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) size_t machine__fprintf_dsos_buildid(struct machine *m, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm) { - return __dsos__fprintf_buildid(&m->dsos.head, fp, skip, parm); + return __dsos__fprintf_buildid(&m->dsos, fp, skip, parm); } size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp, @@ -3306,3 +3277,8 @@ bool machine__is_lock_function(struct machine *machine, u64 addr) return false; } + +int machine__hit_all_dsos(struct machine *machine) +{ + return __dsos__hit_all(&machine->dsos); +} diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 4312f6db6de0..82a47bac8023 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -306,4 +306,6 @@ int machine__map_x86_64_entry_trampolines(struct machine *machine, int machine__resolve(struct machine *machine, struct addr_location *al, struct perf_sample *sample); +int machine__hit_all_dsos(struct machine *machine); + #endif /* __PERF_MACHINE_H */ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 02a932a83c51..a10343b9dcd4 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2916,3 +2916,24 @@ int perf_event__process_id_index(struct perf_session *session, } return 0; } + +int perf_session__dsos_hit_all(struct perf_session *session) +{ + struct rb_node *nd; + int err; + + err = machine__hit_all_dsos(&session->machines.host); + if (err) + return err; + + for (nd = rb_first_cached(&session->machines.guests); nd; + nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + + err = machine__hit_all_dsos(pos); + if (err) + return err; + } + + return 0; +} diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 5064c6ec11e7..3b0256e977a6 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -156,6 +156,8 @@ int perf_session__deliver_synth_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample); +int perf_session__dsos_hit_all(struct perf_session *session); + int perf_event__process_id_index(struct perf_session *session, union perf_event *event); -- cgit v1.2.3 From f649ed80f3cabbf16b228894bb7ecd718da86e47 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Apr 2024 23:42:04 -0700 Subject: perf dsos: Tidy reference counting and locking Move more functionality into dsos.c generally from machine.c, renaming functions to match their new usage. The find function is made to always "get" before returning a dso. Reduce the scope of locks in vdso to match the locking paradigm. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Markus Elfring Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Yang Jihong Cc: Yanteng Si Cc: Yicong Yang Cc: zhaimingbing Link: https://lore.kernel.org/r/20240410064214.2755936-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dsos.c | 73 ++++++++++++++++++++++++++++++++++++++++++----- tools/perf/util/dsos.h | 9 +++++- tools/perf/util/machine.c | 62 ++-------------------------------------- tools/perf/util/map.c | 4 +-- tools/perf/util/vdso.c | 48 +++++++++++++------------------ 5 files changed, 97 insertions(+), 99 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index e65ef6762bed..d269e09005a7 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -181,7 +181,7 @@ struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso * at the end of the list of duplicates. */ if (!dso || (dso == this)) - return this; /* Find matching dso */ + return dso__get(this); /* Find matching dso */ /* * The core kernel DSOs may have duplicated long name. * In this case, the short name should be different. @@ -253,15 +253,20 @@ static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct d if (cmp_short) { list_for_each_entry(pos, &dsos->head, node) if (__dso__cmp_short_name(name, id, pos) == 0) - return pos; + return dso__get(pos); return NULL; } return __dsos__findnew_by_longname_id(&dsos->root, name, id); } -struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short) +struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short) { - return __dsos__find_id(dsos, name, NULL, cmp_short); + struct dso *res; + + down_read(&dsos->lock); + res = __dsos__find_id(dsos, name, NULL, cmp_short); + up_read(&dsos->lock); + return res; } static void dso__set_basename(struct dso *dso) @@ -303,8 +308,6 @@ static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct if (dso != NULL) { __dsos__add(dsos, dso); dso__set_basename(dso); - /* Put dso here because __dsos_add already got it */ - dso__put(dso); } return dso; } @@ -328,7 +331,7 @@ struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id { struct dso *dso; down_write(&dsos->lock); - dso = dso__get(__dsos__findnew_id(dsos, name, id)); + dso = __dsos__findnew_id(dsos, name, id); up_write(&dsos->lock); return dso; } @@ -374,3 +377,59 @@ int __dsos__hit_all(struct dsos *dsos) return 0; } + +struct dso *dsos__findnew_module_dso(struct dsos *dsos, + struct machine *machine, + struct kmod_path *m, + const char *filename) +{ + struct dso *dso; + + down_write(&dsos->lock); + + dso = __dsos__find_id(dsos, m->name, NULL, /*cmp_short=*/true); + if (!dso) { + dso = __dsos__addnew(dsos, m->name); + if (dso == NULL) + goto out_unlock; + + dso__set_module_info(dso, m, machine); + dso__set_long_name(dso, strdup(filename), true); + dso->kernel = DSO_SPACE__KERNEL; + } + +out_unlock: + up_write(&dsos->lock); + return dso; +} + +struct dso *dsos__find_kernel_dso(struct dsos *dsos) +{ + struct dso *dso, *res = NULL; + + down_read(&dsos->lock); + list_for_each_entry(dso, &dsos->head, node) { + /* + * The cpumode passed to is_kernel_module is not the cpumode of + * *this* event. If we insist on passing correct cpumode to + * is_kernel_module, we should record the cpumode when we adding + * this dso to the linked list. + * + * However we don't really need passing correct cpumode. We + * know the correct cpumode must be kernel mode (if not, we + * should not link it onto kernel_dsos list). + * + * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. + * is_kernel_module() treats it as a kernel cpumode. + */ + if (!dso->kernel || + is_kernel_module(dso->long_name, + PERF_RECORD_MISC_CPUMODE_UNKNOWN)) + continue; + + res = dso__get(dso); + break; + } + up_read(&dsos->lock); + return res; +} diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 1c81ddf07f8f..a7c7f723c5ff 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -10,6 +10,8 @@ struct dso; struct dso_id; +struct kmod_path; +struct machine; /* * DSOs are put into both a list for fast iteration and rbtree for fast @@ -33,7 +35,7 @@ void dsos__exit(struct dsos *dsos); void __dsos__add(struct dsos *dsos, struct dso *dso); void dsos__add(struct dsos *dsos, struct dso *dso); struct dso *__dsos__addnew(struct dsos *dsos, const char *name); -struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short); +struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); @@ -48,4 +50,9 @@ size_t __dsos__fprintf(struct dsos *dsos, FILE *fp); int __dsos__hit_all(struct dsos *dsos); +struct dso *dsos__findnew_module_dso(struct dsos *dsos, struct machine *machine, + struct kmod_path *m, const char *filename); + +struct dso *dsos__find_kernel_dso(struct dsos *dsos); + #endif /* __PERF_DSOS */ diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 848a94b8602b..e2c800a1449b 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -646,31 +646,6 @@ int machine__process_lost_samples_event(struct machine *machine __maybe_unused, return 0; } -static struct dso *machine__findnew_module_dso(struct machine *machine, - struct kmod_path *m, - const char *filename) -{ - struct dso *dso; - - down_write(&machine->dsos.lock); - - dso = __dsos__find(&machine->dsos, m->name, true); - if (!dso) { - dso = __dsos__addnew(&machine->dsos, m->name); - if (dso == NULL) - goto out_unlock; - - dso__set_module_info(dso, m, machine); - dso__set_long_name(dso, strdup(filename), true); - dso->kernel = DSO_SPACE__KERNEL; - } - - dso__get(dso); -out_unlock: - up_write(&machine->dsos.lock); - return dso; -} - int machine__process_aux_event(struct machine *machine __maybe_unused, union perf_event *event) { @@ -854,7 +829,7 @@ static struct map *machine__addnew_module_map(struct machine *machine, u64 start if (kmod_path__parse_name(&m, filename)) return NULL; - dso = machine__findnew_module_dso(machine, &m, filename); + dso = dsos__findnew_module_dso(&machine->dsos, machine, &m, filename); if (dso == NULL) goto out; @@ -1663,40 +1638,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine, * Should be there already, from the build-id table in * the header. */ - struct dso *kernel = NULL; - struct dso *dso; - - down_read(&machine->dsos.lock); - - list_for_each_entry(dso, &machine->dsos.head, node) { - - /* - * The cpumode passed to is_kernel_module is not the - * cpumode of *this* event. If we insist on passing - * correct cpumode to is_kernel_module, we should - * record the cpumode when we adding this dso to the - * linked list. - * - * However we don't really need passing correct - * cpumode. We know the correct cpumode must be kernel - * mode (if not, we should not link it onto kernel_dsos - * list). - * - * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. - * is_kernel_module() treats it as a kernel cpumode. - */ - - if (!dso->kernel || - is_kernel_module(dso->long_name, - PERF_RECORD_MISC_CPUMODE_UNKNOWN)) - continue; - - - kernel = dso__get(dso); - break; - } - - up_read(&machine->dsos.lock); + struct dso *kernel = dsos__find_kernel_dso(&machine->dsos); if (kernel == NULL) kernel = machine__findnew_dso(machine, machine->mmap_name); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index a5d57c201a30..cca871959f87 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -196,9 +196,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, * reading the header will have the build ID set and all future mmaps will * have it missing. */ - down_read(&machine->dsos.lock); - header_bid_dso = __dsos__find(&machine->dsos, filename, false); - up_read(&machine->dsos.lock); + header_bid_dso = dsos__find(&machine->dsos, filename, false); if (header_bid_dso && header_bid_dso->header_build_id) { dso__set_build_id(dso, &header_bid_dso->bid); dso->header_build_id = 1; diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index df8963796187..35532dcbff74 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -133,8 +133,6 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s if (dso != NULL) { __dsos__add(&machine->dsos, dso); dso__set_long_name(dso, long_name, false); - /* Put dso here because __dsos_add already got it */ - dso__put(dso); } return dso; @@ -252,17 +250,15 @@ static struct dso *__machine__findnew_compat(struct machine *machine, const char *file_name; struct dso *dso; - dso = __dsos__find(&machine->dsos, vdso_file->dso_name, true); + dso = dsos__find(&machine->dsos, vdso_file->dso_name, true); if (dso) - goto out; + return dso; file_name = vdso__get_compat_file(vdso_file); if (!file_name) - goto out; + return NULL; - dso = __machine__addnew_vdso(machine, vdso_file->dso_name, file_name); -out: - return dso; + return __machine__addnew_vdso(machine, vdso_file->dso_name, file_name); } static int __machine__findnew_vdso_compat(struct machine *machine, @@ -308,21 +304,21 @@ static struct dso *machine__find_vdso(struct machine *machine, dso_type = machine__thread_dso_type(machine, thread); switch (dso_type) { case DSO__TYPE_32BIT: - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO32, true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO32, true); if (!dso) { - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, - true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, + true); if (dso && dso_type != dso__type(dso, machine)) dso = NULL; } break; case DSO__TYPE_X32BIT: - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSOX32, true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSOX32, true); break; case DSO__TYPE_64BIT: case DSO__TYPE_UNKNOWN: default: - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, true); + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, true); break; } @@ -334,37 +330,33 @@ struct dso *machine__findnew_vdso(struct machine *machine, { struct vdso_info *vdso_info; struct dso *dso = NULL; + char *file; - down_write(&machine->dsos.lock); if (!machine->vdso_info) machine->vdso_info = vdso_info__new(); vdso_info = machine->vdso_info; if (!vdso_info) - goto out_unlock; + return NULL; dso = machine__find_vdso(machine, thread); if (dso) - goto out_unlock; + return dso; #if BITS_PER_LONG == 64 if (__machine__findnew_vdso_compat(machine, thread, vdso_info, &dso)) - goto out_unlock; + return dso; #endif - dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, true); - if (!dso) { - char *file; + dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, true); + if (dso) + return dso; - file = get_file(&vdso_info->vdso); - if (file) - dso = __machine__addnew_vdso(machine, DSO__NAME_VDSO, file); - } + file = get_file(&vdso_info->vdso); + if (!file) + return NULL; -out_unlock: - dso__get(dso); - up_write(&machine->dsos.lock); - return dso; + return __machine__addnew_vdso(machine, DSO__NAME_VDSO, file); } bool dso__is_vdso(struct dso *dso) -- cgit v1.2.3 From 73f3fea2e11dbd92f49807ee1c33429674f71f13 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Apr 2024 23:42:05 -0700 Subject: perf dsos: Introduce dsos__for_each_dso() To better abstract the dsos internals, introduce dsos__for_each_dso that does a callback on each dso. This also means the read lock can be correctly held. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Markus Elfring Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Yang Jihong Cc: Yanteng Si Cc: Yicong Yang Cc: zhaimingbing Link: https://lore.kernel.org/r/20240410064214.2755936-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 25 +++++++++------ tools/perf/util/build-id.c | 76 +++++++++++++++++++++++++-------------------- tools/perf/util/dsos.c | 16 ++++++++++ tools/perf/util/dsos.h | 8 ++--- tools/perf/util/machine.c | 40 +++++++++++++++--------- 5 files changed, 100 insertions(+), 65 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index ef73317e6ae7..ce5e28eaad90 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -1187,23 +1187,28 @@ static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_ process_build_id, machine); } +static int guest_session__add_build_ids_cb(struct dso *dso, void *data) +{ + struct guest_session *gs = data; + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); + + if (!dso->has_build_id) + return 0; + + return synthesize_build_id(inject, dso, gs->machine_pid); + +} + static int guest_session__add_build_ids(struct guest_session *gs) { struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); - struct machine *machine = &gs->session->machines.host; - struct dso *dso; - int ret; /* Build IDs will be put in the Build ID feature section */ perf_header__set_feat(&inject->session->header, HEADER_BUILD_ID); - dsos__for_each_with_build_id(dso, &machine->dsos.head) { - ret = synthesize_build_id(inject, dso, gs->machine_pid); - if (ret) - return ret; - } - - return 0; + return dsos__for_each_dso(&gs->session->machines.host.dsos, + guest_session__add_build_ids_cb, + gs); } static int guest_session__ksymbol_event(struct perf_tool *tool, diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index a617b1917e6b..a6d3c253f19f 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -327,48 +327,56 @@ static int write_buildid(const char *name, size_t name_len, struct build_id *bid return write_padded(fd, name, name_len + 1, len); } -static int machine__write_buildid_table(struct machine *machine, - struct feat_fd *fd) +struct machine__write_buildid_table_cb_args { + struct machine *machine; + struct feat_fd *fd; + u16 kmisc, umisc; +}; + +static int machine__write_buildid_table_cb(struct dso *dso, void *data) { - int err = 0; - struct dso *pos; - u16 kmisc = PERF_RECORD_MISC_KERNEL, - umisc = PERF_RECORD_MISC_USER; + struct machine__write_buildid_table_cb_args *args = data; + const char *name; + size_t name_len; + bool in_kernel = false; - if (!machine__is_host(machine)) { - kmisc = PERF_RECORD_MISC_GUEST_KERNEL; - umisc = PERF_RECORD_MISC_GUEST_USER; - } + if (!dso->has_build_id) + return 0; - dsos__for_each_with_build_id(pos, &machine->dsos.head) { - const char *name; - size_t name_len; - bool in_kernel = false; + if (!dso->hit && !dso__is_vdso(dso)) + return 0; - if (!pos->hit && !dso__is_vdso(pos)) - continue; + if (dso__is_vdso(dso)) { + name = dso->short_name; + name_len = dso->short_name_len; + } else if (dso__is_kcore(dso)) { + name = args->machine->mmap_name; + name_len = strlen(name); + } else { + name = dso->long_name; + name_len = dso->long_name_len; + } - if (dso__is_vdso(pos)) { - name = pos->short_name; - name_len = pos->short_name_len; - } else if (dso__is_kcore(pos)) { - name = machine->mmap_name; - name_len = strlen(name); - } else { - name = pos->long_name; - name_len = pos->long_name_len; - } + in_kernel = dso->kernel || is_kernel_module(name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); + return write_buildid(name, name_len, &dso->bid, args->machine->pid, + in_kernel ? args->kmisc : args->umisc, args->fd); +} - in_kernel = pos->kernel || - is_kernel_module(name, - PERF_RECORD_MISC_CPUMODE_UNKNOWN); - err = write_buildid(name, name_len, &pos->bid, machine->pid, - in_kernel ? kmisc : umisc, fd); - if (err) - break; +static int machine__write_buildid_table(struct machine *machine, struct feat_fd *fd) +{ + struct machine__write_buildid_table_cb_args args = { + .machine = machine, + .fd = fd, + .kmisc = PERF_RECORD_MISC_KERNEL, + .umisc = PERF_RECORD_MISC_USER, + }; + + if (!machine__is_host(machine)) { + args.kmisc = PERF_RECORD_MISC_GUEST_KERNEL; + args.umisc = PERF_RECORD_MISC_GUEST_USER; } - return err; + return dsos__for_each_dso(&machine->dsos, machine__write_buildid_table_cb, &args); } int perf_session__write_buildid_table(struct perf_session *session, diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index d269e09005a7..d43f64939b12 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -433,3 +433,19 @@ struct dso *dsos__find_kernel_dso(struct dsos *dsos) up_read(&dsos->lock); return res; } + +int dsos__for_each_dso(struct dsos *dsos, int (*cb)(struct dso *dso, void *data), void *data) +{ + struct dso *dso; + + down_read(&dsos->lock); + list_for_each_entry(dso, &dsos->head, node) { + int err; + + err = cb(dso, data); + if (err) + return err; + } + up_read(&dsos->lock); + return 0; +} diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index a7c7f723c5ff..317a263f0e37 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -23,12 +23,6 @@ struct dsos { struct rw_semaphore lock; }; -#define dsos__for_each_with_build_id(pos, head) \ - list_for_each_entry(pos, head, node) \ - if (!pos->has_build_id) \ - continue; \ - else - void dsos__init(struct dsos *dsos); void dsos__exit(struct dsos *dsos); @@ -55,4 +49,6 @@ struct dso *dsos__findnew_module_dso(struct dsos *dsos, struct machine *machine, struct dso *dsos__find_kernel_dso(struct dsos *dsos); +int dsos__for_each_dso(struct dsos *dsos, int (*cb)(struct dso *dso, void *data), void *data); + #endif /* __PERF_DSOS */ diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index e2c800a1449b..e3deef38405c 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1562,16 +1562,14 @@ out_put: return ret; } -static bool machine__uses_kcore(struct machine *machine) +static int machine__uses_kcore_cb(struct dso *dso, void *data __maybe_unused) { - struct dso *dso; - - list_for_each_entry(dso, &machine->dsos.head, node) { - if (dso__is_kcore(dso)) - return true; - } + return dso__is_kcore(dso) ? 1 : 0; +} - return false; +static bool machine__uses_kcore(struct machine *machine) +{ + return dsos__for_each_dso(&machine->dsos, machine__uses_kcore_cb, NULL) != 0 ? true : false; } static bool perf_event__is_extra_kernel_mmap(struct machine *machine, @@ -3137,16 +3135,28 @@ char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, ch return sym->name; } +struct machine__for_each_dso_cb_args { + struct machine *machine; + machine__dso_t fn; + void *priv; +}; + +static int machine__for_each_dso_cb(struct dso *dso, void *data) +{ + struct machine__for_each_dso_cb_args *args = data; + + return args->fn(dso, args->machine, args->priv); +} + int machine__for_each_dso(struct machine *machine, machine__dso_t fn, void *priv) { - struct dso *pos; - int err = 0; + struct machine__for_each_dso_cb_args args = { + .machine = machine, + .fn = fn, + .priv = priv, + }; - list_for_each_entry(pos, &machine->dsos.head, node) { - if (fn(pos, machine, priv)) - err = -1; - } - return err; + return dsos__for_each_dso(&machine->dsos, machine__for_each_dso_cb, &args); } int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, void *priv) -- cgit v1.2.3 From 1d6eff930595eef83cfb8486c122249afb66145d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Apr 2024 23:42:06 -0700 Subject: perf dso: Move dso functions out of dsos.c Move dso and dso_id functions to dso.c to match the struct declarations. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Markus Elfring Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Yang Jihong Cc: Yanteng Si Cc: Yicong Yang Cc: zhaimingbing Link: https://lore.kernel.org/r/20240410064214.2755936-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/dso.h | 4 ++++ tools/perf/util/dsos.c | 61 -------------------------------------------------- 3 files changed, 65 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 6e2a7198b382..ad562743d769 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1269,6 +1269,67 @@ static void dso__set_long_name_id(struct dso *dso, const char *name, struct dso_ __dsos__findnew_link_by_longname_id(root, dso, NULL, id); } +static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) +{ + if (a->maj > b->maj) return -1; + if (a->maj < b->maj) return 1; + + if (a->min > b->min) return -1; + if (a->min < b->min) return 1; + + if (a->ino > b->ino) return -1; + if (a->ino < b->ino) return 1; + + /* + * Synthesized MMAP events have zero ino_generation, avoid comparing + * them with MMAP events with actual ino_generation. + * + * I found it harmful because the mismatch resulted in a new + * dso that did not have a build ID whereas the original dso did have a + * build ID. The build ID was essential because the object was not found + * otherwise. - Adrian + */ + if (a->ino_generation && b->ino_generation) { + if (a->ino_generation > b->ino_generation) return -1; + if (a->ino_generation < b->ino_generation) return 1; + } + + return 0; +} + +bool dso_id__empty(struct dso_id *id) +{ + if (!id) + return true; + + return !id->maj && !id->min && !id->ino && !id->ino_generation; +} + +void dso__inject_id(struct dso *dso, struct dso_id *id) +{ + dso->id.maj = id->maj; + dso->id.min = id->min; + dso->id.ino = id->ino; + dso->id.ino_generation = id->ino_generation; +} + +int dso_id__cmp(struct dso_id *a, struct dso_id *b) +{ + /* + * The second is always dso->id, so zeroes if not set, assume passing + * NULL for a means a zeroed id + */ + if (dso_id__empty(a) || dso_id__empty(b)) + return 0; + + return __dso_id__cmp(a, b); +} + +int dso__cmp_id(struct dso *a, struct dso *b) +{ + return __dso_id__cmp(&a->id, &b->id); +} + void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) { dso__set_long_name_id(dso, name, NULL, name_allocated); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 3d4faad8d5dc..2c295438226d 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -238,6 +238,9 @@ static inline void dso__set_loaded(struct dso *dso) dso->loaded = true; } +int dso_id__cmp(struct dso_id *a, struct dso_id *b); +bool dso_id__empty(struct dso_id *id); + struct dso *dso__new_id(const char *name, struct dso_id *id); struct dso *dso__new(const char *name); void dso__delete(struct dso *dso); @@ -245,6 +248,7 @@ void dso__delete(struct dso *dso); int dso__cmp_id(struct dso *a, struct dso *b); void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated); void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated); +void dso__inject_id(struct dso *dso, struct dso_id *id); int dso__name_len(const struct dso *dso); diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index d43f64939b12..f816927a21ff 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -41,67 +41,6 @@ void dsos__exit(struct dsos *dsos) exit_rwsem(&dsos->lock); } -static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) -{ - if (a->maj > b->maj) return -1; - if (a->maj < b->maj) return 1; - - if (a->min > b->min) return -1; - if (a->min < b->min) return 1; - - if (a->ino > b->ino) return -1; - if (a->ino < b->ino) return 1; - - /* - * Synthesized MMAP events have zero ino_generation, avoid comparing - * them with MMAP events with actual ino_generation. - * - * I found it harmful because the mismatch resulted in a new - * dso that did not have a build ID whereas the original dso did have a - * build ID. The build ID was essential because the object was not found - * otherwise. - Adrian - */ - if (a->ino_generation && b->ino_generation) { - if (a->ino_generation > b->ino_generation) return -1; - if (a->ino_generation < b->ino_generation) return 1; - } - - return 0; -} - -static bool dso_id__empty(struct dso_id *id) -{ - if (!id) - return true; - - return !id->maj && !id->min && !id->ino && !id->ino_generation; -} - -static void dso__inject_id(struct dso *dso, struct dso_id *id) -{ - dso->id.maj = id->maj; - dso->id.min = id->min; - dso->id.ino = id->ino; - dso->id.ino_generation = id->ino_generation; -} - -static int dso_id__cmp(struct dso_id *a, struct dso_id *b) -{ - /* - * The second is always dso->id, so zeroes if not set, assume passing - * NULL for a means a zeroed id - */ - if (dso_id__empty(a) || dso_id__empty(b)) - return 0; - - return __dso_id__cmp(a, b); -} - -int dso__cmp_id(struct dso *a, struct dso *b) -{ - return __dso_id__cmp(&a->id, &b->id); -} - bool __dsos__read_build_ids(struct dsos *dsos, bool with_hits) { struct list_head *head = &dsos->head; -- cgit v1.2.3 From 0ffc8fca5c15a70f32c8aff12c566bbd3991bd0a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Apr 2024 23:42:07 -0700 Subject: perf dsos: Switch more loops to dsos__for_each_dso() Switch loops within dsos.c, add a version that isn't locked. Switch some unlocked loops to hold the read lock. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Markus Elfring Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Yang Jihong Cc: Yanteng Si Cc: Yicong Yang Cc: zhaimingbing Link: https://lore.kernel.org/r/20240410064214.2755936-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/build-id.c | 2 +- tools/perf/util/dsos.c | 258 +++++++++++++++++++++++++++++---------------- tools/perf/util/dsos.h | 8 +- tools/perf/util/machine.c | 8 +- 4 files changed, 174 insertions(+), 102 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index a6d3c253f19f..864bc26b6b46 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -964,7 +964,7 @@ int perf_session__cache_build_ids(struct perf_session *session) static bool machine__read_build_ids(struct machine *machine, bool with_hits) { - return __dsos__read_build_ids(&machine->dsos, with_hits); + return dsos__read_build_ids(&machine->dsos, with_hits); } bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index f816927a21ff..b7fbfb877ae3 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -41,38 +41,65 @@ void dsos__exit(struct dsos *dsos) exit_rwsem(&dsos->lock); } -bool __dsos__read_build_ids(struct dsos *dsos, bool with_hits) + +static int __dsos__for_each_dso(struct dsos *dsos, + int (*cb)(struct dso *dso, void *data), + void *data) +{ + struct dso *dso; + + list_for_each_entry(dso, &dsos->head, node) { + int err; + + err = cb(dso, data); + if (err) + return err; + } + return 0; +} + +struct dsos__read_build_ids_cb_args { + bool with_hits; + bool have_build_id; +}; + +static int dsos__read_build_ids_cb(struct dso *dso, void *data) { - struct list_head *head = &dsos->head; - bool have_build_id = false; - struct dso *pos; + struct dsos__read_build_ids_cb_args *args = data; struct nscookie nsc; - list_for_each_entry(pos, head, node) { - if (with_hits && !pos->hit && !dso__is_vdso(pos)) - continue; - if (pos->has_build_id) { - have_build_id = true; - continue; - } - nsinfo__mountns_enter(pos->nsinfo, &nsc); - if (filename__read_build_id(pos->long_name, &pos->bid) > 0) { - have_build_id = true; - pos->has_build_id = true; - } else if (errno == ENOENT && pos->nsinfo) { - char *new_name = dso__filename_with_chroot(pos, pos->long_name); - - if (new_name && filename__read_build_id(new_name, - &pos->bid) > 0) { - have_build_id = true; - pos->has_build_id = true; - } - free(new_name); + if (args->with_hits && !dso->hit && !dso__is_vdso(dso)) + return 0; + if (dso->has_build_id) { + args->have_build_id = true; + return 0; + } + nsinfo__mountns_enter(dso->nsinfo, &nsc); + if (filename__read_build_id(dso->long_name, &dso->bid) > 0) { + args->have_build_id = true; + dso->has_build_id = true; + } else if (errno == ENOENT && dso->nsinfo) { + char *new_name = dso__filename_with_chroot(dso, dso->long_name); + + if (new_name && filename__read_build_id(new_name, &dso->bid) > 0) { + args->have_build_id = true; + dso->has_build_id = true; } - nsinfo__mountns_exit(&nsc); + free(new_name); } + nsinfo__mountns_exit(&nsc); + return 0; +} - return have_build_id; +bool dsos__read_build_ids(struct dsos *dsos, bool with_hits) +{ + struct dsos__read_build_ids_cb_args args = { + .with_hits = with_hits, + .have_build_id = false, + }; + + dsos__for_each_dso(dsos, dsos__read_build_ids_cb, &args); + return args.have_build_id; } static int __dso__cmp_long_name(const char *long_name, struct dso_id *id, struct dso *b) @@ -105,6 +132,7 @@ struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso if (!name) name = dso->long_name; + /* * Find node with the matching name */ @@ -185,17 +213,40 @@ static struct dso *__dsos__findnew_by_longname_id(struct rb_root *root, const ch return __dsos__findnew_link_by_longname_id(root, NULL, name, id); } +struct dsos__find_id_cb_args { + const char *name; + struct dso_id *id; + struct dso *res; +}; + +static int dsos__find_id_cb(struct dso *dso, void *data) +{ + struct dsos__find_id_cb_args *args = data; + + if (__dso__cmp_short_name(args->name, args->id, dso) == 0) { + args->res = dso__get(dso); + return 1; + } + return 0; + +} + static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, bool cmp_short) { - struct dso *pos; + struct dso *res; if (cmp_short) { - list_for_each_entry(pos, &dsos->head, node) - if (__dso__cmp_short_name(name, id, pos) == 0) - return dso__get(pos); - return NULL; + struct dsos__find_id_cb_args args = { + .name = name, + .id = id, + .res = NULL, + }; + + __dsos__for_each_dso(dsos, dsos__find_id_cb, &args); + return args.res; } - return __dsos__findnew_by_longname_id(&dsos->root, name, id); + res = __dsos__findnew_by_longname_id(&dsos->root, name, id); + return res; } struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short) @@ -275,48 +326,74 @@ struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id return dso; } -size_t __dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, - bool (skip)(struct dso *dso, int parm), int parm) -{ - struct list_head *head = &dsos->head; - struct dso *pos; - size_t ret = 0; +struct dsos__fprintf_buildid_cb_args { + FILE *fp; + bool (*skip)(struct dso *dso, int parm); + int parm; + size_t ret; +}; - list_for_each_entry(pos, head, node) { - char sbuild_id[SBUILD_ID_SIZE]; +static int dsos__fprintf_buildid_cb(struct dso *dso, void *data) +{ + struct dsos__fprintf_buildid_cb_args *args = data; + char sbuild_id[SBUILD_ID_SIZE]; - if (skip && skip(pos, parm)) - continue; - build_id__sprintf(&pos->bid, sbuild_id); - ret += fprintf(fp, "%-40s %s\n", sbuild_id, pos->long_name); - } - return ret; + if (args->skip && args->skip(dso, args->parm)) + return 0; + build_id__sprintf(&dso->bid, sbuild_id); + args->ret += fprintf(args->fp, "%-40s %s\n", sbuild_id, dso->long_name); + return 0; } -size_t __dsos__fprintf(struct dsos *dsos, FILE *fp) +size_t dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, + bool (*skip)(struct dso *dso, int parm), int parm) { - struct list_head *head = &dsos->head; - struct dso *pos; - size_t ret = 0; + struct dsos__fprintf_buildid_cb_args args = { + .fp = fp, + .skip = skip, + .parm = parm, + .ret = 0, + }; + + dsos__for_each_dso(dsos, dsos__fprintf_buildid_cb, &args); + return args.ret; +} - list_for_each_entry(pos, head, node) { - ret += dso__fprintf(pos, fp); - } +struct dsos__fprintf_cb_args { + FILE *fp; + size_t ret; +}; - return ret; +static int dsos__fprintf_cb(struct dso *dso, void *data) +{ + struct dsos__fprintf_cb_args *args = data; + + args->ret += dso__fprintf(dso, args->fp); + return 0; } -int __dsos__hit_all(struct dsos *dsos) +size_t dsos__fprintf(struct dsos *dsos, FILE *fp) { - struct list_head *head = &dsos->head; - struct dso *pos; + struct dsos__fprintf_cb_args args = { + .fp = fp, + .ret = 0, + }; - list_for_each_entry(pos, head, node) - pos->hit = true; + dsos__for_each_dso(dsos, dsos__fprintf_cb, &args); + return args.ret; +} +static int dsos__hit_all_cb(struct dso *dso, void *data __maybe_unused) +{ + dso->hit = true; return 0; } +int dsos__hit_all(struct dsos *dsos) +{ + return dsos__for_each_dso(dsos, dsos__hit_all_cb, NULL); +} + struct dso *dsos__findnew_module_dso(struct dsos *dsos, struct machine *machine, struct kmod_path *m, @@ -342,49 +419,44 @@ out_unlock: return dso; } -struct dso *dsos__find_kernel_dso(struct dsos *dsos) +static int dsos__find_kernel_dso_cb(struct dso *dso, void *data) { - struct dso *dso, *res = NULL; + struct dso **res = data; + /* + * The cpumode passed to is_kernel_module is not the cpumode of *this* + * event. If we insist on passing correct cpumode to is_kernel_module, + * we should record the cpumode when we adding this dso to the linked + * list. + * + * However we don't really need passing correct cpumode. We know the + * correct cpumode must be kernel mode (if not, we should not link it + * onto kernel_dsos list). + * + * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. + * is_kernel_module() treats it as a kernel cpumode. + */ + if (!dso->kernel || + is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN)) + return 0; - down_read(&dsos->lock); - list_for_each_entry(dso, &dsos->head, node) { - /* - * The cpumode passed to is_kernel_module is not the cpumode of - * *this* event. If we insist on passing correct cpumode to - * is_kernel_module, we should record the cpumode when we adding - * this dso to the linked list. - * - * However we don't really need passing correct cpumode. We - * know the correct cpumode must be kernel mode (if not, we - * should not link it onto kernel_dsos list). - * - * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. - * is_kernel_module() treats it as a kernel cpumode. - */ - if (!dso->kernel || - is_kernel_module(dso->long_name, - PERF_RECORD_MISC_CPUMODE_UNKNOWN)) - continue; + *res = dso__get(dso); + return 1; +} - res = dso__get(dso); - break; - } - up_read(&dsos->lock); +struct dso *dsos__find_kernel_dso(struct dsos *dsos) +{ + struct dso *res = NULL; + + dsos__for_each_dso(dsos, dsos__find_kernel_dso_cb, &res); return res; } int dsos__for_each_dso(struct dsos *dsos, int (*cb)(struct dso *dso, void *data), void *data) { - struct dso *dso; + int err; down_read(&dsos->lock); - list_for_each_entry(dso, &dsos->head, node) { - int err; - - err = cb(dso, data); - if (err) - return err; - } + err = __dsos__for_each_dso(dsos, cb, data); up_read(&dsos->lock); - return 0; + return err; } diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 317a263f0e37..50bd51523475 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -33,16 +33,16 @@ struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); -bool __dsos__read_build_ids(struct dsos *dsos, bool with_hits); +bool dsos__read_build_ids(struct dsos *dsos, bool with_hits); struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, const char *name, struct dso_id *id); -size_t __dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, +size_t dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm); -size_t __dsos__fprintf(struct dsos *dsos, FILE *fp); +size_t dsos__fprintf(struct dsos *dsos, FILE *fp); -int __dsos__hit_all(struct dsos *dsos); +int dsos__hit_all(struct dsos *dsos); struct dso *dsos__findnew_module_dso(struct dsos *dsos, struct machine *machine, struct kmod_path *m, const char *filename); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index e3deef38405c..c5c895131bca 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -853,11 +853,11 @@ out: size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) { struct rb_node *nd; - size_t ret = __dsos__fprintf(&machines->host.dsos, fp); + size_t ret = dsos__fprintf(&machines->host.dsos, fp); for (nd = rb_first_cached(&machines->guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret += __dsos__fprintf(&pos->dsos, fp); + ret += dsos__fprintf(&pos->dsos, fp); } return ret; @@ -866,7 +866,7 @@ size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) size_t machine__fprintf_dsos_buildid(struct machine *m, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm) { - return __dsos__fprintf_buildid(&m->dsos, fp, skip, parm); + return dsos__fprintf_buildid(&m->dsos, fp, skip, parm); } size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp, @@ -3232,5 +3232,5 @@ bool machine__is_lock_function(struct machine *machine, u64 addr) int machine__hit_all_dsos(struct machine *machine) { - return __dsos__hit_all(&machine->dsos); + return dsos__hit_all(&machine->dsos); } -- cgit v1.2.3 From 23cc4fe44f1df5ccce088a7c9398f96794047c2a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 11 Apr 2024 18:43:00 +0200 Subject: bpftool: Fix typo in error message s/at at/at a/ Signed-off-by: Thorsten Blum Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240411164258.533063-3-thorsten.blum@toblux.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 4c4cf16a40ba..1a501cf09e78 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2081,7 +2081,7 @@ static int profile_parse_metrics(int argc, char **argv) NEXT_ARG(); } if (selected_cnt > MAX_NUM_PROFILE_METRICS) { - p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + p_err("too many (%d) metrics, please specify no more than %d metrics at a time", selected_cnt, MAX_NUM_PROFILE_METRICS); return -1; } -- cgit v1.2.3 From 4d4992ff587604455e8843a0e76dce0b99175319 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Apr 2024 16:09:52 +0200 Subject: selftests/bpf: Add read_trace_pipe_iter function We have two printk tests reading trace_pipe in non blocking way, with the very same code. Moving that in new read_trace_pipe_iter function. Current read_trace_pipe is used from samples/bpf and needs to do blocking read and printf of the trace_pipe data, using new read_trace_pipe_iter to implement that. Both printk tests do early checks for the number of found messages and can bail earlier, but I did not find any speed difference w/o that condition, so I did not complicate the change more for that. Some of the samples/bpf programs use read_trace_pipe function, so I kept that interface untouched. I did not see any issues with affected samples/bpf programs other than there's slight change in read_trace_pipe output. The current code uses puts that adds new line after the printed string, so we would occasionally see extra new line. With this patch we read output per lines, so there's no need to use puts and we can use just printf instead without extra new line. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240410140952.292261-1-jolsa@kernel.org --- .../selftests/bpf/prog_tests/trace_printk.c | 36 ++++--------- .../selftests/bpf/prog_tests/trace_vprintk.c | 36 ++++--------- tools/testing/selftests/bpf/trace_helpers.c | 63 ++++++++++++++-------- tools/testing/selftests/bpf/trace_helpers.h | 2 + 4 files changed, 60 insertions(+), 77 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index 7b9124d506a5..e56e88596d64 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -5,18 +5,19 @@ #include "trace_printk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "testing,testing" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_printk(void) { struct trace_printk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_printk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_printk_lskel__open(); if (!ASSERT_OK_PTR(skel, "trace_printk__open")) @@ -35,16 +36,6 @@ void serial_test_trace_printk(void) if (!ASSERT_OK(err, "trace_printk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_printk_lskel__detach(skel); @@ -56,21 +47,12 @@ void serial_test_trace_printk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_printk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_printk_ran, "found")) goto cleanup; cleanup: trace_printk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c index 44ea2fd88f4c..2af6a6f2096a 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c @@ -5,18 +5,19 @@ #include "trace_vprintk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "1,2,3,4,5,6,7,8,9,10" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_vprintk(void) { struct trace_vprintk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_vprintk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_vprintk_lskel__open_and_load(); if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load")) @@ -28,16 +29,6 @@ void serial_test_trace_vprintk(void) if (!ASSERT_OK(err, "trace_vprintk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_vprintk_lskel__detach(skel); @@ -49,14 +40,8 @@ void serial_test_trace_vprintk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_vprintk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_vprintk_ran, "found")) goto cleanup; @@ -66,7 +51,4 @@ void serial_test_trace_vprintk(void) cleanup: trace_vprintk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 7f45b4cb41fe..70e29f316fe7 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -233,29 +233,6 @@ out: return err; } -void read_trace_pipe(void) -{ - int trace_fd; - - if (access(TRACEFS_PIPE, F_OK) == 0) - trace_fd = open(TRACEFS_PIPE, O_RDONLY, 0); - else - trace_fd = open(DEBUGFS_PIPE, O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf) - 1); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} - ssize_t get_uprobe_offset(const void *addr) { size_t start, end, base; @@ -413,3 +390,43 @@ out: close(fd); return err; } + +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter) +{ + size_t buflen, n; + char *buf = NULL; + FILE *fp = NULL; + + if (access(TRACEFS_PIPE, F_OK) == 0) + fp = fopen(TRACEFS_PIPE, "r"); + else + fp = fopen(DEBUGFS_PIPE, "r"); + if (!fp) + return -1; + + /* We do not want to wait forever when iter is specified. */ + if (iter) + fcntl(fileno(fp), F_SETFL, O_NONBLOCK); + + while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) { + if (n > 0) + cb(buf, data); + if (iter && !(--iter)) + break; + } + + free(buf); + if (fp) + fclose(fp); + return 0; +} + +static void trace_pipe_cb(const char *str, void *data) +{ + printf("%s", str); +} + +void read_trace_pipe(void) +{ + read_trace_pipe_iter(trace_pipe_cb, NULL, 0); +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index d1ed71789049..2ce873c9f9aa 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -33,6 +33,8 @@ struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), + void *data, int iter); ssize_t get_uprobe_offset(const void *addr); ssize_t get_rel_offset(uintptr_t addr); -- cgit v1.2.3 From 20b0027ca1a7ee3b6811f4d67c6015483c6bb456 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 10 Apr 2024 15:23:53 -0700 Subject: perf list: Escape '\r' in JSON output Events like for sapphirerapids have '\r' in the uncore descriptions. The non-escaped versions of this fail JSON validation the the 'perf list' test. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240410222353.1722840-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index e0fe3d178d63..5cab31231551 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -326,6 +326,9 @@ static void fix_escape_fprintf(FILE *fp, struct strbuf *buf, const char *fmt, .. case '\n': strbuf_addstr(buf, "\\n"); break; + case '\r': + strbuf_addstr(buf, "\\r"); + break; case '\\': fallthrough; case '\"': -- cgit v1.2.3 From 646e22eb877cdf618a13e7865ff58939b81304ac Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 8 Apr 2024 19:32:13 -0700 Subject: perf build: Add shellcheck to tools/perf scripts Address shell check errors/warnings in perf-archive.sh and perf-completion.sh. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20240409023216.2342032-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Build | 14 ++++++++++++++ tools/perf/perf-archive.sh | 2 +- tools/perf/perf-completion.sh | 23 ++++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/Build b/tools/perf/Build index aa7623622834..b0cb7ad8e6ac 100644 --- a/tools/perf/Build +++ b/tools/perf/Build @@ -59,3 +59,17 @@ perf-y += ui/ perf-y += scripts/ gtk-y += ui/gtk/ + +ifdef SHELLCHECK + SHELL_TESTS := $(wildcard *.sh) + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -s bash -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh index f94795794b36..6ed7e52ab881 100755 --- a/tools/perf/perf-archive.sh +++ b/tools/perf/perf-archive.sh @@ -34,7 +34,7 @@ if [ $UNPACK -eq 1 ]; then TARGET=`find . -regex "\./perf.*\.tar\.bz2"` TARGET_NUM=`echo -n "$TARGET" | grep -c '^'` - if [ -z "$TARGET" -o $TARGET_NUM -gt 1 ]; then + if [ -z "$TARGET" ] || [ $TARGET_NUM -gt 1 ]; then echo -e "Error: $TARGET_NUM files found for unpacking:\n$TARGET" echo "Provide the requested file as an argument" exit 1 diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh index f224d79b89e6..69cba3c170d5 100644 --- a/tools/perf/perf-completion.sh +++ b/tools/perf/perf-completion.sh @@ -108,6 +108,8 @@ __perf__ltrim_colon_completions() __perfcomp () { + # Expansion of spaces to array is deliberate. + # shellcheck disable=SC2207 COMPREPLY=( $( compgen -W "$1" -- "$2" ) ) } @@ -127,13 +129,13 @@ __perf_prev_skip_opts () let i=cword-1 cmds_=$($cmd $1 --list-cmds) - prev_skip_opts=() + prev_skip_opts="" while [ $i -ge 0 ]; do - if [[ ${words[i]} == $1 ]]; then + if [[ ${words[i]} == "$1" ]]; then return fi for cmd_ in $cmds_; do - if [[ ${words[i]} == $cmd_ ]]; then + if [[ ${words[i]} == "$cmd_" ]]; then prev_skip_opts=${words[i]} return fi @@ -164,9 +166,10 @@ __perf_main () $prev_skip_opts == @(record|stat|top) ]]; then local cur1=${COMP_WORDS[COMP_CWORD]} - local raw_evts=$($cmd list --raw-dump hw sw cache tracepoint pmu sdt) + local raw_evts local arr s tmp result cpu_evts + raw_evts=$($cmd list --raw-dump hw sw cache tracepoint pmu sdt) # aarch64 doesn't have /sys/bus/event_source/devices/cpu/events if [[ `uname -m` != aarch64 ]]; then cpu_evts=$(ls /sys/bus/event_source/devices/cpu/events) @@ -175,10 +178,12 @@ __perf_main () if [[ "$cur1" == */* && ${cur1#*/} =~ ^[A-Z] ]]; then OLD_IFS="$IFS" IFS=" " + # Expansion of spaces to array is deliberate. + # shellcheck disable=SC2206 arr=($raw_evts) IFS="$OLD_IFS" - for s in ${arr[@]} + for s in "${arr[@]}" do if [[ "$s" == *cpu/* ]]; then tmp=${s#*cpu/} @@ -200,11 +205,13 @@ __perf_main () fi elif [[ $prev == @("--pfm-events") && $prev_skip_opts == @(record|stat|top) ]]; then - local evts=$($cmd list --raw-dump pfm) + local evts + evts=$($cmd list --raw-dump pfm) __perfcomp "$evts" "$cur" elif [[ $prev == @("-M"|"--metrics") && $prev_skip_opts == @(stat) ]]; then - local metrics=$($cmd list --raw-dump metric metricgroup) + local metrics + metrics=$($cmd list --raw-dump metric metricgroup) __perfcomp "$metrics" "$cur" else # List subcommands for perf commands @@ -278,6 +285,8 @@ if [[ -n ${ZSH_VERSION-} ]]; then let cword=CURRENT-1 emulate ksh -c __perf_main let _ret && _default && _ret=0 + # _ret is only assigned 0 or 1, disable inaccurate analysis. + # shellcheck disable=SC2152 return _ret } -- cgit v1.2.3 From ec440763bbfc84738d2b38e0d47cb5185d2408b8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 8 Apr 2024 19:32:14 -0700 Subject: perf arch x86: Add shellcheck to build Add shellcheck for: tools/perf/arch/x86/tests/gen-insn-x86-dat.sh tools/perf/arch/x86/entry/syscalls/syscalltbl.sh Address a minor quoting issue. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20240409023216.2342032-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/Build | 14 ++++++++++++++ tools/perf/arch/x86/tests/Build | 14 ++++++++++++++ tools/perf/arch/x86/tests/gen-insn-x86-dat.sh | 2 +- 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/Build b/tools/perf/arch/x86/Build index a7dd46a5b678..ed37013b4289 100644 --- a/tools/perf/arch/x86/Build +++ b/tools/perf/arch/x86/Build @@ -1,2 +1,16 @@ perf-y += util/ perf-y += tests/ + +ifdef SHELLCHECK + SHELL_TESTS := entry/syscalls/syscalltbl.sh + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index b87f46e5feea..c1e3b7d39554 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -10,3 +10,17 @@ perf-$(CONFIG_AUXTRACE) += insn-x86.o endif perf-$(CONFIG_X86_64) += bp-modify.o perf-y += amd-ibs-via-core-pmu.o + +ifdef SHELLCHECK + SHELL_TESTS := gen-insn-x86-dat.sh + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh b/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh index 0d0a003a9c5e..89c46532cd5c 100755 --- a/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh +++ b/tools/perf/arch/x86/tests/gen-insn-x86-dat.sh @@ -11,7 +11,7 @@ if [ "$(uname -m)" != "x86_64" ]; then exit 1 fi -cd $(dirname $0) +cd "$(dirname $0)" trap 'echo "Might need a more recent version of binutils"' EXIT -- cgit v1.2.3 From 61ff60aab7d6846a5983804d96a316b22aacefa1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 8 Apr 2024 19:32:15 -0700 Subject: perf util: Add shellcheck to generate-cmdlist.sh Add shellcheck to generate-cmdlist.sh to avoid basic shell script mistakes. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20240409023216.2342032-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index aec5a590e349..292170a99ab6 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -389,3 +389,17 @@ $(OUTPUT)util/vsprintf.o: ../lib/vsprintf.c FORCE $(OUTPUT)util/list_sort.o: ../lib/list_sort.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) + +ifdef SHELLCHECK + SHELL_TESTS := generate-cmdlist.sh + TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) -- cgit v1.2.3 From 2b8c43e7688fa61b842ea2b21d4159c67d6f2fd1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 8 Apr 2024 19:32:16 -0700 Subject: perf trace beauty: Add shellcheck to scripts Add shell check to scripts generating perf trace lookup tables. Fix quoting issue in arch_errno_names.sh. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20240409023216.2342032-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/Build | 14 ++++++++++++++ tools/perf/trace/beauty/arch_errno_names.sh | 8 +++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build index d8ce1b698983..cb3c1399ff40 100644 --- a/tools/perf/trace/beauty/Build +++ b/tools/perf/trace/beauty/Build @@ -20,3 +20,17 @@ perf-y += statx.o perf-y += sync_file_range.o perf-y += timespec.o perf-y += tracepoints/ + +ifdef SHELLCHECK + SHELL_TESTS := $(wildcard trace/beauty/*.sh) + TEST_LOGS := $(SHELL_TESTS:trace/beauty/%=%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -s bash -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/trace/beauty/arch_errno_names.sh b/tools/perf/trace/beauty/arch_errno_names.sh index 7df4bf5b55a3..30d3889b2957 100755 --- a/tools/perf/trace/beauty/arch_errno_names.sh +++ b/tools/perf/trace/beauty/arch_errno_names.sh @@ -60,10 +60,12 @@ create_arch_errno_table_func() printf 'arch_syscalls__strerrno_t *arch_syscalls__strerrno_function(const char *arch)\n' printf '{\n' for arch in $archlist; do - printf '\tif (!strcmp(arch, "%s"))\n' $(arch_string "$arch") - printf '\t\treturn errno_to_name__%s;\n' $(arch_string "$arch") + arch_str=$(arch_string "$arch") + printf '\tif (!strcmp(arch, "%s"))\n' "$arch_str" + printf '\t\treturn errno_to_name__%s;\n' "$arch_str" done - printf '\treturn errno_to_name__%s;\n' $(arch_string "$default") + arch_str=$(arch_string "$default") + printf '\treturn errno_to_name__%s;\n' "$arch_str" printf '}\n' } -- cgit v1.2.3 From 459fee7b508231cd4622b3bd94aaa85e8e16b888 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 5 Apr 2024 21:09:10 -0700 Subject: perf bench uprobe: Remove lib64 from libc.so.6 binary path bpf_program__attach_uprobe_opts will search LD_LIBRARY_PATH and so specifying `/lib64` is unnecessary and causes failures for libc.so.6 paths like `/lib/x86_64-linux-gnu/libc.so.6`. Fixes: 7b47623b8cae8149 ("perf bench uprobe trace_printk: Add entry attaching an BPF program that does a trace_printk") Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrei Vagin Cc: Ingo Molnar Cc: Kan Liang Cc: Kees Kook Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240406040911.1603801-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c index 5c71fdc419dd..b722ff88fe7d 100644 --- a/tools/perf/bench/uprobe.c +++ b/tools/perf/bench/uprobe.c @@ -47,7 +47,7 @@ static const char * const bench_uprobe_usage[] = { #define bench_uprobe__attach_uprobe(prog) \ skel->links.prog = bpf_program__attach_uprobe_opts(/*prog=*/skel->progs.prog, \ /*pid=*/-1, \ - /*binary_path=*/"/lib64/libc.so.6", \ + /*binary_path=*/"libc.so.6", \ /*func_offset=*/0, \ /*opts=*/&uprobe_opts); \ if (!skel->links.prog) { \ -- cgit v1.2.3 From 988052f4bfcc4ee893ea19e9d9ce888cc8578e5a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 5 Apr 2024 21:09:11 -0700 Subject: perf bench uprobe: Add uretprobe variant of uprobe benchmarks Name benchmarks with _ret at the end to avoid creating a new set of benchmarks. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrei Vagin Cc: Ingo Molnar Cc: Kan Liang Cc: Kees Kook Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240406040911.1603801-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/bench.h | 2 ++ tools/perf/bench/uprobe.c | 20 +++++++++++++++++--- tools/perf/builtin-bench.c | 2 ++ tools/perf/util/bpf_skel/bench_uprobe.bpf.c | 16 ++++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index faa18e6d2467..9f736423af53 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -46,6 +46,8 @@ int bench_breakpoint_enable(int argc, const char **argv); int bench_uprobe_baseline(int argc, const char **argv); int bench_uprobe_empty(int argc, const char **argv); int bench_uprobe_trace_printk(int argc, const char **argv); +int bench_uprobe_empty_ret(int argc, const char **argv); +int bench_uprobe_trace_printk_ret(int argc, const char **argv); int bench_pmu_scan(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c index b722ff88fe7d..0b90275862e1 100644 --- a/tools/perf/bench/uprobe.c +++ b/tools/perf/bench/uprobe.c @@ -26,9 +26,11 @@ static int loops = LOOPS_DEFAULT; enum bench_uprobe { - BENCH_UPROBE__BASELINE, - BENCH_UPROBE__EMPTY, - BENCH_UPROBE__TRACE_PRINTK, + BENCH_UPROBE__BASELINE, + BENCH_UPROBE__EMPTY, + BENCH_UPROBE__TRACE_PRINTK, + BENCH_UPROBE__EMPTY_RET, + BENCH_UPROBE__TRACE_PRINTK_RET, }; static const struct option options[] = { @@ -81,6 +83,8 @@ static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench) case BENCH_UPROBE__BASELINE: break; case BENCH_UPROBE__EMPTY: bench_uprobe__attach_uprobe(empty); break; case BENCH_UPROBE__TRACE_PRINTK: bench_uprobe__attach_uprobe(trace_printk); break; + case BENCH_UPROBE__EMPTY_RET: bench_uprobe__attach_uprobe(empty_ret); break; + case BENCH_UPROBE__TRACE_PRINTK_RET: bench_uprobe__attach_uprobe(trace_printk_ret); break; default: fprintf(stderr, "Invalid bench: %d\n", bench); goto cleanup; @@ -197,3 +201,13 @@ int bench_uprobe_trace_printk(int argc, const char **argv) { return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK); } + +int bench_uprobe_empty_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY_RET); +} + +int bench_uprobe_trace_printk_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK_RET); +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 1a8898d5b560..2c1a9f3d847a 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -109,6 +109,8 @@ static struct bench uprobe_benchmarks[] = { { "baseline", "Baseline libc usleep(1000) call", bench_uprobe_baseline, }, { "empty", "Attach empty BPF prog to uprobe on usleep, system wide", bench_uprobe_empty, }, { "trace_printk", "Attach trace_printk BPF prog to uprobe on usleep syswide", bench_uprobe_trace_printk, }, + { "empty_ret", "Attach empty BPF prog to uretprobe on usleep, system wide", bench_uprobe_empty_ret, }, + { "trace_printk_ret", "Attach trace_printk BPF prog to uretprobe on usleep syswide", bench_uprobe_trace_printk_ret,}, { NULL, NULL, NULL }, }; diff --git a/tools/perf/util/bpf_skel/bench_uprobe.bpf.c b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c index 2c55896bb33c..a01c7f791fcd 100644 --- a/tools/perf/util/bpf_skel/bench_uprobe.bpf.c +++ b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c @@ -4,6 +4,7 @@ #include unsigned int nr_uprobes; +unsigned int nr_uretprobes; SEC("uprobe") int BPF_UPROBE(empty) @@ -20,4 +21,19 @@ int BPF_UPROBE(trace_printk) return 0; } +SEC("uretprobe") +int BPF_URETPROBE(empty_ret) +{ + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(trace_printk_ret) +{ + char fmt[] = "perf bench uretprobe %u"; + + bpf_trace_printk(fmt, sizeof(fmt), ++nr_uretprobes); + return 0; +} + char LICENSE[] SEC("license") = "Dual BSD/GPL"; -- cgit v1.2.3 From 3f189349e52ac69a8c4fffef339c5fe177d618f7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:06 +0200 Subject: selftests: netfilter: move to net subdir .. so this can start re-using existing lib.sh infra in next patches. Several of these scripts will not work, e.g. because they assume rp_filter is disabled, or reliance on a particular version/flavor of "netcat" tool. Add config settings for them. nft_trans_stress.sh script is removed, it also exists in the nftables userspace selftests. I do not see a reason to keep two versions in different repositories/projects. The settings file is removed for now: It was used to increase the timeout to avoid slow scripts from getting zapped by the 45s timeout, but some of the slow scripts can be sped up. Re-add it later for scripts that cannot be sped up easily. Update MAINTAINERS to reflect that future updates to netfilter scripts should go through netfilter-devel@. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + tools/testing/selftests/net/netfilter/.gitignore | 6 + tools/testing/selftests/net/netfilter/Makefile | 44 + .../selftests/net/netfilter/audit_logread.c | 165 ++ .../selftests/net/netfilter/br_netfilter.sh | 188 +++ .../selftests/net/netfilter/bridge_brouter.sh | 146 ++ tools/testing/selftests/net/netfilter/config | 37 + .../selftests/net/netfilter/connect_close.c | 136 ++ .../selftests/net/netfilter/conntrack_dump_flush.c | 471 ++++++ .../net/netfilter/conntrack_icmp_related.sh | 315 ++++ .../selftests/net/netfilter/conntrack_ipip_mtu.sh | 207 +++ .../net/netfilter/conntrack_sctp_collision.sh | 89 ++ .../net/netfilter/conntrack_tcp_unreplied.sh | 167 ++ .../selftests/net/netfilter/conntrack_vrf.sh | 241 +++ tools/testing/selftests/net/netfilter/ipvs.sh | 228 +++ tools/testing/selftests/net/netfilter/lib.sh | 3 + .../selftests/net/netfilter/nf_nat_edemux.sh | 127 ++ tools/testing/selftests/net/netfilter/nf_queue.c | 395 +++++ tools/testing/selftests/net/netfilter/nft_audit.sh | 245 +++ .../selftests/net/netfilter/nft_concat_range.sh | 1645 ++++++++++++++++++++ .../net/netfilter/nft_conntrack_helper.sh | 197 +++ tools/testing/selftests/net/netfilter/nft_fib.sh | 273 ++++ .../selftests/net/netfilter/nft_flowtable.sh | 672 ++++++++ tools/testing/selftests/net/netfilter/nft_meta.sh | 142 ++ tools/testing/selftests/net/netfilter/nft_nat.sh | 1224 +++++++++++++++ .../selftests/net/netfilter/nft_nat_zones.sh | 309 ++++ tools/testing/selftests/net/netfilter/nft_queue.sh | 449 ++++++ .../selftests/net/netfilter/nft_synproxy.sh | 117 ++ .../selftests/net/netfilter/nft_zones_many.sh | 163 ++ tools/testing/selftests/net/netfilter/rpath.sh | 169 ++ .../selftests/net/netfilter/sctp_collision.c | 99 ++ tools/testing/selftests/net/netfilter/xt_string.sh | 128 ++ tools/testing/selftests/netfilter/.gitignore | 6 - tools/testing/selftests/netfilter/Makefile | 21 - tools/testing/selftests/netfilter/audit_logread.c | 165 -- .../testing/selftests/netfilter/bridge_brouter.sh | 146 -- .../selftests/netfilter/bridge_netfilter.sh | 188 --- tools/testing/selftests/netfilter/config | 9 - tools/testing/selftests/netfilter/connect_close.c | 136 -- .../selftests/netfilter/conntrack_dump_flush.c | 471 ------ .../selftests/netfilter/conntrack_icmp_related.sh | 315 ---- .../netfilter/conntrack_sctp_collision.sh | 89 -- .../selftests/netfilter/conntrack_tcp_unreplied.sh | 167 -- tools/testing/selftests/netfilter/conntrack_vrf.sh | 241 --- .../selftests/netfilter/ipip-conntrack-mtu.sh | 207 --- tools/testing/selftests/netfilter/ipvs.sh | 228 --- tools/testing/selftests/netfilter/nf-queue.c | 395 ----- tools/testing/selftests/netfilter/nf_nat_edemux.sh | 127 -- tools/testing/selftests/netfilter/nft_audit.sh | 245 --- .../selftests/netfilter/nft_concat_range.sh | 1645 -------------------- .../selftests/netfilter/nft_conntrack_helper.sh | 197 --- tools/testing/selftests/netfilter/nft_fib.sh | 273 ---- tools/testing/selftests/netfilter/nft_flowtable.sh | 672 -------- tools/testing/selftests/netfilter/nft_meta.sh | 142 -- tools/testing/selftests/netfilter/nft_nat.sh | 1224 --------------- tools/testing/selftests/netfilter/nft_nat_zones.sh | 309 ---- tools/testing/selftests/netfilter/nft_queue.sh | 449 ------ tools/testing/selftests/netfilter/nft_synproxy.sh | 117 -- .../selftests/netfilter/nft_trans_stress.sh | 151 -- .../testing/selftests/netfilter/nft_zones_many.sh | 163 -- tools/testing/selftests/netfilter/rpath.sh | 169 -- tools/testing/selftests/netfilter/sctp_collision.c | 99 -- tools/testing/selftests/netfilter/settings | 1 - tools/testing/selftests/netfilter/xt_string.sh | 128 -- 64 files changed, 8798 insertions(+), 8895 deletions(-) create mode 100644 tools/testing/selftests/net/netfilter/.gitignore create mode 100644 tools/testing/selftests/net/netfilter/Makefile create mode 100644 tools/testing/selftests/net/netfilter/audit_logread.c create mode 100755 tools/testing/selftests/net/netfilter/br_netfilter.sh create mode 100755 tools/testing/selftests/net/netfilter/bridge_brouter.sh create mode 100644 tools/testing/selftests/net/netfilter/config create mode 100644 tools/testing/selftests/net/netfilter/connect_close.c create mode 100644 tools/testing/selftests/net/netfilter/conntrack_dump_flush.c create mode 100755 tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh create mode 100755 tools/testing/selftests/net/netfilter/conntrack_vrf.sh create mode 100755 tools/testing/selftests/net/netfilter/ipvs.sh create mode 100644 tools/testing/selftests/net/netfilter/lib.sh create mode 100755 tools/testing/selftests/net/netfilter/nf_nat_edemux.sh create mode 100644 tools/testing/selftests/net/netfilter/nf_queue.c create mode 100755 tools/testing/selftests/net/netfilter/nft_audit.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_concat_range.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_fib.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_flowtable.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_meta.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_nat.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_nat_zones.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_queue.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_synproxy.sh create mode 100755 tools/testing/selftests/net/netfilter/nft_zones_many.sh create mode 100755 tools/testing/selftests/net/netfilter/rpath.sh create mode 100644 tools/testing/selftests/net/netfilter/sctp_collision.c create mode 100755 tools/testing/selftests/net/netfilter/xt_string.sh delete mode 100644 tools/testing/selftests/netfilter/.gitignore delete mode 100644 tools/testing/selftests/netfilter/Makefile delete mode 100644 tools/testing/selftests/netfilter/audit_logread.c delete mode 100755 tools/testing/selftests/netfilter/bridge_brouter.sh delete mode 100644 tools/testing/selftests/netfilter/bridge_netfilter.sh delete mode 100644 tools/testing/selftests/netfilter/config delete mode 100644 tools/testing/selftests/netfilter/connect_close.c delete mode 100644 tools/testing/selftests/netfilter/conntrack_dump_flush.c delete mode 100755 tools/testing/selftests/netfilter/conntrack_icmp_related.sh delete mode 100755 tools/testing/selftests/netfilter/conntrack_sctp_collision.sh delete mode 100755 tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh delete mode 100755 tools/testing/selftests/netfilter/conntrack_vrf.sh delete mode 100755 tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh delete mode 100755 tools/testing/selftests/netfilter/ipvs.sh delete mode 100644 tools/testing/selftests/netfilter/nf-queue.c delete mode 100755 tools/testing/selftests/netfilter/nf_nat_edemux.sh delete mode 100755 tools/testing/selftests/netfilter/nft_audit.sh delete mode 100755 tools/testing/selftests/netfilter/nft_concat_range.sh delete mode 100755 tools/testing/selftests/netfilter/nft_conntrack_helper.sh delete mode 100755 tools/testing/selftests/netfilter/nft_fib.sh delete mode 100755 tools/testing/selftests/netfilter/nft_flowtable.sh delete mode 100755 tools/testing/selftests/netfilter/nft_meta.sh delete mode 100755 tools/testing/selftests/netfilter/nft_nat.sh delete mode 100755 tools/testing/selftests/netfilter/nft_nat_zones.sh delete mode 100755 tools/testing/selftests/netfilter/nft_queue.sh delete mode 100755 tools/testing/selftests/netfilter/nft_synproxy.sh delete mode 100755 tools/testing/selftests/netfilter/nft_trans_stress.sh delete mode 100755 tools/testing/selftests/netfilter/nft_zones_many.sh delete mode 100755 tools/testing/selftests/netfilter/rpath.sh delete mode 100644 tools/testing/selftests/netfilter/sctp_collision.c delete mode 100644 tools/testing/selftests/netfilter/settings delete mode 100755 tools/testing/selftests/netfilter/xt_string.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index d71eb49aaa06..5ba3fe6ac09c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15264,6 +15264,7 @@ F: net/*/netfilter.c F: net/*/netfilter/ F: net/bridge/br_netfilter*.c F: net/netfilter/ +F: tools/testing/selftests/net/netfilter/ NETROM NETWORK LAYER M: Ralf Baechle diff --git a/tools/testing/selftests/net/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore new file mode 100644 index 000000000000..0a64d6d0e29a --- /dev/null +++ b/tools/testing/selftests/net/netfilter/.gitignore @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +audit_logread +connect_close +conntrack_dump_flush +sctp_collision +nf_queue diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile new file mode 100644 index 000000000000..dd9a75a33d28 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0 + +top_srcdir = ../../../../.. + +HOSTPKG_CONFIG := pkg-config +MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) +MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) + +TEST_PROGS := br_netfilter.sh bridge_brouter.sh +TEST_PROGS += conntrack_icmp_related.sh +TEST_PROGS += conntrack_ipip_mtu.sh +TEST_PROGS += conntrack_tcp_unreplied.sh +TEST_PROGS += conntrack_sctp_collision.sh +TEST_PROGS += conntrack_vrf.sh +TEST_PROGS += ipvs.sh +TEST_PROGS += nf_nat_edemux.sh +TEST_PROGS += nft_audit.sh +TEST_PROGS += nft_concat_range.sh +TEST_PROGS += nft_conntrack_helper.sh +TEST_PROGS += nft_fib.sh +TEST_PROGS += nft_flowtable.sh +TEST_PROGS += nft_meta.sh +TEST_PROGS += nft_nat.sh +TEST_PROGS += nft_nat_zones.sh +TEST_PROGS += nft_queue.sh +TEST_PROGS += nft_synproxy.sh +TEST_PROGS += nft_zones_many.sh +TEST_PROGS += rpath.sh +TEST_PROGS += xt_string.sh + +TEST_CUSTOM_PROGS += conntrack_dump_flush + +TEST_GEN_FILES = audit_logread +TEST_GEN_FILES += conntrack_dump_flush +TEST_GEN_FILES += connect_close nf_queue +TEST_GEN_FILES += sctp_collision + +include ../../lib.mk + +$(OUTPUT)/nf_queue: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS) + +$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) +$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) diff --git a/tools/testing/selftests/net/netfilter/audit_logread.c b/tools/testing/selftests/net/netfilter/audit_logread.c new file mode 100644 index 000000000000..a0a880fc2d9d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/audit_logread.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fd; + +#define MAX_AUDIT_MESSAGE_LENGTH 8970 +struct audit_message { + struct nlmsghdr nlh; + union { + struct audit_status s; + char data[MAX_AUDIT_MESSAGE_LENGTH]; + } u; +}; + +int audit_recv(int fd, struct audit_message *rep) +{ + struct sockaddr_nl addr; + socklen_t addrlen = sizeof(addr); + int ret; + + do { + ret = recvfrom(fd, rep, sizeof(*rep), 0, + (struct sockaddr *)&addr, &addrlen); + } while (ret < 0 && errno == EINTR); + + if (ret < 0 || + addrlen != sizeof(addr) || + addr.nl_pid != 0 || + rep->nlh.nlmsg_type == NLMSG_ERROR) /* short-cut for now */ + return -1; + + return ret; +} + +int audit_send(int fd, uint16_t type, uint32_t key, uint32_t val) +{ + static int seq = 0; + struct audit_message msg = { + .nlh = { + .nlmsg_len = NLMSG_SPACE(sizeof(msg.u.s)), + .nlmsg_type = type, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + .nlmsg_seq = ++seq, + }, + .u.s = { + .mask = key, + .enabled = key == AUDIT_STATUS_ENABLED ? val : 0, + .pid = key == AUDIT_STATUS_PID ? val : 0, + } + }; + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + }; + int ret; + + do { + ret = sendto(fd, &msg, msg.nlh.nlmsg_len, 0, + (struct sockaddr *)&addr, sizeof(addr)); + } while (ret < 0 && errno == EINTR); + + if (ret != (int)msg.nlh.nlmsg_len) + return -1; + return 0; +} + +int audit_set(int fd, uint32_t key, uint32_t val) +{ + struct audit_message rep = { 0 }; + int ret; + + ret = audit_send(fd, AUDIT_SET, key, val); + if (ret) + return ret; + + ret = audit_recv(fd, &rep); + if (ret < 0) + return ret; + return 0; +} + +int readlog(int fd) +{ + struct audit_message rep = { 0 }; + int ret = audit_recv(fd, &rep); + const char *sep = ""; + char *k, *v; + + if (ret < 0) + return ret; + + if (rep.nlh.nlmsg_type != AUDIT_NETFILTER_CFG) + return 0; + + /* skip the initial "audit(...): " part */ + strtok(rep.u.data, " "); + + while ((k = strtok(NULL, "="))) { + v = strtok(NULL, " "); + + /* these vary and/or are uninteresting, ignore */ + if (!strcmp(k, "pid") || + !strcmp(k, "comm") || + !strcmp(k, "subj")) + continue; + + /* strip the varying sequence number */ + if (!strcmp(k, "table")) + *strchrnul(v, ':') = '\0'; + + printf("%s%s=%s", sep, k, v); + sep = " "; + } + if (*sep) { + printf("\n"); + fflush(stdout); + } + return 0; +} + +void cleanup(int sig) +{ + audit_set(fd, AUDIT_STATUS_ENABLED, 0); + close(fd); + if (sig) + exit(0); +} + +int main(int argc, char **argv) +{ + struct sigaction act = { + .sa_handler = cleanup, + }; + + fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_AUDIT); + if (fd < 0) { + perror("Can't open netlink socket"); + return -1; + } + + if (sigaction(SIGTERM, &act, NULL) < 0 || + sigaction(SIGINT, &act, NULL) < 0) { + perror("Can't set signal handler"); + close(fd); + return -1; + } + + audit_set(fd, AUDIT_STATUS_ENABLED, 1); + audit_set(fd, AUDIT_STATUS_PID, getpid()); + + while (1) + readlog(fd); +} diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh new file mode 100755 index 000000000000..659b3ab02c8b --- /dev/null +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -0,0 +1,188 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bridge netfilter + conntrack, a combination that doesn't really work, +# with multicast/broadcast packets racing for hash table insertion. + +# eth0 br0 eth0 +# setup is: ns1 <->,ns0 <-> ns3 +# ns2 <-' `'-> ns4 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns0="ns0-$sfx" +ns1="ns1-$sfx" +ns2="ns2-$sfx" +ns3="ns3-$sfx" +ns4="ns4-$sfx" + +ebtables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ebtables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +for i in $(seq 0 4); do + eval ip netns add \$ns$i +done + +cleanup() { + for i in $(seq 0 4); do eval ip netns del \$ns$i;done +} + +trap cleanup EXIT + +do_ping() +{ + fromns="$1" + dstip="$2" + + ip netns exec $fromns ping -c 1 -q $dstip > /dev/null + if [ $? -ne 0 ]; then + echo "ERROR: ping from $fromns to $dstip" + ip netns exec ${ns0} nft list ruleset + ret=1 + fi +} + +bcast_ping() +{ + fromns="$1" + dstip="$2" + + for i in $(seq 1 1000); do + ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "ERROR: ping -b from $fromns to $dstip" + ip netns exec ${ns0} nft list ruleset + fi + done +} + +ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} +if [ $? -ne 0 ]; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi + +ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 +ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 +ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 + +ip -net ${ns0} link set lo up + +for i in $(seq 1 4); do + ip -net ${ns0} link set veth$i up +done + +ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 +if [ $? -ne 0 ]; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +# make veth0,1,2 part of bridge. +for i in $(seq 1 3); do + ip -net ${ns0} link set veth$i master br0 +done + +# add a macvlan on top of the bridge. +MACVLAN_ADDR=ba:f3:13:37:42:23 +ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private +ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} +ip -net ${ns0} link set macvlan0 up +ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 + +# add a macvlan on top of veth4. +MACVLAN_ADDR=ba:f3:13:37:42:24 +ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa +ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} +ip -net ${ns0} link set macvlan4 up + +# make the macvlan part of the bridge. +# veth4 is not a bridge port, only the macvlan on top of it. +ip -net ${ns0} link set macvlan4 master br0 + +ip -net ${ns0} link set br0 up +ip -net ${ns0} addr add 10.0.0.1/24 dev br0 +ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 +ret=$? +if [ $ret -ne 0 ] ; then + echo "SKIP: bridge netfilter not available" + ret=$ksft_skip +fi + +# for testing, so namespaces will reply to ping -b probes. +ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 + +# enable conntrack in ns0 and drop broadcast packets in forward to +# avoid them from getting confirmed in the postrouting hook before +# the cloned skb is passed up the stack. +ip netns exec ${ns0} nft -f - < ns0 <-> ns2 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +ebtables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ebtables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add ns0 +ip netns add ns1 +ip netns add ns2 + +ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 +if [ $? -ne 0 ]; then + echo "SKIP: Can't create veth device" + exit $ksft_skip +fi +ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 + +ip -net ns0 link set lo up +ip -net ns0 link set veth0 up +ip -net ns0 link set veth1 up + +ip -net ns0 link add br0 type bridge +if [ $? -ne 0 ]; then + echo "SKIP: Can't create bridge br0" + exit $ksft_skip +fi + +ip -net ns0 link set veth0 master br0 +ip -net ns0 link set veth1 master br0 +ip -net ns0 link set br0 up +ip -net ns0 addr add 10.0.0.1/24 dev br0 + +# place both in same subnet, ns1 and ns2 connected via ns0:br0 +for i in 1 2; do + ip -net ns$i link set lo up + ip -net ns$i link set eth0 up + ip -net ns$i addr add 10.0.0.1$i/24 dev eth0 +done + +test_ebtables_broute() +{ + local cipt + + # redirect is needed so the dstmac is rewritten to the bridge itself, + # ip stack won't process OTHERHOST (foreign unicast mac) packets. + ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP + if [ $? -ne 0 ]; then + echo "SKIP: Could not add ebtables broute redirect rule" + return $ksft_skip + fi + + # ping netns1, expected to not work (ip forwarding is off) + ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 + if [ $? -eq 0 ]; then + echo "ERROR: ping works, should have failed" 1>&2 + return 1 + fi + + # enable forwarding on both interfaces. + # neither needs an ip address, but at least the bridge needs + # an ip address in same network segment as ns1 and ns2 (ns0 + # needs to be able to determine route for to-be-forwarded packet). + ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1 + ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1 + + sleep 1 + + ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null + if [ $? -ne 0 ]; then + echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 + return 1 + fi + + echo "PASS: ns1/ns2 connectivity with active broute rule" + ip netns exec ns0 ebtables -t broute -F + + # ping netns1, expected to work (frames are bridged) + ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null + if [ $? -ne 0 ]; then + echo "ERROR: ping did not work, but it should (bridged)" 1>&2 + return 1 + fi + + ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP + + # ping netns1, expected to not work (DROP in bridge forward) + ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 + if [ $? -eq 0 ]; then + echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 + return 1 + fi + + # re-activate brouter + ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP + + ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null + if [ $? -ne 0 ]; then + echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 + return 1 + fi + + echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop" + return 0 +} + +# test basic connectivity +ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null +if [ $? -ne 0 ]; then + echo "ERROR: Could not reach ns2 from ns1" 1>&2 + ret=1 +fi + +ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null +if [ $? -ne 0 ]; then + echo "ERROR: Could not reach ns1 from ns2" 1>&2 + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: netns connectivity: ns1 and ns2 can reach each other" +fi + +test_ebtables_broute +ret=$? +for i in 0 1 2; do ip netns del ns$i;done + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config new file mode 100644 index 000000000000..9df6a9f11384 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/config @@ -0,0 +1,37 @@ +CONFIG_AUDIT=y +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_NETFILTER=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP_SCTP=m +CONFIG_IP_VS=m +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_NET_CLS_U32=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_IPIP=m +CONFIG_NET_VRF=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=m +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_IPV4=y +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_CT=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_NAT=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_SYNPROXY=m diff --git a/tools/testing/selftests/net/netfilter/connect_close.c b/tools/testing/selftests/net/netfilter/connect_close.c new file mode 100644 index 000000000000..1c3b0add54c4 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/connect_close.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PORT 12345 +#define RUNTIME 10 + +static struct { + unsigned int timeout; + unsigned int port; +} opts = { + .timeout = RUNTIME, + .port = PORT, +}; + +static void handler(int sig) +{ + _exit(sig == SIGALRM ? 0 : 1); +} + +static void set_timeout(void) +{ + struct sigaction action = { + .sa_handler = handler, + }; + + sigaction(SIGALRM, &action, NULL); + + alarm(opts.timeout); +} + +static void do_connect(const struct sockaddr_in *dst) +{ + int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s >= 0) + fcntl(s, F_SETFL, O_NONBLOCK); + + connect(s, (struct sockaddr *)dst, sizeof(*dst)); + close(s); +} + +static void do_accept(const struct sockaddr_in *src) +{ + int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s < 0) + return; + + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + + bind(s, (struct sockaddr *)src, sizeof(*src)); + + listen(s, 16); + + c = accept(s, NULL, NULL); + if (c >= 0) + close(c); + + close(s); +} + +static int accept_loop(void) +{ + struct sockaddr_in src = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &src.sin_addr); + + set_timeout(); + + for (;;) + do_accept(&src); + + return 1; +} + +static int connect_loop(void) +{ + struct sockaddr_in dst = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr); + + set_timeout(); + + for (;;) + do_connect(&dst); + + return 1; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "t:p:")) != -1) { + switch (c) { + case 't': + opts.timeout = atoi(optarg); + break; + case 'p': + opts.port = atoi(optarg); + break; + } + } +} + +int main(int argc, char *argv[]) +{ + pid_t p; + + parse_opts(argc, argv); + + p = fork(); + if (p < 0) + return 111; + + if (p > 0) + return accept_loop(); + + return connect_loop(); +} diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c new file mode 100644 index 000000000000..ca8d6b976c42 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include +#include +#include + +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +#define TEST_ZONE_ID 123 +#define NF_CT_DEFAULT_ZONE_ID 0 + +static int reply_counter; + +static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type, + uint32_t src_ip, uint32_t dst_ip, + uint16_t src_port, uint16_t dst_port) +{ + struct nlattr *nest, *nest_ip, *nest_proto; + + nest = mnl_attr_nest_start(nlh, type); + if (!nest) + return -1; + + nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); + if (!nest_ip) + return -1; + mnl_attr_put_u32(nlh, CTA_IP_V4_SRC, src_ip); + mnl_attr_put_u32(nlh, CTA_IP_V4_DST, dst_ip); + mnl_attr_nest_end(nlh, nest_ip); + + nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); + if (!nest_proto) + return -1; + mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); + mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); + mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); + mnl_attr_nest_end(nlh, nest_proto); + + mnl_attr_nest_end(nlh, nest); +} + +static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, + struct in6_addr src_ip, struct in6_addr dst_ip, + uint16_t src_port, uint16_t dst_port) +{ + struct nlattr *nest, *nest_ip, *nest_proto; + + nest = mnl_attr_nest_start(nlh, type); + if (!nest) + return -1; + + nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); + if (!nest_ip) + return -1; + mnl_attr_put(nlh, CTA_IP_V6_SRC, sizeof(struct in6_addr), &src_ip); + mnl_attr_put(nlh, CTA_IP_V6_DST, sizeof(struct in6_addr), &dst_ip); + mnl_attr_nest_end(nlh, nest_ip); + + nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); + if (!nest_proto) + return -1; + mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); + mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); + mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); + mnl_attr_nest_end(nlh, nest_proto); + + mnl_attr_nest_end(nlh, nest); +} + +static int build_cta_proto(struct nlmsghdr *nlh) +{ + struct nlattr *nest, *nest_proto; + + nest = mnl_attr_nest_start(nlh, CTA_PROTOINFO); + if (!nest) + return -1; + + nest_proto = mnl_attr_nest_start(nlh, CTA_PROTOINFO_TCP); + if (!nest_proto) + return -1; + mnl_attr_put_u8(nlh, CTA_PROTOINFO_TCP_STATE, TCP_CONNTRACK_ESTABLISHED); + mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 0x0a0a); + mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_REPLY, 0x0a0a); + mnl_attr_nest_end(nlh, nest_proto); + + mnl_attr_nest_end(nlh, nest); +} + +static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh, + uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *rplnlh; + unsigned int portid; + int err, ret; + + portid = mnl_socket_get_portid(sock); + + ret = build_cta_proto(nlh); + if (ret < 0) { + perror("build_cta_proto"); + return -1; + } + mnl_attr_put_u32(nlh, CTA_TIMEOUT, htonl(20000)); + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); + + if (mnl_socket_sendto(sock, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + return -1; + } + + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + if (ret < 0) { + perror("mnl_socket_recvfrom"); + return ret; + } + + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); + if (ret < 0) { + if (errno == EEXIST) { + /* The entries are probably still there from a previous + * run. So we are good + */ + return 0; + } + perror("mnl_cb_run"); + return ret; + } + + return 0; +} + +static int conntrack_data_generate_v4(struct mnl_socket *sock, uint32_t src_ip, + uint32_t dst_ip, uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct nfgenmsg *nfh; + int ret; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_ACK | NLM_F_EXCL; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_INET; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + ret = build_cta_tuple_v4(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 12345, 443); + if (ret < 0) { + perror("build_cta_tuple_v4"); + return ret; + } + ret = build_cta_tuple_v4(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 443, 12345); + if (ret < 0) { + perror("build_cta_tuple_v4"); + return ret; + } + return conntrack_data_insert(sock, nlh, zone); +} + +static int conntrack_data_generate_v6(struct mnl_socket *sock, + struct in6_addr src_ip, + struct in6_addr dst_ip, + uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct nfgenmsg *nfh; + int ret; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_ACK | NLM_F_EXCL; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_INET6; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + ret = build_cta_tuple_v6(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, + 12345, 443); + if (ret < 0) { + perror("build_cta_tuple_v6"); + return ret; + } + ret = build_cta_tuple_v6(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, + 12345, 443); + if (ret < 0) { + perror("build_cta_tuple_v6"); + return ret; + } + return conntrack_data_insert(sock, nlh, zone); +} + +static int count_entries(const struct nlmsghdr *nlh, void *data) +{ + reply_counter++; +} + +static int conntracK_count_zone(struct mnl_socket *sock, uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh, *rplnlh; + struct nfgenmsg *nfh; + struct nlattr *nest; + unsigned int portid; + int err, ret; + + portid = mnl_socket_get_portid(sock); + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_UNSPEC; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); + + ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); + if (ret < 0) { + perror("mnl_socket_sendto"); + return ret; + } + + reply_counter = 0; + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + while (ret > 0) { + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, + count_entries, NULL); + if (ret <= MNL_CB_STOP) + break; + + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + } + if (ret < 0) { + perror("mnl_socket_recvfrom"); + return ret; + } + + return reply_counter; +} + +static int conntrack_flush_zone(struct mnl_socket *sock, uint16_t zone) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh, *rplnlh; + struct nfgenmsg *nfh; + struct nlattr *nest; + unsigned int portid; + int err, ret; + + portid = mnl_socket_get_portid(sock); + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_DELETE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = time(NULL); + + nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); + nfh->nfgen_family = AF_UNSPEC; + nfh->version = NFNETLINK_V0; + nfh->res_id = 0; + + mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); + + ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); + if (ret < 0) { + perror("mnl_socket_sendto"); + return ret; + } + + ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); + if (ret < 0) { + perror("mnl_socket_recvfrom"); + return ret; + } + + ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); + if (ret < 0) { + perror("mnl_cb_run"); + return ret; + } + + return 0; +} + +FIXTURE(conntrack_dump_flush) +{ + struct mnl_socket *sock; +}; + +FIXTURE_SETUP(conntrack_dump_flush) +{ + struct in6_addr src, dst; + int ret; + + self->sock = mnl_socket_open(NETLINK_NETFILTER); + if (!self->sock) { + perror("mnl_socket_open"); + exit(EXIT_FAILURE); + } + + if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { + perror("mnl_socket_bind"); + exit(EXIT_FAILURE); + } + + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + if (ret < 0 && errno == EPERM) + SKIP(return, "Needs to be run as root"); + else if (ret < 0 && errno == EOPNOTSUPP) + SKIP(return, "Kernel does not seem to support conntrack zones"); + + ret = conntrack_data_generate_v4(self->sock, 0xf0f0f0f0, 0xf1f1f1f1, + TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf2f2f2f2, 0xf3f3f3f3, + TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5, + TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf6f6f6f6, 0xf7f7f7f7, + NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x01000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x02000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x03000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x04000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 0); + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x05000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x06000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 0); + + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x07000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x08000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_GE(ret, 2); + if (ret > 2) + SKIP(return, "kernel does not support filtering by zone"); +} + +FIXTURE_TEARDOWN(conntrack_dump_flush) +{ +} + +TEST_F(conntrack_dump_flush, test_dump_by_zone) +{ + int ret; + + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 2); +} + +TEST_F(conntrack_dump_flush, test_flush_by_zone) +{ + int ret; + + ret = conntrack_flush_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 2); +} + +TEST_F(conntrack_dump_flush, test_flush_by_zone_default) +{ + int ret; + + ret = conntrack_flush_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh new file mode 100755 index 000000000000..76645aaf2b58 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh @@ -0,0 +1,315 @@ +#!/bin/bash +# +# check that ICMP df-needed/pkttoobig icmp are set are set as related +# state +# +# Setup is: +# +# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2 +# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280). +# ping nsclient2 from nsclient1, checking that conntrack did set RELATED +# 'fragmentation needed' icmp packet. +# +# In addition, nsrouter1 will perform IP masquerading, i.e. also +# check the icmp errors are propagated to the correct host as per +# nat of "established" icmp-echo "connection". + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup() { + for i in 1 2;do ip netns del nsclient$i;done + for i in 1 2;do ip netns del nsrouter$i;done +} + +trap cleanup EXIT + +ipv4() { + echo -n 192.168.$1.2 +} + +ipv6 () { + echo -n dead:$1::2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") + if [ $? -ne 0 ]; then + echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 + ip netns exec $ns nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +check_unknown() +{ + expect="packets 0 bytes 0" + for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do + check_counter $n "unknown" "$expect" + if [ $? -ne 0 ] ;then + return 1 + fi + done + + return 0 +} + +for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do + ip netns add $n + ip -net $n link set lo up +done + +DEV=veth0 +ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 +DEV=veth0 +ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 + +DEV=veth0 +ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 + +DEV=veth0 +for i in 1 2; do + ip -net nsclient$i link set $DEV up + ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV + ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV +done + +ip -net nsrouter1 link set eth1 up +ip -net nsrouter1 link set veth0 up + +ip -net nsrouter2 link set eth1 up +ip -net nsrouter2 link set eth2 up + +ip -net nsclient1 route add default via 192.168.1.1 +ip -net nsclient1 -6 route add default via dead:1::1 + +ip -net nsclient2 route add default via 192.168.2.1 +ip -net nsclient2 route add default via dead:2::1 + +i=3 +ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 +ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 +ip -net nsrouter1 addr add dead:1::1/64 dev eth1 +ip -net nsrouter1 addr add dead:3::1/64 dev veth0 +ip -net nsrouter1 route add default via 192.168.3.10 +ip -net nsrouter1 -6 route add default via dead:3::10 + +ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 +ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 +ip -net nsrouter2 addr add dead:2::1/64 dev eth1 +ip -net nsrouter2 addr add dead:3::10/64 dev eth2 +ip -net nsrouter2 route add default via 192.168.3.1 +ip -net nsrouter2 route add default via dead:3::1 + +sleep 2 +for i in 4 6; do + ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 +done + +for netns in nsrouter1 nsrouter2; do +ip netns exec $netns nft -f - </dev/null +if [ $? -ne 0 ]; then + echo "ERROR: netns ip routing/connectivity broken" 1>&2 + cleanup + exit 1 +fi +ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null +if [ $? -ne 0 ]; then + echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 + cleanup + exit 1 +fi + +check_unknown +if [ $? -ne 0 ]; then + ret=1 +fi + +expect="packets 0 bytes 0" +for netns in nsrouter1 nsrouter2 nsclient1;do + check_counter "$netns" "related" "$expect" + if [ $? -ne 0 ]; then + ret=1 + fi +done + +expect="packets 2 bytes 2076" +check_counter nsclient2 "new" "$expect" +if [ $? -ne 0 ]; then + ret=1 +fi + +ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null +if [ $? -eq 0 ]; then + echo "ERROR: ping should have failed with PMTU too big error" 1>&2 + ret=1 +fi + +# nsrouter2 should have generated the icmp error, so +# related counter should be 0 (its in forward). +expect="packets 0 bytes 0" +check_counter "nsrouter2" "related" "$expect" +if [ $? -ne 0 ]; then + ret=1 +fi + +# but nsrouter1 should have seen it, same for nsclient1. +expect="packets 1 bytes 576" +for netns in nsrouter1 nsclient1;do + check_counter "$netns" "related" "$expect" + if [ $? -ne 0 ]; then + ret=1 + fi +done + +ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null +if [ $? -eq 0 ]; then + echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 + ret=1 +fi + +expect="packets 2 bytes 1856" +for netns in nsrouter1 nsclient1;do + check_counter "$netns" "related" "$expect" + if [ $? -ne 0 ]; then + ret=1 + fi +done + +if [ $ret -eq 0 ];then + echo "PASS: icmp mtu error had RELATED state" +else + echo "ERROR: icmp error RELATED state test has failed" +fi + +# add 'bad' route, expect icmp REDIRECT to be generated +ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 +ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 + +ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null + +expect="packets 1 bytes 112" +check_counter nsclient1 "redir4" "$expect" +if [ $? -ne 0 ];then + ret=1 +fi + +ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null +expect="packets 1 bytes 192" +check_counter nsclient1 "redir6" "$expect" +if [ $? -ne 0 ];then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: icmp redirects had RELATED state" +else + echo "ERROR: icmp redirect RELATED state test has failed" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh new file mode 100755 index 000000000000..eb9553e4986b --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -0,0 +1,207 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# Conntrack needs to reassemble fragments in order to have complete +# packets for rule matching. Reassembly can lead to packet loss. + +# Consider the following setup: +# +--------+ +---------+ +--------+ +# |Router A|-------|Wanrouter|-------|Router B| +# | |.IPIP..| |..IPIP.| | +# +--------+ +---------+ +--------+ +# / mtu 1400 \ +# / \ +#+--------+ +--------+ +#|Client A| |Client B| +#| | | | +#+--------+ +--------+ + +# Router A and Router B use IPIP tunnel interfaces to tunnel traffic +# between Client A and Client B over WAN. Wanrouter has MTU 1400 set +# on its interfaces. + +rnd=$(mktemp -u XXXXXXXX) +rx=$(mktemp) + +r_a="ns-ra-$rnd" +r_b="ns-rb-$rnd" +r_w="ns-rw-$rnd" +c_a="ns-ca-$rnd" +c_b="ns-cb-$rnd" + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} + +checktool "iptables --version" "run test without iptables" +checktool "ip -Version" "run test without ip tool" +checktool "which socat" "run test without socat" +checktool "ip netns add ${r_a}" "create net namespace" + +for n in ${r_b} ${r_w} ${c_a} ${c_b};do + ip netns add ${n} +done + +cleanup() { + for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do + ip netns del ${n} + done + rm -f ${rx} +} + +trap cleanup EXIT + +test_path() { + msg="$1" + + ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & + + sleep 1 + for i in 1 2 3; do + head -c1400 /dev/zero | tr "\000" "a" | \ + ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 + done + + wait + + bytes=$(wc -c < ${rx}) + + if [ $bytes -eq 1400 ];then + echo "OK: PMTU $msg connection tracking" + else + echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" + exit 1 + fi +} + +# Detailed setup for Router A +# --------------------------- +# Interfaces: +# eth0: 10.2.2.1/24 +# eth1: 192.168.10.1/24 +# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1 +# Routes: +# 192.168.20.0/24 dev ipip0 (192.168.20.0/24 is subnet of Client B) +# 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) +# No iptables rules at all. + +ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} +ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} + +l_addr="10.2.2.1" +r_addr="10.4.4.1" +ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip + +for dev in lo veth0 veth1 ipip0; do + ip -net ${r_a} link set $dev up +done + +ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 +ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 + +ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 +ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 + +ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Detailed setup for Router B +# --------------------------- +# Interfaces: +# eth0: 10.4.4.1/24 +# eth1: 192.168.20.1/24 +# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1 +# Routes: +# 192.168.10.0/24 dev ipip0 (192.168.10.0/24 is subnet of Client A) +# 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) +# No iptables rules at all. + +ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} +ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} + +l_addr="10.4.4.1" +r_addr="10.2.2.1" + +ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip + +for dev in lo veth0 veth1 ipip0; do + ip -net ${r_b} link set $dev up +done + +ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 +ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 + +ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 +ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 +ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Client A +ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 +ip -net ${c_a} link set dev lo up +ip -net ${c_a} link set dev veth0 up +ip -net ${c_a} route add default via 192.168.10.1 + +# Client A +ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 +ip -net ${c_b} link set dev veth0 up +ip -net ${c_b} link set dev lo up +ip -net ${c_b} route add default via 192.168.20.1 + +# Wan +ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 +ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 + +ip -net ${r_w} link set dev lo up +ip -net ${r_w} link set dev veth0 up mtu 1400 +ip -net ${r_w} link set dev veth1 up mtu 1400 + +ip -net ${r_a} link set dev veth0 mtu 1400 +ip -net ${r_b} link set dev veth0 mtu 1400 + +ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Path MTU discovery +# ------------------ +# Running tracepath from Client A to Client B shows PMTU discovery is working +# as expected: +# +# clienta:~# tracepath 192.168.20.2 +# 1?: [LOCALHOST] pmtu 1500 +# 1: 192.168.10.1 0.867ms +# 1: 192.168.10.1 0.302ms +# 2: 192.168.10.1 0.312ms pmtu 1480 +# 2: no reply +# 3: 192.168.10.1 0.510ms pmtu 1380 +# 3: 192.168.20.2 2.320ms reached +# Resume: pmtu 1380 hops 3 back 3 + +# ip netns exec ${c_a} traceroute --mtu 192.168.20.2 + +# Router A has learned PMTU (1400) to Router B from Wanrouter. +# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B +# from Router A. + +#Send large UDP packet +#--------------------- +#Now we send a 1400 bytes UDP packet from Client A to Client B: + +# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000 +test_path "without" + +# The IPv4 stack on Client A already knows the PMTU to Client B, so the +# UDP packet is sent as two fragments (1380 + 20). Router A forwards the +# fragments between eth1 and ipip0. The fragments fit into the tunnel and +# reach their destination. + +#When sending the large UDP packet again, Router A now reassembles the +#fragments before routing the packet over ipip0. The resulting IPIP +#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is +#dropped on Router A before sending. + +ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW +test_path "with" diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh new file mode 100755 index 000000000000..a924e595cfd8 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing For SCTP COLLISION SCENARIO as Below: +# +# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] +# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] +# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] +# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] +# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] +# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] +# +# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS + +CLIENT_NS=$(mktemp -u client-XXXXXXXX) +CLIENT_IP="198.51.200.1" +CLIENT_PORT=1234 + +SERVER_NS=$(mktemp -u server-XXXXXXXX) +SERVER_IP="198.51.100.1" +SERVER_PORT=1234 + +ROUTER_NS=$(mktemp -u router-XXXXXXXX) +CLIENT_GW="198.51.200.2" +SERVER_GW="198.51.100.2" + +# setup the topo +setup() { + ip net add $CLIENT_NS + ip net add $SERVER_NS + ip net add $ROUTER_NS + ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS + ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS + + ip -n $SERVER_NS link set link0 up + ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0 + ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW + + ip -n $ROUTER_NS link set link1 up + ip -n $ROUTER_NS link set link2 up + ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1 + ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2 + ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1 + + ip -n $CLIENT_NS link set link3 up + ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3 + ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW + + # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with + # tc on $SERVER_NS side + tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb + tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit + tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ + 0xff match u8 2 0xff at 32 flowid 1:1 + tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms + + # simulate the ctstate check on OVS nf_conntrack + ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP + ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP + + # use a smaller number for assoc's max_retrans to reproduce the issue + modprobe sctp + ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3 +} + +cleanup() { + ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null + ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null + ip net del "$CLIENT_NS" + ip net del "$SERVER_NS" + ip net del "$ROUTER_NS" +} + +do_test() { + ip net exec $SERVER_NS ./sctp_collision server \ + $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & + ip net exec $CLIENT_NS ./sctp_collision client \ + $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT +} + +# NOTE: one way to work around the issue is set a smaller hb_interval +# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 + +# run the test case +trap cleanup EXIT +setup && \ +echo "Test for SCTP Collision in nf_conntrack:" && \ +do_test && echo "PASS!" +exit $? diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh new file mode 100755 index 000000000000..e7d7bf13cff5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that UNREPLIED tcp conntrack will eventually timeout. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +waittime=20 +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup() { + ip netns pids $ns1 | xargs kill 2>/dev/null + ip netns pids $ns2 | xargs kill 2>/dev/null + + ip netns del $ns1 + ip netns del $ns2 +} + +ipv4() { + echo -n 192.168.$1.2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") + if [ $? -ne 0 ]; then + echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 + ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +# Create test namespaces +ip netns add $ns1 || exit 1 + +trap cleanup EXIT + +ip netns add $ns2 || exit 1 + +# Connect the namespace to the host using a veth pair +ip -net $ns1 link add name veth1 type veth peer name veth2 +ip -net $ns1 link set netns $ns2 dev veth2 + +ip -net $ns1 link set up dev lo +ip -net $ns2 link set up dev lo +ip -net $ns1 link set up dev veth1 +ip -net $ns2 link set up dev veth2 + +ip -net $ns2 addr add 10.11.11.2/24 dev veth2 +ip -net $ns2 route add default via 10.11.11.1 + +ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 + +# add a rule inside NS so we enable conntrack +ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT + +ip -net $ns1 addr add 10.11.11.1/24 dev veth1 +ip -net $ns1 route add 10.99.99.99 via 10.11.11.2 + +# Check connectivity works +ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1 + +ip netns exec $ns2 nc -l -p 8080 < /dev/null & + +# however, conntrack entries are there + +ip netns exec $ns2 nft -f - < $ns2 to the virtual ip" +ip netns exec $ns1 bash -c 'while true ; do + nc -p 60000 10.99.99.99 80 + sleep 1 + done' & + +sleep 1 + +ip netns exec $ns2 nft -f - </dev/null | wc -l) +if [ $count -eq 0 ]; then + echo "ERROR: $ns2 did not pick up tcp connection from peer" + exit 1 +fi + +echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" +for i in $(seq 1 $waittime); do + echo -n "." + + sleep 1 + + count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) + if [ $count -gt 0 ]; then + echo + echo "PASS: redirection took effect after $i seconds" + break + fi + + m=$((i%20)) + if [ $m -eq 0 ]; then + echo " waited for $i seconds" + fi +done + +expect="packets 1 bytes 60" +check_counter "$ns2" "redir" "$expect" +if [ $? -ne 0 ]; then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: redirection counter has expected values" +else + echo "ERROR: no tcp connection was redirected" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh new file mode 100755 index 000000000000..8b5ea9234588 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -0,0 +1,241 @@ +#!/bin/sh + +# This script demonstrates interaction of conntrack and vrf. +# The vrf driver calls the netfilter hooks again, with oif/iif +# pointing at the VRF device. +# +# For ingress, this means first iteration has iifname of lower/real +# device. In this script, thats veth0. +# Second iteration is iifname set to vrf device, tvrf in this script. +# +# For egress, this is reversed: first iteration has the vrf device, +# second iteration is done with the lower/real/veth0 device. +# +# test_ct_zone_in demonstrates unexpected change of nftables +# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack +# connection on VRF rcv" +# +# It was possible to assign conntrack zone to a packet (or mark it for +# `notracking`) in the prerouting chain before conntrack, based on real iif. +# +# After the change, the zone assignment is lost and the zone is assigned based +# on the VRF master interface (in case such a rule exists). +# assignment is lost. Instead, assignment based on the `iif` matching +# Thus it is impossible to distinguish packets based on the original +# interface. +# +# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem +# that was supposed to be fixed by the commit mentioned above to make sure +# that any fix to test case 1 won't break masquerade again. + +ksft_skip=4 + +IP0=172.30.30.1 +IP1=172.30.30.2 +PFXL=30 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns0="ns0-$sfx" +ns1="ns1-$sfx" + +cleanup() +{ + ip netns pids $ns0 | xargs kill 2>/dev/null + ip netns pids $ns1 | xargs kill 2>/dev/null + + ip netns del $ns0 $ns1 +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add "$ns0" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns0" + exit $ksft_skip +fi +ip netns add "$ns1" + +trap cleanup EXIT + +ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0 +ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 +ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 + +ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not add veth device" + exit $ksft_skip +fi + +ip -net $ns0 li add tvrf type vrf table 9876 +if [ $? -ne 0 ];then + echo "SKIP: Could not add vrf device" + exit $ksft_skip +fi + +ip -net $ns0 li set lo up + +ip -net $ns0 li set veth0 master tvrf +ip -net $ns0 li set tvrf up +ip -net $ns0 li set veth0 up +ip -net $ns1 li set veth0 up + +ip -net $ns0 addr add $IP0/$PFXL dev veth0 +ip -net $ns1 addr add $IP1/$PFXL dev veth0 + +ip netns exec $ns1 iperf3 -s > /dev/null 2>&1& +if [ $? -ne 0 ];then + echo "SKIP: Could not start iperf3" + exit $ksft_skip +fi + +# test vrf ingress handling. +# The incoming connection should be placed in conntrack zone 1, +# as decided by the first iteration of the ruleset. +test_ct_zone_in() +{ +ip netns exec $ns0 nft -f - < /dev/null + + # should be in zone 1, not zone 2 + count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) + if [ $count -eq 1 ]; then + echo "PASS: entry found in conntrack zone 1" + else + echo "FAIL: entry not found in conntrack zone 1" + count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) + if [ $count -eq 1 ]; then + echo "FAIL: entry found in zone 2 instead" + else + echo "FAIL: entry not in zone 1 or 2, dumping table" + ip netns exec $ns0 conntrack -L + ip netns exec $ns0 nft list ruleset + fi + fi +} + +# add masq rule that gets evaluated w. outif set to vrf device. +# This tests the first iteration of the packet through conntrack, +# oifname is the vrf device. +test_masquerade_vrf() +{ + local qdisc=$1 + + if [ "$qdisc" != "default" ]; then + tc -net $ns0 qdisc add dev tvrf root $qdisc + fi + + ip netns exec $ns0 conntrack -F 2>/dev/null + +ip netns exec $ns0 nft -f - </dev/null + if [ $? -ne 0 ]; then + echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" + ret=1 + return + fi + + # must also check that nat table was evaluated on second (lower device) iteration. + ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && + ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' + if [ $? -eq 0 ]; then + echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" + else + echo "FAIL: vrf rules have unexpected counter value" + ret=1 + fi + + if [ "$qdisc" != "default" ]; then + tc -net $ns0 qdisc del dev tvrf root + fi +} + +# add masq rule that gets evaluated w. outif set to veth device. +# This tests the 2nd iteration of the packet through conntrack, +# oifname is the lower device (veth0 in this case). +test_masquerade_veth() +{ + ip netns exec $ns0 conntrack -F 2>/dev/null +ip netns exec $ns0 nft -f - < /dev/null + if [ $? -ne 0 ]; then + echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" + ret=1 + return + fi + + # must also check that nat table was evaluated on second (lower device) iteration. + ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' + if [ $? -eq 0 ]; then + echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" + else + echo "FAIL: vrf masq rule has unexpected counter value" + ret=1 + fi +} + +test_ct_zone_in +test_masquerade_vrf "default" +test_masquerade_vrf "pfifo" +test_masquerade_veth + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh new file mode 100755 index 000000000000..c3b8f90c497e --- /dev/null +++ b/tools/testing/selftests/net/netfilter/ipvs.sh @@ -0,0 +1,228 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# End-to-end ipvs test suite +# Topology: +#--------------------------------------------------------------+ +# | | +# ns0 | ns1 | +# ----------- | ----------- ----------- | +# | veth01 | --------- | veth10 | | veth12 | | +# ----------- peer ----------- ----------- | +# | | | | +# ----------- | | | +# | br0 | |----------------- peer |--------------| +# ----------- | | | +# | | | | +# ---------- peer ---------- ----------- | +# | veth02 | --------- | veth20 | | veth21 | | +# ---------- | ---------- ----------- | +# | ns2 | +# | | +#--------------------------------------------------------------+ +# +# We assume that all network driver are loaded +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 +GREEN='\033[0;92m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +readonly port=8080 + +readonly vip_v4=207.175.44.110 +readonly cip_v4=10.0.0.2 +readonly gip_v4=10.0.0.1 +readonly dip_v4=172.16.0.1 +readonly rip_v4=172.16.0.2 +readonly sip_v4=10.0.0.3 + +readonly infile="$(mktemp)" +readonly outfile="$(mktemp)" +readonly datalen=32 + +sysipvsnet="/proc/sys/net/ipv4/vs/" +if [ ! -d $sysipvsnet ]; then + modprobe -q ip_vs + if [ $? -ne 0 ]; then + echo "skip: could not run test without ipvs module" + exit $ksft_skip + fi +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ipvsadm -v > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "SKIP: Could not run test without ipvsadm" + exit $ksft_skip +fi + +setup() { + ip netns add ns0 + ip netns add ns1 + ip netns add ns2 + + ip link add veth01 netns ns0 type veth peer name veth10 netns ns1 + ip link add veth02 netns ns0 type veth peer name veth20 netns ns2 + ip link add veth12 netns ns1 type veth peer name veth21 netns ns2 + + ip netns exec ns0 ip link set veth01 up + ip netns exec ns0 ip link set veth02 up + ip netns exec ns0 ip link add br0 type bridge + ip netns exec ns0 ip link set veth01 master br0 + ip netns exec ns0 ip link set veth02 master br0 + ip netns exec ns0 ip link set br0 up + ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0 + + ip netns exec ns1 ip link set lo up + ip netns exec ns1 ip link set veth10 up + ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10 + ip netns exec ns1 ip link set veth12 up + ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12 + + ip netns exec ns2 ip link set lo up + ip netns exec ns2 ip link set veth21 up + ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21 + ip netns exec ns2 ip link set veth20 up + ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20 + + sleep 1 + + dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none +} + +cleanup() { + for i in 0 1 2 + do + ip netns del ns$i > /dev/null 2>&1 + done + + if [ -f "${outfile}" ]; then + rm "${outfile}" + fi + if [ -f "${infile}" ]; then + rm "${infile}" + fi +} + +server_listen() { + ip netns exec ns2 nc -l -p 8080 > "${outfile}" & + server_pid=$! + sleep 0.2 +} + +client_connect() { + ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}" +} + +verify_data() { + wait "${server_pid}" + cmp "$infile" "$outfile" 2>/dev/null +} + +test_service() { + server_listen + client_connect + verify_data +} + + +test_dr() { + ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 + + ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr + ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port} + ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 + + # avoid incorrect arp response + ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 + # avoid reverse route lookup + ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 + + test_service +} + +test_nat() { + ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 + + ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr + ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port} + ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 + + ip netns exec ns2 ip link del veth20 + ip netns exec ns2 ip route add default via ${dip_v4} dev veth21 + + test_service +} + +test_tun() { + ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 + + ip netns exec ns1 modprobe ipip + ip netns exec ns1 ip link set tunl0 up + ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0 + ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0 + ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0 + ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr + ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port} + ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 + + ip netns exec ns2 modprobe ipip + ip netns exec ns2 ip link set tunl0 up + ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 + ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 + ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 + + test_service +} + +run_tests() { + local errors= + + echo "Testing DR mode..." + cleanup + setup + test_dr + errors=$(( $errors + $? )) + + echo "Testing NAT mode..." + cleanup + setup + test_nat + errors=$(( $errors + $? )) + + echo "Testing Tunnel mode..." + cleanup + setup + test_tun + errors=$(( $errors + $? )) + + return $errors +} + +trap cleanup EXIT + +run_tests + +if [ $? -ne 0 ]; then + echo -e "$(basename $0): ${RED}FAIL${NC}" + exit 1 +fi +echo -e "$(basename $0): ${GREEN}PASS${NC}" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh new file mode 100644 index 000000000000..eb109eb527db --- /dev/null +++ b/tools/testing/selftests/net/netfilter/lib.sh @@ -0,0 +1,3 @@ +net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "$net_netfilter_dir/../lib.sh" diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh new file mode 100755 index 000000000000..a1aa8f4a5828 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh @@ -0,0 +1,127 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +socatpid=0 + +cleanup() +{ + [ $socatpid -gt 0 ] && kill $socatpid + ip netns del $ns1 + ip netns del $ns2 +} + +socat -h > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without socat" + exit $ksft_skip +fi + +iptables --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add "$ns1" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns1" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns $ns1 dev veth1 +ip link set netns $ns2 dev veth2 + +ip netns exec $ns1 ip link set up dev lo +ip netns exec $ns1 ip link set up dev veth1 +ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec $ns2 ip link set up dev lo +ip netns exec $ns2 ip link set up dev veth2 +ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & +socatpid=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 + +sleep 1 + +# add a persistent connection from the other namespace +ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null +ret=$? + +# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: socat can connect via NAT'd address" +else + echo "FAIL: socat cannot connect via NAT'd address" +fi + +# check sport clashres. +ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 +ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 + +sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & +cpid1=$! +sleep 1 + +# if connect succeeds, client closes instantly due to EOF on stdin. +# if connect hangs, it will time out after 5s. +echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & +cpid2=$! + +time_then=$(date +%s) +wait $cpid2 +rv=$? +time_now=$(date +%s) + +# Check how much time has elapsed, expectation is for +# 'cpid2' to connect and then exit (and no connect delay). +delta=$((time_now - time_then)) + +if [ $delta -lt 2 -a $rv -eq 0 ]; then + echo "PASS: could connect to service via redirected ports" +else + echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nf_queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c new file mode 100644 index 000000000000..9e56b9d47037 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_queue.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct options { + bool count_packets; + bool gso_enabled; + int verbose; + unsigned int queue_num; + unsigned int timeout; + uint32_t verdict; + uint32_t delay_ms; +}; + +static unsigned int queue_stats[5]; +static struct options opts; + +static void help(const char *p) +{ + printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); +} + +static int parse_attr_cb(const struct nlattr *attr, void *data) +{ + const struct nlattr **tb = data; + int type = mnl_attr_get_type(attr); + + /* skip unsupported attribute in user-space */ + if (mnl_attr_type_valid(attr, NFQA_MAX) < 0) + return MNL_CB_OK; + + switch (type) { + case NFQA_MARK: + case NFQA_IFINDEX_INDEV: + case NFQA_IFINDEX_OUTDEV: + case NFQA_IFINDEX_PHYSINDEV: + case NFQA_IFINDEX_PHYSOUTDEV: + if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) { + perror("mnl_attr_validate"); + return MNL_CB_ERROR; + } + break; + case NFQA_TIMESTAMP: + if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, + sizeof(struct nfqnl_msg_packet_timestamp)) < 0) { + perror("mnl_attr_validate2"); + return MNL_CB_ERROR; + } + break; + case NFQA_HWADDR: + if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, + sizeof(struct nfqnl_msg_packet_hw)) < 0) { + perror("mnl_attr_validate2"); + return MNL_CB_ERROR; + } + break; + case NFQA_PAYLOAD: + break; + } + tb[type] = attr; + return MNL_CB_OK; +} + +static int queue_cb(const struct nlmsghdr *nlh, void *data) +{ + struct nlattr *tb[NFQA_MAX+1] = { 0 }; + struct nfqnl_msg_packet_hdr *ph = NULL; + uint32_t id = 0; + + (void)data; + + mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb); + if (tb[NFQA_PACKET_HDR]) { + ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]); + id = ntohl(ph->packet_id); + + if (opts.verbose > 0) + printf("packet hook=%u, hwproto 0x%x", + ntohs(ph->hw_protocol), ph->hook); + + if (ph->hook >= 5) { + fprintf(stderr, "Unknown hook %d\n", ph->hook); + return MNL_CB_ERROR; + } + + if (opts.verbose > 0) { + uint32_t skbinfo = 0; + + if (tb[NFQA_SKB_INFO]) + skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO])); + if (skbinfo & NFQA_SKB_CSUMNOTREADY) + printf(" csumnotready"); + if (skbinfo & NFQA_SKB_GSO) + printf(" gso"); + if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED) + printf(" csumnotverified"); + puts(""); + } + + if (opts.count_packets) + queue_stats[ph->hook]++; + } + + return MNL_CB_OK + id; +} + +static struct nlmsghdr * +nfq_build_cfg_request(char *buf, uint8_t command, int queue_num) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + struct nfqnl_msg_config_cmd cmd = { + .command = command, + .pf = htons(AF_INET), + }; + struct nfgenmsg *nfg; + + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; + nlh->nlmsg_flags = NLM_F_REQUEST; + + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd); + + return nlh; +} + +static struct nlmsghdr * +nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) +{ + struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); + struct nfqnl_msg_config_params params = { + .copy_range = htonl(range), + .copy_mode = mode, + }; + struct nfgenmsg *nfg; + + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; + nlh->nlmsg_flags = NLM_F_REQUEST; + + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), ¶ms); + + return nlh; +} + +static struct nlmsghdr * +nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) +{ + struct nfqnl_msg_verdict_hdr vh = { + .verdict = htonl(verd), + .id = htonl(id), + }; + struct nlmsghdr *nlh; + struct nfgenmsg *nfg; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT; + nlh->nlmsg_flags = NLM_F_REQUEST; + nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); + nfg->nfgen_family = AF_UNSPEC; + nfg->version = NFNETLINK_V0; + nfg->res_id = htons(queue_num); + + mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh); + + return nlh; +} + +static void print_stats(void) +{ + unsigned int last, total; + int i; + + total = 0; + last = queue_stats[0]; + + for (i = 0; i < 5; i++) { + printf("hook %d packets %08u\n", i, queue_stats[i]); + last = queue_stats[i]; + total += last; + } + + printf("%u packets total\n", total); +} + +struct mnl_socket *open_queue(void) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + unsigned int queue_num; + struct mnl_socket *nl; + struct nlmsghdr *nlh; + struct timeval tv; + uint32_t flags; + + nl = mnl_socket_open(NETLINK_NETFILTER); + if (nl == NULL) { + perror("mnl_socket_open"); + exit(EXIT_FAILURE); + } + + if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { + perror("mnl_socket_bind"); + exit(EXIT_FAILURE); + } + + queue_num = opts.queue_num; + nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + + nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); + + flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; + flags |= NFQA_CFG_F_UID_GID; + mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); + mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + + memset(&tv, 0, sizeof(tv)); + tv.tv_sec = opts.timeout; + if (opts.timeout && setsockopt(mnl_socket_get_fd(nl), + SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + perror("setsockopt(SO_RCVTIMEO)"); + exit(EXIT_FAILURE); + } + + return nl; +} + +static void sleep_ms(uint32_t delay) +{ + struct timespec ts = { .tv_sec = delay / 1000 }; + + delay %= 1000; + + ts.tv_nsec = delay * 1000llu * 1000llu; + + nanosleep(&ts, NULL); +} + +static int mainloop(void) +{ + unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; + struct mnl_socket *nl; + struct nlmsghdr *nlh; + unsigned int portid; + char *buf; + int ret; + + buf = malloc(buflen); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + nl = open_queue(); + portid = mnl_socket_get_portid(nl); + + for (;;) { + uint32_t id; + + ret = mnl_socket_recvfrom(nl, buf, buflen); + if (ret == -1) { + if (errno == ENOBUFS || errno == EINTR) + continue; + + if (errno == EAGAIN) { + errno = 0; + ret = 0; + break; + } + + perror("mnl_socket_recvfrom"); + exit(EXIT_FAILURE); + } + + ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL); + if (ret < 0) { + perror("mnl_cb_run"); + exit(EXIT_FAILURE); + } + + id = ret - MNL_CB_OK; + if (opts.delay_ms) + sleep_ms(opts.delay_ms); + + nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + perror("mnl_socket_sendto"); + exit(EXIT_FAILURE); + } + } + + mnl_socket_close(nl); + + return ret; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { + switch (c) { + case 'c': + opts.count_packets = true; + break; + case 'h': + help(argv[0]); + exit(0); + break; + case 'q': + opts.queue_num = atoi(optarg); + if (opts.queue_num > 0xffff) + opts.queue_num = 0; + break; + case 'Q': + opts.verdict = atoi(optarg); + if (opts.verdict > 0xffff) { + fprintf(stderr, "Expected destination queue number\n"); + exit(1); + } + + opts.verdict <<= 16; + opts.verdict |= NF_QUEUE; + break; + case 'd': + opts.delay_ms = atoi(optarg); + if (opts.delay_ms == 0) { + fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); + exit(1); + } + break; + case 't': + opts.timeout = atoi(optarg); + break; + case 'G': + opts.gso_enabled = false; + break; + case 'v': + opts.verbose++; + break; + } + } + + if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { + fprintf(stderr, "Cannot use same destination and source queue\n"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + int ret; + + opts.verdict = NF_ACCEPT; + opts.gso_enabled = true; + + parse_opts(argc, argv); + + ret = mainloop(); + if (opts.count_packets) + print_stats(); + + return ret; +} diff --git a/tools/testing/selftests/net/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh new file mode 100755 index 000000000000..99ed5bd6e840 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_audit.sh @@ -0,0 +1,245 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check that audit logs generated for nft commands are as expected. + +SKIP_RC=4 +RC=0 + +nft --version >/dev/null 2>&1 || { + echo "SKIP: missing nft tool" + exit $SKIP_RC +} + +# Run everything in a separate network namespace +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } + +# give other scripts a chance to finish - audit_logread sees all activity +sleep 1 + +logfile=$(mktemp) +rulefile=$(mktemp) +echo "logging into $logfile" +./audit_logread >"$logfile" & +logread_pid=$! +trap 'kill $logread_pid; rm -f $logfile $rulefile' EXIT +exec 3<"$logfile" + +do_test() { # (cmd, log) + echo -n "testing for cmd: $1 ... " + cat <&3 >/dev/null + $1 >/dev/null || exit 1 + sleep 0.1 + res=$(diff -a -u <(echo "$2") - <&3) + [ $? -eq 0 ] && { echo "OK"; return; } + echo "FAIL" + grep -v '^\(---\|+++\|@@\)' <<< "$res" + ((RC--)) +} + +nft flush ruleset + +# adding tables, chains and rules + +for table in t1 t2; do + do_test "nft add table $table" \ + "table=$table family=2 entries=1 op=nft_register_table" + + do_test "nft add chain $table c1" \ + "table=$table family=2 entries=1 op=nft_register_chain" + + do_test "nft add chain $table c2; add chain $table c3" \ + "table=$table family=2 entries=2 op=nft_register_chain" + + cmd="add rule $table c1 counter" + + do_test "nft $cmd" \ + "table=$table family=2 entries=1 op=nft_register_rule" + + do_test "nft $cmd; $cmd" \ + "table=$table family=2 entries=2 op=nft_register_rule" + + cmd="" + sep="" + for chain in c2 c3; do + for i in {1..3}; do + cmd+="$sep add rule $table $chain counter" + sep=";" + done + done + do_test "nft $cmd" \ + "table=$table family=2 entries=6 op=nft_register_rule" +done + +for ((i = 0; i < 500; i++)); do + echo "add rule t2 c3 counter accept comment \"rule $i\"" +done >$rulefile +do_test "nft -f $rulefile" \ +'table=t2 family=2 entries=500 op=nft_register_rule' + +# adding sets and elements + +settype='type inet_service; counter' +setelem='{ 22, 80, 443 }' +setblock="{ $settype; elements = $setelem; }" +do_test "nft add set t1 s $setblock" \ +"table=t1 family=2 entries=4 op=nft_register_set" + +do_test "nft add set t1 s2 $setblock; add set t1 s3 { $settype; }" \ +"table=t1 family=2 entries=5 op=nft_register_set" + +do_test "nft add element t1 s3 $setelem" \ +"table=t1 family=2 entries=3 op=nft_register_setelem" + +# adding counters + +do_test 'nft add counter t1 c1' \ +'table=t1 family=2 entries=1 op=nft_register_obj' + +do_test 'nft add counter t2 c1; add counter t2 c2' \ +'table=t2 family=2 entries=2 op=nft_register_obj' + +for ((i = 3; i <= 500; i++)); do + echo "add counter t2 c$i" +done >$rulefile +do_test "nft -f $rulefile" \ +'table=t2 family=2 entries=498 op=nft_register_obj' + +# adding/updating quotas + +do_test 'nft add quota t1 q1 { 10 bytes }' \ +'table=t1 family=2 entries=1 op=nft_register_obj' + +do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ +'table=t2 family=2 entries=2 op=nft_register_obj' + +for ((i = 3; i <= 500; i++)); do + echo "add quota t2 q$i { 10 bytes }" +done >$rulefile +do_test "nft -f $rulefile" \ +'table=t2 family=2 entries=498 op=nft_register_obj' + +# changing the quota value triggers obj update path +do_test 'nft add quota t1 q1 { 20 bytes }' \ +'table=t1 family=2 entries=1 op=nft_register_obj' + +# resetting rules + +do_test 'nft reset rules t1 c2' \ +'table=t1 family=2 entries=3 op=nft_reset_rule' + +do_test 'nft reset rules table t1' \ +'table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule' + +do_test 'nft reset rules t2 c3' \ +'table=t2 family=2 entries=189 op=nft_reset_rule +table=t2 family=2 entries=188 op=nft_reset_rule +table=t2 family=2 entries=126 op=nft_reset_rule' + +do_test 'nft reset rules t2' \ +'table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=186 op=nft_reset_rule +table=t2 family=2 entries=188 op=nft_reset_rule +table=t2 family=2 entries=129 op=nft_reset_rule' + +do_test 'nft reset rules' \ +'table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule +table=t1 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=3 op=nft_reset_rule +table=t2 family=2 entries=180 op=nft_reset_rule +table=t2 family=2 entries=188 op=nft_reset_rule +table=t2 family=2 entries=135 op=nft_reset_rule' + +# resetting sets and elements + +elem=(22 ,80 ,443) +relem="" +for i in {1..3}; do + relem+="${elem[((i - 1))]}" + do_test "nft reset element t1 s { $relem }" \ + "table=t1 family=2 entries=$i op=nft_reset_setelem" +done + +do_test 'nft reset set t1 s' \ +'table=t1 family=2 entries=3 op=nft_reset_setelem' + +# resetting counters + +do_test 'nft reset counter t1 c1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset counters t1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset counters t2' \ +'table=t2 family=2 entries=342 op=nft_reset_obj +table=t2 family=2 entries=158 op=nft_reset_obj' + +do_test 'nft reset counters' \ +'table=t1 family=2 entries=1 op=nft_reset_obj +table=t2 family=2 entries=341 op=nft_reset_obj +table=t2 family=2 entries=159 op=nft_reset_obj' + +# resetting quotas + +do_test 'nft reset quota t1 q1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset quotas t1' \ +'table=t1 family=2 entries=1 op=nft_reset_obj' + +do_test 'nft reset quotas t2' \ +'table=t2 family=2 entries=315 op=nft_reset_obj +table=t2 family=2 entries=185 op=nft_reset_obj' + +do_test 'nft reset quotas' \ +'table=t1 family=2 entries=1 op=nft_reset_obj +table=t2 family=2 entries=314 op=nft_reset_obj +table=t2 family=2 entries=186 op=nft_reset_obj' + +# deleting rules + +readarray -t handles < <(nft -a list chain t1 c1 | \ + sed -n 's/.*counter.* handle \(.*\)$/\1/p') + +do_test "nft delete rule t1 c1 handle ${handles[0]}" \ +'table=t1 family=2 entries=1 op=nft_unregister_rule' + +cmd='delete rule t1 c1 handle' +do_test "nft $cmd ${handles[1]}; $cmd ${handles[2]}" \ +'table=t1 family=2 entries=2 op=nft_unregister_rule' + +do_test 'nft flush chain t1 c2' \ +'table=t1 family=2 entries=3 op=nft_unregister_rule' + +do_test 'nft flush table t2' \ +'table=t2 family=2 entries=509 op=nft_unregister_rule' + +# deleting chains + +do_test 'nft delete chain t2 c2' \ +'table=t2 family=2 entries=1 op=nft_unregister_chain' + +# deleting sets and elements + +do_test 'nft delete element t1 s { 22 }' \ +'table=t1 family=2 entries=1 op=nft_unregister_setelem' + +do_test 'nft delete element t1 s { 80, 443 }' \ +'table=t1 family=2 entries=2 op=nft_unregister_setelem' + +do_test 'nft flush set t1 s2' \ +'table=t1 family=2 entries=3 op=nft_unregister_setelem' + +do_test 'nft delete set t1 s2' \ +'table=t1 family=2 entries=1 op=nft_unregister_set' + +do_test 'nft delete set t1 s3' \ +'table=t1 family=2 entries=1 op=nft_unregister_set' + +exit $RC diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh new file mode 100755 index 000000000000..e908009576c7 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -0,0 +1,1645 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# nft_concat_range.sh - Tests for sets with concatenation of ranged fields +# +# Copyright (c) 2019 Red Hat GmbH +# +# Author: Stefano Brivio +# +# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 +# ^ Configuration and templates sourced with eval, counters reused in subshells + +KSELFTEST_SKIP=4 + +# Available test groups: +# - reported_issues: check for issues that were reported in the past +# - correctness: check that packets match given entries, and only those +# - concurrency: attempt races between insertion, deletion and lookup +# - timeout: check that packets match entries until they expire +# - performance: estimate matching rate, compare with rbtree and hash baselines +TESTS="reported_issues correctness concurrency timeout" +[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" + +# Set types, defined by TYPE_ variables below +TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto + net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp + net6_port_net6_port net_port_mac_proto_net" + +# Reported bugs, also described by TYPE_ variables below +BUGS="flush_remove_add reload" + +# List of possible paths to pktgen script from kernel tree for performance tests +PKTGEN_SCRIPT_PATHS=" + ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + pktgen/pktgen_bench_xmit_mode_netif_receive.sh" + +# Definition of set types: +# display display text for test report +# type_spec nftables set type specifier +# chain_spec nftables type specifier for rules mapping to set +# dst call sequence of format_*() functions for destination fields +# src call sequence of format_*() functions for source fields +# start initial integer used to generate addresses and ports +# count count of entries to generate and match +# src_delta number summed to destination generator for source fields +# tools list of tools for correctness and timeout tests, any can be used +# proto L4 protocol of test packets +# +# race_repeat race attempts per thread, 0 disables concurrency test for type +# flood_tools list of tools for concurrency tests, any can be used +# flood_proto L4 protocol of test packets for concurrency tests +# flood_spec nftables type specifier for concurrency tests +# +# perf_duration duration of single pktgen injection test +# perf_spec nftables type specifier for performance tests +# perf_dst format_*() functions for destination fields in performance test +# perf_src format_*() functions for source fields in performance test +# perf_entries number of set entries for performance test +# perf_proto L3 protocol of test packets +TYPE_net_port=" +display net,port +type_spec ipv4_addr . inet_service +chain_spec ip daddr . udp dport +dst addr4 port +src +start 1 +count 5 +src_delta 2000 +tools sendip nc bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto udp +flood_spec ip daddr . udp dport + +perf_duration 5 +perf_spec ip daddr . udp dport +perf_dst addr4 port +perf_src +perf_entries 1000 +perf_proto ipv4 +" + +TYPE_port_net=" +display port,net +type_spec inet_service . ipv4_addr +chain_spec udp dport . ip daddr +dst port addr4 +src +start 1 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto udp +flood_spec udp dport . ip daddr + +perf_duration 5 +perf_spec udp dport . ip daddr +perf_dst port addr4 +perf_src +perf_entries 100 +perf_proto ipv4 +" + +TYPE_net6_port=" +display net6,port +type_spec ipv6_addr . inet_service +chain_spec ip6 daddr . udp dport +dst addr6 port +src +start 10 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp6 + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp6 +flood_spec ip6 daddr . udp dport + +perf_duration 5 +perf_spec ip6 daddr . udp dport +perf_dst addr6 port +perf_src +perf_entries 1000 +perf_proto ipv6 +" + +TYPE_port_proto=" +display port,proto +type_spec inet_service . inet_proto +chain_spec udp dport . meta l4proto +dst port proto +src +start 1 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 0 + +perf_duration 5 +perf_spec udp dport . meta l4proto +perf_dst port proto +perf_src +perf_entries 30000 +perf_proto ipv4 +" + +TYPE_net6_port_mac=" +display net6,port,mac +type_spec ipv6_addr . inet_service . ether_addr +chain_spec ip6 daddr . udp dport . ether saddr +dst addr6 port +src mac +start 10 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp6 + +race_repeat 0 + +perf_duration 5 +perf_spec ip6 daddr . udp dport . ether daddr +perf_dst addr6 port mac +perf_src +perf_entries 10 +perf_proto ipv6 +" + +TYPE_net6_port_mac_proto=" +display net6,port,mac,proto +type_spec ipv6_addr . inet_service . ether_addr . inet_proto +chain_spec ip6 daddr . udp dport . ether saddr . meta l4proto +dst addr6 port +src mac proto +start 10 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp6 + +race_repeat 0 + +perf_duration 5 +perf_spec ip6 daddr . udp dport . ether daddr . meta l4proto +perf_dst addr6 port mac proto +perf_src +perf_entries 1000 +perf_proto ipv6 +" + +TYPE_net_port_net=" +display net,port,net +type_spec ipv4_addr . inet_service . ipv4_addr +chain_spec ip daddr . udp dport . ip saddr +dst addr4 port +src addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp +flood_spec ip daddr . udp dport . ip saddr + +perf_duration 0 +" + +TYPE_net6_port_net6_port=" +display net6,port,net6,port +type_spec ipv6_addr . inet_service . ipv6_addr . inet_service +chain_spec ip6 daddr . udp dport . ip6 saddr . udp sport +dst addr6 port +src addr6 port +start 10 +count 5 +src_delta 2000 +tools sendip socat nc +proto udp6 + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp6 +flood_spec ip6 daddr . tcp dport . ip6 saddr . tcp sport + +perf_duration 0 +" + +TYPE_net_port_mac_proto_net=" +display net,port,mac,proto,net +type_spec ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr +chain_spec ip daddr . udp dport . ether saddr . meta l4proto . ip saddr +dst addr4 port +src mac proto addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_mac=" +display net,mac +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 0 + +perf_duration 5 +perf_spec ip daddr . ether daddr +perf_dst addr4 mac +perf_src +perf_entries 1000 +perf_proto ipv4 +" + +TYPE_mac_net=" +display mac,net +type_spec ether_addr . ipv4_addr +chain_spec ether saddr . ip saddr +dst +src mac addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_mac_icmp=" +display net,mac - ICMP +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 5 +src_delta 2000 +tools ping +proto icmp + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net6_mac_icmp=" +display net6,mac - ICMPv6 +type_spec ipv6_addr . ether_addr +chain_spec ip6 daddr . ether saddr +dst addr6 +src mac +start 10 +count 50 +src_delta 2000 +tools ping +proto icmp6 + +race_repeat 0 + +perf_duration 0 +" + +TYPE_net_port_proto_net=" +display net,port,proto,net +type_spec ipv4_addr . inet_service . inet_proto . ipv4_addr +chain_spec ip daddr . udp dport . meta l4proto . ip saddr +dst addr4 port proto +src addr4 +start 1 +count 5 +src_delta 2000 +tools sendip socat nc +proto udp + +race_repeat 3 +flood_tools iperf3 iperf netperf +flood_proto tcp +flood_spec ip daddr . tcp dport . meta l4proto . ip saddr + +perf_duration 0 +" + +# Definition of tests for bugs reported in the past: +# display display text for test report +TYPE_flush_remove_add=" +display Add two elements, flush, re-add +" + +TYPE_reload=" +display net,mac with reload +type_spec ipv4_addr . ether_addr +chain_spec ip daddr . ether saddr +dst addr4 +src mac +start 1 +count 1 +src_delta 2000 +tools sendip socat nc bash +proto udp + +race_repeat 0 + +perf_duration 0 +" + +# Set template for all tests, types and rules are filled in depending on test +set_template=' +flush ruleset + +table inet filter { + counter test { + packets 0 bytes 0 + } + + set test { + type ${type_spec} + flags interval,timeout + } + + chain input { + type filter hook prerouting priority 0; policy accept; + ${chain_spec} @test counter name \"test\" + } +} + +table netdev perf { + counter test { + packets 0 bytes 0 + } + + counter match { + packets 0 bytes 0 + } + + set test { + type ${type_spec} + flags interval + } + + set norange { + type ${type_spec} + } + + set noconcat { + type ${type_spec%% *} + flags interval + } + + chain test { + type filter hook ingress device veth_a priority 0; + } +} +' + +err_buf= +info_buf= + +# Append string to error buffer +err() { + err_buf="${err_buf}${1} +" +} + +# Append string to information buffer +info() { + info_buf="${info_buf}${1} +" +} + +# Flush error buffer to stdout +err_flush() { + printf "%s" "${err_buf}" + err_buf= +} + +# Flush information buffer to stdout +info_flush() { + printf "%s" "${info_buf}" + info_buf= +} + +# Setup veth pair: this namespace receives traffic, B generates it +setup_veth() { + ip netns add B + ip link add veth_a type veth peer name veth_b || return 1 + + ip link set veth_a up + ip link set veth_b netns B + + ip -n B link set veth_b up + + ip addr add dev veth_a 10.0.0.1 + ip route add default dev veth_a + + ip -6 addr add fe80::1/64 dev veth_a nodad + ip -6 addr add 2001:db8::1/64 dev veth_a nodad + ip -6 route add default dev veth_a + + ip -n B route add default dev veth_b + + ip -6 -n B addr add fe80::2/64 dev veth_b nodad + ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad + ip -6 -n B route add default dev veth_b + + B() { + ip netns exec B "$@" >/dev/null 2>&1 + } + + sleep 2 +} + +# Fill in set template and initialise set +setup_set() { + eval "echo \"${set_template}\"" | nft -f - +} + +# Check that at least one of the needed tools is available +check_tools() { + [ -z "${tools}" ] && return 0 + + __tools= + for tool in ${tools}; do + if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ + ! nc -u -w0 1.1.1.1 1 2>/dev/null; then + # Some GNU netcat builds might not support IPv6 + __tools="${__tools} netcat-openbsd" + continue + fi + __tools="${__tools} ${tool}" + + command -v "${tool}" >/dev/null && return 0 + done + err "need one of:${__tools}, skipping" && return 1 +} + +# Set up function to send ICMP packets +setup_send_icmp() { + send_icmp() { + B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1 + } +} + +# Set up function to send ICMPv6 packets +setup_send_icmp6() { + if command -v ping6 >/dev/null; then + send_icmp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ping6 -q -c1 -W1 "${dst_addr6}" + } + else + send_icmp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ping -q -6 -c1 -W1 "${dst_addr6}" + } + fi +} + +# Set up function to send single UDP packets on IPv4 +setup_send_udp() { + if command -v sendip >/dev/null; then + send_udp() { + [ -n "${src_port}" ] && src_port="-us ${src_port}" + [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" + [ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}" + + # shellcheck disable=SC2086 # sendip needs split options + B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \ + ${dst_port} "${dst_addr4}" + + src_port= + dst_port= + src_addr4= + } + elif command -v socat -v >/dev/null; then + send_udp() { + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}" dev veth_b + __socatbind=",bind=${src_addr4}" + if [ -n "${src_port}" ];then + __socatbind="${__socatbind}:${src_port}" + fi + fi + + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + [ -z "${dst_port}" ] && dst_port=12345 + + echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" + + src_addr4= + src_port= + } + elif command -v nc >/dev/null; then + if nc -u -w0 1.1.1.1 1 2>/dev/null; then + # OpenBSD netcat + nc_opt="-w0" + else + # GNU netcat + nc_opt="-q0" + fi + + send_udp() { + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}" dev veth_b + __src_addr4="-s ${src_addr4}" + fi + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + [ -n "${src_port}" ] && src_port="-p ${src_port}" + + echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ + "${src_port}" "${dst_addr4}" "${dst_port}" + + src_addr4= + src_port= + } + elif [ -z "$(bash -c 'type -p')" ]; then + send_udp() { + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + B ip route add default dev veth_b + fi + + B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}" + + if [ -n "${src_addr4}" ]; then + B ip addr del "${src_addr4}/16" dev veth_b + fi + src_addr4= + } + else + return 1 + fi +} + +# Set up function to send single UDP packets on IPv6 +setup_send_udp6() { + if command -v sendip >/dev/null; then + send_udp6() { + [ -n "${src_port}" ] && src_port="-us ${src_port}" + [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" + if [ -n "${src_addr6}" ]; then + src_addr6="-6s ${src_addr6}" + else + src_addr6="-6s 2001:db8::2" + fi + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \ + ${dst_port} "${dst_addr6}" + + src_port= + dst_port= + src_addr6= + } + elif command -v socat -v >/dev/null; then + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + __socatbind6= + + if [ -n "${src_addr6}" ]; then + if [ -n "${src_addr6} != "${src_addr6_added} ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + + src_addr6_added=${src_addr6} + fi + + __socatbind6=",bind=[${src_addr6}]" + + if [ -n "${src_port}" ] ;then + __socatbind6="${__socatbind6}:${src_port}" + fi + fi + + echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" + } + elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then + # GNU netcat might not work with IPv6, try next tool + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + else + src_addr6="2001:db8::2" + fi + [ -n "${src_port}" ] && src_port="-p ${src_port}" + + # shellcheck disable=SC2086 # this needs split options + echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ + ${dst_addr6} ${dst_port} + + src_addr6= + src_port= + } + elif [ -z "$(bash -c 'type -p')" ]; then + send_udp6() { + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + B ip addr add "${src_addr6}" dev veth_b nodad + B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}" + ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null + } + else + return 1 + fi +} + +# Set up function to send TCP traffic on IPv4 +setup_flood_tcp() { + if command -v iperf3 >/dev/null; then + flood_tcp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b 2>/dev/null + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ + ${src_addr4} -l16 -t 1000 + + src_addr4= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_tcp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 2>/dev/null + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_addr4="${src_addr4}:${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ + -l20 -t 1000 + + src_addr4= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_tcp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="10.0.0.2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -4 ${dst_port} -L "${dst_addr4}" \ + >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B netperf -4 -H "${dst_addr4}" ${dst_port} \ + -L "${src_addr4}" -l 1000 -t TCP_STREAM + + src_addr4= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Set up function to send TCP traffic on IPv6 +setup_flood_tcp6() { + if command -v iperf3 >/dev/null; then + flood_tcp6() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + src_addr6="-B ${src_addr6}" + else + src_addr6="-B 2001:db8::2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -c "${dst_addr6}" ${dst_port} \ + ${src_port} ${src_addr6} -l16 -t 1000 + + src_addr6= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_tcp6() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + src_addr6="-B ${src_addr6}" + else + src_addr6="-B 2001:db8::2" + fi + if [ -n "${src_port}" ]; then + src_addr6="${src_addr6}:${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf -c "${dst_addr6}" -V ${dst_port} \ + ${src_addr6} -l1 -t 1000 + + src_addr6= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_tcp6() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr6}" ]; then + B ip addr add "${src_addr6}" dev veth_b nodad + else + src_addr6="2001:db8::2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip -6 addr add "${dst_addr6}" dev veth_a nodad \ + 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -6 ${dst_port} -L "${dst_addr6}" \ + >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B netperf -6 -H "${dst_addr6}" ${dst_port} \ + -L "${src_addr6}" -l 1000 -t TCP_STREAM + + src_addr6= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Set up function to send UDP traffic on IPv4 +setup_flood_udp() { + if command -v iperf3 >/dev/null; then + flood_udp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 2>/dev/null + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_port="--cport ${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf3 -s -DB "${dst_addr4}" ${dst_port} + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ + ${dst_port} ${src_port} ${src_addr4} + + src_addr4= + src_port= + dst_port= + } + elif command -v iperf >/dev/null; then + flood_udp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + src_addr4="-B ${src_addr4}" + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="-B 10.0.0.2" + fi + if [ -n "${src_port}" ]; then + src_addr4="${src_addr4}:${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ + ${dst_port} ${src_addr4} + + src_addr4= + src_port= + dst_port= + } + elif command -v netperf >/dev/null; then + flood_udp() { + [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" + if [ -n "${src_addr4}" ]; then + B ip addr add "${src_addr4}/16" dev veth_b + else + B ip addr add dev veth_b 10.0.0.2 + src_addr4="10.0.0.2" + fi + if [ -n "${src_port}" ]; then + dst_port="${dst_port},${src_port}" + fi + B ip route add default dev veth_b + ip addr add "${dst_addr4}" dev veth_a 2>/dev/null + + # shellcheck disable=SC2086 # this needs split options + netserver -4 ${dst_port} -L "${dst_addr4}" \ + >/dev/null 2>&1 + sleep 2 + + # shellcheck disable=SC2086 # this needs split options + B netperf -4 -H "${dst_addr4}" ${dst_port} \ + -L "${src_addr4}" -l 1000 -t UDP_STREAM + + src_addr4= + src_port= + dst_port= + } + else + return 1 + fi +} + +# Find pktgen script and set up function to start pktgen injection +setup_perf() { + for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do + command -v "${pktgen_script_path}" >/dev/null && break + done + [ "${pktgen_script_path}" = "__notfound" ] && return 1 + + perf_ipv4() { + ${pktgen_script_path} -s80 \ + -i veth_a -d "${dst_addr4}" -p "${dst_port}" \ + -m "${dst_mac}" \ + -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & + perf_pid=$! + } + perf_ipv6() { + IP6=6 ${pktgen_script_path} -s100 \ + -i veth_a -d "${dst_addr6}" -p "${dst_port}" \ + -m "${dst_mac}" \ + -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & + perf_pid=$! + } +} + +# Clean up before each test +cleanup() { + nft reset counter inet filter test >/dev/null 2>&1 + nft flush ruleset >/dev/null 2>&1 + ip link del dummy0 2>/dev/null + ip route del default 2>/dev/null + ip -6 route del default 2>/dev/null + ip netns del B 2>/dev/null + ip link del veth_a 2>/dev/null + timeout= + killall iperf3 2>/dev/null + killall iperf 2>/dev/null + killall netperf 2>/dev/null + killall netserver 2>/dev/null + rm -f ${tmp} + sleep 2 +} + +# Entry point for setup functions +setup() { + if [ "$(id -u)" -ne 0 ]; then + echo " need to run as root" + exit ${KSELFTEST_SKIP} + fi + + cleanup + check_tools || return 1 + for arg do + if ! eval setup_"${arg}"; then + err " ${arg} not supported" + return 1 + fi + done +} + +# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it +format_addr4() { + a=$((${1} + 16777216 * 10 + 5)) + printf "%i.%i.%i.%i" \ + "$((a / 16777216))" "$((a % 16777216 / 65536))" \ + "$((a % 65536 / 256))" "$((a % 256))" +} + +# Format integer into IPv6 address, summing 2001:db8:: to it +format_addr6() { + printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))" +} + +# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it +format_mac() { + printf "00:01:%02x:%02x:%02x:%02x" \ + "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))" \ + "$((${1} % 65536 / 256))" "$((${1} % 256))" +} + +# Format integer into port, avoid 0 port +format_port() { + printf "%i" "$((${1} % 65534 + 1))" +} + +# Drop suffixed '6' from L4 protocol, if any +format_proto() { + printf "%s" "${proto}" | tr -d 6 +} + +# Format destination and source fields into nft concatenated type +format() { + __start= + __end= + __expr="{ " + + for f in ${dst}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __start="$(eval format_"${f}" "${start}")" + __end="$(eval format_"${f}" "${end}")" + + if [ "${f}" = "proto" ]; then + __expr="${__expr}${__start}" + else + __expr="${__expr}${__start}-${__end}" + fi + done + for f in ${src}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __start="$(eval format_"${f}" "${srcstart}")" + __end="$(eval format_"${f}" "${srcend}")" + + if [ "${f}" = "proto" ]; then + __expr="${__expr}${__start}" + else + __expr="${__expr}${__start}-${__end}" + fi + done + + if [ -n "${timeout}" ]; then + echo "${__expr} timeout ${timeout}s }" + else + echo "${__expr} }" + fi +} + +# Format destination and source fields into nft type, start element only +format_norange() { + __expr="{ " + + for f in ${dst}; do + [ "${__expr}" != "{ " ] && __expr="${__expr} . " + + __expr="${__expr}$(eval format_"${f}" "${start}")" + done + for f in ${src}; do + __expr="${__expr} . $(eval format_"${f}" "${start}")" + done + + echo "${__expr} }" +} + +# Format first destination field into nft type +format_noconcat() { + for f in ${dst}; do + __start="$(eval format_"${f}" "${start}")" + __end="$(eval format_"${f}" "${end}")" + + if [ "${f}" = "proto" ]; then + echo "{ ${__start} }" + else + echo "{ ${__start}-${__end} }" + fi + return + done +} + +# Add single entry to 'test' set in 'inet filter' table +add() { + if ! nft add element inet filter test "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Format and output entries for sets in 'netdev perf' table +add_perf() { + if [ "${1}" = "test" ]; then + echo "add element netdev perf test $(format)" + elif [ "${1}" = "norange" ]; then + echo "add element netdev perf norange $(format_norange)" + elif [ "${1}" = "noconcat" ]; then + echo "add element netdev perf noconcat $(format_noconcat)" + fi +} + +# Add single entry to 'norange' set in 'netdev perf' table +add_perf_norange() { + if ! nft add element netdev perf norange "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Add single entry to 'noconcat' set in 'netdev perf' table +add_perf_noconcat() { + if ! nft add element netdev perf noconcat "${1}"; then + err "Failed to add ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Delete single entry from set +del() { + if ! nft delete element inet filter test "${1}"; then + err "Failed to delete ${1} given ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Return packet count from 'test' counter in 'inet filter' table +count_packets() { + found=0 + for token in $(nft list counter inet filter test); do + [ ${found} -eq 1 ] && echo "${token}" && return + [ "${token}" = "packets" ] && found=1 + done +} + +# Return packet count from 'test' counter in 'netdev perf' table +count_perf_packets() { + found=0 + for token in $(nft list counter netdev perf test); do + [ ${found} -eq 1 ] && echo "${token}" && return + [ "${token}" = "packets" ] && found=1 + done +} + +# Set MAC addresses, send traffic according to specifier +flood() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval flood_\$proto +} + +# Set MAC addresses, start pktgen injection +perf() { + dst_mac="$(format_mac "${1}")" + ip link set veth_a address "${dst_mac}" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval perf_\$perf_proto +} + +# Set MAC addresses, send single packet, check that it matches, reset counter +send_match() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval send_\$proto + if [ "$(count_packets)" != "1" ]; then + err "${proto} packet to:" + err " $(for f in ${dst}; do + eval format_\$f "${1}"; printf ' '; done)" + err "from:" + err " $(for f in ${src}; do + eval format_\$f "${2}"; printf ' '; done)" + err "should have matched ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi + nft reset counter inet filter test >/dev/null +} + +# Set MAC addresses, send single packet, check that it doesn't match +send_nomatch() { + ip link set veth_a address "$(format_mac "${1}")" + ip -n B link set veth_b address "$(format_mac "${2}")" + + for f in ${dst}; do + eval dst_"$f"=\$\(format_\$f "${1}"\) + done + for f in ${src}; do + eval src_"$f"=\$\(format_\$f "${2}"\) + done + eval send_\$proto + if [ "$(count_packets)" != "0" ]; then + err "${proto} packet to:" + err " $(for f in ${dst}; do + eval format_\$f "${1}"; printf ' '; done)" + err "from:" + err " $(for f in ${src}; do + eval format_\$f "${2}"; printf ' '; done)" + err "should not have matched ruleset:" + err "$(nft -a list ruleset)" + return 1 + fi +} + +# Correctness test template: +# - add ranged element, check that packets match it +# - check that packets outside range don't match it +# - remove some elements, check that packets don't match anymore +test_correctness() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 + + # Delete elements now and then + if [ $((i % 3)) -eq 0 ]; then + del "$(format)" || return 1 + for j in $(seq ${start} \ + $((range_size / 2 + 1)) ${end}); do + send_nomatch "${j}" $((j + src_delta)) \ + || return 1 + done + fi + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done +} + +# Concurrency test template: +# - add all the elements +# - start a thread for each physical thread that: +# - adds all the elements +# - flushes the set +# - adds all the elements +# - flushes the entire ruleset +# - adds the set back +# - adds all the elements +# - delete all the elements +test_concurrency() { + proto=${flood_proto} + tools=${flood_tools} + chain_spec=${flood_spec} + setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} + + range_size=1 + cstart=${start} + flood_pids= + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + + flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!" + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + sleep 10 + + pids= + for c in $(seq 1 "$(nproc)"); do ( + for r in $(seq 1 "${race_repeat}"); do + range_size=1 + + # $start needs to be local to this subshell + # shellcheck disable=SC2030 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush inet filter test 2>/dev/null + + range_size=1 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset + setup set 2>/dev/null + + range_size=1 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + range_size=1 + start=${cstart} + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + del "$(format)" 2>/dev/null + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + done + ) & pids="${pids} $!" + done + + # shellcheck disable=SC2046,SC2086 # word splitting wanted here + wait $(for pid in ${pids}; do echo ${pid}; done) + # shellcheck disable=SC2046,SC2086 + kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null + # shellcheck disable=SC2046,SC2086 + wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null + + return 0 +} + +# Timeout test template: +# - add all the elements with 3s timeout while checking that packets match +# - wait 3s after the last insertion, check that packets don't match any entry +test_timeout() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + + timeout=3 + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + sleep 3 + for i in $(seq ${start} $((start + count))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_nomatch "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done +} + +# Performance test template: +# - add concatenated ranged entries +# - add non-ranged concatenated entries (for hash set matching rate baseline) +# - add ranged entries with first field only (for rbhash baseline) +# - start pktgen injection directly on device rx path of this namespace +# - measure drop only rate, hash and rbtree baselines, then matching rate +test_performance() { + chain_spec=${perf_spec} + dst="${perf_dst}" + src="${perf_src}" + setup veth perf set || return ${KSELFTEST_SKIP} + + first=${start} + range_size=1 + for set in test norange noconcat; do + start=${first} + for i in $(seq ${start} $((start + perf_entries))); do + end=$((start + range_size)) + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + elif [ ${start} -eq ${end} ]; then + end=$((start + 1)) + fi + + add_perf ${set} + + start=$((end + range_size)) + done > "${tmp}" + nft -f "${tmp}" + done + + perf $((end - 1)) ${srcstart} + + sleep 2 + + nft add rule netdev perf test counter name \"test\" drop + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline (drop from netdev hook): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec} @norange \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline hash (non-ranged entries): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + info " baseline rbtree (match on first field only): ${pps}pps" + handle="$(nft -a list chain netdev perf test | grep counter)" + handle="${handle##* }" + nft delete rule netdev perf test handle "${handle}" + + nft add rule "netdev perf test ${chain_spec} @test \ + counter name \"test\" drop" + nft reset counter netdev perf test >/dev/null 2>&1 + sleep "${perf_duration}" + pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" + p5="$(printf %5s "${perf_entries}")" + info " set with ${p5} full, ranged entries: ${pps}pps" + kill "${perf_pid}" +} + +test_bug_flush_remove_add() { + set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' + elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' + elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' + for i in `seq 1 100`; do + nft add table t ${set_cmd} || return ${KSELFTEST_SKIP} + nft add element t s ${elem1} 2>/dev/null || return 1 + nft flush set t s 2>/dev/null || return 1 + nft add element t s ${elem2} 2>/dev/null || return 1 + done + nft flush ruleset +} + +# - add ranged element, check that packets match it +# - reload the set, check packets still match +test_bug_reload() { + setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + rstart=${start} + + range_size=1 + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + add "$(format)" || return 1 + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + # check kernel does allocate pcpu sctrach map + # for reload with no elemet add/delete + ( echo flush set inet filter test ; + nft list set inet filter test ) | nft -f - + + start=${rstart} + range_size=1 + + for i in $(seq "${start}" $((start + count))); do + end=$((start + range_size)) + + # Avoid negative or zero-sized port ranges + if [ $((end / 65534)) -gt $((start / 65534)) ]; then + start=${end} + end=$((end + 1)) + fi + srcstart=$((start + src_delta)) + srcend=$((end + src_delta)) + + for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + send_match "${j}" $((j + src_delta)) || return 1 + done + + range_size=$((range_size + 1)) + start=$((end + range_size)) + done + + nft flush ruleset +} + +test_reported_issues() { + eval test_bug_"${subtest}" +} + +# Run everything in a separate network namespace +[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } +tmp="$(mktemp)" +trap cleanup EXIT + +# Entry point for test runs +passed=0 +for name in ${TESTS}; do + printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')" + if [ "${name}" = "reported_issues" ]; then + SUBTESTS="${BUGS}" + else + SUBTESTS="${TYPES}" + fi + + for subtest in ${SUBTESTS}; do + eval desc=\$TYPE_"${subtest}" + IFS=' +' + for __line in ${desc}; do + # shellcheck disable=SC2086 + eval ${__line%% *}=\"${__line##* }\"; + done + IFS=' +' + + if [ "${name}" = "concurrency" ] && \ + [ "${race_repeat}" = "0" ]; then + continue + fi + if [ "${name}" = "performance" ] && \ + [ "${perf_duration}" = "0" ]; then + continue + fi + + printf " %-60s " "${display}" + eval test_"${name}" + ret=$? + + if [ $ret -eq 0 ]; then + printf "[ OK ]\n" + info_flush + passed=$((passed + 1)) + elif [ $ret -eq 1 ]; then + printf "[FAIL]\n" + err_flush + exit 1 + elif [ $ret -eq ${KSELFTEST_SKIP} ]; then + printf "[SKIP]\n" + err_flush + fi + done +done + +[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0 diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh new file mode 100755 index 000000000000..faa7778d7bd1 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# +# This tests connection tracking helper assignment: +# 1. can attach ftp helper to a connection from nft ruleset. +# 2. auto-assign still works. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +testipv6=1 + +cleanup() +{ + ip netns del ${ns1} + ip netns del ${ns2} +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +which nc >/dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without netcat tool" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add ${ns1} +ip netns add ${ns2} + +ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set veth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set veth0 up + +ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 +ip -net ${ns1} addr add dead:1::1/64 dev veth0 + +ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 +ip -net ${ns2} addr add dead:1::2/64 dev veth0 + +load_ruleset_family() { + local family=$1 + local ns=$2 + +ip netns exec ${ns} nft -f - < /dev/null |grep -q 'helper=ftp' + if [ $? -ne 0 ] ; then + if [ $autoassign -eq 0 ] ;then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + else + echo "PASS: ${netns} did not show attached helper $message" 1>&2 + fi + else + if [ $autoassign -eq 0 ] ;then + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + else + echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 + ret=1 + fi + fi + + return 0 +} + +test_helper() +{ + local port=$1 + local autoassign=$2 + + if [ $autoassign -eq 0 ] ;then + msg="set via ruleset" + else + msg="auto-assign" + fi + + sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & + + sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & + sleep 1 + + check_for_helper "$ns1" "ip $msg" $port $autoassign + check_for_helper "$ns2" "ip $msg" $port $autoassign + + wait + + if [ $testipv6 -eq 0 ] ;then + return 0 + fi + + ip netns exec ${ns1} conntrack -F 2> /dev/null + ip netns exec ${ns2} conntrack -F 2> /dev/null + + sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & + + sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & + sleep 1 + + check_for_helper "$ns1" "ipv6 $msg" $port + check_for_helper "$ns2" "ipv6 $msg" $port + + wait +} + +load_ruleset_family ip ${ns1} +if [ $? -ne 0 ];then + echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 + exit 1 +fi + +load_ruleset_family ip6 ${ns1} +if [ $? -ne 0 ];then + echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 + testipv6=0 +fi + +load_ruleset_family inet ${ns2} +if [ $? -ne 0 ];then + echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 + load_ruleset_family ip ${ns2} + if [ $? -ne 0 ];then + echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 + exit 1 + fi + + if [ $testipv6 -eq 1 ] ;then + load_ruleset_family ip6 ${ns2} + if [ $? -ne 0 ];then + echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 + exit 1 + fi + fi +fi + +test_helper 2121 0 +ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 1 + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh new file mode 100755 index 000000000000..dff476e45e77 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -0,0 +1,273 @@ +#!/bin/bash +# +# This tests the fib expression. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsrouter="nsrouter-$sfx" +timeout=4 + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +cleanup() +{ + ip netns del ${ns1} + ip netns del ${ns2} + ip netns del ${nsrouter} + + [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add ${nsrouter} +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +trap cleanup EXIT + +dmesg | grep -q ' nft_rpfilter: ' +if [ $? -eq 0 ]; then + dmesg -c | grep ' nft_rpfilter: ' + echo "WARN: a previous test run has failed" 1>&2 +fi + +sysctl -q net.netfilter.nf_log_all_netns=1 +ip netns add ${ns1} +ip netns add ${ns2} + +load_ruleset() { + local netns=$1 + +ip netns exec ${netns} nft -f /dev/stdin <&2 + ip netns exec ${ns} nft list table inet filter + return 1 + fi + + if [ $want -gt 0 ]; then + echo "PASS: fib expression did drop packets for $address" + fi + + return 0 +} + +load_ruleset ${nsrouter} +load_ruleset ${ns1} +load_ruleset ${ns2} + +ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} + +ip -net ${nsrouter} link set lo up +ip -net ${nsrouter} link set veth0 up +ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 + +ip -net ${nsrouter} link set veth1 up +ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set eth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set eth0 up + +ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} route add default via 10.0.1.1 +ip -net ${ns1} route add default via dead:1::1 + +ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} route add default via 10.0.2.1 +ip -net ${ns2} route add default via dead:2::1 + +test_ping() { + local daddr4=$1 + local daddr6=$2 + + ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null + ret=$? + if [ $ret -ne 0 ];then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 + return 1 + fi + + ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null + ret=$? + if [ $ret -ne 0 ];then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 + return 1 + fi + + return 0 +} + +ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null + +sleep 3 + +test_ping 10.0.2.1 dead:2::1 || exit 1 +check_drops || exit 1 + +test_ping 10.0.2.99 dead:2::99 || exit 1 +check_drops || exit 1 + +echo "PASS: fib expression did not cause unwanted packet drops" + +ip netns exec ${nsrouter} nft flush table inet filter + +ip -net ${ns1} route del default +ip -net ${ns1} -6 route del default + +ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr del dead:1::99/64 dev eth0 + +ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns1} addr add dead:2::99/64 dev eth0 + +ip -net ${ns1} route add default via 10.0.2.1 +ip -net ${ns1} -6 route add default via dead:2::1 + +ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 + +# switch to ruleset that doesn't log, this time +# its expected that this does drop the packets. +load_ruleset_count ${nsrouter} + +# ns1 has a default route, but nsrouter does not. +# must not check return value, ping to 1.1.1.1 will +# fail. +check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 +check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 + +ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null +check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 + +sleep 2 +ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null +check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 + +# delete all rules +ip netns exec ${ns1} nft flush ruleset +ip netns exec ${ns2} nft flush ruleset +ip netns exec ${nsrouter} nft flush ruleset + +ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 + +ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 +ip -net ${ns1} addr del dead:2::99/64 dev eth0 + +ip -net ${nsrouter} addr del dead:2::1/64 dev veth0 + +# ... pbr ruleset for the router, check iif+oif. +load_pbr_ruleset ${nsrouter} +if [ $? -ne 0 ] ; then + echo "SKIP: Could not load fib forward ruleset" + exit $ksft_skip +fi + +ip -net ${nsrouter} rule add from all table 128 +ip -net ${nsrouter} rule add from all iif veth0 table 129 +ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0 +ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1 + +# drop main ipv4 table +ip -net ${nsrouter} -4 rule delete table main + +test_ping 10.0.2.99 dead:2::99 +if [ $? -ne 0 ] ; then + ip -net ${nsrouter} nft list ruleset + echo "FAIL: fib mismatch in pbr setup" + exit 1 +fi + +echo "PASS: fib expression forward check with policy based routing" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh new file mode 100755 index 000000000000..a32f490f7539 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -0,0 +1,672 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This tests basic flowtable functionality. +# Creates following default topology: +# +# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) +# Router1 is the one doing flow offloading, Router2 has no special +# purpose other than having a link that is smaller than either Originator +# and responder, i.e. TCPMSS announced values are too large and will still +# result in fragmentation and/or PMTU discovery. +# +# You can check with different Orgininator/Link/Responder MTU eg: +# nft_flowtable.sh -o8000 -l1500 -r2000 +# + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsr1="nsr1-$sfx" +nsr2="nsr2-$sfx" + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +nsin="" +ns1out="" +ns2out="" + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} + +checktool "nft --version" "run test without nft tool" +checktool "ip -Version" "run test without ip tool" +checktool "which nc" "run test without nc (netcat)" +checktool "ip netns add $nsr1" "create net namespace $nsr1" + +ip netns add $ns1 +ip netns add $ns2 +ip netns add $nsr2 + +cleanup() { + ip netns del $ns1 + ip netns del $ns2 + ip netns del $nsr1 + ip netns del $nsr2 + + rm -f "$nsin" "$ns1out" "$ns2out" + + [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +trap cleanup EXIT + +sysctl -q net.netfilter.nf_log_all_netns=1 + +ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 +ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 + +ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 + +for dev in lo veth0 veth1; do + ip -net $nsr1 link set $dev up + ip -net $nsr2 link set $dev up +done + +ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net $nsr1 addr add dead:1::1/64 dev veth0 + +ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 +ip -net $nsr2 addr add dead:2::1/64 dev veth1 + +# set different MTUs so we need to push packets coming from ns1 (large MTU) +# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), +# or to do PTMU discovery (send ICMP error back to originator). +# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers +# is NOT the lowest link mtu. + +omtu=9000 +lmtu=1500 +rmtu=2000 + +usage(){ + echo "nft_flowtable.sh [OPTIONS]" + echo + echo "MTU options" + echo " -o originator" + echo " -l link" + echo " -r responder" + exit 1 +} + +while getopts "o:l:r:" o +do + case $o in + o) omtu=$OPTARG;; + l) lmtu=$OPTARG;; + r) rmtu=$OPTARG;; + *) usage;; + esac +done + +if ! ip -net $nsr1 link set veth0 mtu $omtu; then + exit 1 +fi + +ip -net $ns1 link set eth0 mtu $omtu + +if ! ip -net $nsr2 link set veth1 mtu $rmtu; then + exit 1 +fi + +ip -net $ns2 link set eth0 mtu $rmtu + +# transfer-net between nsr1 and nsr2. +# these addresses are not used for connections. +ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 +ip -net $nsr1 addr add fee1:2::1/64 dev veth1 + +ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 +ip -net $nsr2 addr add fee1:2::2/64 dev veth0 + +for i in 0 1; do + ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null +done + +for ns in $ns1 $ns2;do + ip -net $ns link set lo up + ip -net $ns link set eth0 up + + if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then + echo "ERROR: Check Originator/Responder values (problem during address addition)" + exit 1 + fi + # don't set ip DF bit for first two tests + ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null +done + +ip -net $ns1 addr add 10.0.1.99/24 dev eth0 +ip -net $ns2 addr add 10.0.2.99/24 dev eth0 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns2 route add default via 10.0.2.1 +ip -net $ns1 addr add dead:1::99/64 dev eth0 +ip -net $ns2 addr add dead:2::99/64 dev eth0 +ip -net $ns1 route add default via dead:1::1 +ip -net $ns2 route add default via dead:2::1 + +ip -net $nsr1 route add default via 192.168.10.2 +ip -net $nsr2 route add default via 192.168.10.1 + +ip netns exec $nsr1 nft -f - < /dev/null; then + echo "ERROR: $ns1 cannot reach ns2" 1>&2 + exit 1 +fi + +if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 + exit 1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" +fi + +nsin=$(mktemp) +ns1out=$(mktemp) +ns2out=$(mktemp) + +make_file() +{ + name=$1 + + SIZE=$((RANDOM % (1024 * 128))) + SIZE=$((SIZE + (1024 * 8))) + TSIZE=$((SIZE * 1024)) + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + + SIZE=$((RANDOM % 1024)) + SIZE=$((SIZE + 128)) + TSIZE=$((TSIZE + SIZE)) + dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null +} + +check_counters() +{ + local what=$1 + local ok=1 + + local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) + local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) + + local orig_cnt=${orig#*bytes} + local repl_cnt=${repl#*bytes} + + local fs=$(du -sb $nsin) + local max_orig=${fs%%/*} + local max_repl=$((max_orig/4)) + + if [ $orig_cnt -gt $max_orig ];then + echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 + ret=1 + ok=0 + fi + + if [ $repl_cnt -gt $max_repl ];then + echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 + ret=1 + ok=0 + fi + + if [ $ok -eq 1 ]; then + echo "PASS: $what" + fi +} + +check_dscp() +{ + local what=$1 + local ok=1 + + local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) + + local pc4=${counter%*bytes*} + local pc4=${pc4#*packets} + + local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) + local pc4z=${counter%*bytes*} + local pc4z=${pc4z#*packets} + + case "$what" in + "dscp_none") + if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_fwd") + if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_ingress") + if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + "dscp_egress") + if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + ret=1 + ok=0 + fi + ;; + *) + echo "FAIL: Unknown DSCP check" 1>&2 + ret=1 + ok=0 + esac + + if [ $ok -eq 1 ] ;then + echo "PASS: $what: dscp packet counters match" + fi +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + if ! cmp "$in" "$out" > /dev/null 2>&1; then + echo "FAIL: file mismatch for $what" 1>&2 + ls -l "$in" + ls -l "$out" + return 1 + fi + + return 0 +} + +test_tcp_forwarding_ip() +{ + local nsa=$1 + local nsb=$2 + local dstip=$3 + local dstport=$4 + local lret=0 + + ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & + lpid=$! + + sleep 1 + ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & + cpid=$! + + sleep 1 + + prev="$(ls -l $ns1out $ns2out)" + sleep 1 + + while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do + sleep 1; + prev="$(ls -l $ns1out $ns2out)" + done + + if test -d /proc/"$lpid"/; then + kill $lpid + fi + + if test -d /proc/"$cpid"/; then + kill $cpid + fi + + wait $lpid + wait $cpid + + if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then + lret=1 + fi + + if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then + lret=1 + fi + + return $lret +} + +test_tcp_forwarding() +{ + test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + + return $? +} + +test_tcp_forwarding_set_dscp() +{ + check_dscp "dscp_none" + +ip netns exec $nsr1 nft -f - <&2 + ip netns exec $nsr1 nft list ruleset + ret=1 +fi + +# delete default route, i.e. ns2 won't be able to reach ns1 and +# will depend on ns1 being masqueraded in nsr1. +# expect ns1 has nsr1 address. +ip -net $ns2 route del default via 10.0.2.1 +ip -net $ns2 route del default via dead:2::1 +ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 + +# Second test: +# Same, but with NAT enabled. Same as in first test: we expect normal forward path +# to handle most packets. +ip netns exec $nsr1 nft -f - <&2 + exit 0 +fi + +if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then + echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 + ip netns exec $nsr1 nft list ruleset + ret=1 +fi + +# Third test: +# Same as second test, but with PMTU discovery enabled. This +# means that we expect the fastpath to handle packets as soon +# as the endpoints adjust the packet size. +ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null + +# reset counters. +# With pmtu in-place we'll also check that nft counters +# are lower than file size and packets were forwarded via flowtable layer. +# For earlier tests (large mtus), packets cannot be handled via flowtable +# (except pure acks and other small packets). +ip netns exec $nsr1 nft reset counters table inet filter >/dev/null + +if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then + echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 + ip netns exec $nsr1 nft list ruleset +fi + +# Another test: +# Add bridge interface br0 to Router1, with NAT enabled. +ip -net $nsr1 link add name br0 type bridge +ip -net $nsr1 addr flush dev veth0 +ip -net $nsr1 link set up dev veth0 +ip -net $nsr1 link set veth0 master br0 +ip -net $nsr1 addr add 10.0.1.1/24 dev br0 +ip -net $nsr1 addr add dead:1::1/64 dev br0 +ip -net $nsr1 link set up dev br0 + +ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null + +# br0 with NAT enabled. +ip netns exec $nsr1 nft -f - <&2 + ip netns exec $nsr1 nft list ruleset + ret=1 +fi + + +# Another test: +# Add bridge interface br0 to Router1, with NAT and VLAN. +ip -net $nsr1 link set veth0 nomaster +ip -net $nsr1 link set down dev veth0 +ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 +ip -net $nsr1 link set up dev veth0 +ip -net $nsr1 link set up dev veth0.10 +ip -net $nsr1 link set veth0.10 master br0 + +ip -net $ns1 addr flush dev eth0 +ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 +ip -net $ns1 link set eth0 up +ip -net $ns1 link set eth0.10 up +ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns1 addr add dead:1::99/64 dev eth0.10 + +if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then + echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 + ip netns exec $nsr1 nft list ruleset + ret=1 +fi + +# restore test topology (remove bridge and VLAN) +ip -net $nsr1 link set veth0 nomaster +ip -net $nsr1 link set veth0 down +ip -net $nsr1 link set veth0.10 down +ip -net $nsr1 link delete veth0.10 type vlan +ip -net $nsr1 link delete br0 type bridge +ip -net $ns1 addr flush dev eth0.10 +ip -net $ns1 link set eth0.10 down +ip -net $ns1 link set eth0 down +ip -net $ns1 link delete eth0.10 type vlan + +# restore address in ns1 and nsr1 +ip -net $ns1 link set eth0 up +ip -net $ns1 addr add 10.0.1.99/24 dev eth0 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns1 addr add dead:1::99/64 dev eth0 +ip -net $ns1 route add default via dead:1::1 +ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net $nsr1 addr add dead:1::1/64 dev veth0 +ip -net $nsr1 link set up dev veth0 + +KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) +KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) +SPI1=$RANDOM +SPI2=$RANDOM + +if [ $SPI1 -eq $SPI2 ]; then + SPI2=$((SPI2+1)) +fi + +do_esp() { + local ns=$1 + local me=$2 + local remote=$3 + local lnet=$4 + local rnet=$5 + local spi_out=$6 + local spi_in=$7 + + ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet + ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + + # to encrypt packets as they go out (includes forwarded packets that need encapsulation) + ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow + # to fwd decrypted packets after esp processing: + ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow + +} + +do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 + +do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 + +ip netns exec $nsr1 nft delete table ip nat + +# restore default routes +ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 +ip -net $ns2 route add default via 10.0.2.1 +ip -net $ns2 route add default via dead:2::1 + +if test_tcp_forwarding $ns1 $ns2; then + check_counters "ipsec tunnel mode for ns1/ns2" +else + echo "FAIL: ipsec tunnel mode for ns1/ns2" + ip netns exec $nsr1 nft list ruleset 1>&2 + ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh new file mode 100755 index 000000000000..f33154c04d34 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_meta.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# check iif/iifname/oifgroup/iiftype match. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +sfx=$(mktemp -u "XXXXXXXX") +ns0="ns0-$sfx" + +if ! nft --version > /dev/null 2>&1; then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +cleanup() +{ + ip netns del "$ns0" +} + +ip netns add "$ns0" +ip -net "$ns0" link set lo up +ip -net "$ns0" addr add 127.0.0.1 dev lo + +trap cleanup EXIT + +currentyear=$(date +%Y) +lastyear=$((currentyear-1)) +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + +check_lo_counters "2" true + +check_one_counter oskuidcounter "1" true +check_one_counter oskgidcounter "1" true +check_one_counter imarkcounter "1" true +check_one_counter omarkcounter "1" true +check_one_counter ilastyearcounter "0" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta iif/oif counters at expected values" +else + exit $ret +fi + +#First CPU execution and counter +taskset -p 01 $$ > /dev/null +ip netns exec "$ns0" nft reset counters > /dev/null +ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null +check_one_counter icpu0counter "2" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta cpu counter at expected values" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh new file mode 100755 index 000000000000..dd40d9f6f259 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_nat.sh @@ -0,0 +1,1224 @@ +#!/bin/bash +# +# This test is for basic NAT functionality: snat, dnat, redirect, masquerade. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 +test_inet_nat=true + +sfx=$(mktemp -u "XXXXXXXX") +ns0="ns0-$sfx" +ns1="ns1-$sfx" +ns2="ns2-$sfx" + +cleanup() +{ + for i in 0 1 2; do ip netns del ns$i-"$sfx";done +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add "$ns0" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns0" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add "$ns1" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns1" + exit $ksft_skip +fi + +ip netns add "$ns2" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns2" + exit $ksft_skip +fi + +ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" + +ip -net "$ns0" link set lo up +ip -net "$ns0" link set veth0 up +ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 +ip -net "$ns0" addr add dead:1::1/64 dev veth0 + +ip -net "$ns0" link set veth1 up +ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 +ip -net "$ns0" addr add dead:2::1/64 dev veth1 + +for i in 1 2; do + ip -net ns$i-$sfx link set lo up + ip -net ns$i-$sfx link set eth0 up + ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0 + ip -net ns$i-$sfx route add default via 10.0.$i.1 + ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0 + ip -net ns$i-$sfx route add default via dead:$i::1 +done + +bad_counter() +{ + local ns=$1 + local counter=$2 + local expect=$3 + local tag=$4 + + echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 + ip netns exec $ns nft list counter inet filter $counter 1>&2 +} + +check_counters() +{ + ns=$1 + local lret=0 + + cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84") + if [ $? -ne 0 ]; then + bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1" + lret=1 + fi + cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84") + if [ $? -ne 0 ]; then + bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2" + lret=1 + fi + + expect="packets 1 bytes 104" + cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter $ns ns0in6 "$expect" "check_counters 3" + lret=1 + fi + cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter $ns ns0out6 "$expect" "check_counters 4" + lret=1 + fi + + return $lret +} + +check_ns0_counters() +{ + local ns=$1 + local lret=0 + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns0in6 "packets 0 bytes 0" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" + lret=1 + fi + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " + lret=1 + fi + + for dir in "in" "out" ; do + expect="packets 1 bytes 84" + cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4" + lret=1 + fi + + expect="packets 1 bytes 104" + cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5" + lret=1 + fi + done + + return $lret +} + +reset_counters() +{ + for i in 0 1 2;do + ip netns exec ns$i-$sfx nft reset counters inet > /dev/null + done +} + +test_local_dnat6() +{ + local family=$1 + local lret=0 + local IPF="" + + if [ $family = "inet" ];then + IPF="ip6" + fi + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + if [ $? -ne 0 ]; then + lret=1 + echo "ERROR: ping6 failed" + return $lret + fi + + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" + lret=1 + fi + done + + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" + lret=1 + fi + done + + # expect 0 count in ns1 + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" + lret=1 + fi + done + + # expect 1 packet in ns2 + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" + lret=1 + fi + done + + test $lret -eq 0 && echo "PASS: ipv6 ping to $ns1 was $family NATted to $ns2" + ip netns exec "$ns0" nft flush chain ip6 nat output + + return $lret +} + +test_local_dnat() +{ + local family=$1 + local lret=0 + local IPF="" + + if [ $family = "inet" ];then + IPF="ip" + fi + +ip netns exec "$ns0" nft -f /dev/stdin </dev/null +table $family nat { + chain output { + type nat hook output priority 0; policy accept; + ip daddr 10.0.1.99 dnat $IPF to 10.0.2.99 + } +} +EOF + if [ $? -ne 0 ]; then + if [ $family = "inet" ];then + echo "SKIP: inet nat tests" + test_inet_nat=false + return $ksft_skip + fi + echo "SKIP: Could not add add $family dnat hook" + return $ksft_skip + fi + + # ping netns1, expect rewrite to netns2 + ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null + if [ $? -ne 0 ]; then + lret=1 + echo "ERROR: ping failed" + return $lret + fi + + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1" + lret=1 + fi + done + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2" + lret=1 + fi + done + + # expect 0 count in ns1 + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3" + lret=1 + fi + done + + # expect 1 packet in ns2 + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4" + lret=1 + fi + done + + test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" + + ip netns exec "$ns0" nft flush chain $family nat output + + reset_counters + ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null + if [ $? -ne 0 ]; then + lret=1 + echo "ERROR: ping failed" + return $lret + fi + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" + lret=1 + fi + done + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" + lret=1 + fi + done + + # expect 1 count in ns1 + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" + lret=1 + fi + done + + # expect 0 packet in ns2 + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" + lret=1 + fi + done + + test $lret -eq 0 && echo "PASS: ping to $ns1 OK after $family nat output chain flush" + + return $lret +} + +test_local_dnat_portonly() +{ + local family=$1 + local daddr=$2 + local lret=0 + local sr_s + local sr_r + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + + ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" + return 1 + lret=1 + fi + + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1" + lret=1 + fi + + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2" + lret=1 + fi + done + + reset_counters + +# add masquerading rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" + lret=1 + fi + + # ns1 should have seen packets from ns0, due to masquerade + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" + lret=1 + fi + + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6" + lret=1 + fi + done + + ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" + lret=1 + fi + + ip netns exec "$ns0" nft flush chain $family nat postrouting + if [ $? -ne 0 ]; then + echo "ERROR: Could not flush $family nat postrouting" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IPv6 masquerade $natflags for $ns2" + + return $lret +} + +test_masquerade() +{ + local family=$1 + local natflags=$2 + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from "$ns2" $natflags" + lret=1 + fi + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1" + lret=1 + fi + + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2" + lret=1 + fi + done + + reset_counters + +# add masquerading rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" + lret=1 + fi + + # ns1 should have seen packets from ns0, due to masquerade + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3" + lret=1 + fi + + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6" + lret=1 + fi + done + + ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" + lret=1 + fi + + ip netns exec "$ns0" nft flush chain $family nat postrouting + if [ $? -ne 0 ]; then + echo "ERROR: Could not flush $family nat postrouting" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IP masquerade $natflags for $ns2" + + return $lret +} + +test_redirect6() +{ + local family=$1 + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null + + ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" + lret=1 + fi + + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" + lret=1 + fi + + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" + lret=1 + fi + done + + reset_counters + +# add redirect rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" + lret=1 + fi + + # ns1 should have seen no packets from ns2, due to redirection + expect="packets 0 bytes 0" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" + lret=1 + fi + done + + # ns0 should have seen packets from ns2, due to masquerade + expect="packets 1 bytes 104" + for dir in "in6" "out6" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" + lret=1 + fi + done + + ip netns exec "$ns0" nft delete table $family nat + if [ $? -ne 0 ]; then + echo "ERROR: Could not delete $family nat table" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IPv6 redirection for $ns2" + + return $lret +} + +test_redirect() +{ + local family=$1 + local lret=0 + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2" + lret=1 + fi + + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1" + lret=1 + fi + + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" + lret=1 + fi + done + + reset_counters + +# add redirect rule +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" + lret=1 + fi + + # ns1 should have seen no packets from ns2, due to redirection + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" + lret=1 + fi + done + + # ns0 should have seen packets from ns2, due to masquerade + expect="packets 1 bytes 84" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" + lret=1 + fi + done + + ip netns exec "$ns0" nft delete table $family nat + if [ $? -ne 0 ]; then + echo "ERROR: Could not delete $family nat table" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: $family IP redirection for $ns2" + + return $lret +} + +# test port shadowing. +# create two listening services, one on router (ns0), one +# on client (ns2), which is masqueraded from ns1 point of view. +# ns2 sends udp packet coming from service port to ns1, on a highport. +# Later, if n1 uses same highport to connect to ns0:service, packet +# might be port-forwarded to ns2 instead. + +# second argument tells if we expect the 'fake-entry' to take effect +# (CLIENT) or not (ROUTER). +test_port_shadow() +{ + local test=$1 + local expect=$2 + local daddrc="10.0.1.99" + local daddrs="10.0.1.1" + local result="" + local logmsg="" + + # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. + echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 + + echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 & + sc_r=$! + + echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport & + sc_c=$! + + sleep 0.3 + + # ns1 tries to connect to ns0:1405. With default settings this should connect + # to client, it matches the conntrack entry created above. + + result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404) + + if [ "$result" = "$expect" ] ;then + echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}" + else + echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended" + ret=1 + fi + + kill $sc_r $sc_c 2>/dev/null + + # flush udp entries for next test round, if any + ip netns exec "$ns0" conntrack -F >/dev/null 2>&1 +} + +# This prevents port shadow of router service via packet filter, +# packets claiming to originate from service port from internal +# network are dropped. +test_port_shadow_filter() +{ + local family=$1 + +ip netns exec "$ns0" nft -f /dev/stdin </dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run nat port shadowing test without conntrack tool" + return + fi + + socat -h > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run nat port shadowing test without socat tool" + return + fi + + ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns0" nft -f /dev/stdin < /dev/null + ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + + ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" + return 1 + fi + +ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" + lret=1 + fi + + # ns1 should have seen packets from .2.2, due to stateless rewrite. + expect="packets 1 bytes 84" + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" + lret=1 + fi + + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" + lret=1 + fi + done + + # ns1 should not have seen packets from ns2, due to masquerade + expect="packets 0 bytes 0" + for dir in "in" "out" ; do + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" + lret=1 + fi + + cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" + lret=1 + fi + done + + reset_counters + + socat -h > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run stateless nat frag test without socat tool" + if [ $lret -eq 0 ]; then + return $ksft_skip + fi + + ip netns exec "$ns0" nft delete table ip stateless + return $lret + fi + + local tmpfile=$(mktemp) + dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null + + local outfile=$(mktemp) + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & + sc_r=$! + + sleep 1 + # re-do with large ping -> ip fragmentation + ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null + if [ $? -ne 0 ] ; then + echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 + lret=1 + fi + + wait + + cmp "$tmpfile" "$outfile" + if [ $? -ne 0 ]; then + ls -l "$tmpfile" "$outfile" + echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 + lret=1 + fi + + rm -f "$tmpfile" "$outfile" + + # ns1 should have seen packets from 2.2, due to stateless rewrite. + expect="packets 3 bytes 4164" + cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") + if [ $? -ne 0 ]; then + bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" + lret=1 + fi + + ip netns exec "$ns0" nft delete table ip stateless + if [ $? -ne 0 ]; then + echo "ERROR: Could not delete table ip stateless" 1>&2 + lret=1 + fi + + test $lret -eq 0 && echo "PASS: IP statless for $ns2" + + return $lret +} + +# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 +for i in 0 1 2; do +ip netns exec ns$i-$sfx nft -f /dev/stdin < /dev/null + if [ $? -ne 0 ];then + echo "ERROR: Could not reach other namespace(s)" 1>&2 + ret=1 + fi + + ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null + if [ $? -ne 0 ];then + echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 + ret=1 + fi + check_counters ns$i-$sfx + if [ $? -ne 0 ]; then + ret=1 + fi + + check_ns0_counters ns$i + if [ $? -ne 0 ]; then + ret=1 + fi + reset_counters +done + +if [ $ret -eq 0 ];then + echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" +fi + +reset_counters +test_local_dnat ip +test_local_dnat6 ip6 + +reset_counters +test_local_dnat_portonly inet 10.0.1.99 + +reset_counters +$test_inet_nat && test_local_dnat inet +$test_inet_nat && test_local_dnat6 inet + +for flags in "" "fully-random"; do +reset_counters +test_masquerade ip $flags +test_masquerade6 ip6 $flags +reset_counters +$test_inet_nat && test_masquerade inet $flags +$test_inet_nat && test_masquerade6 inet $flags +done + +reset_counters +test_redirect ip +test_redirect6 ip6 +reset_counters +$test_inet_nat && test_redirect inet +$test_inet_nat && test_redirect6 inet + +test_port_shadowing +test_stateless_nat_ip + +if [ $ret -ne 0 ];then + echo -n "FAIL: " + nft --version +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh new file mode 100755 index 000000000000..b9ab37380f33 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# +# Test connection tracking zone and NAT source port reallocation support. +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# Don't increase too much, 2000 clients should work +# just fine but script can then take several minutes with +# KASAN/debug builds. +maxclients=100 + +have_iperf=1 +ret=0 + +# client1---. +# veth1-. +# | +# NAT Gateway --veth0--> Server +# | | +# veth2-' | +# client2---' | +# .... | +# clientX----vethX---' + +# All clients share identical IP address. +# NAT Gateway uses policy routing and conntrack zones to isolate client +# namespaces. Each client connects to Server, each with colliding tuples: +# clientsaddr:10000 -> serveraddr:dport +# NAT Gateway is supposed to do port reallocation for each of the +# connections. + +sfx=$(mktemp -u "XXXXXXXX") +gw="ns-gw-$sfx" +cl1="ns-cl1-$sfx" +cl2="ns-cl2-$sfx" +srv="ns-srv-$sfx" + +v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) +v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) +v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) +v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) +v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) +v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) + +cleanup() +{ + ip netns del $gw + ip netns del $srv + for i in $(seq 1 $maxclients); do + ip netns del ns-cl$i-$sfx 2>/dev/null + done + + sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +conntrack -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without conntrack tool" + exit $ksft_skip +fi + +iperf3 -v >/dev/null 2>&1 +if [ $? -ne 0 ];then + have_iperf=0 +fi + +ip netns add "$gw" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $gw" + exit $ksft_skip +fi +ip -net "$gw" link set lo up + +trap cleanup EXIT + +ip netns add "$srv" +if [ $? -ne 0 ];then + echo "SKIP: Could not create server netns $srv" + exit $ksft_skip +fi + +ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" +ip -net "$gw" link set veth0 up +ip -net "$srv" link set lo up +ip -net "$srv" link set eth0 up + +sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null +sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + + ip netns add "$cl" + if [ $? -ne 0 ];then + echo "SKIP: Could not create client netns $cl" + exit $ksft_skip + fi + ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip + fi +done + +for i in $(seq 1 $maxclients);do + cl="ns-cl$i-$sfx" + echo netns exec "$cl" ip link set lo up + echo netns exec "$cl" ip link set eth0 up + echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 + echo netns exec "$gw" ip link set veth$i up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + + # clients have same IP addresses. + echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 + echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 + + # NB: same addresses on client-facing interfaces. + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i + echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + + # gw: policy routing + echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) + echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) +done | ip -batch /dev/stdin + +ip -net "$gw" addr add 10.3.0.1/24 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 + +ip -net "$srv" addr add 10.3.0.99/24 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 + +ip netns exec $gw nft -f /dev/stdin< /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null + +# useful for debugging: allows to use 'ping' from clients to gateway. +ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null +ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null + +for i in $(seq 1 $maxclients); do + cl="ns-cl$i-$sfx" + ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & + if [ $? -ne 0 ]; then + echo FAIL: Ping failure from $cl 1>&2 + ret=1 + break + fi +done + +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth$i" 1>&2 + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + break + fi +done + +ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" + ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 +fi + +if [ $ret -eq 0 ]; then + echo "PASS: ping test from all $maxclients namespaces" +fi + +if [ $have_iperf -eq 0 ];then + echo "SKIP: iperf3 not installed" + if [ $ret -ne 0 ];then + exit $ret + fi + exit $ksft_skip +fi + +ip netns exec $srv iperf3 -s > /dev/null 2>&1 & +iperfpid=$! +sleep 1 + +for i in $(seq 1 $maxclients); do + if [ $ret -ne 0 ]; then + break + fi + cl="ns-cl$i-$sfx" + ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null + if [ $? -ne 0 ]; then + echo FAIL: Failure to connect for $cl 1>&2 + ip netns exec $gw conntrack -S 1>&2 + ret=1 + fi +done +if [ $ret -eq 0 ];then + echo "PASS: iperf3 connections for all $maxclients net namespaces" +fi + +kill $iperfpid +wait + +for i in $(seq 1 $maxclients); do + ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null + if [ $? -ne 0 ];then + ret=1 + echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 + break + fi +done +if [ $ret -eq 0 ];then + echo "PASS: Found client connection for all $maxclients net namespaces" +fi + +ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null +if [ $? -ne 0 ];then + ret=1 + echo "FAIL: cannot find return entry on veth0" 1>&2 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh new file mode 100755 index 000000000000..2eb65887e570 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -0,0 +1,449 @@ +#!/bin/bash +# +# This tests nf_queue: +# 1. can process packets from all hooks +# 2. support running nfqueue from more than one base chain +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsrouter="nsrouter-$sfx" +timeout=4 + +cleanup() +{ + ip netns pids ${ns1} | xargs kill 2>/dev/null + ip netns pids ${ns2} | xargs kill 2>/dev/null + ip netns pids ${nsrouter} | xargs kill 2>/dev/null + + ip netns del ${ns1} + ip netns del ${ns2} + ip netns del ${nsrouter} + rm -f "$TMPFILE0" + rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add ${nsrouter} +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +TMPFILE0=$(mktemp) +TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) +trap cleanup EXIT + +ip netns add ${ns1} +ip netns add ${ns2} + +ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} + +ip -net ${nsrouter} link set lo up +ip -net ${nsrouter} link set veth0 up +ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 + +ip -net ${nsrouter} link set veth1 up +ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set eth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set eth0 up + +ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} route add default via 10.0.1.1 +ip -net ${ns1} route add default via dead:1::1 + +ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} route add default via 10.0.2.1 +ip -net ${ns2} route add default via dead:2::1 + +load_ruleset() { + local name=$1 + local prio=$2 + +ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + return 0 +} + +test_ping_router() { + ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null + if [ $? -ne 0 ];then + return 1 + fi + + return 0 +} + +test_queue_blackhole() { + local proto=$1 + +ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null + lret=$? + elif [ $proto = "ip6" ]; then + ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null + lret=$? + else + lret=111 + fi + + # queue without bypass keyword should drop traffic if no listener exists. + if [ $lret -eq 0 ];then + echo "FAIL: $proto expected failure, got $lret" 1>&2 + exit 1 + fi + + ip netns exec ${nsrouter} nft delete table $proto blackh + if [ $? -ne 0 ] ;then + echo "FAIL: $proto: Could not delete blackh table" + exit 1 + fi + + echo "PASS: $proto: statement with no listener results in packet drop" +} + +test_queue() +{ + local expected=$1 + local last="" + + # spawn nf_queue listeners + ip netns exec ${nsrouter} ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec ${nsrouter} ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" & + sleep 1 + test_ping + ret=$? + if [ $ret -ne 0 ];then + echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 + exit $ret + fi + + test_ping_router + ret=$? + if [ $ret -ne 0 ];then + echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 + exit $ret + fi + + wait + ret=$? + + for file in $TMPFILE0 $TMPFILE1; do + last=$(tail -n1 "$file") + if [ x"$last" != x"$expected packets total" ]; then + echo "FAIL: Expected $expected packets total, but got $last" 1>&2 + cat "$file" 1>&2 + + ip netns exec ${nsrouter} nft list ruleset + exit 1 + fi + done + + echo "PASS: Expected and received $last" +} + +test_tcp_forward() +{ + ip netns exec ${nsrouter} ./nf_queue -q 2 -t $timeout & + local nfqpid=$! + + tmpfile=$(mktemp) || exit 1 + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile + ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + local rpid=$! + + sleep 1 + ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & + + rm -f "$tmpfile" + + wait $rpid + wait $lpid + [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" +} + +test_tcp_localhost() +{ + tmpfile=$(mktemp) || exit 1 + + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile + ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + local rpid=$! + + ip netns exec ${nsrouter} ./nf_queue -q 3 -t $timeout & + local nfqpid=$! + + sleep 1 + ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null + rm -f "$tmpfile" + + wait $rpid + [ $? -eq 0 ] && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_connectclose() +{ + tmpfile=$(mktemp) || exit 1 + + ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & + + ip netns exec ${nsrouter} ./nf_queue -q 3 -t $timeout & + local nfqpid=$! + + sleep 1 + rm -f "$tmpfile" + + wait $rpid + [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec ${nsrouter} nft -f /dev/stdin </dev/null & + local rpid=$! + + ip netns exec ${nsrouter} ./nf_queue -c -q 1 -t $timeout > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec ${nsrouter} ./nf_queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & + + sleep 1 + ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null + rm -f "$tmpfile" + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" +} + +test_icmp_vrf() { + ip -net $ns1 link add tvrf type vrf table 9876 + if [ $? -ne 0 ];then + echo "SKIP: Could not add vrf device" + return + fi + + ip -net $ns1 li set eth0 master tvrf + ip -net $ns1 li set tvrf up + + ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 +ip netns exec ${ns1} nft -f /dev/stdin < /dev/null + + for n in output post; do + for d in tvrf eth0; do + ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1" + if [ $? -ne 0 ] ; then + echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 + ip netns exec ${ns1} nft list ruleset + ret=1 + return + fi + done + done + + wait $nfqpid + [ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf" + wait 2>/dev/null +} + +ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +load_ruleset "filter" 0 + +sleep 3 + +test_ping +ret=$? +if [ $ret -eq 0 ];then + # queue bypass works (rules were skipped, no listener) + echo "PASS: ${ns1} can reach ${ns2}" +else + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 + exit $ret +fi + +test_queue_blackhole ip +test_queue_blackhole ip6 + +# dummy ruleset to add base chains between the +# queueing rules. We don't want the second reinject +# to re-execute the old hooks. +load_counter_ruleset 10 + +# we are hooking all: prerouting/input/forward/output/postrouting. +# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: +# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). +# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. +# so we expect that userspace program receives 10 packets. +test_queue 10 + +# same. We queue to a second program as well. +load_ruleset "filter2" 20 +test_queue 20 + +test_tcp_forward +test_tcp_localhost +test_tcp_localhost_connectclose +test_tcp_localhost_requeue +test_icmp_vrf + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh new file mode 100755 index 000000000000..b62933b680d6 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +rnd=$(mktemp -u XXXXXXXX) +nsr="nsr-$rnd" # synproxy machine +ns1="ns1-$rnd" # iperf client +ns2="ns2-$rnd" # iperf server + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} + +checktool "nft --version" "run test without nft tool" +checktool "ip -Version" "run test without ip tool" +checktool "iperf3 --version" "run test without iperf3" +checktool "ip netns add $nsr" "create net namespace" + +modprobe -q nf_conntrack + +ip netns add $ns1 +ip netns add $ns2 + +cleanup() { + ip netns pids $ns1 | xargs kill 2>/dev/null + ip netns pids $ns2 | xargs kill 2>/dev/null + ip netns del $ns1 + ip netns del $ns2 + + ip netns del $nsr +} + +trap cleanup EXIT + +ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1 +ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2 + +for dev in lo veth0 veth1; do +ip -net $nsr link set $dev up +done + +ip -net $nsr addr add 10.0.1.1/24 dev veth0 +ip -net $nsr addr add 10.0.2.1/24 dev veth1 + +ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1 +ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1 +ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 + +for n in $ns1 $ns2; do + ip -net $n link set lo up + ip -net $n link set eth0 up +done +ip -net $ns1 addr add 10.0.1.99/24 dev eth0 +ip -net $ns2 addr add 10.0.2.99/24 dev eth0 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns2 route add default via 10.0.2.1 + +# test basic connectivity +if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then + echo "ERROR: $ns1 cannot reach $ns2" 1>&2 + exit 1 +fi + +if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 + exit 1 +fi + +ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 & +# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & + +sleep 1 + +ip netns exec $nsr nft -f - < /dev/null + +if [ $? -ne 0 ]; then + echo "FAIL: iperf3 returned an error" 1>&2 + ret=$? + ip netns exec $nsr nft list ruleset +else + echo "PASS: synproxy connection successful" +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh new file mode 100755 index 000000000000..5a8db0b48928 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -0,0 +1,163 @@ +#!/bin/bash + +# Test insertion speed for packets with identical addresses/ports +# that are all placed in distinct conntrack zones. + +sfx=$(mktemp -u "XXXXXXXX") +ns="ns-$sfx" + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +zones=2000 +have_ct_tool=0 +ret=0 + +cleanup() +{ + ip netns del $ns +} + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} + +checktool "nft --version" "run test without nft tool" +checktool "ip -Version" "run test without ip tool" +checktool "socat -V" "run test without socat tool" +checktool "ip netns add $ns" "create net namespace" + +trap cleanup EXIT + +conntrack -V > /dev/null 2>&1 +if [ $? -eq 0 ];then + have_ct_tool=1 +fi + +ip -net "$ns" link set lo up + +test_zones() { + local max_zones=$1 + +ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 +ip netns exec $ns nft -f /dev/stdin</dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 + if [ $? -ne 0 ] ;then + ret=1 + break + fi + + stop=$(date +%s%3N) + local duration=$((stop-start)) + echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" + done + + if [ $have_ct_tool -eq 1 ]; then + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries from packet path in $duration ms total" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" + ret=1 + fi + fi + + if [ $ret -ne 0 ];then + echo "FAIL: insert $max_zones entries from packet path" 1>&2 + fi +} + +test_conntrack_tool() { + local max_zones=$1 + + ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + + local outerstart=$(date +%s%3N) + local start=$(date +%s%3N) + local stop=$start + local i=0 + while [ $i -lt $max_zones ]; do + i=$((i + 1)) + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 + if [ $? -ne 0 ];then + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null + echo "FAIL: conntrack -I returned an error" + ret=1 + break + fi + + if [ $((i%1000)) -eq 0 ];then + stop=$(date +%s%3N) + + local duration=$((stop-start)) + echo "PASS: added 1000 entries in $duration ms (now $i total)" + start=$stop + fi + done + + local count=$(ip netns exec "$ns" conntrack -C) + local duration=$((stop-outerstart)) + + if [ $count -eq $max_zones ]; then + echo "PASS: inserted $count entries via ctnetlink in $duration ms" + else + ip netns exec $ns conntrack -S 1>&2 + echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" + ret=1 + fi +} + +test_zones $zones + +if [ $have_ct_tool -eq 1 ];then + test_conntrack_tool $zones +else + echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" + if [ $ret -eq 0 ];then + exit $ksft_skip + fi +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh new file mode 100755 index 000000000000..5289c8447a41 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/rpath.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# return code to signal skipped test +ksft_skip=4 + +# search for legacy iptables (it uses the xtables extensions +if iptables-legacy --version >/dev/null 2>&1; then + iptables='iptables-legacy' +elif iptables --version >/dev/null 2>&1; then + iptables='iptables' +else + iptables='' +fi + +if ip6tables-legacy --version >/dev/null 2>&1; then + ip6tables='ip6tables-legacy' +elif ip6tables --version >/dev/null 2>&1; then + ip6tables='ip6tables' +else + ip6tables='' +fi + +if nft --version >/dev/null 2>&1; then + nft='nft' +else + nft='' +fi + +if [ -z "$iptables$ip6tables$nft" ]; then + echo "SKIP: Test needs iptables, ip6tables or nft" + exit $ksft_skip +fi + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +trap "ip netns del $ns1; ip netns del $ns2" EXIT + +# create two netns, disable rp_filter in ns2 and +# keep IPv6 address when moving into VRF +ip netns add "$ns1" +ip netns add "$ns2" +ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1 + +# a standard connection between the netns, should not trigger rp filter +ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2" +ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up +ip -net "$ns1" a a 192.168.23.2/24 dev v0 +ip -net "$ns2" a a 192.168.23.1/24 dev v0 +ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad + +# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0 +ip -net "$ns2" link add d0 type dummy +ip -net "$ns2" link set d0 up +ip -net "$ns1" a a 192.168.42.2/24 dev v0 +ip -net "$ns2" a a 192.168.42.1/24 dev d0 +ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad + +# firewall matches to test +[ -n "$iptables" ] && { + common='-t raw -A PREROUTING -s 192.168.0.0/16' + ip netns exec "$ns2" "$iptables" $common -m rpfilter + ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert +} +[ -n "$ip6tables" ] && { + common='-t raw -A PREROUTING -s fec0::/16' + ip netns exec "$ns2" "$ip6tables" $common -m rpfilter + ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert +} +[ -n "$nft" ] && ip netns exec "$ns2" $nft -f - </dev/null +} + +clear_counters() { + [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z + [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z + if [ -n "$nft" ]; then + ( + echo "delete table inet t"; + ip netns exec "$ns2" $nft -s list table inet t; + ) | ip netns exec "$ns2" $nft -f - + fi +} + +testrun() { + clear_counters + + # test 1: martian traffic should fail rpfilter matches + netns_ping "$ns1" -I v0 192.168.42.1 && \ + die "martian ping 192.168.42.1 succeeded" + netns_ping "$ns1" -I v0 fec0:42::1 && \ + die "martian ping fec0:42::1 succeeded" + + ipt_zero_rule "$iptables" || die "iptables matched martian" + ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" + ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian" + ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian" + nft_zero_rule ip || die "nft IPv4 matched martian" + nft_zero_rule ip6 || die "nft IPv6 matched martian" + + clear_counters + + # test 2: rpfilter match should pass for regular traffic + netns_ping "$ns1" 192.168.23.1 || \ + die "regular ping 192.168.23.1 failed" + netns_ping "$ns1" fec0:23::1 || \ + die "regular ping fec0:23::1 failed" + + ipt_zero_rule "$iptables" && die "iptables match not effective" + ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" + ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective" + ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective" + nft_zero_rule ip && die "nft IPv4 match not effective" + nft_zero_rule ip6 && die "nft IPv6 match not effective" + +} + +testrun + +# repeat test with vrf device in $ns2 +ip -net "$ns2" link add vrf0 type vrf table 10 +ip -net "$ns2" link set vrf0 up +ip -net "$ns2" link set v0 master vrf0 + +testrun + +echo "PASS: netfilter reverse path match works as intended" +exit 0 diff --git a/tools/testing/selftests/net/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c new file mode 100644 index 000000000000..21bb1cfd8a85 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/sctp_collision.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + struct sockaddr_in saddr = {}, daddr = {}; + int sd, ret, len = sizeof(daddr); + struct timeval tv = {25, 0}; + char buf[] = "hello"; + + if (argc != 6 || (strcmp(argv[1], "server") && strcmp(argv[1], "client"))) { + printf("%s \n", + argv[0]); + return -1; + } + + sd = socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP); + if (sd < 0) { + printf("Failed to create sd\n"); + return -1; + } + + saddr.sin_family = AF_INET; + saddr.sin_addr.s_addr = inet_addr(argv[2]); + saddr.sin_port = htons(atoi(argv[3])); + + ret = bind(sd, (struct sockaddr *)&saddr, sizeof(saddr)); + if (ret < 0) { + printf("Failed to bind to address\n"); + goto out; + } + + ret = listen(sd, 5); + if (ret < 0) { + printf("Failed to listen on port\n"); + goto out; + } + + daddr.sin_family = AF_INET; + daddr.sin_addr.s_addr = inet_addr(argv[4]); + daddr.sin_port = htons(atoi(argv[5])); + + /* make test shorter than 25s */ + ret = setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret < 0) { + printf("Failed to setsockopt SO_RCVTIMEO\n"); + goto out; + } + + if (!strcmp(argv[1], "server")) { + sleep(1); /* wait a bit for client's INIT */ + ret = connect(sd, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to connect to peer\n"); + goto out; + } + ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); + if (ret < 0) { + printf("Failed to recv msg %d\n", ret); + goto out; + } + ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to send msg %d\n", ret); + goto out; + } + printf("Server: sent! %d\n", ret); + } + + if (!strcmp(argv[1], "client")) { + usleep(300000); /* wait a bit for server's listening */ + ret = connect(sd, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to connect to peer\n"); + goto out; + } + sleep(1); /* wait a bit for server's delayed INIT_ACK to reproduce the issue */ + ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); + if (ret < 0) { + printf("Failed to send msg %d\n", ret); + goto out; + } + ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); + if (ret < 0) { + printf("Failed to recv msg %d\n", ret); + goto out; + } + printf("Client: rcvd! %d\n", ret); + } + ret = 0; +out: + close(sd); + return ret; +} diff --git a/tools/testing/selftests/net/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh new file mode 100755 index 000000000000..1802653a4728 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/xt_string.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# return code to signal skipped test +ksft_skip=4 +rc=0 + +if ! iptables --version >/dev/null 2>&1; then + echo "SKIP: Test needs iptables" + exit $ksft_skip +fi +if ! ip -V >/dev/null 2>&1; then + echo "SKIP: Test needs iproute2" + exit $ksft_skip +fi +if ! nc -h >/dev/null 2>&1; then + echo "SKIP: Test needs netcat" + exit $ksft_skip +fi + +pattern="foo bar baz" +patlen=11 +hdrlen=$((20 + 8)) # IPv4 + UDP +ns="ns-$(mktemp -u XXXXXXXX)" +trap 'ip netns del $ns' EXIT +ip netns add "$ns" +ip -net "$ns" link add d0 type dummy +ip -net "$ns" link set d0 up +ip -net "$ns" addr add 10.1.2.1/24 dev d0 + +#ip netns exec "$ns" tcpdump -npXi d0 & +#tcpdump_pid=$! +#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT + +add_rule() { # (alg, from, to) + ip netns exec "$ns" \ + iptables -A OUTPUT -o d0 -m string \ + --string "$pattern" --algo $1 --from $2 --to $3 +} +showrules() { # () + ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A' +} +zerorules() { + ip netns exec "$ns" iptables -Z OUTPUT +} +countrule() { # (pattern) + showrules | grep -c -- "$*" +} +send() { # (offset) + ( for ((i = 0; i < $1 - $hdrlen; i++)); do + printf " " + done + printf "$pattern" + ) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374 +} + +add_rule bm 1000 1500 +add_rule bm 1400 1600 +add_rule kmp 1000 1500 +add_rule kmp 1400 1600 + +zerorules +send 0 +send $((1000 - $patlen)) +if [ $(countrule -c 0 0) -ne 4 ]; then + echo "FAIL: rules match data before --from" + showrules + ((rc--)) +fi + +zerorules +send 1000 +send $((1400 - $patlen)) +if [ $(countrule -c 2) -ne 2 ]; then + echo "FAIL: only two rules should match at low offset" + showrules + ((rc--)) +fi + +zerorules +send $((1500 - $patlen)) +if [ $(countrule -c 1) -ne 4 ]; then + echo "FAIL: all rules should match at end of packet" + showrules + ((rc--)) +fi + +zerorules +send 1495 +if [ $(countrule -c 1) -ne 1 ]; then + echo "FAIL: only kmp with proper --to should match pattern spanning fragments" + showrules + ((rc--)) +fi + +zerorules +send 1500 +if [ $(countrule -c 1) -ne 2 ]; then + echo "FAIL: two rules should match pattern at start of second fragment" + showrules + ((rc--)) +fi + +zerorules +send $((1600 - $patlen)) +if [ $(countrule -c 1) -ne 2 ]; then + echo "FAIL: two rules should match pattern at end of largest --to" + showrules + ((rc--)) +fi + +zerorules +send $((1600 - $patlen + 1)) +if [ $(countrule -c 1) -ne 0 ]; then + echo "FAIL: no rules should match pattern extending largest --to" + showrules + ((rc--)) +fi + +zerorules +send 1600 +if [ $(countrule -c 1) -ne 0 ]; then + echo "FAIL: no rule should match pattern past largest --to" + showrules + ((rc--)) +fi + +exit $rc diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/netfilter/.gitignore deleted file mode 100644 index c2229b3e40d4..000000000000 --- a/tools/testing/selftests/netfilter/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -nf-queue -connect_close -audit_logread -conntrack_dump_flush -sctp_collision diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile deleted file mode 100644 index 936c3085bb83..000000000000 --- a/tools/testing/selftests/netfilter/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for netfilter selftests - -TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ - conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ - nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ - ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ - conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \ - conntrack_sctp_collision.sh xt_string.sh \ - bridge_netfilter.sh - -HOSTPKG_CONFIG := pkg-config - -CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null) -LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) - -TEST_GEN_FILES = nf-queue connect_close audit_logread sctp_collision \ - conntrack_dump_flush - -include ../lib.mk diff --git a/tools/testing/selftests/netfilter/audit_logread.c b/tools/testing/selftests/netfilter/audit_logread.c deleted file mode 100644 index a0a880fc2d9d..000000000000 --- a/tools/testing/selftests/netfilter/audit_logread.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int fd; - -#define MAX_AUDIT_MESSAGE_LENGTH 8970 -struct audit_message { - struct nlmsghdr nlh; - union { - struct audit_status s; - char data[MAX_AUDIT_MESSAGE_LENGTH]; - } u; -}; - -int audit_recv(int fd, struct audit_message *rep) -{ - struct sockaddr_nl addr; - socklen_t addrlen = sizeof(addr); - int ret; - - do { - ret = recvfrom(fd, rep, sizeof(*rep), 0, - (struct sockaddr *)&addr, &addrlen); - } while (ret < 0 && errno == EINTR); - - if (ret < 0 || - addrlen != sizeof(addr) || - addr.nl_pid != 0 || - rep->nlh.nlmsg_type == NLMSG_ERROR) /* short-cut for now */ - return -1; - - return ret; -} - -int audit_send(int fd, uint16_t type, uint32_t key, uint32_t val) -{ - static int seq = 0; - struct audit_message msg = { - .nlh = { - .nlmsg_len = NLMSG_SPACE(sizeof(msg.u.s)), - .nlmsg_type = type, - .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, - .nlmsg_seq = ++seq, - }, - .u.s = { - .mask = key, - .enabled = key == AUDIT_STATUS_ENABLED ? val : 0, - .pid = key == AUDIT_STATUS_PID ? val : 0, - } - }; - struct sockaddr_nl addr = { - .nl_family = AF_NETLINK, - }; - int ret; - - do { - ret = sendto(fd, &msg, msg.nlh.nlmsg_len, 0, - (struct sockaddr *)&addr, sizeof(addr)); - } while (ret < 0 && errno == EINTR); - - if (ret != (int)msg.nlh.nlmsg_len) - return -1; - return 0; -} - -int audit_set(int fd, uint32_t key, uint32_t val) -{ - struct audit_message rep = { 0 }; - int ret; - - ret = audit_send(fd, AUDIT_SET, key, val); - if (ret) - return ret; - - ret = audit_recv(fd, &rep); - if (ret < 0) - return ret; - return 0; -} - -int readlog(int fd) -{ - struct audit_message rep = { 0 }; - int ret = audit_recv(fd, &rep); - const char *sep = ""; - char *k, *v; - - if (ret < 0) - return ret; - - if (rep.nlh.nlmsg_type != AUDIT_NETFILTER_CFG) - return 0; - - /* skip the initial "audit(...): " part */ - strtok(rep.u.data, " "); - - while ((k = strtok(NULL, "="))) { - v = strtok(NULL, " "); - - /* these vary and/or are uninteresting, ignore */ - if (!strcmp(k, "pid") || - !strcmp(k, "comm") || - !strcmp(k, "subj")) - continue; - - /* strip the varying sequence number */ - if (!strcmp(k, "table")) - *strchrnul(v, ':') = '\0'; - - printf("%s%s=%s", sep, k, v); - sep = " "; - } - if (*sep) { - printf("\n"); - fflush(stdout); - } - return 0; -} - -void cleanup(int sig) -{ - audit_set(fd, AUDIT_STATUS_ENABLED, 0); - close(fd); - if (sig) - exit(0); -} - -int main(int argc, char **argv) -{ - struct sigaction act = { - .sa_handler = cleanup, - }; - - fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_AUDIT); - if (fd < 0) { - perror("Can't open netlink socket"); - return -1; - } - - if (sigaction(SIGTERM, &act, NULL) < 0 || - sigaction(SIGINT, &act, NULL) < 0) { - perror("Can't set signal handler"); - close(fd); - return -1; - } - - audit_set(fd, AUDIT_STATUS_ENABLED, 1); - audit_set(fd, AUDIT_STATUS_PID, getpid()); - - while (1) - readlog(fd); -} diff --git a/tools/testing/selftests/netfilter/bridge_brouter.sh b/tools/testing/selftests/netfilter/bridge_brouter.sh deleted file mode 100755 index 29f3955b9af7..000000000000 --- a/tools/testing/selftests/netfilter/bridge_brouter.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# -# This test is for bridge 'brouting', i.e. make some packets being routed -# rather than getting bridged even though they arrive on interface that is -# part of a bridge. - -# eth0 br0 eth0 -# setup is: ns1 <-> ns0 <-> ns2 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ns0 -ip netns add ns1 -ip netns add ns2 - -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi -ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 - -ip -net ns0 link set lo up -ip -net ns0 link set veth0 up -ip -net ns0 link set veth1 up - -ip -net ns0 link add br0 type bridge -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -ip -net ns0 link set veth0 master br0 -ip -net ns0 link set veth1 master br0 -ip -net ns0 link set br0 up -ip -net ns0 addr add 10.0.0.1/24 dev br0 - -# place both in same subnet, ns1 and ns2 connected via ns0:br0 -for i in 1 2; do - ip -net ns$i link set lo up - ip -net ns$i link set eth0 up - ip -net ns$i addr add 10.0.0.1$i/24 dev eth0 -done - -test_ebtables_broute() -{ - local cipt - - # redirect is needed so the dstmac is rewritten to the bridge itself, - # ip stack won't process OTHERHOST (foreign unicast mac) packets. - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - if [ $? -ne 0 ]; then - echo "SKIP: Could not add ebtables broute redirect rule" - return $ksft_skip - fi - - # ping netns1, expected to not work (ip forwarding is off) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed" 1>&2 - return 1 - fi - - # enable forwarding on both interfaces. - # neither needs an ip address, but at least the bridge needs - # an ip address in same network segment as ns1 and ns2 (ns0 - # needs to be able to determine route for to-be-forwarded packet). - ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1 - ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1 - - sleep 1 - - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule" - ip netns exec ns0 ebtables -t broute -F - - # ping netns1, expected to work (frames are bridged) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (bridged)" 1>&2 - return 1 - fi - - ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP - - # ping netns1, expected to not work (DROP in bridge forward) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then - echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 - return 1 - fi - - # re-activate brouter - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - - ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 - return 1 - fi - - echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop" - return 0 -} - -# test basic connectivity -ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns2 from ns1" 1>&2 - ret=1 -fi - -ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns1 from ns2" 1>&2 - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 and ns2 can reach each other" -fi - -test_ebtables_broute -ret=$? -for i in 0 1 2; do ip netns del ns$i;done - -exit $ret diff --git a/tools/testing/selftests/netfilter/bridge_netfilter.sh b/tools/testing/selftests/netfilter/bridge_netfilter.sh deleted file mode 100644 index 659b3ab02c8b..000000000000 --- a/tools/testing/selftests/netfilter/bridge_netfilter.sh +++ /dev/null @@ -1,188 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test bridge netfilter + conntrack, a combination that doesn't really work, -# with multicast/broadcast packets racing for hash table insertion. - -# eth0 br0 eth0 -# setup is: ns1 <->,ns0 <-> ns3 -# ns2 <-' `'-> ns4 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" -ns3="ns3-$sfx" -ns4="ns4-$sfx" - -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -for i in $(seq 0 4); do - eval ip netns add \$ns$i -done - -cleanup() { - for i in $(seq 0 4); do eval ip netns del \$ns$i;done -} - -trap cleanup EXIT - -do_ping() -{ - fromns="$1" - dstip="$2" - - ip netns exec $fromns ping -c 1 -q $dstip > /dev/null - if [ $? -ne 0 ]; then - echo "ERROR: ping from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - ret=1 - fi -} - -bcast_ping() -{ - fromns="$1" - dstip="$2" - - for i in $(seq 1 1000); do - ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "ERROR: ping -b from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - fi - done -} - -ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} -if [ $? -ne 0 ]; then - echo "SKIP: Can't create veth device" - exit $ksft_skip -fi - -ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 -ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 -ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 - -ip -net ${ns0} link set lo up - -for i in $(seq 1 4); do - ip -net ${ns0} link set veth$i up -done - -ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 -if [ $? -ne 0 ]; then - echo "SKIP: Can't create bridge br0" - exit $ksft_skip -fi - -# make veth0,1,2 part of bridge. -for i in $(seq 1 3); do - ip -net ${ns0} link set veth$i master br0 -done - -# add a macvlan on top of the bridge. -MACVLAN_ADDR=ba:f3:13:37:42:23 -ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private -ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan0 up -ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 - -# add a macvlan on top of veth4. -MACVLAN_ADDR=ba:f3:13:37:42:24 -ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa -ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan4 up - -# make the macvlan part of the bridge. -# veth4 is not a bridge port, only the macvlan on top of it. -ip -net ${ns0} link set macvlan4 master br0 - -ip -net ${ns0} link set br0 up -ip -net ${ns0} addr add 10.0.0.1/24 dev br0 -ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 -ret=$? -if [ $ret -ne 0 ] ; then - echo "SKIP: bridge netfilter not available" - ret=$ksft_skip -fi - -# for testing, so namespaces will reply to ping -b probes. -ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 - -# enable conntrack in ns0 and drop broadcast packets in forward to -# avoid them from getting confirmed in the postrouting hook before -# the cloned skb is passed up the stack. -ip netns exec ${ns0} nft -f - < -#include -#include -#include -#include -#include - -#include -#include - -#define PORT 12345 -#define RUNTIME 10 - -static struct { - unsigned int timeout; - unsigned int port; -} opts = { - .timeout = RUNTIME, - .port = PORT, -}; - -static void handler(int sig) -{ - _exit(sig == SIGALRM ? 0 : 1); -} - -static void set_timeout(void) -{ - struct sigaction action = { - .sa_handler = handler, - }; - - sigaction(SIGALRM, &action, NULL); - - alarm(opts.timeout); -} - -static void do_connect(const struct sockaddr_in *dst) -{ - int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - - if (s >= 0) - fcntl(s, F_SETFL, O_NONBLOCK); - - connect(s, (struct sockaddr *)dst, sizeof(*dst)); - close(s); -} - -static void do_accept(const struct sockaddr_in *src) -{ - int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - - if (s < 0) - return; - - setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); - setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); - - bind(s, (struct sockaddr *)src, sizeof(*src)); - - listen(s, 16); - - c = accept(s, NULL, NULL); - if (c >= 0) - close(c); - - close(s); -} - -static int accept_loop(void) -{ - struct sockaddr_in src = { - .sin_family = AF_INET, - .sin_port = htons(opts.port), - }; - - inet_pton(AF_INET, "127.0.0.1", &src.sin_addr); - - set_timeout(); - - for (;;) - do_accept(&src); - - return 1; -} - -static int connect_loop(void) -{ - struct sockaddr_in dst = { - .sin_family = AF_INET, - .sin_port = htons(opts.port), - }; - - inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr); - - set_timeout(); - - for (;;) - do_connect(&dst); - - return 1; -} - -static void parse_opts(int argc, char **argv) -{ - int c; - - while ((c = getopt(argc, argv, "t:p:")) != -1) { - switch (c) { - case 't': - opts.timeout = atoi(optarg); - break; - case 'p': - opts.port = atoi(optarg); - break; - } - } -} - -int main(int argc, char *argv[]) -{ - pid_t p; - - parse_opts(argc, argv); - - p = fork(); - if (p < 0) - return 111; - - if (p > 0) - return accept_loop(); - - return connect_loop(); -} diff --git a/tools/testing/selftests/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/netfilter/conntrack_dump_flush.c deleted file mode 100644 index b11ea8ee6719..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_dump_flush.c +++ /dev/null @@ -1,471 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define _GNU_SOURCE - -#include -#include -#include - -#include -#include -#include -#include -#include "../kselftest_harness.h" - -#define TEST_ZONE_ID 123 -#define NF_CT_DEFAULT_ZONE_ID 0 - -static int reply_counter; - -static int build_cta_tuple_v4(struct nlmsghdr *nlh, int type, - uint32_t src_ip, uint32_t dst_ip, - uint16_t src_port, uint16_t dst_port) -{ - struct nlattr *nest, *nest_ip, *nest_proto; - - nest = mnl_attr_nest_start(nlh, type); - if (!nest) - return -1; - - nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); - if (!nest_ip) - return -1; - mnl_attr_put_u32(nlh, CTA_IP_V4_SRC, src_ip); - mnl_attr_put_u32(nlh, CTA_IP_V4_DST, dst_ip); - mnl_attr_nest_end(nlh, nest_ip); - - nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); - if (!nest_proto) - return -1; - mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); - mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); - mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); - mnl_attr_nest_end(nlh, nest_proto); - - mnl_attr_nest_end(nlh, nest); -} - -static int build_cta_tuple_v6(struct nlmsghdr *nlh, int type, - struct in6_addr src_ip, struct in6_addr dst_ip, - uint16_t src_port, uint16_t dst_port) -{ - struct nlattr *nest, *nest_ip, *nest_proto; - - nest = mnl_attr_nest_start(nlh, type); - if (!nest) - return -1; - - nest_ip = mnl_attr_nest_start(nlh, CTA_TUPLE_IP); - if (!nest_ip) - return -1; - mnl_attr_put(nlh, CTA_IP_V6_SRC, sizeof(struct in6_addr), &src_ip); - mnl_attr_put(nlh, CTA_IP_V6_DST, sizeof(struct in6_addr), &dst_ip); - mnl_attr_nest_end(nlh, nest_ip); - - nest_proto = mnl_attr_nest_start(nlh, CTA_TUPLE_PROTO); - if (!nest_proto) - return -1; - mnl_attr_put_u8(nlh, CTA_PROTO_NUM, 6); - mnl_attr_put_u16(nlh, CTA_PROTO_SRC_PORT, htons(src_port)); - mnl_attr_put_u16(nlh, CTA_PROTO_DST_PORT, htons(dst_port)); - mnl_attr_nest_end(nlh, nest_proto); - - mnl_attr_nest_end(nlh, nest); -} - -static int build_cta_proto(struct nlmsghdr *nlh) -{ - struct nlattr *nest, *nest_proto; - - nest = mnl_attr_nest_start(nlh, CTA_PROTOINFO); - if (!nest) - return -1; - - nest_proto = mnl_attr_nest_start(nlh, CTA_PROTOINFO_TCP); - if (!nest_proto) - return -1; - mnl_attr_put_u8(nlh, CTA_PROTOINFO_TCP_STATE, TCP_CONNTRACK_ESTABLISHED); - mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, 0x0a0a); - mnl_attr_put_u16(nlh, CTA_PROTOINFO_TCP_FLAGS_REPLY, 0x0a0a); - mnl_attr_nest_end(nlh, nest_proto); - - mnl_attr_nest_end(nlh, nest); -} - -static int conntrack_data_insert(struct mnl_socket *sock, struct nlmsghdr *nlh, - uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *rplnlh; - unsigned int portid; - int err, ret; - - portid = mnl_socket_get_portid(sock); - - ret = build_cta_proto(nlh); - if (ret < 0) { - perror("build_cta_proto"); - return -1; - } - mnl_attr_put_u32(nlh, CTA_TIMEOUT, htonl(20000)); - mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); - - if (mnl_socket_sendto(sock, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - return -1; - } - - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - if (ret < 0) { - perror("mnl_socket_recvfrom"); - return ret; - } - - ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); - if (ret < 0) { - if (errno == EEXIST) { - /* The entries are probably still there from a previous - * run. So we are good - */ - return 0; - } - perror("mnl_cb_run"); - return ret; - } - - return 0; -} - -static int conntrack_data_generate_v4(struct mnl_socket *sock, uint32_t src_ip, - uint32_t dst_ip, uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh; - struct nfgenmsg *nfh; - int ret; - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | - NLM_F_ACK | NLM_F_EXCL; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_INET; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - ret = build_cta_tuple_v4(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, 12345, 443); - if (ret < 0) { - perror("build_cta_tuple_v4"); - return ret; - } - ret = build_cta_tuple_v4(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, 443, 12345); - if (ret < 0) { - perror("build_cta_tuple_v4"); - return ret; - } - return conntrack_data_insert(sock, nlh, zone); -} - -static int conntrack_data_generate_v6(struct mnl_socket *sock, - struct in6_addr src_ip, - struct in6_addr dst_ip, - uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh; - struct nfgenmsg *nfh; - int ret; - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_NEW; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | - NLM_F_ACK | NLM_F_EXCL; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_INET6; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - ret = build_cta_tuple_v6(nlh, CTA_TUPLE_ORIG, src_ip, dst_ip, - 12345, 443); - if (ret < 0) { - perror("build_cta_tuple_v6"); - return ret; - } - ret = build_cta_tuple_v6(nlh, CTA_TUPLE_REPLY, dst_ip, src_ip, - 12345, 443); - if (ret < 0) { - perror("build_cta_tuple_v6"); - return ret; - } - return conntrack_data_insert(sock, nlh, zone); -} - -static int count_entries(const struct nlmsghdr *nlh, void *data) -{ - reply_counter++; -} - -static int conntracK_count_zone(struct mnl_socket *sock, uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh, *rplnlh; - struct nfgenmsg *nfh; - struct nlattr *nest; - unsigned int portid; - int err, ret; - - portid = mnl_socket_get_portid(sock); - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_GET; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_UNSPEC; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); - - ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); - if (ret < 0) { - perror("mnl_socket_sendto"); - return ret; - } - - reply_counter = 0; - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - while (ret > 0) { - ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, - count_entries, NULL); - if (ret <= MNL_CB_STOP) - break; - - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - } - if (ret < 0) { - perror("mnl_socket_recvfrom"); - return ret; - } - - return reply_counter; -} - -static int conntrack_flush_zone(struct mnl_socket *sock, uint16_t zone) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - struct nlmsghdr *nlh, *rplnlh; - struct nfgenmsg *nfh; - struct nlattr *nest; - unsigned int portid; - int err, ret; - - portid = mnl_socket_get_portid(sock); - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8) | IPCTNL_MSG_CT_DELETE; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - nlh->nlmsg_seq = time(NULL); - - nfh = mnl_nlmsg_put_extra_header(nlh, sizeof(struct nfgenmsg)); - nfh->nfgen_family = AF_UNSPEC; - nfh->version = NFNETLINK_V0; - nfh->res_id = 0; - - mnl_attr_put_u16(nlh, CTA_ZONE, htons(zone)); - - ret = mnl_socket_sendto(sock, nlh, nlh->nlmsg_len); - if (ret < 0) { - perror("mnl_socket_sendto"); - return ret; - } - - ret = mnl_socket_recvfrom(sock, buf, MNL_SOCKET_BUFFER_SIZE); - if (ret < 0) { - perror("mnl_socket_recvfrom"); - return ret; - } - - ret = mnl_cb_run(buf, ret, nlh->nlmsg_seq, portid, NULL, NULL); - if (ret < 0) { - perror("mnl_cb_run"); - return ret; - } - - return 0; -} - -FIXTURE(conntrack_dump_flush) -{ - struct mnl_socket *sock; -}; - -FIXTURE_SETUP(conntrack_dump_flush) -{ - struct in6_addr src, dst; - int ret; - - self->sock = mnl_socket_open(NETLINK_NETFILTER); - if (!self->sock) { - perror("mnl_socket_open"); - exit(EXIT_FAILURE); - } - - if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } - - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - if (ret < 0 && errno == EPERM) - SKIP(return, "Needs to be run as root"); - else if (ret < 0 && errno == EOPNOTSUPP) - SKIP(return, "Kernel does not seem to support conntrack zones"); - - ret = conntrack_data_generate_v4(self->sock, 0xf0f0f0f0, 0xf1f1f1f1, - TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntrack_data_generate_v4(self->sock, 0xf2f2f2f2, 0xf3f3f3f3, - TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 0); - ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5, - TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 0); - ret = conntrack_data_generate_v4(self->sock, 0xf6f6f6f6, 0xf7f7f7f7, - NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); - - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x01000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x02000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x03000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x04000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 0); - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x05000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x06000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 0); - - src = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x07000000 - } - }}; - dst = (struct in6_addr) {{ - .__u6_addr32 = { - 0xb80d0120, - 0x00000000, - 0x00000000, - 0x08000000 - } - }}; - ret = conntrack_data_generate_v6(self->sock, src, dst, - NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); - - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_GE(ret, 2); - if (ret > 2) - SKIP(return, "kernel does not support filtering by zone"); -} - -FIXTURE_TEARDOWN(conntrack_dump_flush) -{ -} - -TEST_F(conntrack_dump_flush, test_dump_by_zone) -{ - int ret; - - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 2); -} - -TEST_F(conntrack_dump_flush, test_flush_by_zone) -{ - int ret; - - ret = conntrack_flush_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 2); -} - -TEST_F(conntrack_dump_flush, test_flush_by_zone_default) -{ - int ret; - - ret = conntrack_flush_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); - EXPECT_EQ(ret, 2); - ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); - EXPECT_EQ(ret, 0); -} - -TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh deleted file mode 100755 index 76645aaf2b58..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh +++ /dev/null @@ -1,315 +0,0 @@ -#!/bin/bash -# -# check that ICMP df-needed/pkttoobig icmp are set are set as related -# state -# -# Setup is: -# -# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2 -# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280). -# ping nsclient2 from nsclient1, checking that conntrack did set RELATED -# 'fragmentation needed' icmp packet. -# -# In addition, nsrouter1 will perform IP masquerading, i.e. also -# check the icmp errors are propagated to the correct host as per -# nat of "established" icmp-echo "connection". - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -cleanup() { - for i in 1 2;do ip netns del nsclient$i;done - for i in 1 2;do ip netns del nsrouter$i;done -} - -trap cleanup EXIT - -ipv4() { - echo -n 192.168.$1.2 -} - -ipv6 () { - echo -n dead:$1::2 -} - -check_counter() -{ - ns=$1 - name=$2 - expect=$3 - local lret=0 - - cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then - echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns nft list counter inet filter "$name" 1>&2 - lret=1 - fi - - return $lret -} - -check_unknown() -{ - expect="packets 0 bytes 0" - for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - check_counter $n "unknown" "$expect" - if [ $? -ne 0 ] ;then - return 1 - fi - done - - return 0 -} - -for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - ip netns add $n - ip -net $n link set lo up -done - -DEV=veth0 -ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 -DEV=veth0 -ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 - -DEV=veth0 -ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 - -DEV=veth0 -for i in 1 2; do - ip -net nsclient$i link set $DEV up - ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV - ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV -done - -ip -net nsrouter1 link set eth1 up -ip -net nsrouter1 link set veth0 up - -ip -net nsrouter2 link set eth1 up -ip -net nsrouter2 link set eth2 up - -ip -net nsclient1 route add default via 192.168.1.1 -ip -net nsclient1 -6 route add default via dead:1::1 - -ip -net nsclient2 route add default via 192.168.2.1 -ip -net nsclient2 route add default via dead:2::1 - -i=3 -ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 -ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 -ip -net nsrouter1 addr add dead:1::1/64 dev eth1 -ip -net nsrouter1 addr add dead:3::1/64 dev veth0 -ip -net nsrouter1 route add default via 192.168.3.10 -ip -net nsrouter1 -6 route add default via dead:3::10 - -ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 -ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 -ip -net nsrouter2 addr add dead:2::1/64 dev eth1 -ip -net nsrouter2 addr add dead:3::10/64 dev eth2 -ip -net nsrouter2 route add default via 192.168.3.1 -ip -net nsrouter2 route add default via dead:3::1 - -sleep 2 -for i in 4 6; do - ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 - ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 -done - -for netns in nsrouter1 nsrouter2; do -ip netns exec $netns nft -f - </dev/null -if [ $? -ne 0 ]; then - echo "ERROR: netns ip routing/connectivity broken" 1>&2 - cleanup - exit 1 -fi -ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null -if [ $? -ne 0 ]; then - echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 - cleanup - exit 1 -fi - -check_unknown -if [ $? -ne 0 ]; then - ret=1 -fi - -expect="packets 0 bytes 0" -for netns in nsrouter1 nsrouter2 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then - ret=1 - fi -done - -expect="packets 2 bytes 2076" -check_counter nsclient2 "new" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null -if [ $? -eq 0 ]; then - echo "ERROR: ping should have failed with PMTU too big error" 1>&2 - ret=1 -fi - -# nsrouter2 should have generated the icmp error, so -# related counter should be 0 (its in forward). -expect="packets 0 bytes 0" -check_counter "nsrouter2" "related" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -# but nsrouter1 should have seen it, same for nsclient1. -expect="packets 1 bytes 576" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then - ret=1 - fi -done - -ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null -if [ $? -eq 0 ]; then - echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 - ret=1 -fi - -expect="packets 2 bytes 1856" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then - ret=1 - fi -done - -if [ $ret -eq 0 ];then - echo "PASS: icmp mtu error had RELATED state" -else - echo "ERROR: icmp error RELATED state test has failed" -fi - -# add 'bad' route, expect icmp REDIRECT to be generated -ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 -ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 - -ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null - -expect="packets 1 bytes 112" -check_counter nsclient1 "redir4" "$expect" -if [ $? -ne 0 ];then - ret=1 -fi - -ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null -expect="packets 1 bytes 192" -check_counter nsclient1 "redir6" "$expect" -if [ $? -ne 0 ];then - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: icmp redirects had RELATED state" -else - echo "ERROR: icmp redirect RELATED state test has failed" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh deleted file mode 100755 index a924e595cfd8..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Testing For SCTP COLLISION SCENARIO as Below: -# -# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] -# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] -# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] -# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] -# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] -# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] -# -# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS - -CLIENT_NS=$(mktemp -u client-XXXXXXXX) -CLIENT_IP="198.51.200.1" -CLIENT_PORT=1234 - -SERVER_NS=$(mktemp -u server-XXXXXXXX) -SERVER_IP="198.51.100.1" -SERVER_PORT=1234 - -ROUTER_NS=$(mktemp -u router-XXXXXXXX) -CLIENT_GW="198.51.200.2" -SERVER_GW="198.51.100.2" - -# setup the topo -setup() { - ip net add $CLIENT_NS - ip net add $SERVER_NS - ip net add $ROUTER_NS - ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS - ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS - - ip -n $SERVER_NS link set link0 up - ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0 - ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW - - ip -n $ROUTER_NS link set link1 up - ip -n $ROUTER_NS link set link2 up - ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1 - ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2 - ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1 - - ip -n $CLIENT_NS link set link3 up - ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3 - ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW - - # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with - # tc on $SERVER_NS side - tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb - tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit - tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ - 0xff match u8 2 0xff at 32 flowid 1:1 - tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms - - # simulate the ctstate check on OVS nf_conntrack - ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP - ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP - - # use a smaller number for assoc's max_retrans to reproduce the issue - modprobe sctp - ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3 -} - -cleanup() { - ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null - ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null - ip net del "$CLIENT_NS" - ip net del "$SERVER_NS" - ip net del "$ROUTER_NS" -} - -do_test() { - ip net exec $SERVER_NS ./sctp_collision server \ - $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & - ip net exec $CLIENT_NS ./sctp_collision client \ - $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT -} - -# NOTE: one way to work around the issue is set a smaller hb_interval -# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500 - -# run the test case -trap cleanup EXIT -setup && \ -echo "Test for SCTP Collision in nf_conntrack:" && \ -do_test && echo "PASS!" -exit $? diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh deleted file mode 100755 index e7d7bf13cff5..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Check that UNREPLIED tcp conntrack will eventually timeout. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -waittime=20 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - - ip netns del $ns1 - ip netns del $ns2 -} - -ipv4() { - echo -n 192.168.$1.2 -} - -check_counter() -{ - ns=$1 - name=$2 - expect=$3 - local lret=0 - - cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then - echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 - lret=1 - fi - - return $lret -} - -# Create test namespaces -ip netns add $ns1 || exit 1 - -trap cleanup EXIT - -ip netns add $ns2 || exit 1 - -# Connect the namespace to the host using a veth pair -ip -net $ns1 link add name veth1 type veth peer name veth2 -ip -net $ns1 link set netns $ns2 dev veth2 - -ip -net $ns1 link set up dev lo -ip -net $ns2 link set up dev lo -ip -net $ns1 link set up dev veth1 -ip -net $ns2 link set up dev veth2 - -ip -net $ns2 addr add 10.11.11.2/24 dev veth2 -ip -net $ns2 route add default via 10.11.11.1 - -ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 - -# add a rule inside NS so we enable conntrack -ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT - -ip -net $ns1 addr add 10.11.11.1/24 dev veth1 -ip -net $ns1 route add 10.99.99.99 via 10.11.11.2 - -# Check connectivity works -ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1 - -ip netns exec $ns2 nc -l -p 8080 < /dev/null & - -# however, conntrack entries are there - -ip netns exec $ns2 nft -f - < $ns2 to the virtual ip" -ip netns exec $ns1 bash -c 'while true ; do - nc -p 60000 10.99.99.99 80 - sleep 1 - done' & - -sleep 1 - -ip netns exec $ns2 nft -f - </dev/null | wc -l) -if [ $count -eq 0 ]; then - echo "ERROR: $ns2 did not pick up tcp connection from peer" - exit 1 -fi - -echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" -for i in $(seq 1 $waittime); do - echo -n "." - - sleep 1 - - count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) - if [ $count -gt 0 ]; then - echo - echo "PASS: redirection took effect after $i seconds" - break - fi - - m=$((i%20)) - if [ $m -eq 0 ]; then - echo " waited for $i seconds" - fi -done - -expect="packets 1 bytes 60" -check_counter "$ns2" "redir" "$expect" -if [ $? -ne 0 ]; then - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: redirection counter has expected values" -else - echo "ERROR: no tcp connection was redirected" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/netfilter/conntrack_vrf.sh deleted file mode 100755 index 8b5ea9234588..000000000000 --- a/tools/testing/selftests/netfilter/conntrack_vrf.sh +++ /dev/null @@ -1,241 +0,0 @@ -#!/bin/sh - -# This script demonstrates interaction of conntrack and vrf. -# The vrf driver calls the netfilter hooks again, with oif/iif -# pointing at the VRF device. -# -# For ingress, this means first iteration has iifname of lower/real -# device. In this script, thats veth0. -# Second iteration is iifname set to vrf device, tvrf in this script. -# -# For egress, this is reversed: first iteration has the vrf device, -# second iteration is done with the lower/real/veth0 device. -# -# test_ct_zone_in demonstrates unexpected change of nftables -# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack -# connection on VRF rcv" -# -# It was possible to assign conntrack zone to a packet (or mark it for -# `notracking`) in the prerouting chain before conntrack, based on real iif. -# -# After the change, the zone assignment is lost and the zone is assigned based -# on the VRF master interface (in case such a rule exists). -# assignment is lost. Instead, assignment based on the `iif` matching -# Thus it is impossible to distinguish packets based on the original -# interface. -# -# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem -# that was supposed to be fixed by the commit mentioned above to make sure -# that any fix to test case 1 won't break masquerade again. - -ksft_skip=4 - -IP0=172.30.30.1 -IP1=172.30.30.2 -PFXL=30 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" - -cleanup() -{ - ip netns pids $ns0 | xargs kill 2>/dev/null - ip netns pids $ns1 | xargs kill 2>/dev/null - - ip netns del $ns0 $ns1 -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi -ip netns add "$ns1" - -trap cleanup EXIT - -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 - -ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not add veth device" - exit $ksft_skip -fi - -ip -net $ns0 li add tvrf type vrf table 9876 -if [ $? -ne 0 ];then - echo "SKIP: Could not add vrf device" - exit $ksft_skip -fi - -ip -net $ns0 li set lo up - -ip -net $ns0 li set veth0 master tvrf -ip -net $ns0 li set tvrf up -ip -net $ns0 li set veth0 up -ip -net $ns1 li set veth0 up - -ip -net $ns0 addr add $IP0/$PFXL dev veth0 -ip -net $ns1 addr add $IP1/$PFXL dev veth0 - -ip netns exec $ns1 iperf3 -s > /dev/null 2>&1& -if [ $? -ne 0 ];then - echo "SKIP: Could not start iperf3" - exit $ksft_skip -fi - -# test vrf ingress handling. -# The incoming connection should be placed in conntrack zone 1, -# as decided by the first iteration of the ruleset. -test_ct_zone_in() -{ -ip netns exec $ns0 nft -f - < /dev/null - - # should be in zone 1, not zone 2 - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) - if [ $count -eq 1 ]; then - echo "PASS: entry found in conntrack zone 1" - else - echo "FAIL: entry not found in conntrack zone 1" - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) - if [ $count -eq 1 ]; then - echo "FAIL: entry found in zone 2 instead" - else - echo "FAIL: entry not in zone 1 or 2, dumping table" - ip netns exec $ns0 conntrack -L - ip netns exec $ns0 nft list ruleset - fi - fi -} - -# add masq rule that gets evaluated w. outif set to vrf device. -# This tests the first iteration of the packet through conntrack, -# oifname is the vrf device. -test_masquerade_vrf() -{ - local qdisc=$1 - - if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc add dev tvrf root $qdisc - fi - - ip netns exec $ns0 conntrack -F 2>/dev/null - -ip netns exec $ns0 nft -f - </dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" - ret=1 - return - fi - - # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && - ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" - else - echo "FAIL: vrf rules have unexpected counter value" - ret=1 - fi - - if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc del dev tvrf root - fi -} - -# add masq rule that gets evaluated w. outif set to veth device. -# This tests the 2nd iteration of the packet through conntrack, -# oifname is the lower device (veth0 in this case). -test_masquerade_veth() -{ - ip netns exec $ns0 conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - < /dev/null - if [ $? -ne 0 ]; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" - ret=1 - return - fi - - # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' - if [ $? -eq 0 ]; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" - else - echo "FAIL: vrf masq rule has unexpected counter value" - ret=1 - fi -} - -test_ct_zone_in -test_masquerade_vrf "default" -test_masquerade_vrf "pfifo" -test_masquerade_veth - -exit $ret diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh deleted file mode 100755 index eb9553e4986b..000000000000 --- a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh +++ /dev/null @@ -1,207 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# Conntrack needs to reassemble fragments in order to have complete -# packets for rule matching. Reassembly can lead to packet loss. - -# Consider the following setup: -# +--------+ +---------+ +--------+ -# |Router A|-------|Wanrouter|-------|Router B| -# | |.IPIP..| |..IPIP.| | -# +--------+ +---------+ +--------+ -# / mtu 1400 \ -# / \ -#+--------+ +--------+ -#|Client A| |Client B| -#| | | | -#+--------+ +--------+ - -# Router A and Router B use IPIP tunnel interfaces to tunnel traffic -# between Client A and Client B over WAN. Wanrouter has MTU 1400 set -# on its interfaces. - -rnd=$(mktemp -u XXXXXXXX) -rx=$(mktemp) - -r_a="ns-ra-$rnd" -r_b="ns-rb-$rnd" -r_w="ns-rw-$rnd" -c_a="ns-ca-$rnd" -c_b="ns-cb-$rnd" - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "iptables --version" "run test without iptables" -checktool "ip -Version" "run test without ip tool" -checktool "which socat" "run test without socat" -checktool "ip netns add ${r_a}" "create net namespace" - -for n in ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns add ${n} -done - -cleanup() { - for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns del ${n} - done - rm -f ${rx} -} - -trap cleanup EXIT - -test_path() { - msg="$1" - - ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & - - sleep 1 - for i in 1 2 3; do - head -c1400 /dev/zero | tr "\000" "a" | \ - ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 - done - - wait - - bytes=$(wc -c < ${rx}) - - if [ $bytes -eq 1400 ];then - echo "OK: PMTU $msg connection tracking" - else - echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" - exit 1 - fi -} - -# Detailed setup for Router A -# --------------------------- -# Interfaces: -# eth0: 10.2.2.1/24 -# eth1: 192.168.10.1/24 -# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1 -# Routes: -# 192.168.20.0/24 dev ipip0 (192.168.20.0/24 is subnet of Client B) -# 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) -# No iptables rules at all. - -ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} -ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} - -l_addr="10.2.2.1" -r_addr="10.4.4.1" -ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip - -for dev in lo veth0 veth1 ipip0; do - ip -net ${r_a} link set $dev up -done - -ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 -ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 - -ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 -ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 - -ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null - -# Detailed setup for Router B -# --------------------------- -# Interfaces: -# eth0: 10.4.4.1/24 -# eth1: 192.168.20.1/24 -# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1 -# Routes: -# 192.168.10.0/24 dev ipip0 (192.168.10.0/24 is subnet of Client A) -# 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) -# No iptables rules at all. - -ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} -ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} - -l_addr="10.4.4.1" -r_addr="10.2.2.1" - -ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip - -for dev in lo veth0 veth1 ipip0; do - ip -net ${r_b} link set $dev up -done - -ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 -ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 - -ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 -ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 -ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null - -# Client A -ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 -ip -net ${c_a} link set dev lo up -ip -net ${c_a} link set dev veth0 up -ip -net ${c_a} route add default via 192.168.10.1 - -# Client A -ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 -ip -net ${c_b} link set dev veth0 up -ip -net ${c_b} link set dev lo up -ip -net ${c_b} route add default via 192.168.20.1 - -# Wan -ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 -ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 - -ip -net ${r_w} link set dev lo up -ip -net ${r_w} link set dev veth0 up mtu 1400 -ip -net ${r_w} link set dev veth1 up mtu 1400 - -ip -net ${r_a} link set dev veth0 mtu 1400 -ip -net ${r_b} link set dev veth0 mtu 1400 - -ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null - -# Path MTU discovery -# ------------------ -# Running tracepath from Client A to Client B shows PMTU discovery is working -# as expected: -# -# clienta:~# tracepath 192.168.20.2 -# 1?: [LOCALHOST] pmtu 1500 -# 1: 192.168.10.1 0.867ms -# 1: 192.168.10.1 0.302ms -# 2: 192.168.10.1 0.312ms pmtu 1480 -# 2: no reply -# 3: 192.168.10.1 0.510ms pmtu 1380 -# 3: 192.168.20.2 2.320ms reached -# Resume: pmtu 1380 hops 3 back 3 - -# ip netns exec ${c_a} traceroute --mtu 192.168.20.2 - -# Router A has learned PMTU (1400) to Router B from Wanrouter. -# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B -# from Router A. - -#Send large UDP packet -#--------------------- -#Now we send a 1400 bytes UDP packet from Client A to Client B: - -# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000 -test_path "without" - -# The IPv4 stack on Client A already knows the PMTU to Client B, so the -# UDP packet is sent as two fragments (1380 + 20). Router A forwards the -# fragments between eth1 and ipip0. The fragments fit into the tunnel and -# reach their destination. - -#When sending the large UDP packet again, Router A now reassembles the -#fragments before routing the packet over ipip0. The resulting IPIP -#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is -#dropped on Router A before sending. - -ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW -test_path "with" diff --git a/tools/testing/selftests/netfilter/ipvs.sh b/tools/testing/selftests/netfilter/ipvs.sh deleted file mode 100755 index c3b8f90c497e..000000000000 --- a/tools/testing/selftests/netfilter/ipvs.sh +++ /dev/null @@ -1,228 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# End-to-end ipvs test suite -# Topology: -#--------------------------------------------------------------+ -# | | -# ns0 | ns1 | -# ----------- | ----------- ----------- | -# | veth01 | --------- | veth10 | | veth12 | | -# ----------- peer ----------- ----------- | -# | | | | -# ----------- | | | -# | br0 | |----------------- peer |--------------| -# ----------- | | | -# | | | | -# ---------- peer ---------- ----------- | -# | veth02 | --------- | veth20 | | veth21 | | -# ---------- | ---------- ----------- | -# | ns2 | -# | | -#--------------------------------------------------------------+ -# -# We assume that all network driver are loaded -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -GREEN='\033[0;92m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -readonly port=8080 - -readonly vip_v4=207.175.44.110 -readonly cip_v4=10.0.0.2 -readonly gip_v4=10.0.0.1 -readonly dip_v4=172.16.0.1 -readonly rip_v4=172.16.0.2 -readonly sip_v4=10.0.0.3 - -readonly infile="$(mktemp)" -readonly outfile="$(mktemp)" -readonly datalen=32 - -sysipvsnet="/proc/sys/net/ipv4/vs/" -if [ ! -d $sysipvsnet ]; then - modprobe -q ip_vs - if [ $? -ne 0 ]; then - echo "skip: could not run test without ipvs module" - exit $ksft_skip - fi -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ipvsadm -v > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ipvsadm" - exit $ksft_skip -fi - -setup() { - ip netns add ns0 - ip netns add ns1 - ip netns add ns2 - - ip link add veth01 netns ns0 type veth peer name veth10 netns ns1 - ip link add veth02 netns ns0 type veth peer name veth20 netns ns2 - ip link add veth12 netns ns1 type veth peer name veth21 netns ns2 - - ip netns exec ns0 ip link set veth01 up - ip netns exec ns0 ip link set veth02 up - ip netns exec ns0 ip link add br0 type bridge - ip netns exec ns0 ip link set veth01 master br0 - ip netns exec ns0 ip link set veth02 master br0 - ip netns exec ns0 ip link set br0 up - ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0 - - ip netns exec ns1 ip link set lo up - ip netns exec ns1 ip link set veth10 up - ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10 - ip netns exec ns1 ip link set veth12 up - ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12 - - ip netns exec ns2 ip link set lo up - ip netns exec ns2 ip link set veth21 up - ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21 - ip netns exec ns2 ip link set veth20 up - ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20 - - sleep 1 - - dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -} - -cleanup() { - for i in 0 1 2 - do - ip netns del ns$i > /dev/null 2>&1 - done - - if [ -f "${outfile}" ]; then - rm "${outfile}" - fi - if [ -f "${infile}" ]; then - rm "${infile}" - fi -} - -server_listen() { - ip netns exec ns2 nc -l -p 8080 > "${outfile}" & - server_pid=$! - sleep 0.2 -} - -client_connect() { - ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}" -} - -verify_data() { - wait "${server_pid}" - cmp "$infile" "$outfile" 2>/dev/null -} - -test_service() { - server_listen - client_connect - verify_data -} - - -test_dr() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - # avoid incorrect arp response - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - # avoid reverse route lookup - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -test_nat() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 ip link del veth20 - ip netns exec ns2 ip route add default via ${dip_v4} dev veth21 - - test_service -} - -test_tun() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 modprobe ipip - ip netns exec ns1 ip link set tunl0 up - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 modprobe ipip - ip netns exec ns2 ip link set tunl0 up - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 - - test_service -} - -run_tests() { - local errors= - - echo "Testing DR mode..." - cleanup - setup - test_dr - errors=$(( $errors + $? )) - - echo "Testing NAT mode..." - cleanup - setup - test_nat - errors=$(( $errors + $? )) - - echo "Testing Tunnel mode..." - cleanup - setup - test_tun - errors=$(( $errors + $? )) - - return $errors -} - -trap cleanup EXIT - -run_tests - -if [ $? -ne 0 ]; then - echo -e "$(basename $0): ${RED}FAIL${NC}" - exit 1 -fi -echo -e "$(basename $0): ${GREEN}PASS${NC}" -exit 0 diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c deleted file mode 100644 index 9e56b9d47037..000000000000 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ /dev/null @@ -1,395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -struct options { - bool count_packets; - bool gso_enabled; - int verbose; - unsigned int queue_num; - unsigned int timeout; - uint32_t verdict; - uint32_t delay_ms; -}; - -static unsigned int queue_stats[5]; -static struct options opts; - -static void help(const char *p) -{ - printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); -} - -static int parse_attr_cb(const struct nlattr *attr, void *data) -{ - const struct nlattr **tb = data; - int type = mnl_attr_get_type(attr); - - /* skip unsupported attribute in user-space */ - if (mnl_attr_type_valid(attr, NFQA_MAX) < 0) - return MNL_CB_OK; - - switch (type) { - case NFQA_MARK: - case NFQA_IFINDEX_INDEV: - case NFQA_IFINDEX_OUTDEV: - case NFQA_IFINDEX_PHYSINDEV: - case NFQA_IFINDEX_PHYSOUTDEV: - if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) { - perror("mnl_attr_validate"); - return MNL_CB_ERROR; - } - break; - case NFQA_TIMESTAMP: - if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, - sizeof(struct nfqnl_msg_packet_timestamp)) < 0) { - perror("mnl_attr_validate2"); - return MNL_CB_ERROR; - } - break; - case NFQA_HWADDR: - if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC, - sizeof(struct nfqnl_msg_packet_hw)) < 0) { - perror("mnl_attr_validate2"); - return MNL_CB_ERROR; - } - break; - case NFQA_PAYLOAD: - break; - } - tb[type] = attr; - return MNL_CB_OK; -} - -static int queue_cb(const struct nlmsghdr *nlh, void *data) -{ - struct nlattr *tb[NFQA_MAX+1] = { 0 }; - struct nfqnl_msg_packet_hdr *ph = NULL; - uint32_t id = 0; - - (void)data; - - mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb); - if (tb[NFQA_PACKET_HDR]) { - ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]); - id = ntohl(ph->packet_id); - - if (opts.verbose > 0) - printf("packet hook=%u, hwproto 0x%x", - ntohs(ph->hw_protocol), ph->hook); - - if (ph->hook >= 5) { - fprintf(stderr, "Unknown hook %d\n", ph->hook); - return MNL_CB_ERROR; - } - - if (opts.verbose > 0) { - uint32_t skbinfo = 0; - - if (tb[NFQA_SKB_INFO]) - skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO])); - if (skbinfo & NFQA_SKB_CSUMNOTREADY) - printf(" csumnotready"); - if (skbinfo & NFQA_SKB_GSO) - printf(" gso"); - if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED) - printf(" csumnotverified"); - puts(""); - } - - if (opts.count_packets) - queue_stats[ph->hook]++; - } - - return MNL_CB_OK + id; -} - -static struct nlmsghdr * -nfq_build_cfg_request(char *buf, uint8_t command, int queue_num) -{ - struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); - struct nfqnl_msg_config_cmd cmd = { - .command = command, - .pf = htons(AF_INET), - }; - struct nfgenmsg *nfg; - - nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; - nlh->nlmsg_flags = NLM_F_REQUEST; - - nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); - - nfg->nfgen_family = AF_UNSPEC; - nfg->version = NFNETLINK_V0; - nfg->res_id = htons(queue_num); - - mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd); - - return nlh; -} - -static struct nlmsghdr * -nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) -{ - struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf); - struct nfqnl_msg_config_params params = { - .copy_range = htonl(range), - .copy_mode = mode, - }; - struct nfgenmsg *nfg; - - nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG; - nlh->nlmsg_flags = NLM_F_REQUEST; - - nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); - nfg->nfgen_family = AF_UNSPEC; - nfg->version = NFNETLINK_V0; - nfg->res_id = htons(queue_num); - - mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), ¶ms); - - return nlh; -} - -static struct nlmsghdr * -nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) -{ - struct nfqnl_msg_verdict_hdr vh = { - .verdict = htonl(verd), - .id = htonl(id), - }; - struct nlmsghdr *nlh; - struct nfgenmsg *nfg; - - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT; - nlh->nlmsg_flags = NLM_F_REQUEST; - nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg)); - nfg->nfgen_family = AF_UNSPEC; - nfg->version = NFNETLINK_V0; - nfg->res_id = htons(queue_num); - - mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh); - - return nlh; -} - -static void print_stats(void) -{ - unsigned int last, total; - int i; - - total = 0; - last = queue_stats[0]; - - for (i = 0; i < 5; i++) { - printf("hook %d packets %08u\n", i, queue_stats[i]); - last = queue_stats[i]; - total += last; - } - - printf("%u packets total\n", total); -} - -struct mnl_socket *open_queue(void) -{ - char buf[MNL_SOCKET_BUFFER_SIZE]; - unsigned int queue_num; - struct mnl_socket *nl; - struct nlmsghdr *nlh; - struct timeval tv; - uint32_t flags; - - nl = mnl_socket_open(NETLINK_NETFILTER); - if (nl == NULL) { - perror("mnl_socket_open"); - exit(EXIT_FAILURE); - } - - if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } - - queue_num = opts.queue_num; - nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num); - - if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - exit(EXIT_FAILURE); - } - - nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); - - flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; - flags |= NFQA_CFG_F_UID_GID; - mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); - mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); - - if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - exit(EXIT_FAILURE); - } - - memset(&tv, 0, sizeof(tv)); - tv.tv_sec = opts.timeout; - if (opts.timeout && setsockopt(mnl_socket_get_fd(nl), - SOL_SOCKET, SO_RCVTIMEO, - &tv, sizeof(tv))) { - perror("setsockopt(SO_RCVTIMEO)"); - exit(EXIT_FAILURE); - } - - return nl; -} - -static void sleep_ms(uint32_t delay) -{ - struct timespec ts = { .tv_sec = delay / 1000 }; - - delay %= 1000; - - ts.tv_nsec = delay * 1000llu * 1000llu; - - nanosleep(&ts, NULL); -} - -static int mainloop(void) -{ - unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; - struct mnl_socket *nl; - struct nlmsghdr *nlh; - unsigned int portid; - char *buf; - int ret; - - buf = malloc(buflen); - if (!buf) { - perror("malloc"); - exit(EXIT_FAILURE); - } - - nl = open_queue(); - portid = mnl_socket_get_portid(nl); - - for (;;) { - uint32_t id; - - ret = mnl_socket_recvfrom(nl, buf, buflen); - if (ret == -1) { - if (errno == ENOBUFS || errno == EINTR) - continue; - - if (errno == EAGAIN) { - errno = 0; - ret = 0; - break; - } - - perror("mnl_socket_recvfrom"); - exit(EXIT_FAILURE); - } - - ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL); - if (ret < 0) { - perror("mnl_cb_run"); - exit(EXIT_FAILURE); - } - - id = ret - MNL_CB_OK; - if (opts.delay_ms) - sleep_ms(opts.delay_ms); - - nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); - if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { - perror("mnl_socket_sendto"); - exit(EXIT_FAILURE); - } - } - - mnl_socket_close(nl); - - return ret; -} - -static void parse_opts(int argc, char **argv) -{ - int c; - - while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { - switch (c) { - case 'c': - opts.count_packets = true; - break; - case 'h': - help(argv[0]); - exit(0); - break; - case 'q': - opts.queue_num = atoi(optarg); - if (opts.queue_num > 0xffff) - opts.queue_num = 0; - break; - case 'Q': - opts.verdict = atoi(optarg); - if (opts.verdict > 0xffff) { - fprintf(stderr, "Expected destination queue number\n"); - exit(1); - } - - opts.verdict <<= 16; - opts.verdict |= NF_QUEUE; - break; - case 'd': - opts.delay_ms = atoi(optarg); - if (opts.delay_ms == 0) { - fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); - exit(1); - } - break; - case 't': - opts.timeout = atoi(optarg); - break; - case 'G': - opts.gso_enabled = false; - break; - case 'v': - opts.verbose++; - break; - } - } - - if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { - fprintf(stderr, "Cannot use same destination and source queue\n"); - exit(1); - } -} - -int main(int argc, char *argv[]) -{ - int ret; - - opts.verdict = NF_ACCEPT; - opts.gso_enabled = true; - - parse_opts(argc, argv); - - ret = mainloop(); - if (opts.count_packets) - print_stats(); - - return ret; -} diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh deleted file mode 100755 index a1aa8f4a5828..000000000000 --- a/tools/testing/selftests/netfilter/nf_nat_edemux.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test NAT source port clash resolution -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -socatpid=0 - -cleanup() -{ - [ $socatpid -gt 0 ] && kill $socatpid - ip netns del $ns1 - ip netns del $ns2 -} - -socat -h > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without socat" - exit $ksft_skip -fi - -iptables --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without iptables" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add $ns2 - -# Connect the namespaces using a veth pair -ip link add name veth2 type veth peer name veth1 -ip link set netns $ns1 dev veth1 -ip link set netns $ns2 dev veth2 - -ip netns exec $ns1 ip link set up dev lo -ip netns exec $ns1 ip link set up dev veth1 -ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 - -ip netns exec $ns2 ip link set up dev lo -ip netns exec $ns2 ip link set up dev veth2 -ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 - -# Create a server in one namespace -ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & -socatpid=$! - -# Restrict source port to just one so we don't have to exhaust -# all others. -ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" - -# add a virtual IP using DNAT -ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 - -# ... and route it to the other namespace -ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 - -sleep 1 - -# add a persistent connection from the other namespace -ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & - -sleep 1 - -# ip daddr:dport will be rewritten to 192.168.1.1 5201 -# NAT must reallocate source port 10000 because -# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use -echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null -ret=$? - -# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). -if [ $ret -eq 0 ]; then - echo "PASS: socat can connect via NAT'd address" -else - echo "FAIL: socat cannot connect via NAT'd address" -fi - -# check sport clashres. -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 - -sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & -cpid1=$! -sleep 1 - -# if connect succeeds, client closes instantly due to EOF on stdin. -# if connect hangs, it will time out after 5s. -echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & -cpid2=$! - -time_then=$(date +%s) -wait $cpid2 -rv=$? -time_now=$(date +%s) - -# Check how much time has elapsed, expectation is for -# 'cpid2' to connect and then exit (and no connect delay). -delta=$((time_now - time_then)) - -if [ $delta -lt 2 -a $rv -eq 0 ]; then - echo "PASS: could connect to service via redirected ports" -else - echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" - ret=1 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_audit.sh b/tools/testing/selftests/netfilter/nft_audit.sh deleted file mode 100755 index 99ed5bd6e840..000000000000 --- a/tools/testing/selftests/netfilter/nft_audit.sh +++ /dev/null @@ -1,245 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Check that audit logs generated for nft commands are as expected. - -SKIP_RC=4 -RC=0 - -nft --version >/dev/null 2>&1 || { - echo "SKIP: missing nft tool" - exit $SKIP_RC -} - -# Run everything in a separate network namespace -[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } - -# give other scripts a chance to finish - audit_logread sees all activity -sleep 1 - -logfile=$(mktemp) -rulefile=$(mktemp) -echo "logging into $logfile" -./audit_logread >"$logfile" & -logread_pid=$! -trap 'kill $logread_pid; rm -f $logfile $rulefile' EXIT -exec 3<"$logfile" - -do_test() { # (cmd, log) - echo -n "testing for cmd: $1 ... " - cat <&3 >/dev/null - $1 >/dev/null || exit 1 - sleep 0.1 - res=$(diff -a -u <(echo "$2") - <&3) - [ $? -eq 0 ] && { echo "OK"; return; } - echo "FAIL" - grep -v '^\(---\|+++\|@@\)' <<< "$res" - ((RC--)) -} - -nft flush ruleset - -# adding tables, chains and rules - -for table in t1 t2; do - do_test "nft add table $table" \ - "table=$table family=2 entries=1 op=nft_register_table" - - do_test "nft add chain $table c1" \ - "table=$table family=2 entries=1 op=nft_register_chain" - - do_test "nft add chain $table c2; add chain $table c3" \ - "table=$table family=2 entries=2 op=nft_register_chain" - - cmd="add rule $table c1 counter" - - do_test "nft $cmd" \ - "table=$table family=2 entries=1 op=nft_register_rule" - - do_test "nft $cmd; $cmd" \ - "table=$table family=2 entries=2 op=nft_register_rule" - - cmd="" - sep="" - for chain in c2 c3; do - for i in {1..3}; do - cmd+="$sep add rule $table $chain counter" - sep=";" - done - done - do_test "nft $cmd" \ - "table=$table family=2 entries=6 op=nft_register_rule" -done - -for ((i = 0; i < 500; i++)); do - echo "add rule t2 c3 counter accept comment \"rule $i\"" -done >$rulefile -do_test "nft -f $rulefile" \ -'table=t2 family=2 entries=500 op=nft_register_rule' - -# adding sets and elements - -settype='type inet_service; counter' -setelem='{ 22, 80, 443 }' -setblock="{ $settype; elements = $setelem; }" -do_test "nft add set t1 s $setblock" \ -"table=t1 family=2 entries=4 op=nft_register_set" - -do_test "nft add set t1 s2 $setblock; add set t1 s3 { $settype; }" \ -"table=t1 family=2 entries=5 op=nft_register_set" - -do_test "nft add element t1 s3 $setelem" \ -"table=t1 family=2 entries=3 op=nft_register_setelem" - -# adding counters - -do_test 'nft add counter t1 c1' \ -'table=t1 family=2 entries=1 op=nft_register_obj' - -do_test 'nft add counter t2 c1; add counter t2 c2' \ -'table=t2 family=2 entries=2 op=nft_register_obj' - -for ((i = 3; i <= 500; i++)); do - echo "add counter t2 c$i" -done >$rulefile -do_test "nft -f $rulefile" \ -'table=t2 family=2 entries=498 op=nft_register_obj' - -# adding/updating quotas - -do_test 'nft add quota t1 q1 { 10 bytes }' \ -'table=t1 family=2 entries=1 op=nft_register_obj' - -do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ -'table=t2 family=2 entries=2 op=nft_register_obj' - -for ((i = 3; i <= 500; i++)); do - echo "add quota t2 q$i { 10 bytes }" -done >$rulefile -do_test "nft -f $rulefile" \ -'table=t2 family=2 entries=498 op=nft_register_obj' - -# changing the quota value triggers obj update path -do_test 'nft add quota t1 q1 { 20 bytes }' \ -'table=t1 family=2 entries=1 op=nft_register_obj' - -# resetting rules - -do_test 'nft reset rules t1 c2' \ -'table=t1 family=2 entries=3 op=nft_reset_rule' - -do_test 'nft reset rules table t1' \ -'table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule' - -do_test 'nft reset rules t2 c3' \ -'table=t2 family=2 entries=189 op=nft_reset_rule -table=t2 family=2 entries=188 op=nft_reset_rule -table=t2 family=2 entries=126 op=nft_reset_rule' - -do_test 'nft reset rules t2' \ -'table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=186 op=nft_reset_rule -table=t2 family=2 entries=188 op=nft_reset_rule -table=t2 family=2 entries=129 op=nft_reset_rule' - -do_test 'nft reset rules' \ -'table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule -table=t1 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=3 op=nft_reset_rule -table=t2 family=2 entries=180 op=nft_reset_rule -table=t2 family=2 entries=188 op=nft_reset_rule -table=t2 family=2 entries=135 op=nft_reset_rule' - -# resetting sets and elements - -elem=(22 ,80 ,443) -relem="" -for i in {1..3}; do - relem+="${elem[((i - 1))]}" - do_test "nft reset element t1 s { $relem }" \ - "table=t1 family=2 entries=$i op=nft_reset_setelem" -done - -do_test 'nft reset set t1 s' \ -'table=t1 family=2 entries=3 op=nft_reset_setelem' - -# resetting counters - -do_test 'nft reset counter t1 c1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset counters t1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset counters t2' \ -'table=t2 family=2 entries=342 op=nft_reset_obj -table=t2 family=2 entries=158 op=nft_reset_obj' - -do_test 'nft reset counters' \ -'table=t1 family=2 entries=1 op=nft_reset_obj -table=t2 family=2 entries=341 op=nft_reset_obj -table=t2 family=2 entries=159 op=nft_reset_obj' - -# resetting quotas - -do_test 'nft reset quota t1 q1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset quotas t1' \ -'table=t1 family=2 entries=1 op=nft_reset_obj' - -do_test 'nft reset quotas t2' \ -'table=t2 family=2 entries=315 op=nft_reset_obj -table=t2 family=2 entries=185 op=nft_reset_obj' - -do_test 'nft reset quotas' \ -'table=t1 family=2 entries=1 op=nft_reset_obj -table=t2 family=2 entries=314 op=nft_reset_obj -table=t2 family=2 entries=186 op=nft_reset_obj' - -# deleting rules - -readarray -t handles < <(nft -a list chain t1 c1 | \ - sed -n 's/.*counter.* handle \(.*\)$/\1/p') - -do_test "nft delete rule t1 c1 handle ${handles[0]}" \ -'table=t1 family=2 entries=1 op=nft_unregister_rule' - -cmd='delete rule t1 c1 handle' -do_test "nft $cmd ${handles[1]}; $cmd ${handles[2]}" \ -'table=t1 family=2 entries=2 op=nft_unregister_rule' - -do_test 'nft flush chain t1 c2' \ -'table=t1 family=2 entries=3 op=nft_unregister_rule' - -do_test 'nft flush table t2' \ -'table=t2 family=2 entries=509 op=nft_unregister_rule' - -# deleting chains - -do_test 'nft delete chain t2 c2' \ -'table=t2 family=2 entries=1 op=nft_unregister_chain' - -# deleting sets and elements - -do_test 'nft delete element t1 s { 22 }' \ -'table=t1 family=2 entries=1 op=nft_unregister_setelem' - -do_test 'nft delete element t1 s { 80, 443 }' \ -'table=t1 family=2 entries=2 op=nft_unregister_setelem' - -do_test 'nft flush set t1 s2' \ -'table=t1 family=2 entries=3 op=nft_unregister_setelem' - -do_test 'nft delete set t1 s2' \ -'table=t1 family=2 entries=1 op=nft_unregister_set' - -do_test 'nft delete set t1 s3' \ -'table=t1 family=2 entries=1 op=nft_unregister_set' - -exit $RC diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh deleted file mode 100755 index e908009576c7..000000000000 --- a/tools/testing/selftests/netfilter/nft_concat_range.sh +++ /dev/null @@ -1,1645 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# nft_concat_range.sh - Tests for sets with concatenation of ranged fields -# -# Copyright (c) 2019 Red Hat GmbH -# -# Author: Stefano Brivio -# -# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 -# ^ Configuration and templates sourced with eval, counters reused in subshells - -KSELFTEST_SKIP=4 - -# Available test groups: -# - reported_issues: check for issues that were reported in the past -# - correctness: check that packets match given entries, and only those -# - concurrency: attempt races between insertion, deletion and lookup -# - timeout: check that packets match entries until they expire -# - performance: estimate matching rate, compare with rbtree and hash baselines -TESTS="reported_issues correctness concurrency timeout" -[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" - -# Set types, defined by TYPE_ variables below -TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto - net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp - net6_port_net6_port net_port_mac_proto_net" - -# Reported bugs, also described by TYPE_ variables below -BUGS="flush_remove_add reload" - -# List of possible paths to pktgen script from kernel tree for performance tests -PKTGEN_SCRIPT_PATHS=" - ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh - pktgen/pktgen_bench_xmit_mode_netif_receive.sh" - -# Definition of set types: -# display display text for test report -# type_spec nftables set type specifier -# chain_spec nftables type specifier for rules mapping to set -# dst call sequence of format_*() functions for destination fields -# src call sequence of format_*() functions for source fields -# start initial integer used to generate addresses and ports -# count count of entries to generate and match -# src_delta number summed to destination generator for source fields -# tools list of tools for correctness and timeout tests, any can be used -# proto L4 protocol of test packets -# -# race_repeat race attempts per thread, 0 disables concurrency test for type -# flood_tools list of tools for concurrency tests, any can be used -# flood_proto L4 protocol of test packets for concurrency tests -# flood_spec nftables type specifier for concurrency tests -# -# perf_duration duration of single pktgen injection test -# perf_spec nftables type specifier for performance tests -# perf_dst format_*() functions for destination fields in performance test -# perf_src format_*() functions for source fields in performance test -# perf_entries number of set entries for performance test -# perf_proto L3 protocol of test packets -TYPE_net_port=" -display net,port -type_spec ipv4_addr . inet_service -chain_spec ip daddr . udp dport -dst addr4 port -src -start 1 -count 5 -src_delta 2000 -tools sendip nc bash -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto udp -flood_spec ip daddr . udp dport - -perf_duration 5 -perf_spec ip daddr . udp dport -perf_dst addr4 port -perf_src -perf_entries 1000 -perf_proto ipv4 -" - -TYPE_port_net=" -display port,net -type_spec inet_service . ipv4_addr -chain_spec udp dport . ip daddr -dst port addr4 -src -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto udp -flood_spec udp dport . ip daddr - -perf_duration 5 -perf_spec udp dport . ip daddr -perf_dst port addr4 -perf_src -perf_entries 100 -perf_proto ipv4 -" - -TYPE_net6_port=" -display net6,port -type_spec ipv6_addr . inet_service -chain_spec ip6 daddr . udp dport -dst addr6 port -src -start 10 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp6 - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp6 -flood_spec ip6 daddr . udp dport - -perf_duration 5 -perf_spec ip6 daddr . udp dport -perf_dst addr6 port -perf_src -perf_entries 1000 -perf_proto ipv6 -" - -TYPE_port_proto=" -display port,proto -type_spec inet_service . inet_proto -chain_spec udp dport . meta l4proto -dst port proto -src -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 5 -perf_spec udp dport . meta l4proto -perf_dst port proto -perf_src -perf_entries 30000 -perf_proto ipv4 -" - -TYPE_net6_port_mac=" -display net6,port,mac -type_spec ipv6_addr . inet_service . ether_addr -chain_spec ip6 daddr . udp dport . ether saddr -dst addr6 port -src mac -start 10 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp6 - -race_repeat 0 - -perf_duration 5 -perf_spec ip6 daddr . udp dport . ether daddr -perf_dst addr6 port mac -perf_src -perf_entries 10 -perf_proto ipv6 -" - -TYPE_net6_port_mac_proto=" -display net6,port,mac,proto -type_spec ipv6_addr . inet_service . ether_addr . inet_proto -chain_spec ip6 daddr . udp dport . ether saddr . meta l4proto -dst addr6 port -src mac proto -start 10 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp6 - -race_repeat 0 - -perf_duration 5 -perf_spec ip6 daddr . udp dport . ether daddr . meta l4proto -perf_dst addr6 port mac proto -perf_src -perf_entries 1000 -perf_proto ipv6 -" - -TYPE_net_port_net=" -display net,port,net -type_spec ipv4_addr . inet_service . ipv4_addr -chain_spec ip daddr . udp dport . ip saddr -dst addr4 port -src addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp -flood_spec ip daddr . udp dport . ip saddr - -perf_duration 0 -" - -TYPE_net6_port_net6_port=" -display net6,port,net6,port -type_spec ipv6_addr . inet_service . ipv6_addr . inet_service -chain_spec ip6 daddr . udp dport . ip6 saddr . udp sport -dst addr6 port -src addr6 port -start 10 -count 5 -src_delta 2000 -tools sendip socat nc -proto udp6 - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp6 -flood_spec ip6 daddr . tcp dport . ip6 saddr . tcp sport - -perf_duration 0 -" - -TYPE_net_port_mac_proto_net=" -display net,port,mac,proto,net -type_spec ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr -chain_spec ip daddr . udp dport . ether saddr . meta l4proto . ip saddr -dst addr4 port -src mac proto addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net_mac=" -display net,mac -type_spec ipv4_addr . ether_addr -chain_spec ip daddr . ether saddr -dst addr4 -src mac -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 5 -perf_spec ip daddr . ether daddr -perf_dst addr4 mac -perf_src -perf_entries 1000 -perf_proto ipv4 -" - -TYPE_mac_net=" -display mac,net -type_spec ether_addr . ipv4_addr -chain_spec ether saddr . ip saddr -dst -src mac addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net_mac_icmp=" -display net,mac - ICMP -type_spec ipv4_addr . ether_addr -chain_spec ip daddr . ether saddr -dst addr4 -src mac -start 1 -count 5 -src_delta 2000 -tools ping -proto icmp - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net6_mac_icmp=" -display net6,mac - ICMPv6 -type_spec ipv6_addr . ether_addr -chain_spec ip6 daddr . ether saddr -dst addr6 -src mac -start 10 -count 50 -src_delta 2000 -tools ping -proto icmp6 - -race_repeat 0 - -perf_duration 0 -" - -TYPE_net_port_proto_net=" -display net,port,proto,net -type_spec ipv4_addr . inet_service . inet_proto . ipv4_addr -chain_spec ip daddr . udp dport . meta l4proto . ip saddr -dst addr4 port proto -src addr4 -start 1 -count 5 -src_delta 2000 -tools sendip socat nc -proto udp - -race_repeat 3 -flood_tools iperf3 iperf netperf -flood_proto tcp -flood_spec ip daddr . tcp dport . meta l4proto . ip saddr - -perf_duration 0 -" - -# Definition of tests for bugs reported in the past: -# display display text for test report -TYPE_flush_remove_add=" -display Add two elements, flush, re-add -" - -TYPE_reload=" -display net,mac with reload -type_spec ipv4_addr . ether_addr -chain_spec ip daddr . ether saddr -dst addr4 -src mac -start 1 -count 1 -src_delta 2000 -tools sendip socat nc bash -proto udp - -race_repeat 0 - -perf_duration 0 -" - -# Set template for all tests, types and rules are filled in depending on test -set_template=' -flush ruleset - -table inet filter { - counter test { - packets 0 bytes 0 - } - - set test { - type ${type_spec} - flags interval,timeout - } - - chain input { - type filter hook prerouting priority 0; policy accept; - ${chain_spec} @test counter name \"test\" - } -} - -table netdev perf { - counter test { - packets 0 bytes 0 - } - - counter match { - packets 0 bytes 0 - } - - set test { - type ${type_spec} - flags interval - } - - set norange { - type ${type_spec} - } - - set noconcat { - type ${type_spec%% *} - flags interval - } - - chain test { - type filter hook ingress device veth_a priority 0; - } -} -' - -err_buf= -info_buf= - -# Append string to error buffer -err() { - err_buf="${err_buf}${1} -" -} - -# Append string to information buffer -info() { - info_buf="${info_buf}${1} -" -} - -# Flush error buffer to stdout -err_flush() { - printf "%s" "${err_buf}" - err_buf= -} - -# Flush information buffer to stdout -info_flush() { - printf "%s" "${info_buf}" - info_buf= -} - -# Setup veth pair: this namespace receives traffic, B generates it -setup_veth() { - ip netns add B - ip link add veth_a type veth peer name veth_b || return 1 - - ip link set veth_a up - ip link set veth_b netns B - - ip -n B link set veth_b up - - ip addr add dev veth_a 10.0.0.1 - ip route add default dev veth_a - - ip -6 addr add fe80::1/64 dev veth_a nodad - ip -6 addr add 2001:db8::1/64 dev veth_a nodad - ip -6 route add default dev veth_a - - ip -n B route add default dev veth_b - - ip -6 -n B addr add fe80::2/64 dev veth_b nodad - ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad - ip -6 -n B route add default dev veth_b - - B() { - ip netns exec B "$@" >/dev/null 2>&1 - } - - sleep 2 -} - -# Fill in set template and initialise set -setup_set() { - eval "echo \"${set_template}\"" | nft -f - -} - -# Check that at least one of the needed tools is available -check_tools() { - [ -z "${tools}" ] && return 0 - - __tools= - for tool in ${tools}; do - if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ - ! nc -u -w0 1.1.1.1 1 2>/dev/null; then - # Some GNU netcat builds might not support IPv6 - __tools="${__tools} netcat-openbsd" - continue - fi - __tools="${__tools} ${tool}" - - command -v "${tool}" >/dev/null && return 0 - done - err "need one of:${__tools}, skipping" && return 1 -} - -# Set up function to send ICMP packets -setup_send_icmp() { - send_icmp() { - B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1 - } -} - -# Set up function to send ICMPv6 packets -setup_send_icmp6() { - if command -v ping6 >/dev/null; then - send_icmp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - B ping6 -q -c1 -W1 "${dst_addr6}" - } - else - send_icmp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - B ping -q -6 -c1 -W1 "${dst_addr6}" - } - fi -} - -# Set up function to send single UDP packets on IPv4 -setup_send_udp() { - if command -v sendip >/dev/null; then - send_udp() { - [ -n "${src_port}" ] && src_port="-us ${src_port}" - [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" - [ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}" - - # shellcheck disable=SC2086 # sendip needs split options - B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \ - ${dst_port} "${dst_addr4}" - - src_port= - dst_port= - src_addr4= - } - elif command -v socat -v >/dev/null; then - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __socatbind=",bind=${src_addr4}" - if [ -n "${src_port}" ];then - __socatbind="${__socatbind}:${src_port}" - fi - fi - - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -z "${dst_port}" ] && dst_port=12345 - - echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" - - src_addr4= - src_port= - } - elif command -v nc >/dev/null; then - if nc -u -w0 1.1.1.1 1 2>/dev/null; then - # OpenBSD netcat - nc_opt="-w0" - else - # GNU netcat - nc_opt="-q0" - fi - - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __src_addr4="-s ${src_addr4}" - fi - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ - "${src_port}" "${dst_addr4}" "${dst_port}" - - src_addr4= - src_port= - } - elif [ -z "$(bash -c 'type -p')" ]; then - send_udp() { - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - B ip route add default dev veth_b - fi - - B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}" - - if [ -n "${src_addr4}" ]; then - B ip addr del "${src_addr4}/16" dev veth_b - fi - src_addr4= - } - else - return 1 - fi -} - -# Set up function to send single UDP packets on IPv6 -setup_send_udp6() { - if command -v sendip >/dev/null; then - send_udp6() { - [ -n "${src_port}" ] && src_port="-us ${src_port}" - [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}" - if [ -n "${src_addr6}" ]; then - src_addr6="-6s ${src_addr6}" - else - src_addr6="-6s 2001:db8::2" - fi - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \ - ${dst_port} "${dst_addr6}" - - src_port= - dst_port= - src_addr6= - } - elif command -v socat -v >/dev/null; then - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - __socatbind6= - - if [ -n "${src_addr6}" ]; then - if [ -n "${src_addr6} != "${src_addr6_added} ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - - src_addr6_added=${src_addr6} - fi - - __socatbind6=",bind=[${src_addr6}]" - - if [ -n "${src_port}" ] ;then - __socatbind6="${__socatbind6}:${src_port}" - fi - fi - - echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" - } - elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then - # GNU netcat might not work with IPv6, try next tool - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - # shellcheck disable=SC2086 # this needs split options - echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ - ${dst_addr6} ${dst_port} - - src_addr6= - src_port= - } - elif [ -z "$(bash -c 'type -p')" ]; then - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - B ip addr add "${src_addr6}" dev veth_b nodad - B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}" - ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null - } - else - return 1 - fi -} - -# Set up function to send TCP traffic on IPv4 -setup_flood_tcp() { - if command -v iperf3 >/dev/null; then - flood_tcp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_port="--cport ${src_port}" - fi - B ip route add default dev veth_b 2>/dev/null - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ - ${src_addr4} -l16 -t 1000 - - src_addr4= - src_port= - dst_port= - } - elif command -v iperf >/dev/null; then - flood_tcp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 2>/dev/null - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_addr4="${src_addr4}:${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ - -l20 -t 1000 - - src_addr4= - src_port= - dst_port= - } - elif command -v netperf >/dev/null; then - flood_tcp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="10.0.0.2" - fi - if [ -n "${src_port}" ]; then - dst_port="${dst_port},${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - netserver -4 ${dst_port} -L "${dst_addr4}" \ - >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B netperf -4 -H "${dst_addr4}" ${dst_port} \ - -L "${src_addr4}" -l 1000 -t TCP_STREAM - - src_addr4= - src_port= - dst_port= - } - else - return 1 - fi -} - -# Set up function to send TCP traffic on IPv6 -setup_flood_tcp6() { - if command -v iperf3 >/dev/null; then - flood_tcp6() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - src_addr6="-B ${src_addr6}" - else - src_addr6="-B 2001:db8::2" - fi - if [ -n "${src_port}" ]; then - src_port="--cport ${src_port}" - fi - B ip route add default dev veth_b - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf3 -c "${dst_addr6}" ${dst_port} \ - ${src_port} ${src_addr6} -l16 -t 1000 - - src_addr6= - src_port= - dst_port= - } - elif command -v iperf >/dev/null; then - flood_tcp6() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - src_addr6="-B ${src_addr6}" - else - src_addr6="-B 2001:db8::2" - fi - if [ -n "${src_port}" ]; then - src_addr6="${src_addr6}:${src_port}" - fi - B ip route add default dev veth_b - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf -c "${dst_addr6}" -V ${dst_port} \ - ${src_addr6} -l1 -t 1000 - - src_addr6= - src_port= - dst_port= - } - elif command -v netperf >/dev/null; then - flood_tcp6() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - if [ -n "${src_port}" ]; then - dst_port="${dst_port},${src_port}" - fi - B ip route add default dev veth_b - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - netserver -6 ${dst_port} -L "${dst_addr6}" \ - >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B netperf -6 -H "${dst_addr6}" ${dst_port} \ - -L "${src_addr6}" -l 1000 -t TCP_STREAM - - src_addr6= - src_port= - dst_port= - } - else - return 1 - fi -} - -# Set up function to send UDP traffic on IPv4 -setup_flood_udp() { - if command -v iperf3 >/dev/null; then - flood_udp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 2>/dev/null - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_port="--cport ${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf3 -s -DB "${dst_addr4}" ${dst_port} - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ - ${dst_port} ${src_port} ${src_addr4} - - src_addr4= - src_port= - dst_port= - } - elif command -v iperf >/dev/null; then - flood_udp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - src_addr4="-B ${src_addr4}" - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="-B 10.0.0.2" - fi - if [ -n "${src_port}" ]; then - src_addr4="${src_addr4}:${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ - ${dst_port} ${src_addr4} - - src_addr4= - src_port= - dst_port= - } - elif command -v netperf >/dev/null; then - flood_udp() { - [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}/16" dev veth_b - else - B ip addr add dev veth_b 10.0.0.2 - src_addr4="10.0.0.2" - fi - if [ -n "${src_port}" ]; then - dst_port="${dst_port},${src_port}" - fi - B ip route add default dev veth_b - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - - # shellcheck disable=SC2086 # this needs split options - netserver -4 ${dst_port} -L "${dst_addr4}" \ - >/dev/null 2>&1 - sleep 2 - - # shellcheck disable=SC2086 # this needs split options - B netperf -4 -H "${dst_addr4}" ${dst_port} \ - -L "${src_addr4}" -l 1000 -t UDP_STREAM - - src_addr4= - src_port= - dst_port= - } - else - return 1 - fi -} - -# Find pktgen script and set up function to start pktgen injection -setup_perf() { - for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do - command -v "${pktgen_script_path}" >/dev/null && break - done - [ "${pktgen_script_path}" = "__notfound" ] && return 1 - - perf_ipv4() { - ${pktgen_script_path} -s80 \ - -i veth_a -d "${dst_addr4}" -p "${dst_port}" \ - -m "${dst_mac}" \ - -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & - perf_pid=$! - } - perf_ipv6() { - IP6=6 ${pktgen_script_path} -s100 \ - -i veth_a -d "${dst_addr6}" -p "${dst_port}" \ - -m "${dst_mac}" \ - -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null & - perf_pid=$! - } -} - -# Clean up before each test -cleanup() { - nft reset counter inet filter test >/dev/null 2>&1 - nft flush ruleset >/dev/null 2>&1 - ip link del dummy0 2>/dev/null - ip route del default 2>/dev/null - ip -6 route del default 2>/dev/null - ip netns del B 2>/dev/null - ip link del veth_a 2>/dev/null - timeout= - killall iperf3 2>/dev/null - killall iperf 2>/dev/null - killall netperf 2>/dev/null - killall netserver 2>/dev/null - rm -f ${tmp} - sleep 2 -} - -# Entry point for setup functions -setup() { - if [ "$(id -u)" -ne 0 ]; then - echo " need to run as root" - exit ${KSELFTEST_SKIP} - fi - - cleanup - check_tools || return 1 - for arg do - if ! eval setup_"${arg}"; then - err " ${arg} not supported" - return 1 - fi - done -} - -# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it -format_addr4() { - a=$((${1} + 16777216 * 10 + 5)) - printf "%i.%i.%i.%i" \ - "$((a / 16777216))" "$((a % 16777216 / 65536))" \ - "$((a % 65536 / 256))" "$((a % 256))" -} - -# Format integer into IPv6 address, summing 2001:db8:: to it -format_addr6() { - printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))" -} - -# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it -format_mac() { - printf "00:01:%02x:%02x:%02x:%02x" \ - "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))" \ - "$((${1} % 65536 / 256))" "$((${1} % 256))" -} - -# Format integer into port, avoid 0 port -format_port() { - printf "%i" "$((${1} % 65534 + 1))" -} - -# Drop suffixed '6' from L4 protocol, if any -format_proto() { - printf "%s" "${proto}" | tr -d 6 -} - -# Format destination and source fields into nft concatenated type -format() { - __start= - __end= - __expr="{ " - - for f in ${dst}; do - [ "${__expr}" != "{ " ] && __expr="${__expr} . " - - __start="$(eval format_"${f}" "${start}")" - __end="$(eval format_"${f}" "${end}")" - - if [ "${f}" = "proto" ]; then - __expr="${__expr}${__start}" - else - __expr="${__expr}${__start}-${__end}" - fi - done - for f in ${src}; do - [ "${__expr}" != "{ " ] && __expr="${__expr} . " - - __start="$(eval format_"${f}" "${srcstart}")" - __end="$(eval format_"${f}" "${srcend}")" - - if [ "${f}" = "proto" ]; then - __expr="${__expr}${__start}" - else - __expr="${__expr}${__start}-${__end}" - fi - done - - if [ -n "${timeout}" ]; then - echo "${__expr} timeout ${timeout}s }" - else - echo "${__expr} }" - fi -} - -# Format destination and source fields into nft type, start element only -format_norange() { - __expr="{ " - - for f in ${dst}; do - [ "${__expr}" != "{ " ] && __expr="${__expr} . " - - __expr="${__expr}$(eval format_"${f}" "${start}")" - done - for f in ${src}; do - __expr="${__expr} . $(eval format_"${f}" "${start}")" - done - - echo "${__expr} }" -} - -# Format first destination field into nft type -format_noconcat() { - for f in ${dst}; do - __start="$(eval format_"${f}" "${start}")" - __end="$(eval format_"${f}" "${end}")" - - if [ "${f}" = "proto" ]; then - echo "{ ${__start} }" - else - echo "{ ${__start}-${__end} }" - fi - return - done -} - -# Add single entry to 'test' set in 'inet filter' table -add() { - if ! nft add element inet filter test "${1}"; then - err "Failed to add ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Format and output entries for sets in 'netdev perf' table -add_perf() { - if [ "${1}" = "test" ]; then - echo "add element netdev perf test $(format)" - elif [ "${1}" = "norange" ]; then - echo "add element netdev perf norange $(format_norange)" - elif [ "${1}" = "noconcat" ]; then - echo "add element netdev perf noconcat $(format_noconcat)" - fi -} - -# Add single entry to 'norange' set in 'netdev perf' table -add_perf_norange() { - if ! nft add element netdev perf norange "${1}"; then - err "Failed to add ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Add single entry to 'noconcat' set in 'netdev perf' table -add_perf_noconcat() { - if ! nft add element netdev perf noconcat "${1}"; then - err "Failed to add ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Delete single entry from set -del() { - if ! nft delete element inet filter test "${1}"; then - err "Failed to delete ${1} given ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Return packet count from 'test' counter in 'inet filter' table -count_packets() { - found=0 - for token in $(nft list counter inet filter test); do - [ ${found} -eq 1 ] && echo "${token}" && return - [ "${token}" = "packets" ] && found=1 - done -} - -# Return packet count from 'test' counter in 'netdev perf' table -count_perf_packets() { - found=0 - for token in $(nft list counter netdev perf test); do - [ ${found} -eq 1 ] && echo "${token}" && return - [ "${token}" = "packets" ] && found=1 - done -} - -# Set MAC addresses, send traffic according to specifier -flood() { - ip link set veth_a address "$(format_mac "${1}")" - ip -n B link set veth_b address "$(format_mac "${2}")" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval flood_\$proto -} - -# Set MAC addresses, start pktgen injection -perf() { - dst_mac="$(format_mac "${1}")" - ip link set veth_a address "${dst_mac}" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval perf_\$perf_proto -} - -# Set MAC addresses, send single packet, check that it matches, reset counter -send_match() { - ip link set veth_a address "$(format_mac "${1}")" - ip -n B link set veth_b address "$(format_mac "${2}")" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval send_\$proto - if [ "$(count_packets)" != "1" ]; then - err "${proto} packet to:" - err " $(for f in ${dst}; do - eval format_\$f "${1}"; printf ' '; done)" - err "from:" - err " $(for f in ${src}; do - eval format_\$f "${2}"; printf ' '; done)" - err "should have matched ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi - nft reset counter inet filter test >/dev/null -} - -# Set MAC addresses, send single packet, check that it doesn't match -send_nomatch() { - ip link set veth_a address "$(format_mac "${1}")" - ip -n B link set veth_b address "$(format_mac "${2}")" - - for f in ${dst}; do - eval dst_"$f"=\$\(format_\$f "${1}"\) - done - for f in ${src}; do - eval src_"$f"=\$\(format_\$f "${2}"\) - done - eval send_\$proto - if [ "$(count_packets)" != "0" ]; then - err "${proto} packet to:" - err " $(for f in ${dst}; do - eval format_\$f "${1}"; printf ' '; done)" - err "from:" - err " $(for f in ${src}; do - eval format_\$f "${2}"; printf ' '; done)" - err "should not have matched ruleset:" - err "$(nft -a list ruleset)" - return 1 - fi -} - -# Correctness test template: -# - add ranged element, check that packets match it -# - check that packets outside range don't match it -# - remove some elements, check that packets don't match anymore -test_correctness() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} - - range_size=1 - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - - # Avoid negative or zero-sized port ranges - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - fi - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_match "${j}" $((j + src_delta)) || return 1 - done - send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 - - # Delete elements now and then - if [ $((i % 3)) -eq 0 ]; then - del "$(format)" || return 1 - for j in $(seq ${start} \ - $((range_size / 2 + 1)) ${end}); do - send_nomatch "${j}" $((j + src_delta)) \ - || return 1 - done - fi - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done -} - -# Concurrency test template: -# - add all the elements -# - start a thread for each physical thread that: -# - adds all the elements -# - flushes the set -# - adds all the elements -# - flushes the entire ruleset -# - adds the set back -# - adds all the elements -# - delete all the elements -test_concurrency() { - proto=${flood_proto} - tools=${flood_tools} - chain_spec=${flood_spec} - setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} - - range_size=1 - cstart=${start} - flood_pids= - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - - flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!" - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - sleep 10 - - pids= - for c in $(seq 1 "$(nproc)"); do ( - for r in $(seq 1 "${race_repeat}"); do - range_size=1 - - # $start needs to be local to this subshell - # shellcheck disable=SC2030 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - nft flush inet filter test 2>/dev/null - - range_size=1 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - nft flush ruleset - setup set 2>/dev/null - - range_size=1 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - range_size=1 - start=${cstart} - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - del "$(format)" 2>/dev/null - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - done - ) & pids="${pids} $!" - done - - # shellcheck disable=SC2046,SC2086 # word splitting wanted here - wait $(for pid in ${pids}; do echo ${pid}; done) - # shellcheck disable=SC2046,SC2086 - kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null - # shellcheck disable=SC2046,SC2086 - wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null - - return 0 -} - -# Timeout test template: -# - add all the elements with 3s timeout while checking that packets match -# - wait 3s after the last insertion, check that packets don't match any entry -test_timeout() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} - - timeout=3 - range_size=1 - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_match "${j}" $((j + src_delta)) || return 1 - done - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - sleep 3 - for i in $(seq ${start} $((start + count))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_nomatch "${j}" $((j + src_delta)) || return 1 - done - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done -} - -# Performance test template: -# - add concatenated ranged entries -# - add non-ranged concatenated entries (for hash set matching rate baseline) -# - add ranged entries with first field only (for rbhash baseline) -# - start pktgen injection directly on device rx path of this namespace -# - measure drop only rate, hash and rbtree baselines, then matching rate -test_performance() { - chain_spec=${perf_spec} - dst="${perf_dst}" - src="${perf_src}" - setup veth perf set || return ${KSELFTEST_SKIP} - - first=${start} - range_size=1 - for set in test norange noconcat; do - start=${first} - for i in $(seq ${start} $((start + perf_entries))); do - end=$((start + range_size)) - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - elif [ ${start} -eq ${end} ]; then - end=$((start + 1)) - fi - - add_perf ${set} - - start=$((end + range_size)) - done > "${tmp}" - nft -f "${tmp}" - done - - perf $((end - 1)) ${srcstart} - - sleep 2 - - nft add rule netdev perf test counter name \"test\" drop - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - info " baseline (drop from netdev hook): ${pps}pps" - handle="$(nft -a list chain netdev perf test | grep counter)" - handle="${handle##* }" - nft delete rule netdev perf test handle "${handle}" - - nft add rule "netdev perf test ${chain_spec} @norange \ - counter name \"test\" drop" - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - info " baseline hash (non-ranged entries): ${pps}pps" - handle="$(nft -a list chain netdev perf test | grep counter)" - handle="${handle##* }" - nft delete rule netdev perf test handle "${handle}" - - nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \ - counter name \"test\" drop" - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - info " baseline rbtree (match on first field only): ${pps}pps" - handle="$(nft -a list chain netdev perf test | grep counter)" - handle="${handle##* }" - nft delete rule netdev perf test handle "${handle}" - - nft add rule "netdev perf test ${chain_spec} @test \ - counter name \"test\" drop" - nft reset counter netdev perf test >/dev/null 2>&1 - sleep "${perf_duration}" - pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))" - p5="$(printf %5s "${perf_entries}")" - info " set with ${p5} full, ranged entries: ${pps}pps" - kill "${perf_pid}" -} - -test_bug_flush_remove_add() { - set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' - elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' - elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in `seq 1 100`; do - nft add table t ${set_cmd} || return ${KSELFTEST_SKIP} - nft add element t s ${elem1} 2>/dev/null || return 1 - nft flush set t s 2>/dev/null || return 1 - nft add element t s ${elem2} 2>/dev/null || return 1 - done - nft flush ruleset -} - -# - add ranged element, check that packets match it -# - reload the set, check packets still match -test_bug_reload() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} - rstart=${start} - - range_size=1 - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - - # Avoid negative or zero-sized port ranges - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - fi - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - add "$(format)" || return 1 - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - # check kernel does allocate pcpu sctrach map - # for reload with no elemet add/delete - ( echo flush set inet filter test ; - nft list set inet filter test ) | nft -f - - - start=${rstart} - range_size=1 - - for i in $(seq "${start}" $((start + count))); do - end=$((start + range_size)) - - # Avoid negative or zero-sized port ranges - if [ $((end / 65534)) -gt $((start / 65534)) ]; then - start=${end} - end=$((end + 1)) - fi - srcstart=$((start + src_delta)) - srcend=$((end + src_delta)) - - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do - send_match "${j}" $((j + src_delta)) || return 1 - done - - range_size=$((range_size + 1)) - start=$((end + range_size)) - done - - nft flush ruleset -} - -test_reported_issues() { - eval test_bug_"${subtest}" -} - -# Run everything in a separate network namespace -[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } -tmp="$(mktemp)" -trap cleanup EXIT - -# Entry point for test runs -passed=0 -for name in ${TESTS}; do - printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')" - if [ "${name}" = "reported_issues" ]; then - SUBTESTS="${BUGS}" - else - SUBTESTS="${TYPES}" - fi - - for subtest in ${SUBTESTS}; do - eval desc=\$TYPE_"${subtest}" - IFS=' -' - for __line in ${desc}; do - # shellcheck disable=SC2086 - eval ${__line%% *}=\"${__line##* }\"; - done - IFS=' -' - - if [ "${name}" = "concurrency" ] && \ - [ "${race_repeat}" = "0" ]; then - continue - fi - if [ "${name}" = "performance" ] && \ - [ "${perf_duration}" = "0" ]; then - continue - fi - - printf " %-60s " "${display}" - eval test_"${name}" - ret=$? - - if [ $ret -eq 0 ]; then - printf "[ OK ]\n" - info_flush - passed=$((passed + 1)) - elif [ $ret -eq 1 ]; then - printf "[FAIL]\n" - err_flush - exit 1 - elif [ $ret -eq ${KSELFTEST_SKIP} ]; then - printf "[SKIP]\n" - err_flush - fi - done -done - -[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0 diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh deleted file mode 100755 index faa7778d7bd1..000000000000 --- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# -# This tests connection tracking helper assignment: -# 1. can attach ftp helper to a connection from nft ruleset. -# 2. auto-assign still works. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -testipv6=1 - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -which nc >/dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without netcat tool" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set veth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set veth0 up - -ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 -ip -net ${ns1} addr add dead:1::1/64 dev veth0 - -ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 -ip -net ${ns2} addr add dead:1::2/64 dev veth0 - -load_ruleset_family() { - local family=$1 - local ns=$2 - -ip netns exec ${ns} nft -f - < /dev/null |grep -q 'helper=ftp' - if [ $? -ne 0 ] ; then - if [ $autoassign -eq 0 ] ;then - echo "FAIL: ${netns} did not show attached helper $message" 1>&2 - ret=1 - else - echo "PASS: ${netns} did not show attached helper $message" 1>&2 - fi - else - if [ $autoassign -eq 0 ] ;then - echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 - else - echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 - ret=1 - fi - fi - - return 0 -} - -test_helper() -{ - local port=$1 - local autoassign=$2 - - if [ $autoassign -eq 0 ] ;then - msg="set via ruleset" - else - msg="auto-assign" - fi - - sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ip $msg" $port $autoassign - check_for_helper "$ns2" "ip $msg" $port $autoassign - - wait - - if [ $testipv6 -eq 0 ] ;then - return 0 - fi - - ip netns exec ${ns1} conntrack -F 2> /dev/null - ip netns exec ${ns2} conntrack -F 2> /dev/null - - sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & - - sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & - sleep 1 - - check_for_helper "$ns1" "ipv6 $msg" $port - check_for_helper "$ns2" "ipv6 $msg" $port - - wait -} - -load_ruleset_family ip ${ns1} -if [ $? -ne 0 ];then - echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 - exit 1 -fi - -load_ruleset_family ip6 ${ns1} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 - testipv6=0 -fi - -load_ruleset_family inet ${ns2} -if [ $? -ne 0 ];then - echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 - load_ruleset_family ip ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 - exit 1 - fi - - if [ $testipv6 -eq 1 ] ;then - load_ruleset_family ip6 ${ns2} - if [ $? -ne 0 ];then - echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 - exit 1 - fi - fi -fi - -test_helper 2121 0 -ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -test_helper 21 1 - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh deleted file mode 100755 index dff476e45e77..000000000000 --- a/tools/testing/selftests/netfilter/nft_fib.sh +++ /dev/null @@ -1,273 +0,0 @@ -#!/bin/bash -# -# This tests the fib expression. -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) - -cleanup() -{ - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -trap cleanup EXIT - -dmesg | grep -q ' nft_rpfilter: ' -if [ $? -eq 0 ]; then - dmesg -c | grep ' nft_rpfilter: ' - echo "WARN: a previous test run has failed" 1>&2 -fi - -sysctl -q net.netfilter.nf_log_all_netns=1 -ip netns add ${ns1} -ip netns add ${ns2} - -load_ruleset() { - local netns=$1 - -ip netns exec ${netns} nft -f /dev/stdin <&2 - ip netns exec ${ns} nft list table inet filter - return 1 - fi - - if [ $want -gt 0 ]; then - echo "PASS: fib expression did drop packets for $address" - fi - - return 0 -} - -load_ruleset ${nsrouter} -load_ruleset ${ns1} -load_ruleset ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -test_ping() { - local daddr4=$1 - local daddr6=$2 - - ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 - return 1 - fi - - ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null - ret=$? - if [ $ret -ne 0 ];then - check_drops - echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 - return 1 - fi - - return 0 -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null - -sleep 3 - -test_ping 10.0.2.1 dead:2::1 || exit 1 -check_drops || exit 1 - -test_ping 10.0.2.99 dead:2::99 || exit 1 -check_drops || exit 1 - -echo "PASS: fib expression did not cause unwanted packet drops" - -ip netns exec ${nsrouter} nft flush table inet filter - -ip -net ${ns1} route del default -ip -net ${ns1} -6 route del default - -ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr del dead:1::99/64 dev eth0 - -ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr add dead:2::99/64 dev eth0 - -ip -net ${ns1} route add default via 10.0.2.1 -ip -net ${ns1} -6 route add default via dead:2::1 - -ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 - -# switch to ruleset that doesn't log, this time -# its expected that this does drop the packets. -load_ruleset_count ${nsrouter} - -# ns1 has a default route, but nsrouter does not. -# must not check return value, ping to 1.1.1.1 will -# fail. -check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 -check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 - -ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null -check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 - -sleep 2 -ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null -check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 - -# delete all rules -ip netns exec ${ns1} nft flush ruleset -ip netns exec ${ns2} nft flush ruleset -ip netns exec ${nsrouter} nft flush ruleset - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 - -ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr del dead:2::99/64 dev eth0 - -ip -net ${nsrouter} addr del dead:2::1/64 dev veth0 - -# ... pbr ruleset for the router, check iif+oif. -load_pbr_ruleset ${nsrouter} -if [ $? -ne 0 ] ; then - echo "SKIP: Could not load fib forward ruleset" - exit $ksft_skip -fi - -ip -net ${nsrouter} rule add from all table 128 -ip -net ${nsrouter} rule add from all iif veth0 table 129 -ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0 -ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1 - -# drop main ipv4 table -ip -net ${nsrouter} -4 rule delete table main - -test_ping 10.0.2.99 dead:2::99 -if [ $? -ne 0 ] ; then - ip -net ${nsrouter} nft list ruleset - echo "FAIL: fib mismatch in pbr setup" - exit 1 -fi - -echo "PASS: fib expression forward check with policy based routing" -exit 0 diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh deleted file mode 100755 index a32f490f7539..000000000000 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ /dev/null @@ -1,672 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This tests basic flowtable functionality. -# Creates following default topology: -# -# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) -# Router1 is the one doing flow offloading, Router2 has no special -# purpose other than having a link that is smaller than either Originator -# and responder, i.e. TCPMSS announced values are too large and will still -# result in fragmentation and/or PMTU discovery. -# -# You can check with different Orgininator/Link/Responder MTU eg: -# nft_flowtable.sh -o8000 -l1500 -r2000 -# - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsr1="nsr1-$sfx" -nsr2="nsr2-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -nsin="" -ns1out="" -ns2out="" - -log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "which nc" "run test without nc (netcat)" -checktool "ip netns add $nsr1" "create net namespace $nsr1" - -ip netns add $ns1 -ip netns add $ns2 -ip netns add $nsr2 - -cleanup() { - ip netns del $ns1 - ip netns del $ns2 - ip netns del $nsr1 - ip netns del $nsr2 - - rm -f "$nsin" "$ns1out" "$ns2out" - - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns -} - -trap cleanup EXIT - -sysctl -q net.netfilter.nf_log_all_netns=1 - -ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 - -ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 - -for dev in lo veth0 veth1; do - ip -net $nsr1 link set $dev up - ip -net $nsr2 link set $dev up -done - -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 - -ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 -ip -net $nsr2 addr add dead:2::1/64 dev veth1 - -# set different MTUs so we need to push packets coming from ns1 (large MTU) -# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), -# or to do PTMU discovery (send ICMP error back to originator). -# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers -# is NOT the lowest link mtu. - -omtu=9000 -lmtu=1500 -rmtu=2000 - -usage(){ - echo "nft_flowtable.sh [OPTIONS]" - echo - echo "MTU options" - echo " -o originator" - echo " -l link" - echo " -r responder" - exit 1 -} - -while getopts "o:l:r:" o -do - case $o in - o) omtu=$OPTARG;; - l) lmtu=$OPTARG;; - r) rmtu=$OPTARG;; - *) usage;; - esac -done - -if ! ip -net $nsr1 link set veth0 mtu $omtu; then - exit 1 -fi - -ip -net $ns1 link set eth0 mtu $omtu - -if ! ip -net $nsr2 link set veth1 mtu $rmtu; then - exit 1 -fi - -ip -net $ns2 link set eth0 mtu $rmtu - -# transfer-net between nsr1 and nsr2. -# these addresses are not used for connections. -ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 -ip -net $nsr1 addr add fee1:2::1/64 dev veth1 - -ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 -ip -net $nsr2 addr add fee1:2::2/64 dev veth0 - -for i in 0 1; do - ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null - ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null -done - -for ns in $ns1 $ns2;do - ip -net $ns link set lo up - ip -net $ns link set eth0 up - - if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then - echo "ERROR: Check Originator/Responder values (problem during address addition)" - exit 1 - fi - # don't set ip DF bit for first two tests - ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null -done - -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns2 addr add dead:2::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $ns2 route add default via dead:2::1 - -ip -net $nsr1 route add default via 192.168.10.2 -ip -net $nsr2 route add default via 192.168.10.1 - -ip netns exec $nsr1 nft -f - < /dev/null; then - echo "ERROR: $ns1 cannot reach ns2" 1>&2 - exit 1 -fi - -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: $ns2 cannot reach $ns1" 1>&2 - exit 1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" -fi - -nsin=$(mktemp) -ns1out=$(mktemp) -ns2out=$(mktemp) - -make_file() -{ - name=$1 - - SIZE=$((RANDOM % (1024 * 128))) - SIZE=$((SIZE + (1024 * 8))) - TSIZE=$((SIZE * 1024)) - - dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null - - SIZE=$((RANDOM % 1024)) - SIZE=$((SIZE + 128)) - TSIZE=$((TSIZE + SIZE)) - dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null -} - -check_counters() -{ - local what=$1 - local ok=1 - - local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) - local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) - - local orig_cnt=${orig#*bytes} - local repl_cnt=${repl#*bytes} - - local fs=$(du -sb $nsin) - local max_orig=${fs%%/*} - local max_repl=$((max_orig/4)) - - if [ $orig_cnt -gt $max_orig ];then - echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 - ret=1 - ok=0 - fi - - if [ $repl_cnt -gt $max_repl ];then - echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 - ret=1 - ok=0 - fi - - if [ $ok -eq 1 ]; then - echo "PASS: $what" - fi -} - -check_dscp() -{ - local what=$1 - local ok=1 - - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) - - local pc4=${counter%*bytes*} - local pc4=${pc4#*packets} - - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) - local pc4z=${counter%*bytes*} - local pc4z=${pc4z#*packets} - - case "$what" in - "dscp_none") - if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - "dscp_fwd") - if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - "dscp_ingress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - "dscp_egress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 - ret=1 - ok=0 - fi - ;; - *) - echo "FAIL: Unknown DSCP check" 1>&2 - ret=1 - ok=0 - esac - - if [ $ok -eq 1 ] ;then - echo "PASS: $what: dscp packet counters match" - fi -} - -check_transfer() -{ - in=$1 - out=$2 - what=$3 - - if ! cmp "$in" "$out" > /dev/null 2>&1; then - echo "FAIL: file mismatch for $what" 1>&2 - ls -l "$in" - ls -l "$out" - return 1 - fi - - return 0 -} - -test_tcp_forwarding_ip() -{ - local nsa=$1 - local nsb=$2 - local dstip=$3 - local dstport=$4 - local lret=0 - - ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & - lpid=$! - - sleep 1 - ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & - cpid=$! - - sleep 1 - - prev="$(ls -l $ns1out $ns2out)" - sleep 1 - - while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do - sleep 1; - prev="$(ls -l $ns1out $ns2out)" - done - - if test -d /proc/"$lpid"/; then - kill $lpid - fi - - if test -d /proc/"$cpid"/; then - kill $cpid - fi - - wait $lpid - wait $cpid - - if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then - lret=1 - fi - - if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then - lret=1 - fi - - return $lret -} - -test_tcp_forwarding() -{ - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - - return $? -} - -test_tcp_forwarding_set_dscp() -{ - check_dscp "dscp_none" - -ip netns exec $nsr1 nft -f - <&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - -# delete default route, i.e. ns2 won't be able to reach ns1 and -# will depend on ns1 being masqueraded in nsr1. -# expect ns1 has nsr1 address. -ip -net $ns2 route del default via 10.0.2.1 -ip -net $ns2 route del default via dead:2::1 -ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 - -# Second test: -# Same, but with NAT enabled. Same as in first test: we expect normal forward path -# to handle most packets. -ip netns exec $nsr1 nft -f - <&2 - exit 0 -fi - -if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then - echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - -# Third test: -# Same as second test, but with PMTU discovery enabled. This -# means that we expect the fastpath to handle packets as soon -# as the endpoints adjust the packet size. -ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null - -# reset counters. -# With pmtu in-place we'll also check that nft counters -# are lower than file size and packets were forwarded via flowtable layer. -# For earlier tests (large mtus), packets cannot be handled via flowtable -# (except pure acks and other small packets). -ip netns exec $nsr1 nft reset counters table inet filter >/dev/null - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then - echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 - ip netns exec $nsr1 nft list ruleset -fi - -# Another test: -# Add bridge interface br0 to Router1, with NAT enabled. -ip -net $nsr1 link add name br0 type bridge -ip -net $nsr1 addr flush dev veth0 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set veth0 master br0 -ip -net $nsr1 addr add 10.0.1.1/24 dev br0 -ip -net $nsr1 addr add dead:1::1/64 dev br0 -ip -net $nsr1 link set up dev br0 - -ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null - -# br0 with NAT enabled. -ip netns exec $nsr1 nft -f - <&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - - -# Another test: -# Add bridge interface br0 to Router1, with NAT and VLAN. -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set down dev veth0 -ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set up dev veth0.10 -ip -net $nsr1 link set veth0.10 master br0 - -ip -net $ns1 addr flush dev eth0 -ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 -ip -net $ns1 link set eth0 up -ip -net $ns1 link set eth0.10 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0.10 - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then - echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 - ip netns exec $nsr1 nft list ruleset - ret=1 -fi - -# restore test topology (remove bridge and VLAN) -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set veth0 down -ip -net $nsr1 link set veth0.10 down -ip -net $nsr1 link delete veth0.10 type vlan -ip -net $nsr1 link delete br0 type bridge -ip -net $ns1 addr flush dev eth0.10 -ip -net $ns1 link set eth0.10 down -ip -net $ns1 link set eth0 down -ip -net $ns1 link delete eth0.10 type vlan - -# restore address in ns1 and nsr1 -ip -net $ns1 link set eth0 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns1 route add default via dead:1::1 -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 -ip -net $nsr1 link set up dev veth0 - -KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) -KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) -SPI1=$RANDOM -SPI2=$RANDOM - -if [ $SPI1 -eq $SPI2 ]; then - SPI2=$((SPI2+1)) -fi - -do_esp() { - local ns=$1 - local me=$2 - local remote=$3 - local lnet=$4 - local rnet=$5 - local spi_out=$6 - local spi_in=$7 - - ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet - ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet - - # to encrypt packets as they go out (includes forwarded packets that need encapsulation) - ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow - # to fwd decrypted packets after esp processing: - ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow - -} - -do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 - -do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 - -ip netns exec $nsr1 nft delete table ip nat - -# restore default routes -ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns2 route add default via dead:2::1 - -if test_tcp_forwarding $ns1 $ns2; then - check_counters "ipsec tunnel mode for ns1/ns2" -else - echo "FAIL: ipsec tunnel mode for ns1/ns2" - ip netns exec $nsr1 nft list ruleset 1>&2 - ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh deleted file mode 100755 index f33154c04d34..000000000000 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/bin/bash - -# check iif/iifname/oifgroup/iiftype match. - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" - -if ! nft --version > /dev/null 2>&1; then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -cleanup() -{ - ip netns del "$ns0" -} - -ip netns add "$ns0" -ip -net "$ns0" link set lo up -ip -net "$ns0" addr add 127.0.0.1 dev lo - -trap cleanup EXIT - -currentyear=$(date +%Y) -lastyear=$((currentyear-1)) -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - -check_lo_counters "2" true - -check_one_counter oskuidcounter "1" true -check_one_counter oskgidcounter "1" true -check_one_counter imarkcounter "1" true -check_one_counter omarkcounter "1" true -check_one_counter ilastyearcounter "0" true - -if [ $ret -eq 0 ];then - echo "OK: nftables meta iif/oif counters at expected values" -else - exit $ret -fi - -#First CPU execution and counter -taskset -p 01 $$ > /dev/null -ip netns exec "$ns0" nft reset counters > /dev/null -ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null -check_one_counter icpu0counter "2" true - -if [ $ret -eq 0 ];then - echo "OK: nftables meta cpu counter at expected values" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh deleted file mode 100755 index dd40d9f6f259..000000000000 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ /dev/null @@ -1,1224 +0,0 @@ -#!/bin/bash -# -# This test is for basic NAT functionality: snat, dnat, redirect, masquerade. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -test_inet_nat=true - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" - -cleanup() -{ - for i in 0 1 2; do ip netns del ns$i-"$sfx";done -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi - -trap cleanup EXIT - -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi - -ip netns add "$ns2" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns2" - exit $ksft_skip -fi - -ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" - -ip -net "$ns0" link set lo up -ip -net "$ns0" link set veth0 up -ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 -ip -net "$ns0" addr add dead:1::1/64 dev veth0 - -ip -net "$ns0" link set veth1 up -ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 -ip -net "$ns0" addr add dead:2::1/64 dev veth1 - -for i in 1 2; do - ip -net ns$i-$sfx link set lo up - ip -net ns$i-$sfx link set eth0 up - ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0 - ip -net ns$i-$sfx route add default via 10.0.$i.1 - ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0 - ip -net ns$i-$sfx route add default via dead:$i::1 -done - -bad_counter() -{ - local ns=$1 - local counter=$2 - local expect=$3 - local tag=$4 - - echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 - ip netns exec $ns nft list counter inet filter $counter 1>&2 -} - -check_counters() -{ - ns=$1 - local lret=0 - - cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1" - lret=1 - fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2" - lret=1 - fi - - expect="packets 1 bytes 104" - cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in6 "$expect" "check_counters 3" - lret=1 - fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out6 "$expect" "check_counters 4" - lret=1 - fi - - return $lret -} - -check_ns0_counters() -{ - local ns=$1 - local lret=0 - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0in6 "packets 0 bytes 0" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" - lret=1 - fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " - lret=1 - fi - - for dir in "in" "out" ; do - expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4" - lret=1 - fi - - expect="packets 1 bytes 104" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5" - lret=1 - fi - done - - return $lret -} - -reset_counters() -{ - for i in 0 1 2;do - ip netns exec ns$i-$sfx nft reset counters inet > /dev/null - done -} - -test_local_dnat6() -{ - local family=$1 - local lret=0 - local IPF="" - - if [ $family = "inet" ];then - IPF="ip6" - fi - -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ]; then - lret=1 - echo "ERROR: ping6 failed" - return $lret - fi - - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" - lret=1 - fi - done - - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" - lret=1 - fi - done - - # expect 0 count in ns1 - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" - lret=1 - fi - done - - # expect 1 packet in ns2 - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" - lret=1 - fi - done - - test $lret -eq 0 && echo "PASS: ipv6 ping to $ns1 was $family NATted to $ns2" - ip netns exec "$ns0" nft flush chain ip6 nat output - - return $lret -} - -test_local_dnat() -{ - local family=$1 - local lret=0 - local IPF="" - - if [ $family = "inet" ];then - IPF="ip" - fi - -ip netns exec "$ns0" nft -f /dev/stdin </dev/null -table $family nat { - chain output { - type nat hook output priority 0; policy accept; - ip daddr 10.0.1.99 dnat $IPF to 10.0.2.99 - } -} -EOF - if [ $? -ne 0 ]; then - if [ $family = "inet" ];then - echo "SKIP: inet nat tests" - test_inet_nat=false - return $ksft_skip - fi - echo "SKIP: Could not add add $family dnat hook" - return $ksft_skip - fi - - # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then - lret=1 - echo "ERROR: ping failed" - return $lret - fi - - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1" - lret=1 - fi - done - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2" - lret=1 - fi - done - - # expect 0 count in ns1 - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3" - lret=1 - fi - done - - # expect 1 packet in ns2 - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4" - lret=1 - fi - done - - test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" - - ip netns exec "$ns0" nft flush chain $family nat output - - reset_counters - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then - lret=1 - echo "ERROR: ping failed" - return $lret - fi - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" - lret=1 - fi - done - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" - lret=1 - fi - done - - # expect 1 count in ns1 - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" - lret=1 - fi - done - - # expect 0 packet in ns2 - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" - lret=1 - fi - done - - test $lret -eq 0 && echo "PASS: ping to $ns1 OK after $family nat output chain flush" - - return $lret -} - -test_local_dnat_portonly() -{ - local family=$1 - local daddr=$2 - local lret=0 - local sr_s - local sr_r - -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" - return 1 - lret=1 - fi - - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2" - lret=1 - fi - done - - reset_counters - -# add masquerading rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" - lret=1 - fi - - # ns1 should have seen packets from ns0, due to masquerade - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" - lret=1 - fi - done - - # ns1 should not have seen packets from ns2, due to masquerade - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6" - lret=1 - fi - done - - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" - lret=1 - fi - - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then - echo "ERROR: Could not flush $family nat postrouting" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IPv6 masquerade $natflags for $ns2" - - return $lret -} - -test_masquerade() -{ - local family=$1 - local natflags=$2 - local lret=0 - - ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from "$ns2" $natflags" - lret=1 - fi - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2" - lret=1 - fi - done - - reset_counters - -# add masquerading rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" - lret=1 - fi - - # ns1 should have seen packets from ns0, due to masquerade - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4" - lret=1 - fi - done - - # ns1 should not have seen packets from ns2, due to masquerade - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6" - lret=1 - fi - done - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" - lret=1 - fi - - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then - echo "ERROR: Could not flush $family nat postrouting" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IP masquerade $natflags for $ns2" - - return $lret -} - -test_redirect6() -{ - local family=$1 - local lret=0 - - ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" - lret=1 - fi - - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" - lret=1 - fi - done - - reset_counters - -# add redirect rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" - lret=1 - fi - - # ns1 should have seen no packets from ns2, due to redirection - expect="packets 0 bytes 0" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" - lret=1 - fi - done - - # ns0 should have seen packets from ns2, due to masquerade - expect="packets 1 bytes 104" - for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" - lret=1 - fi - done - - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then - echo "ERROR: Could not delete $family nat table" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IPv6 redirection for $ns2" - - return $lret -} - -test_redirect() -{ - local family=$1 - local lret=0 - - ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2" - lret=1 - fi - - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1" - lret=1 - fi - - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" - lret=1 - fi - done - - reset_counters - -# add redirect rule -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" - lret=1 - fi - - # ns1 should have seen no packets from ns2, due to redirection - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" - lret=1 - fi - done - - # ns0 should have seen packets from ns2, due to masquerade - expect="packets 1 bytes 84" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" - lret=1 - fi - done - - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then - echo "ERROR: Could not delete $family nat table" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: $family IP redirection for $ns2" - - return $lret -} - -# test port shadowing. -# create two listening services, one on router (ns0), one -# on client (ns2), which is masqueraded from ns1 point of view. -# ns2 sends udp packet coming from service port to ns1, on a highport. -# Later, if n1 uses same highport to connect to ns0:service, packet -# might be port-forwarded to ns2 instead. - -# second argument tells if we expect the 'fake-entry' to take effect -# (CLIENT) or not (ROUTER). -test_port_shadow() -{ - local test=$1 - local expect=$2 - local daddrc="10.0.1.99" - local daddrs="10.0.1.1" - local result="" - local logmsg="" - - # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. - echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 - - echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 & - sc_r=$! - - echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport & - sc_c=$! - - sleep 0.3 - - # ns1 tries to connect to ns0:1405. With default settings this should connect - # to client, it matches the conntrack entry created above. - - result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404) - - if [ "$result" = "$expect" ] ;then - echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}" - else - echo "ERROR: portshadow test $test: got reply from \"$result\", not $expect as intended" - ret=1 - fi - - kill $sc_r $sc_c 2>/dev/null - - # flush udp entries for next test round, if any - ip netns exec "$ns0" conntrack -F >/dev/null 2>&1 -} - -# This prevents port shadow of router service via packet filter, -# packets claiming to originate from service port from internal -# network are dropped. -test_port_shadow_filter() -{ - local family=$1 - -ip netns exec "$ns0" nft -f /dev/stdin </dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: Could not run nat port shadowing test without conntrack tool" - return - fi - - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: Could not run nat port shadowing test without socat tool" - return - fi - - ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" - return 1 - fi - -ip netns exec "$ns0" nft -f /dev/stdin < /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" - lret=1 - fi - - # ns1 should have seen packets from .2.2, due to stateless rewrite. - expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" - lret=1 - fi - - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" - lret=1 - fi - done - - # ns1 should not have seen packets from ns2, due to masquerade - expect="packets 0 bytes 0" - for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" - lret=1 - fi - - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" - lret=1 - fi - done - - reset_counters - - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: Could not run stateless nat frag test without socat tool" - if [ $lret -eq 0 ]; then - return $ksft_skip - fi - - ip netns exec "$ns0" nft delete table ip stateless - return $lret - fi - - local tmpfile=$(mktemp) - dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null - - local outfile=$(mktemp) - ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & - sc_r=$! - - sleep 1 - # re-do with large ping -> ip fragmentation - ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null - if [ $? -ne 0 ] ; then - echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 - lret=1 - fi - - wait - - cmp "$tmpfile" "$outfile" - if [ $? -ne 0 ]; then - ls -l "$tmpfile" "$outfile" - echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 - lret=1 - fi - - rm -f "$tmpfile" "$outfile" - - # ns1 should have seen packets from 2.2, due to stateless rewrite. - expect="packets 3 bytes 4164" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" - lret=1 - fi - - ip netns exec "$ns0" nft delete table ip stateless - if [ $? -ne 0 ]; then - echo "ERROR: Could not delete table ip stateless" 1>&2 - lret=1 - fi - - test $lret -eq 0 && echo "PASS: IP statless for $ns2" - - return $lret -} - -# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 -for i in 0 1 2; do -ip netns exec ns$i-$sfx nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s)" 1>&2 - ret=1 - fi - - ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 - ret=1 - fi - check_counters ns$i-$sfx - if [ $? -ne 0 ]; then - ret=1 - fi - - check_ns0_counters ns$i - if [ $? -ne 0 ]; then - ret=1 - fi - reset_counters -done - -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" -fi - -reset_counters -test_local_dnat ip -test_local_dnat6 ip6 - -reset_counters -test_local_dnat_portonly inet 10.0.1.99 - -reset_counters -$test_inet_nat && test_local_dnat inet -$test_inet_nat && test_local_dnat6 inet - -for flags in "" "fully-random"; do -reset_counters -test_masquerade ip $flags -test_masquerade6 ip6 $flags -reset_counters -$test_inet_nat && test_masquerade inet $flags -$test_inet_nat && test_masquerade6 inet $flags -done - -reset_counters -test_redirect ip -test_redirect6 ip6 -reset_counters -$test_inet_nat && test_redirect inet -$test_inet_nat && test_redirect6 inet - -test_port_shadowing -test_stateless_nat_ip - -if [ $ret -ne 0 ];then - echo -n "FAIL: " - nft --version -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/netfilter/nft_nat_zones.sh deleted file mode 100755 index b9ab37380f33..000000000000 --- a/tools/testing/selftests/netfilter/nft_nat_zones.sh +++ /dev/null @@ -1,309 +0,0 @@ -#!/bin/bash -# -# Test connection tracking zone and NAT source port reallocation support. -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# Don't increase too much, 2000 clients should work -# just fine but script can then take several minutes with -# KASAN/debug builds. -maxclients=100 - -have_iperf=1 -ret=0 - -# client1---. -# veth1-. -# | -# NAT Gateway --veth0--> Server -# | | -# veth2-' | -# client2---' | -# .... | -# clientX----vethX---' - -# All clients share identical IP address. -# NAT Gateway uses policy routing and conntrack zones to isolate client -# namespaces. Each client connects to Server, each with colliding tuples: -# clientsaddr:10000 -> serveraddr:dport -# NAT Gateway is supposed to do port reallocation for each of the -# connections. - -sfx=$(mktemp -u "XXXXXXXX") -gw="ns-gw-$sfx" -cl1="ns-cl1-$sfx" -cl2="ns-cl2-$sfx" -srv="ns-srv-$sfx" - -v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) -v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) -v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) -v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) -v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) -v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) - -cleanup() -{ - ip netns del $gw - ip netns del $srv - for i in $(seq 1 $maxclients); do - ip netns del ns-cl$i-$sfx 2>/dev/null - done - - sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -iperf3 -v >/dev/null 2>&1 -if [ $? -ne 0 ];then - have_iperf=0 -fi - -ip netns add "$gw" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $gw" - exit $ksft_skip -fi -ip -net "$gw" link set lo up - -trap cleanup EXIT - -ip netns add "$srv" -if [ $? -ne 0 ];then - echo "SKIP: Could not create server netns $srv" - exit $ksft_skip -fi - -ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" -ip -net "$gw" link set veth0 up -ip -net "$srv" link set lo up -ip -net "$srv" link set eth0 up - -sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null -sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null -sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null -sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null -sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null -sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null - -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - - ip netns add "$cl" - if [ $? -ne 0 ];then - echo "SKIP: Could not create client netns $cl" - exit $ksft_skip - fi - ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 - if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip - fi -done - -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - echo netns exec "$cl" ip link set lo up - echo netns exec "$cl" ip link set eth0 up - echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 - echo netns exec "$gw" ip link set veth$i up - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 - - # clients have same IP addresses. - echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 - echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 - echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 - echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 - - # NB: same addresses on client-facing interfaces. - echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i - echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i - - # gw: policy routing - echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) -done | ip -batch /dev/stdin - -ip -net "$gw" addr add 10.3.0.1/24 dev veth0 -ip -net "$gw" addr add dead:3::1/64 dev veth0 - -ip -net "$srv" addr add 10.3.0.99/24 dev eth0 -ip -net "$srv" addr add dead:3::99/64 dev eth0 - -ip netns exec $gw nft -f /dev/stdin< /dev/null -ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null - -# useful for debugging: allows to use 'ping' from clients to gateway. -ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null -ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null - -for i in $(seq 1 $maxclients); do - cl="ns-cl$i-$sfx" - ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & - if [ $? -ne 0 ]; then - echo FAIL: Ping failure from $cl 1>&2 - ret=1 - break - fi -done - -wait - -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" - if [ $? -ne 0 ];then - ret=1 - echo "FAIL: counter icmp mismatch for veth$i" 1>&2 - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 - break - fi -done - -ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" -if [ $? -ne 0 ];then - ret=1 - echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" - ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 -fi - -if [ $ret -eq 0 ]; then - echo "PASS: ping test from all $maxclients namespaces" -fi - -if [ $have_iperf -eq 0 ];then - echo "SKIP: iperf3 not installed" - if [ $ret -ne 0 ];then - exit $ret - fi - exit $ksft_skip -fi - -ip netns exec $srv iperf3 -s > /dev/null 2>&1 & -iperfpid=$! -sleep 1 - -for i in $(seq 1 $maxclients); do - if [ $ret -ne 0 ]; then - break - fi - cl="ns-cl$i-$sfx" - ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null - if [ $? -ne 0 ]; then - echo FAIL: Failure to connect for $cl 1>&2 - ip netns exec $gw conntrack -S 1>&2 - ret=1 - fi -done -if [ $ret -eq 0 ];then - echo "PASS: iperf3 connections for all $maxclients net namespaces" -fi - -kill $iperfpid -wait - -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null - if [ $? -ne 0 ];then - ret=1 - echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 - break - fi -done -if [ $ret -eq 0 ];then - echo "PASS: Found client connection for all $maxclients net namespaces" -fi - -ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null -if [ $? -ne 0 ];then - ret=1 - echo "FAIL: cannot find return entry on veth0" 1>&2 -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh deleted file mode 100755 index e12729753351..000000000000 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ /dev/null @@ -1,449 +0,0 @@ -#!/bin/bash -# -# This tests nf_queue: -# 1. can process packets from all hooks -# 2. support running nfqueue from more than one base chain -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 - -cleanup() -{ - ip netns pids ${ns1} | xargs kill 2>/dev/null - ip netns pids ${ns2} | xargs kill 2>/dev/null - ip netns pids ${nsrouter} | xargs kill 2>/dev/null - - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} - rm -f "$TMPFILE0" - rm -f "$TMPFILE1" - rm -f "$TMPFILE2" "$TMPFILE3" -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi - -TMPFILE0=$(mktemp) -TMPFILE1=$(mktemp) -TMPFILE2=$(mktemp) -TMPFILE3=$(mktemp) -trap cleanup EXIT - -ip netns add ${ns1} -ip netns add ${ns2} - -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: No virtual ethernet pair device support in kernel" - exit $ksft_skip -fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} - -ip -net ${nsrouter} link set lo up -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 - -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 - -ip -net ${ns1} link set lo up -ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up -ip -net ${ns2} link set eth0 up - -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 - -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 - -load_ruleset() { - local name=$1 - local prio=$2 - -ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_ping_router() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null - if [ $? -ne 0 ];then - return 1 - fi - - return 0 -} - -test_queue_blackhole() { - local proto=$1 - -ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null - lret=$? - elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null - lret=$? - else - lret=111 - fi - - # queue without bypass keyword should drop traffic if no listener exists. - if [ $lret -eq 0 ];then - echo "FAIL: $proto expected failure, got $lret" 1>&2 - exit 1 - fi - - ip netns exec ${nsrouter} nft delete table $proto blackh - if [ $? -ne 0 ] ;then - echo "FAIL: $proto: Could not delete blackh table" - exit 1 - fi - - echo "PASS: $proto: statement with no listener results in packet drop" -} - -test_queue() -{ - local expected=$1 - local last="" - - # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & - sleep 1 - test_ping - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - test_ping_router - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 - exit $ret - fi - - wait - ret=$? - - for file in $TMPFILE0 $TMPFILE1; do - last=$(tail -n1 "$file") - if [ x"$last" != x"$expected packets total" ]; then - echo "FAIL: Expected $expected packets total, but got $last" 1>&2 - cat "$file" 1>&2 - - ip netns exec ${nsrouter} nft list ruleset - exit 1 - fi - done - - echo "PASS: Expected and received $last" -} - -test_tcp_forward() -{ - ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & - local nfqpid=$! - - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - sleep 1 - ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & - - rm -f "$tmpfile" - - wait $rpid - wait $lpid - [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" -} - -test_tcp_localhost() -{ - tmpfile=$(mktemp) || exit 1 - - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback" - wait 2>/dev/null -} - -test_tcp_localhost_connectclose() -{ - tmpfile=$(mktemp) || exit 1 - - ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & - - ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 - rm -f "$tmpfile" - - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" - wait 2>/dev/null -} - -test_tcp_localhost_requeue() -{ -ip netns exec ${nsrouter} nft -f /dev/stdin </dev/null & - local rpid=$! - - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & - - # nfqueue 1 will be called via output hook. But this time, - # re-queue the packet to nfqueue program on queue 2. - ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & - - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" - - wait - - if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then - echo "FAIL: lost packets during requeue?!" 1>&2 - return - fi - - echo "PASS: tcp via loopback and re-queueing" -} - -test_icmp_vrf() { - ip -net $ns1 link add tvrf type vrf table 9876 - if [ $? -ne 0 ];then - echo "SKIP: Could not add vrf device" - return - fi - - ip -net $ns1 li set eth0 master tvrf - ip -net $ns1 li set tvrf up - - ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 -ip netns exec ${ns1} nft -f /dev/stdin < /dev/null - - for n in output post; do - for d in tvrf eth0; do - ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1" - if [ $? -ne 0 ] ; then - echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 - ip netns exec ${ns1} nft list ruleset - ret=1 - return - fi - done - done - - wait $nfqpid - [ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf" - wait 2>/dev/null -} - -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - -load_ruleset "filter" 0 - -sleep 3 - -test_ping -ret=$? -if [ $ret -eq 0 ];then - # queue bypass works (rules were skipped, no listener) - echo "PASS: ${ns1} can reach ${ns2}" -else - echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 - exit $ret -fi - -test_queue_blackhole ip -test_queue_blackhole ip6 - -# dummy ruleset to add base chains between the -# queueing rules. We don't want the second reinject -# to re-execute the old hooks. -load_counter_ruleset 10 - -# we are hooking all: prerouting/input/forward/output/postrouting. -# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so: -# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). -# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. -# so we expect that userspace program receives 10 packets. -test_queue 10 - -# same. We queue to a second program as well. -load_ruleset "filter2" 20 -test_queue 20 - -test_tcp_forward -test_tcp_localhost -test_tcp_localhost_connectclose -test_tcp_localhost_requeue -test_icmp_vrf - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_synproxy.sh b/tools/testing/selftests/netfilter/nft_synproxy.sh deleted file mode 100755 index b62933b680d6..000000000000 --- a/tools/testing/selftests/netfilter/nft_synproxy.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -rnd=$(mktemp -u XXXXXXXX) -nsr="nsr-$rnd" # synproxy machine -ns1="ns1-$rnd" # iperf client -ns2="ns2-$rnd" # iperf server - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "iperf3 --version" "run test without iperf3" -checktool "ip netns add $nsr" "create net namespace" - -modprobe -q nf_conntrack - -ip netns add $ns1 -ip netns add $ns2 - -cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - ip netns del $ns1 - ip netns del $ns2 - - ip netns del $nsr -} - -trap cleanup EXIT - -ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2 - -for dev in lo veth0 veth1; do -ip -net $nsr link set $dev up -done - -ip -net $nsr addr add 10.0.1.1/24 dev veth0 -ip -net $nsr addr add 10.0.2.1/24 dev veth1 - -ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1 -ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1 -ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 - -for n in $ns1 $ns2; do - ip -net $n link set lo up - ip -net $n link set eth0 up -done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 - -# test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then - echo "ERROR: $ns1 cannot reach $ns2" 1>&2 - exit 1 -fi - -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: $ns2 cannot reach $ns1" 1>&2 - exit 1 -fi - -ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 & -# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & - -sleep 1 - -ip netns exec $nsr nft -f - < /dev/null - -if [ $? -ne 0 ]; then - echo "FAIL: iperf3 returned an error" 1>&2 - ret=$? - ip netns exec $nsr nft list ruleset -else - echo "PASS: synproxy connection successful" -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh deleted file mode 100755 index 2ffba45a78bf..000000000000 --- a/tools/testing/selftests/netfilter/nft_trans_stress.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# -# This test is for stress-testing the nf_tables config plane path vs. -# packet path processing: Make sure we never release rules that are -# still visible to other cpus. -# -# set -e - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -testns=testns-$(mktemp -u "XXXXXXXX") -tmp="" - -tables="foo bar baz quux" -global_ret=0 -eret=0 -lret=0 - -cleanup() { - ip netns pids "$testns" | xargs kill 2>/dev/null - ip netns del "$testns" - - rm -f "$tmp" -} - -check_result() -{ - local r=$1 - local OK="PASS" - - if [ $r -ne 0 ] ;then - OK="FAIL" - global_ret=$r - fi - - echo "$OK: nft $2 test returned $r" - - eret=0 -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -trap cleanup EXIT -tmp=$(mktemp) - -for table in $tables; do - echo add table inet "$table" >> "$tmp" - echo flush table inet "$table" >> "$tmp" - - echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp" - echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp" - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - echo "add chain inet $table $chain" >> "$tmp" - done - - for c in $(seq 1 400); do - chain=$(printf "chain%03u" "$c") - for BASE in INPUT OUTPUT; do - echo "add rule inet $table $BASE counter jump $chain" >> "$tmp" - done - echo "add rule inet $table $chain counter return" >> "$tmp" - done -done - -ip netns add "$testns" -ip -netns "$testns" link set lo up - -lscpu | grep ^CPU\(s\): | ( read cpu cpunum ; -cpunum=$((cpunum-1)) -for i in $(seq 0 $cpunum);do - mask=$(printf 0x%x $((1<<$i))) - ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null & - ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null & -done) - -sleep 1 - -ip netns exec "$testns" nft -f "$tmp" -for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done - -for table in $tables;do - randsleep=$((RANDOM%2)) - sleep $randsleep - ip netns exec "$testns" nft delete table inet $table - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "reload" - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp" - echo "insert rule inet foo INPUT meta nftrace set 1" - echo "insert rule inet foo OUTPUT meta nftrace set 1" - ) | ip netns exec "$testns" nft -f /dev/stdin - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi - - (echo "flush ruleset"; cat "$tmp" - ) | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=$lret - fi -done - -check_result $eret "add/delete with nftrace enabled" - -echo "insert rule inet foo INPUT meta nftrace set 1" >> $tmp -echo "insert rule inet foo OUTPUT meta nftrace set 1" >> $tmp - -for i in $(seq 1 10) ; do - (echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin - - lret=$? - if [ $lret -ne 0 ]; then - eret=1 - fi -done - -check_result $lret "add/delete with nftrace enabled" - -exit $global_ret diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh deleted file mode 100755 index 5a8db0b48928..000000000000 --- a/tools/testing/selftests/netfilter/nft_zones_many.sh +++ /dev/null @@ -1,163 +0,0 @@ -#!/bin/bash - -# Test insertion speed for packets with identical addresses/ports -# that are all placed in distinct conntrack zones. - -sfx=$(mktemp -u "XXXXXXXX") -ns="ns-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -zones=2000 -have_ct_tool=0 -ret=0 - -cleanup() -{ - ip netns del $ns -} - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - -checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "socat -V" "run test without socat tool" -checktool "ip netns add $ns" "create net namespace" - -trap cleanup EXIT - -conntrack -V > /dev/null 2>&1 -if [ $? -eq 0 ];then - have_ct_tool=1 -fi - -ip -net "$ns" link set lo up - -test_zones() { - local max_zones=$1 - -ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 -ip netns exec $ns nft -f /dev/stdin</dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 - if [ $? -ne 0 ] ;then - ret=1 - break - fi - - stop=$(date +%s%3N) - local duration=$((stop-start)) - echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" - done - - if [ $have_ct_tool -eq 1 ]; then - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) - - if [ $count -eq $max_zones ]; then - echo "PASS: inserted $count entries from packet path in $duration ms total" - else - ip netns exec $ns conntrack -S 1>&2 - echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" - ret=1 - fi - fi - - if [ $ret -ne 0 ];then - echo "FAIL: insert $max_zones entries from packet path" 1>&2 - fi -} - -test_conntrack_tool() { - local max_zones=$1 - - ip netns exec $ns conntrack -F >/dev/null 2>/dev/null - - local outerstart=$(date +%s%3N) - local start=$(date +%s%3N) - local stop=$start - local i=0 - while [ $i -lt $max_zones ]; do - i=$((i + 1)) - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ - --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 - if [ $? -ne 0 ];then - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ - --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null - echo "FAIL: conntrack -I returned an error" - ret=1 - break - fi - - if [ $((i%1000)) -eq 0 ];then - stop=$(date +%s%3N) - - local duration=$((stop-start)) - echo "PASS: added 1000 entries in $duration ms (now $i total)" - start=$stop - fi - done - - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) - - if [ $count -eq $max_zones ]; then - echo "PASS: inserted $count entries via ctnetlink in $duration ms" - else - ip netns exec $ns conntrack -S 1>&2 - echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" - ret=1 - fi -} - -test_zones $zones - -if [ $have_ct_tool -eq 1 ];then - test_conntrack_tool $zones -else - echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" - if [ $ret -eq 0 ];then - exit $ksft_skip - fi -fi - -exit $ret diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh deleted file mode 100755 index 5289c8447a41..000000000000 --- a/tools/testing/selftests/netfilter/rpath.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# return code to signal skipped test -ksft_skip=4 - -# search for legacy iptables (it uses the xtables extensions -if iptables-legacy --version >/dev/null 2>&1; then - iptables='iptables-legacy' -elif iptables --version >/dev/null 2>&1; then - iptables='iptables' -else - iptables='' -fi - -if ip6tables-legacy --version >/dev/null 2>&1; then - ip6tables='ip6tables-legacy' -elif ip6tables --version >/dev/null 2>&1; then - ip6tables='ip6tables' -else - ip6tables='' -fi - -if nft --version >/dev/null 2>&1; then - nft='nft' -else - nft='' -fi - -if [ -z "$iptables$ip6tables$nft" ]; then - echo "SKIP: Test needs iptables, ip6tables or nft" - exit $ksft_skip -fi - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -trap "ip netns del $ns1; ip netns del $ns2" EXIT - -# create two netns, disable rp_filter in ns2 and -# keep IPv6 address when moving into VRF -ip netns add "$ns1" -ip netns add "$ns2" -ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0 -ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0 -ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1 - -# a standard connection between the netns, should not trigger rp filter -ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2" -ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up -ip -net "$ns1" a a 192.168.23.2/24 dev v0 -ip -net "$ns2" a a 192.168.23.1/24 dev v0 -ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad -ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad - -# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0 -ip -net "$ns2" link add d0 type dummy -ip -net "$ns2" link set d0 up -ip -net "$ns1" a a 192.168.42.2/24 dev v0 -ip -net "$ns2" a a 192.168.42.1/24 dev d0 -ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad -ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad - -# firewall matches to test -[ -n "$iptables" ] && { - common='-t raw -A PREROUTING -s 192.168.0.0/16' - ip netns exec "$ns2" "$iptables" $common -m rpfilter - ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert -} -[ -n "$ip6tables" ] && { - common='-t raw -A PREROUTING -s fec0::/16' - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert -} -[ -n "$nft" ] && ip netns exec "$ns2" $nft -f - </dev/null -} - -clear_counters() { - [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z - [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z - if [ -n "$nft" ]; then - ( - echo "delete table inet t"; - ip netns exec "$ns2" $nft -s list table inet t; - ) | ip netns exec "$ns2" $nft -f - - fi -} - -testrun() { - clear_counters - - # test 1: martian traffic should fail rpfilter matches - netns_ping "$ns1" -I v0 192.168.42.1 && \ - die "martian ping 192.168.42.1 succeeded" - netns_ping "$ns1" -I v0 fec0:42::1 && \ - die "martian ping fec0:42::1 succeeded" - - ipt_zero_rule "$iptables" || die "iptables matched martian" - ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" - ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian" - ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian" - nft_zero_rule ip || die "nft IPv4 matched martian" - nft_zero_rule ip6 || die "nft IPv6 matched martian" - - clear_counters - - # test 2: rpfilter match should pass for regular traffic - netns_ping "$ns1" 192.168.23.1 || \ - die "regular ping 192.168.23.1 failed" - netns_ping "$ns1" fec0:23::1 || \ - die "regular ping fec0:23::1 failed" - - ipt_zero_rule "$iptables" && die "iptables match not effective" - ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" - ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective" - ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective" - nft_zero_rule ip && die "nft IPv4 match not effective" - nft_zero_rule ip6 && die "nft IPv6 match not effective" - -} - -testrun - -# repeat test with vrf device in $ns2 -ip -net "$ns2" link add vrf0 type vrf table 10 -ip -net "$ns2" link set vrf0 up -ip -net "$ns2" link set v0 master vrf0 - -testrun - -echo "PASS: netfilter reverse path match works as intended" -exit 0 diff --git a/tools/testing/selftests/netfilter/sctp_collision.c b/tools/testing/selftests/netfilter/sctp_collision.c deleted file mode 100644 index 21bb1cfd8a85..000000000000 --- a/tools/testing/selftests/netfilter/sctp_collision.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include - -int main(int argc, char *argv[]) -{ - struct sockaddr_in saddr = {}, daddr = {}; - int sd, ret, len = sizeof(daddr); - struct timeval tv = {25, 0}; - char buf[] = "hello"; - - if (argc != 6 || (strcmp(argv[1], "server") && strcmp(argv[1], "client"))) { - printf("%s \n", - argv[0]); - return -1; - } - - sd = socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP); - if (sd < 0) { - printf("Failed to create sd\n"); - return -1; - } - - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = inet_addr(argv[2]); - saddr.sin_port = htons(atoi(argv[3])); - - ret = bind(sd, (struct sockaddr *)&saddr, sizeof(saddr)); - if (ret < 0) { - printf("Failed to bind to address\n"); - goto out; - } - - ret = listen(sd, 5); - if (ret < 0) { - printf("Failed to listen on port\n"); - goto out; - } - - daddr.sin_family = AF_INET; - daddr.sin_addr.s_addr = inet_addr(argv[4]); - daddr.sin_port = htons(atoi(argv[5])); - - /* make test shorter than 25s */ - ret = setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); - if (ret < 0) { - printf("Failed to setsockopt SO_RCVTIMEO\n"); - goto out; - } - - if (!strcmp(argv[1], "server")) { - sleep(1); /* wait a bit for client's INIT */ - ret = connect(sd, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to connect to peer\n"); - goto out; - } - ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); - if (ret < 0) { - printf("Failed to recv msg %d\n", ret); - goto out; - } - ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to send msg %d\n", ret); - goto out; - } - printf("Server: sent! %d\n", ret); - } - - if (!strcmp(argv[1], "client")) { - usleep(300000); /* wait a bit for server's listening */ - ret = connect(sd, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to connect to peer\n"); - goto out; - } - sleep(1); /* wait a bit for server's delayed INIT_ACK to reproduce the issue */ - ret = sendto(sd, buf, strlen(buf) + 1, 0, (struct sockaddr *)&daddr, len); - if (ret < 0) { - printf("Failed to send msg %d\n", ret); - goto out; - } - ret = recvfrom(sd, buf, sizeof(buf), 0, (struct sockaddr *)&daddr, &len); - if (ret < 0) { - printf("Failed to recv msg %d\n", ret); - goto out; - } - printf("Client: rcvd! %d\n", ret); - } - ret = 0; -out: - close(sd); - return ret; -} diff --git a/tools/testing/selftests/netfilter/settings b/tools/testing/selftests/netfilter/settings deleted file mode 100644 index 6091b45d226b..000000000000 --- a/tools/testing/selftests/netfilter/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=120 diff --git a/tools/testing/selftests/netfilter/xt_string.sh b/tools/testing/selftests/netfilter/xt_string.sh deleted file mode 100755 index 1802653a4728..000000000000 --- a/tools/testing/selftests/netfilter/xt_string.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# return code to signal skipped test -ksft_skip=4 -rc=0 - -if ! iptables --version >/dev/null 2>&1; then - echo "SKIP: Test needs iptables" - exit $ksft_skip -fi -if ! ip -V >/dev/null 2>&1; then - echo "SKIP: Test needs iproute2" - exit $ksft_skip -fi -if ! nc -h >/dev/null 2>&1; then - echo "SKIP: Test needs netcat" - exit $ksft_skip -fi - -pattern="foo bar baz" -patlen=11 -hdrlen=$((20 + 8)) # IPv4 + UDP -ns="ns-$(mktemp -u XXXXXXXX)" -trap 'ip netns del $ns' EXIT -ip netns add "$ns" -ip -net "$ns" link add d0 type dummy -ip -net "$ns" link set d0 up -ip -net "$ns" addr add 10.1.2.1/24 dev d0 - -#ip netns exec "$ns" tcpdump -npXi d0 & -#tcpdump_pid=$! -#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT - -add_rule() { # (alg, from, to) - ip netns exec "$ns" \ - iptables -A OUTPUT -o d0 -m string \ - --string "$pattern" --algo $1 --from $2 --to $3 -} -showrules() { # () - ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A' -} -zerorules() { - ip netns exec "$ns" iptables -Z OUTPUT -} -countrule() { # (pattern) - showrules | grep -c -- "$*" -} -send() { # (offset) - ( for ((i = 0; i < $1 - $hdrlen; i++)); do - printf " " - done - printf "$pattern" - ) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374 -} - -add_rule bm 1000 1500 -add_rule bm 1400 1600 -add_rule kmp 1000 1500 -add_rule kmp 1400 1600 - -zerorules -send 0 -send $((1000 - $patlen)) -if [ $(countrule -c 0 0) -ne 4 ]; then - echo "FAIL: rules match data before --from" - showrules - ((rc--)) -fi - -zerorules -send 1000 -send $((1400 - $patlen)) -if [ $(countrule -c 2) -ne 2 ]; then - echo "FAIL: only two rules should match at low offset" - showrules - ((rc--)) -fi - -zerorules -send $((1500 - $patlen)) -if [ $(countrule -c 1) -ne 4 ]; then - echo "FAIL: all rules should match at end of packet" - showrules - ((rc--)) -fi - -zerorules -send 1495 -if [ $(countrule -c 1) -ne 1 ]; then - echo "FAIL: only kmp with proper --to should match pattern spanning fragments" - showrules - ((rc--)) -fi - -zerorules -send 1500 -if [ $(countrule -c 1) -ne 2 ]; then - echo "FAIL: two rules should match pattern at start of second fragment" - showrules - ((rc--)) -fi - -zerorules -send $((1600 - $patlen)) -if [ $(countrule -c 1) -ne 2 ]; then - echo "FAIL: two rules should match pattern at end of largest --to" - showrules - ((rc--)) -fi - -zerorules -send $((1600 - $patlen + 1)) -if [ $(countrule -c 1) -ne 0 ]; then - echo "FAIL: no rules should match pattern extending largest --to" - showrules - ((rc--)) -fi - -zerorules -send 1600 -if [ $(countrule -c 1) -ne 0 ]; then - echo "FAIL: no rule should match pattern past largest --to" - showrules - ((rc--)) -fi - -exit $rc -- cgit v1.2.3 From 94831b130dedc0498e08f402d44e89bd1b9386b5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:07 +0200 Subject: selftests: netfilter: bridge_brouter.sh: move to lib.sh infra Doing so gets us dynamically generated netns names. Also: * do not assume rp_filter is disabled, if its on script failed * reduce timeout (-W) for "expected to fail" ping commands * don't print PASS line for basic sanity ping * shellcheck cleanups Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-3-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/bridge_brouter.sh | 128 +++++++++------------ 1 file changed, 52 insertions(+), 76 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/bridge_brouter.sh b/tools/testing/selftests/net/netfilter/bridge_brouter.sh index 29f3955b9af7..2549b6590693 100755 --- a/tools/testing/selftests/net/netfilter/bridge_brouter.sh +++ b/tools/testing/selftests/net/netfilter/bridge_brouter.sh @@ -5,142 +5,118 @@ # part of a bridge. # eth0 br0 eth0 -# setup is: ns1 <-> ns0 <-> ns2 +# setup is: ns1 <-> nsbr <-> ns2 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 +source lib.sh -ebtables -V > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ebtables -V > /dev/null 2>&1;then echo "SKIP: Could not run test without ebtables" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi +cleanup() { + cleanup_all_ns +} -ip netns add ns0 -ip netns add ns1 -ip netns add ns2 +trap cleanup EXIT -ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 -if [ $? -ne 0 ]; then +setup_ns nsbr ns1 ns2 + +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$nsbr" sysctl -q net.ipv4.conf.all.rp_filter=0 +if ! ip link add veth0 netns "$nsbr" type veth peer name eth0 netns "$ns1"; then echo "SKIP: Can't create veth device" exit $ksft_skip fi -ip link add veth1 netns ns0 type veth peer name eth0 netns ns2 - -ip -net ns0 link set lo up -ip -net ns0 link set veth0 up -ip -net ns0 link set veth1 up +ip link add veth1 netns "$nsbr" type veth peer name eth0 netns "$ns2" -ip -net ns0 link add br0 type bridge -if [ $? -ne 0 ]; then +if ! ip -net "$nsbr" link add br0 type bridge; then echo "SKIP: Can't create bridge br0" exit $ksft_skip fi -ip -net ns0 link set veth0 master br0 -ip -net ns0 link set veth1 master br0 -ip -net ns0 link set br0 up -ip -net ns0 addr add 10.0.0.1/24 dev br0 +ip -net "$nsbr" link set veth0 up +ip -net "$nsbr" link set veth1 up + +ip -net "$nsbr" link set veth0 master br0 +ip -net "$nsbr" link set veth1 master br0 +ip -net "$nsbr" link set br0 up +ip -net "$nsbr" addr add 10.0.0.1/24 dev br0 -# place both in same subnet, ns1 and ns2 connected via ns0:br0 -for i in 1 2; do - ip -net ns$i link set lo up - ip -net ns$i link set eth0 up - ip -net ns$i addr add 10.0.0.1$i/24 dev eth0 -done +# place both in same subnet, ${ns1} and ${ns2} connected via ${nsbr}:br0 +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up +ip -net "$ns1" addr add 10.0.0.11/24 dev eth0 +ip -net "$ns2" addr add 10.0.0.12/24 dev eth0 test_ebtables_broute() { - local cipt - # redirect is needed so the dstmac is rewritten to the bridge itself, # ip stack won't process OTHERHOST (foreign unicast mac) packets. - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - if [ $? -ne 0 ]; then + if ! ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP; then echo "SKIP: Could not add ebtables broute redirect rule" return $ksft_skip fi - # ping netns1, expected to not work (ip forwarding is off) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=0 + + # ping net${ns1}, expected to not work (ip forwarding is off) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then echo "ERROR: ping works, should have failed" 1>&2 return 1 fi # enable forwarding on both interfaces. # neither needs an ip address, but at least the bridge needs - # an ip address in same network segment as ns1 and ns2 (ns0 + # an ip address in same network segment as ${ns1} and ${ns2} (${nsbr} # needs to be able to determine route for to-be-forwarded packet). - ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1 - ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1 - - sleep 1 + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=1 + ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth1.forwarding=1 - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2 return 1 fi - echo "PASS: ns1/ns2 connectivity with active broute rule" - ip netns exec ns0 ebtables -t broute -F + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule" + ip netns exec "$nsbr" ebtables -t broute -F - # ping netns1, expected to work (frames are bridged) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null - if [ $? -ne 0 ]; then + # ping net${ns1}, expected to work (frames are bridged) + if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then echo "ERROR: ping did not work, but it should (bridged)" 1>&2 return 1 fi - ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP + ip netns exec "$nsbr" ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP - # ping netns1, expected to not work (DROP in bridge forward) - ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1 - if [ $? -eq 0 ]; then + # ping net${ns1}, expected to not work (DROP in bridge forward) + if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2 return 1 fi # re-activate brouter - ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP + ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP - ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.0.11 > /dev/null; then echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2 return 1 fi - echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop" + echo "PASS: ${ns1}/${ns2} connectivity with active broute rule and bridge forward drop" return 0 } # test basic connectivity -ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns2 from ns1" 1>&2 - ret=1 +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.0.12 > /dev/null; then + echo "ERROR: Could not reach ${ns2} from ${ns1}" 1>&2 + exit 1 fi -ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null -if [ $? -ne 0 ]; then - echo "ERROR: Could not reach ns1 from ns2" 1>&2 - ret=1 -fi - -if [ $ret -eq 0 ];then - echo "PASS: netns connectivity: ns1 and ns2 can reach each other" +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.0.11 > /dev/null; then + echo "ERROR: Could not reach ${ns1} from ${ns2}" 1>&2 + exit 1 fi test_ebtables_broute -ret=$? -for i in 0 1 2; do ip netns del ns$i;done - -exit $ret +exit $? -- cgit v1.2.3 From 1286e106dd6f7df20f6cc2372198235accab865d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:08 +0200 Subject: selftests: netfilter: br_netfilter.sh: move to lib.sh infra Also, fix two issues reported by Pablo Neira: 1. Must modprobe br_netfilter in case its not loaded, else sysctl cannot be set. 2. ping for netns4 fails if rp_filter is enabled in bridge netns, so set all and default to 0. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-4-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/br_netfilter.sh | 131 +++++++++------------ 1 file changed, 55 insertions(+), 76 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh index 659b3ab02c8b..ea3afd6d401f 100755 --- a/tools/testing/selftests/net/netfilter/br_netfilter.sh +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -1,55 +1,40 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 # -# Test bridge netfilter + conntrack, a combination that doesn't really work, -# with multicast/broadcast packets racing for hash table insertion. +# Test for legacy br_netfilter module combined with connection tracking, +# a combination that doesn't really work. +# Multicast/broadcast packets race for hash table insertion. # eth0 br0 eth0 # setup is: ns1 <->,ns0 <-> ns3 # ns2 <-' `'-> ns4 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" -ns3="ns3-$sfx" -ns4="ns4-$sfx" +source lib.sh -ebtables -V > /dev/null 2>&1 +nft --version > /dev/null 2>&1 if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ebtables" + echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -for i in $(seq 0 4); do - eval ip netns add \$ns$i -done - cleanup() { - for i in $(seq 0 4); do eval ip netns del \$ns$i;done + cleanup_all_ns } trap cleanup EXIT +setup_ns ns0 ns1 ns2 ns3 ns4 + +ret=0 + do_ping() { fromns="$1" dstip="$2" - ip netns exec $fromns ping -c 1 -q $dstip > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$fromns" ping -c 1 -q "$dstip" > /dev/null; then echo "ERROR: ping from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset + ip netns exec "$ns0" nft list ruleset ret=1 fi } @@ -59,75 +44,75 @@ bcast_ping() fromns="$1" dstip="$2" - for i in $(seq 1 1000); do - ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1 - if [ $? -ne 0 ]; then + for i in $(seq 1 500); do + if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then echo "ERROR: ping -b from $fromns to $dstip" - ip netns exec ${ns0} nft list ruleset - fi + ip netns exec "$ns0" nft list ruleset + ret=1 + break + fi done } -ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1} -if [ $? -ne 0 ]; then +ip netns exec "$ns0" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q net.ipv4.conf.default.rp_filter=0 + +if ! ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns1"; then echo "SKIP: Can't create veth device" exit $ksft_skip fi -ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2 -ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3 -ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4 - -ip -net ${ns0} link set lo up +ip link add veth2 netns "$ns0" type veth peer name eth0 netns "$ns2" +ip link add veth3 netns "$ns0" type veth peer name eth0 netns "$ns3" +ip link add veth4 netns "$ns0" type veth peer name eth0 netns "$ns4" for i in $(seq 1 4); do - ip -net ${ns0} link set veth$i up + ip -net "$ns0" link set "veth$i" up done -ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1 -if [ $? -ne 0 ]; then +if ! ip -net "$ns0" link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1; then echo "SKIP: Can't create bridge br0" exit $ksft_skip fi # make veth0,1,2 part of bridge. for i in $(seq 1 3); do - ip -net ${ns0} link set veth$i master br0 + ip -net "$ns0" link set "veth$i" master br0 done # add a macvlan on top of the bridge. MACVLAN_ADDR=ba:f3:13:37:42:23 -ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private -ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan0 up -ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0 +ip -net "$ns0" link add link br0 name macvlan0 type macvlan mode private +ip -net "$ns0" link set macvlan0 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan0 up +ip -net "$ns0" addr add 10.23.0.1/24 dev macvlan0 # add a macvlan on top of veth4. MACVLAN_ADDR=ba:f3:13:37:42:24 -ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa -ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR} -ip -net ${ns0} link set macvlan4 up +ip -net "$ns0" link add link veth4 name macvlan4 type macvlan mode passthru +ip -net "$ns0" link set macvlan4 address ${MACVLAN_ADDR} +ip -net "$ns0" link set macvlan4 up # make the macvlan part of the bridge. # veth4 is not a bridge port, only the macvlan on top of it. -ip -net ${ns0} link set macvlan4 master br0 +ip -net "$ns0" link set macvlan4 master br0 -ip -net ${ns0} link set br0 up -ip -net ${ns0} addr add 10.0.0.1/24 dev br0 -ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1 -ret=$? -if [ $ret -ne 0 ] ; then +ip -net "$ns0" link set br0 up +ip -net "$ns0" addr add 10.0.0.1/24 dev br0 + +modprobe -q br_netfilter +if ! ip netns exec "$ns0" sysctl -q net.bridge.bridge-nf-call-iptables=1; then echo "SKIP: bridge netfilter not available" ret=$ksft_skip fi # for testing, so namespaces will reply to ping -b probes. -ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 +ip netns exec "$ns0" sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0 # enable conntrack in ns0 and drop broadcast packets in forward to # avoid them from getting confirmed in the postrouting hook before # the cloned skb is passed up the stack. -ip netns exec ${ns0} nft -f - < Date: Fri, 12 Apr 2024 01:36:09 +0200 Subject: selftests: netfilter: conntrack_icmp_related.sh: move to lib.sh infra Only relevant change is that netns names have random suffix names, i.e. its safe to run this in parallel with other tests. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-5-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../net/netfilter/conntrack_icmp_related.sh | 179 ++++++++------------- 1 file changed, 71 insertions(+), 108 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh index 76645aaf2b58..c63d840ead61 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh @@ -14,35 +14,32 @@ # check the icmp errors are propagated to the correct host as per # nat of "established" icmp-echo "connection". -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 +source lib.sh -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1;then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - cleanup() { - for i in 1 2;do ip netns del nsclient$i;done - for i in 1 2;do ip netns del nsrouter$i;done + cleanup_all_ns } trap cleanup EXIT -ipv4() { - echo -n 192.168.$1.2 -} +setup_ns nsclient1 nsclient2 nsrouter1 nsrouter2 + +ret=0 + +add_addr() +{ + ns=$1 + dev=$2 + i=$3 -ipv6 () { - echo -n dead:$1::2 + ip -net "$ns" link set "$dev" up + ip -net "$ns" addr add "192.168.$i.2/24" dev "$dev" + ip -net "$ns" addr add "dead:$i::2/64" dev "$dev" nodad } check_counter() @@ -52,10 +49,9 @@ check_counter() expect=$3 local lret=0 - cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns" nft list counter inet filter "$name" | grep -q "$expect"; then echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns nft list counter inet filter "$name" 1>&2 + ip netns exec "$ns" nft list counter inet filter "$name" 1>&2 lret=1 fi @@ -65,9 +61,8 @@ check_counter() check_unknown() { expect="packets 0 bytes 0" - for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - check_counter $n "unknown" "$expect" - if [ $? -ne 0 ] ;then + for n in ${nsclient1} ${nsclient2} ${nsrouter1} ${nsrouter2}; do + if ! check_counter "$n" "unknown" "$expect"; then return 1 fi done @@ -75,61 +70,48 @@ check_unknown() return 0 } -for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do - ip netns add $n - ip -net $n link set lo up -done - -DEV=veth0 -ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 DEV=veth0 -ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 +ip link add "$DEV" netns "$nsclient1" type veth peer name eth1 netns "$nsrouter1" +ip link add "$DEV" netns "$nsclient2" type veth peer name eth1 netns "$nsrouter2" +ip link add "$DEV" netns "$nsrouter1" type veth peer name eth2 netns "$nsrouter2" -DEV=veth0 -ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 +add_addr "$nsclient1" $DEV 1 +add_addr "$nsclient2" $DEV 2 -DEV=veth0 -for i in 1 2; do - ip -net nsclient$i link set $DEV up - ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV - ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV -done - -ip -net nsrouter1 link set eth1 up -ip -net nsrouter1 link set veth0 up +ip -net "$nsrouter1" link set eth1 up +ip -net "$nsrouter1" link set $DEV up -ip -net nsrouter2 link set eth1 up -ip -net nsrouter2 link set eth2 up +ip -net "$nsrouter2" link set eth1 mtu 1280 up +ip -net "$nsrouter2" link set eth2 up -ip -net nsclient1 route add default via 192.168.1.1 -ip -net nsclient1 -6 route add default via dead:1::1 +ip -net "$nsclient1" route add default via 192.168.1.1 +ip -net "$nsclient1" -6 route add default via dead:1::1 -ip -net nsclient2 route add default via 192.168.2.1 -ip -net nsclient2 route add default via dead:2::1 +ip -net "$nsclient2" route add default via 192.168.2.1 +ip -net "$nsclient2" route add default via dead:2::1 +ip -net "$nsclient2" link set veth0 mtu 1280 -i=3 -ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 -ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 -ip -net nsrouter1 addr add dead:1::1/64 dev eth1 -ip -net nsrouter1 addr add dead:3::1/64 dev veth0 -ip -net nsrouter1 route add default via 192.168.3.10 -ip -net nsrouter1 -6 route add default via dead:3::10 +ip -net "$nsrouter1" addr add 192.168.1.1/24 dev eth1 +ip -net "$nsrouter1" addr add 192.168.3.1/24 dev veth0 +ip -net "$nsrouter1" addr add dead:1::1/64 dev eth1 nodad +ip -net "$nsrouter1" addr add dead:3::1/64 dev veth0 nodad +ip -net "$nsrouter1" route add default via 192.168.3.10 +ip -net "$nsrouter1" -6 route add default via dead:3::10 -ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 -ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 -ip -net nsrouter2 addr add dead:2::1/64 dev eth1 -ip -net nsrouter2 addr add dead:3::10/64 dev eth2 -ip -net nsrouter2 route add default via 192.168.3.1 -ip -net nsrouter2 route add default via dead:3::1 +ip -net "$nsrouter2" addr add 192.168.2.1/24 dev eth1 +ip -net "$nsrouter2" addr add 192.168.3.10/24 dev eth2 +ip -net "$nsrouter2" addr add dead:2::1/64 dev eth1 nodad +ip -net "$nsrouter2" addr add dead:3::10/64 dev eth2 nodad +ip -net "$nsrouter2" route add default via 192.168.3.1 +ip -net "$nsrouter2" route add default via dead:3::1 -sleep 2 for i in 4 6; do - ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 - ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter1" sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec "$nsrouter2" sysctl -q net.ipv$i.conf.all.forwarding=1 done -for netns in nsrouter1 nsrouter2; do -ip netns exec $netns nft -f - </dev/null -if [ $? -ne 0 ]; then +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q -M "do" 192.168.2.2 >/dev/null; then echo "ERROR: netns ip routing/connectivity broken" 1>&2 - cleanup exit 1 fi -ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null -if [ $? -ne 0 ]; then +if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q dead:2::2 >/dev/null; then echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 - cleanup exit 1 fi -check_unknown -if [ $? -ne 0 ]; then +if ! check_unknown; then ret=1 fi expect="packets 0 bytes 0" -for netns in nsrouter1 nsrouter2 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in "$nsrouter1" "$nsrouter2" "$nsclient1";do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done expect="packets 2 bytes 2076" -check_counter nsclient2 "new" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$nsclient2" "new" "$expect"; then ret=1 fi -ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null -if [ $? -eq 0 ]; then +if ip netns exec "$nsclient1" ping -W 0.5 -q -c 1 -s 1300 -M "do" 192.168.2.2 > /dev/null; then echo "ERROR: ping should have failed with PMTU too big error" 1>&2 ret=1 fi @@ -253,30 +222,26 @@ fi # nsrouter2 should have generated the icmp error, so # related counter should be 0 (its in forward). expect="packets 0 bytes 0" -check_counter "nsrouter2" "related" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$nsrouter2" "related" "$expect"; then ret=1 fi # but nsrouter1 should have seen it, same for nsclient1. expect="packets 1 bytes 576" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in ${nsrouter1} ${nsclient1};do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done -ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null -if [ $? -eq 0 ]; then +if ip netns exec "${nsclient1}" ping6 -W 0.5 -c 1 -s 1300 dead:2::2 > /dev/null; then echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 ret=1 fi expect="packets 2 bytes 1856" -for netns in nsrouter1 nsclient1;do - check_counter "$netns" "related" "$expect" - if [ $? -ne 0 ]; then +for netns in "${nsrouter1}" "${nsclient1}";do + if ! check_counter "$netns" "related" "$expect"; then ret=1 fi done @@ -288,21 +253,19 @@ else fi # add 'bad' route, expect icmp REDIRECT to be generated -ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1 -ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1 +ip netns exec "${nsclient1}" ip route add 192.168.1.42 via 192.168.1.1 +ip netns exec "${nsclient1}" ip route add dead:1::42 via dead:1::1 -ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null +ip netns exec "$nsclient1" ping -W 1 -q -i 0.5 -c 2 192.168.1.42 > /dev/null expect="packets 1 bytes 112" -check_counter nsclient1 "redir4" "$expect" -if [ $? -ne 0 ];then +if ! check_counter "$nsclient1" "redir4" "$expect"; then ret=1 fi -ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null +ip netns exec "$nsclient1" ping -W 1 -c 1 dead:1::42 > /dev/null expect="packets 1 bytes 192" -check_counter nsclient1 "redir6" "$expect" -if [ $? -ne 0 ];then +if ! check_counter "$nsclient1" "redir6" "$expect"; then ret=1 fi -- cgit v1.2.3 From 6f864d391b299bdc2154162b50c39c04414dc38c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:10 +0200 Subject: selftests: netfilter: conntrack_tcp_unreplied.sh: move to lib.sh infra Replace nc with socat. Too many different implementations of nc are around with incompatible options ("nc: cannot use -p and -l"). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-6-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../net/netfilter/conntrack_tcp_unreplied.sh | 124 +++++++++------------ 1 file changed, 55 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh index e7d7bf13cff5..1f862c089028 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh @@ -4,37 +4,29 @@ # Check that UNREPLIED tcp conntrack will eventually timeout. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -waittime=20 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" +source lib.sh -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1;then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" +if ! conntrack --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without conntrack tool" exit $ksft_skip fi +ret=0 + cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null - ip netns del $ns1 - ip netns del $ns2 + cleanup_all_ns } ipv4() { - echo -n 192.168.$1.2 + echo -n 192.168."$1".2 } check_counter() @@ -44,51 +36,53 @@ check_counter() expect=$3 local lret=0 - cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "$name" | grep -q "$expect"; then echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2 - ip netns exec $ns2 nft list counter inet filter "$name" 1>&2 + ip netns exec "$ns2" nft list counter inet filter "$name" 1>&2 lret=1 fi return $lret } -# Create test namespaces -ip netns add $ns1 || exit 1 - trap cleanup EXIT -ip netns add $ns2 || exit 1 +# Create test namespaces +setup_ns ns1 ns2 # Connect the namespace to the host using a veth pair -ip -net $ns1 link add name veth1 type veth peer name veth2 -ip -net $ns1 link set netns $ns2 dev veth2 +ip -net "$ns1" link add name veth1 type veth peer name veth2 +ip -net "$ns1" link set netns "$ns2" dev veth2 -ip -net $ns1 link set up dev lo -ip -net $ns2 link set up dev lo -ip -net $ns1 link set up dev veth1 -ip -net $ns2 link set up dev veth2 +ip -net "$ns1" link set up dev lo +ip -net "$ns2" link set up dev lo +ip -net "$ns1" link set up dev veth1 +ip -net "$ns2" link set up dev veth2 -ip -net $ns2 addr add 10.11.11.2/24 dev veth2 -ip -net $ns2 route add default via 10.11.11.1 +ip -net "$ns2" addr add 10.11.11.2/24 dev veth2 +ip -net "$ns2" route add default via 10.11.11.1 -ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1 +ip netns exec "$ns2" sysctl -q net.ipv4.conf.veth2.forwarding=1 # add a rule inside NS so we enable conntrack -ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT +ip netns exec "$ns1" nft -f - </dev/null || exit 1 - -ip netns exec $ns2 nc -l -p 8080 < /dev/null & +ip netns exec "$ns1" ping -q -c 2 10.11.11.2 >/dev/null || exit 1 -# however, conntrack entries are there +ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT & -ip netns exec $ns2 nft -f - < $ns2 to the virtual ip" -ip netns exec $ns1 bash -c 'while true ; do - nc -p 60000 10.99.99.99 80 - sleep 1 +ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do + socat -u STDIN TCP:10.99.99.99:80 < /dev/null + sleep 0.1 done' & -sleep 1 - -ip netns exec $ns2 nft -f - </dev/null | wc -l) -if [ $count -eq 0 ]; then +count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) +if [ "$count" -eq 0 ]; then echo "ERROR: $ns2 did not pick up tcp connection from peer" exit 1 fi -echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect" -for i in $(seq 1 $waittime); do - echo -n "." - - sleep 1 - - count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) - if [ $count -gt 0 ]; then - echo - echo "PASS: redirection took effect after $i seconds" - break +wait_for_redirect() +{ + count=$(ip netns exec "$ns2" conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 fi - m=$((i%20)) - if [ $m -eq 0 ]; then - echo " waited for $i seconds" - fi -done + return 1 +} +echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect" + +busywait $BUSYWAIT_TIMEOUT wait_for_redirect +ret=$? expect="packets 1 bytes 60" -check_counter "$ns2" "redir" "$expect" -if [ $? -ne 0 ]; then +if ! check_counter "$ns2" "redir" "$expect"; then ret=1 fi -- cgit v1.2.3 From 9785517a22456002b5ee8a9e8c74ee7348016bb6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:11 +0200 Subject: selftests: netfilter: conntrack_sctp_collision.sh: move to lib.sh infra While at it, address warnings generated by shellcheck and fix following minor issues: - some distros place netem in 'extra' modules package, so add a skip check for netem-attach failure. - tc prints a warning for the 100mbit class: "Warning: sch_htb: quantum of class 10001 is big. Consider r2q change." Silence this by increasing the divisor. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-7-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../net/netfilter/conntrack_sctp_collision.sh | 66 +++++++++++----------- 1 file changed, 32 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh index a924e595cfd8..d860f7d9744b 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh @@ -12,69 +12,67 @@ # # TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS -CLIENT_NS=$(mktemp -u client-XXXXXXXX) +source lib.sh + CLIENT_IP="198.51.200.1" CLIENT_PORT=1234 -SERVER_NS=$(mktemp -u server-XXXXXXXX) SERVER_IP="198.51.100.1" SERVER_PORT=1234 -ROUTER_NS=$(mktemp -u router-XXXXXXXX) CLIENT_GW="198.51.200.2" SERVER_GW="198.51.100.2" # setup the topo setup() { - ip net add $CLIENT_NS - ip net add $SERVER_NS - ip net add $ROUTER_NS - ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS - ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS + setup_ns CLIENT_NS SERVER_NS ROUTER_NS + ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS" + ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS" - ip -n $SERVER_NS link set link0 up - ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0 - ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW + ip -n "$SERVER_NS" link set link0 up + ip -n "$SERVER_NS" addr add $SERVER_IP/24 dev link0 + ip -n "$SERVER_NS" route add $CLIENT_IP dev link0 via $SERVER_GW - ip -n $ROUTER_NS link set link1 up - ip -n $ROUTER_NS link set link2 up - ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1 - ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2 - ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1 + ip -n "$ROUTER_NS" link set link1 up + ip -n "$ROUTER_NS" link set link2 up + ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1 + ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2 + ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1 - ip -n $CLIENT_NS link set link3 up - ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3 - ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW + ip -n "$CLIENT_NS" link set link3 up + ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3 + ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with # tc on $SERVER_NS side - tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb - tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit - tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ + tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64 + tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit + tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ 0xff match u8 2 0xff at 32 flowid 1:1 - tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms + if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then + echo "SKIP: Cannot add netem qdisc" + exit $ksft_skip + fi # simulate the ctstate check on OVS nf_conntrack - ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP - ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP + ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP + ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP # use a smaller number for assoc's max_retrans to reproduce the issue - modprobe sctp - ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3 + modprobe -q sctp + ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3 } cleanup() { - ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null - ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null - ip net del "$CLIENT_NS" - ip net del "$SERVER_NS" - ip net del "$ROUTER_NS" + ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1 + ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1 + cleanup_all_ns } do_test() { - ip net exec $SERVER_NS ./sctp_collision server \ + ip net exec "$SERVER_NS" ./sctp_collision server \ $SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT & - ip net exec $CLIENT_NS ./sctp_collision client \ + ip net exec "$CLIENT_NS" ./sctp_collision client \ $CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT } -- cgit v1.2.3 From 954398b4d83759244656d37b4b626e912dc0969c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:12 +0200 Subject: selftests: netfilter: conntrack_vrf.sh: move to lib.sh infra swap test for "ip" with "conntrack", former is already accounted for via setup_ns helper. Also switch to bash. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-8-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/conntrack_vrf.sh | 101 ++++++++------------- 1 file changed, 39 insertions(+), 62 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh index 8b5ea9234588..f7417004ec71 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # This script demonstrates interaction of conntrack and vrf. # The vrf driver calls the netfilter hooks again, with oif/iif @@ -28,84 +28,65 @@ # that was supposed to be fixed by the commit mentioned above to make sure # that any fix to test case 1 won't break masquerade again. -ksft_skip=4 +source lib.sh IP0=172.30.30.1 IP1=172.30.30.2 PFXL=30 ret=0 -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" - cleanup() { ip netns pids $ns0 | xargs kill 2>/dev/null ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns del $ns0 $ns1 + cleanup_all_ns } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1;then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" +if ! conntrack --version > /dev/null 2>&1;then + echo "SKIP: Could not run test without conntrack tool" exit $ksft_skip fi -ip netns add "$ns1" trap cleanup EXIT -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 -ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0 +setup_ns ns0 ns1 -ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0 + +if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: Could not add veth device" exit $ksft_skip fi -ip -net $ns0 li add tvrf type vrf table 9876 -if [ $? -ne 0 ];then +if ! ip -net "$ns0" li add tvrf type vrf table 9876; then echo "SKIP: Could not add vrf device" exit $ksft_skip fi -ip -net $ns0 li set lo up - -ip -net $ns0 li set veth0 master tvrf -ip -net $ns0 li set tvrf up -ip -net $ns0 li set veth0 up -ip -net $ns1 li set veth0 up +ip -net "$ns0" li set veth0 master tvrf +ip -net "$ns0" li set tvrf up +ip -net "$ns0" li set veth0 up +ip -net "$ns1" li set veth0 up -ip -net $ns0 addr add $IP0/$PFXL dev veth0 -ip -net $ns1 addr add $IP1/$PFXL dev veth0 +ip -net "$ns0" addr add $IP0/$PFXL dev veth0 +ip -net "$ns1" addr add $IP1/$PFXL dev veth0 -ip netns exec $ns1 iperf3 -s > /dev/null 2>&1& -if [ $? -ne 0 ];then - echo "SKIP: Could not start iperf3" - exit $ksft_skip -fi +ip netns exec "$ns1" iperf3 -s > /dev/null 2>&1 & # test vrf ingress handling. # The incoming connection should be placed in conntrack zone 1, # as decided by the first iteration of the ruleset. test_ct_zone_in() { -ip netns exec $ns0 nft -f - < /dev/null + ip netns exec "$ns1" ping -W 1 -c 1 -I veth0 "$IP0" > /dev/null # should be in zone 1, not zone 2 - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) - if [ $count -eq 1 ]; then + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l) + if [ "$count" -eq 1 ]; then echo "PASS: entry found in conntrack zone 1" else echo "FAIL: entry not found in conntrack zone 1" - count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) - if [ $count -eq 1 ]; then + count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l) + if [ "$count" -eq 1 ]; then echo "FAIL: entry found in zone 2 instead" else echo "FAIL: entry not in zone 1 or 2, dumping table" - ip netns exec $ns0 conntrack -L - ip netns exec $ns0 nft list ruleset + ip netns exec "$ns0" conntrack -L + ip netns exec "$ns0" nft list ruleset fi fi } @@ -153,12 +134,12 @@ test_masquerade_vrf() local qdisc=$1 if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc add dev tvrf root $qdisc + tc -net "$ns0" qdisc add dev tvrf root "$qdisc" fi - ip netns exec $ns0 conntrack -F 2>/dev/null + ip netns exec "$ns0" conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - </dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null; then echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && - ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' - if [ $? -eq 0 ]; then + ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 2' && + if ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" else echo "FAIL: vrf rules have unexpected counter value" @@ -197,7 +176,7 @@ EOF fi if [ "$qdisc" != "default" ]; then - tc -net $ns0 qdisc del dev tvrf root + tc -net "$ns0" qdisc del dev tvrf root fi } @@ -206,8 +185,8 @@ EOF # oifname is the lower device (veth0 in this case). test_masquerade_veth() { - ip netns exec $ns0 conntrack -F 2>/dev/null -ip netns exec $ns0 nft -f - </dev/null +ip netns exec "$ns0" nft -f - < /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null; then echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' - if [ $? -eq 0 ]; then + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 2'; then echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" else echo "FAIL: vrf masq rule has unexpected counter value" -- cgit v1.2.3 From 0413156eec28e45d12e70d382d1f890b7d2f1a53 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:13 +0200 Subject: selftests: netfilter: conntrack_ipip_mtu.sh" move to lib.sh infra Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-9-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/conntrack_ipip_mtu.sh | 37 ++++++++-------------- 1 file changed, 14 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh index eb9553e4986b..f87ca4c59d3b 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -1,8 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh # Conntrack needs to reassemble fragments in order to have complete # packets for rule matching. Reassembly can lead to packet loss. @@ -23,15 +22,8 @@ ksft_skip=4 # between Client A and Client B over WAN. Wanrouter has MTU 1400 set # on its interfaces. -rnd=$(mktemp -u XXXXXXXX) rx=$(mktemp) -r_a="ns-ra-$rnd" -r_b="ns-rb-$rnd" -r_w="ns-rw-$rnd" -c_a="ns-ca-$rnd" -c_b="ns-cb-$rnd" - checktool (){ if ! $1 > /dev/null 2>&1; then echo "SKIP: Could not $2" @@ -40,29 +32,31 @@ checktool (){ } checktool "iptables --version" "run test without iptables" -checktool "ip -Version" "run test without ip tool" -checktool "which socat" "run test without socat" -checktool "ip netns add ${r_a}" "create net namespace" +checktool "socat -h" "run test without socat" -for n in ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns add ${n} -done +setup_ns r_a r_b r_w c_a c_b cleanup() { - for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do - ip netns del ${n} - done + cleanup_all_ns rm -f ${rx} } trap cleanup EXIT +listener_ready() +{ + ns="$1" + port="$2" + ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port" +} + test_path() { msg="$1" ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & - sleep 1 + busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000 + for i in 1 2 3; do head -c1400 /dev/zero | tr "\000" "a" | \ ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 @@ -129,7 +123,7 @@ r_addr="10.2.2.1" ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip -for dev in lo veth0 veth1 ipip0; do +for dev in veth0 veth1 ipip0; do ip -net ${r_b} link set $dev up done @@ -142,21 +136,18 @@ ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Client A ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 -ip -net ${c_a} link set dev lo up ip -net ${c_a} link set dev veth0 up ip -net ${c_a} route add default via 192.168.10.1 # Client A ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 ip -net ${c_b} link set dev veth0 up -ip -net ${c_b} link set dev lo up ip -net ${c_b} route add default via 192.168.20.1 # Wan ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 -ip -net ${r_w} link set dev lo up ip -net ${r_w} link set dev veth0 up mtu 1400 ip -net ${r_w} link set dev veth1 up mtu 1400 -- cgit v1.2.3 From 10e2ed3fcdf4834ed02beae1860ea2555b484a40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:14 +0200 Subject: selftests: netfilter: place checktool helper in lib.sh ... so it doesn't have to be repeated everywhere. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-10-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/br_netfilter.sh | 6 +----- tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh | 7 ------- tools/testing/selftests/net/netfilter/lib.sh | 7 +++++++ 3 files changed, 8 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh index ea3afd6d401f..1084faf88f0b 100755 --- a/tools/testing/selftests/net/netfilter/br_netfilter.sh +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -11,11 +11,7 @@ source lib.sh -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool "nft --version" "run test without nft tool" cleanup() { cleanup_all_ns diff --git a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh index f87ca4c59d3b..ac0dff0f80d7 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -24,13 +24,6 @@ source lib.sh rx=$(mktemp) -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - checktool "iptables --version" "run test without iptables" checktool "socat -h" "run test without socat" diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh index eb109eb527db..bedd35298e15 100644 --- a/tools/testing/selftests/net/netfilter/lib.sh +++ b/tools/testing/selftests/net/netfilter/lib.sh @@ -1,3 +1,10 @@ net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") source "$net_netfilter_dir/../lib.sh" + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} -- cgit v1.2.3 From 87ce7d79075f2eed67327512379112818c8a0303 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:15 +0200 Subject: selftests: netfilter: ipvs.sh: move to lib.sh infra The setup_ns helper makes the netns names random, so replace nsX with $nsX everywhere. Replace nc with socat, otherwise script fails on my system due to incompatible nc versions ("nc: cannot use -p and -l"). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-11-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/ipvs.sh | 153 ++++++++++++-------------- 1 file changed, 68 insertions(+), 85 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh index c3b8f90c497e..4ceee9fb3949 100755 --- a/tools/testing/selftests/net/netfilter/ipvs.sh +++ b/tools/testing/selftests/net/netfilter/ipvs.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # End-to-end ipvs test suite @@ -24,8 +24,8 @@ # We assume that all network driver are loaded # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 GREEN='\033[0;92m' RED='\033[0;31m' @@ -46,53 +46,39 @@ readonly datalen=32 sysipvsnet="/proc/sys/net/ipv4/vs/" if [ ! -d $sysipvsnet ]; then - modprobe -q ip_vs - if [ $? -ne 0 ]; then + if ! modprobe -q ip_vs; then echo "skip: could not run test without ipvs module" exit $ksft_skip fi fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ipvsadm -v > /dev/null 2>&1 -if [ $? -ne 0 ]; then - echo "SKIP: Could not run test without ipvsadm" - exit $ksft_skip -fi +checktool "ipvsadm -v" "run test without ipvsadm" +checktool "socat -h" "run test without socat" setup() { - ip netns add ns0 - ip netns add ns1 - ip netns add ns2 - - ip link add veth01 netns ns0 type veth peer name veth10 netns ns1 - ip link add veth02 netns ns0 type veth peer name veth20 netns ns2 - ip link add veth12 netns ns1 type veth peer name veth21 netns ns2 - - ip netns exec ns0 ip link set veth01 up - ip netns exec ns0 ip link set veth02 up - ip netns exec ns0 ip link add br0 type bridge - ip netns exec ns0 ip link set veth01 master br0 - ip netns exec ns0 ip link set veth02 master br0 - ip netns exec ns0 ip link set br0 up - ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0 - - ip netns exec ns1 ip link set lo up - ip netns exec ns1 ip link set veth10 up - ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10 - ip netns exec ns1 ip link set veth12 up - ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12 - - ip netns exec ns2 ip link set lo up - ip netns exec ns2 ip link set veth21 up - ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21 - ip netns exec ns2 ip link set veth20 up - ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20 + setup_ns ns0 ns1 ns2 + + ip link add veth01 netns "${ns0}" type veth peer name veth10 netns "${ns1}" + ip link add veth02 netns "${ns0}" type veth peer name veth20 netns "${ns2}" + ip link add veth12 netns "${ns1}" type veth peer name veth21 netns "${ns2}" + + ip netns exec "${ns0}" ip link set veth01 up + ip netns exec "${ns0}" ip link set veth02 up + ip netns exec "${ns0}" ip link add br0 type bridge + ip netns exec "${ns0}" ip link set veth01 master br0 + ip netns exec "${ns0}" ip link set veth02 master br0 + ip netns exec "${ns0}" ip link set br0 up + ip netns exec "${ns0}" ip addr add "${cip_v4}/24" dev br0 + + ip netns exec "${ns1}" ip link set veth10 up + ip netns exec "${ns1}" ip addr add "${gip_v4}/24" dev veth10 + ip netns exec "${ns1}" ip link set veth12 up + ip netns exec "${ns1}" ip addr add "${dip_v4}/24" dev veth12 + + ip netns exec "${ns2}" ip link set veth21 up + ip netns exec "${ns2}" ip addr add "${rip_v4}/24" dev veth21 + ip netns exec "${ns2}" ip link set veth20 up + ip netns exec "${ns2}" ip addr add "${sip_v4}/24" dev veth20 sleep 1 @@ -100,10 +86,7 @@ setup() { } cleanup() { - for i in 0 1 2 - do - ip netns del ns$i > /dev/null 2>&1 - done + cleanup_all_ns if [ -f "${outfile}" ]; then rm "${outfile}" @@ -114,13 +97,13 @@ cleanup() { } server_listen() { - ip netns exec ns2 nc -l -p 8080 > "${outfile}" & + ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & server_pid=$! sleep 0.2 } client_connect() { - ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}" + ip netns exec "${ns0}" timeout 2 socat -u -4 STDIN TCP:"${vip_v4}":"${port}" < "${infile}" } verify_data() { @@ -136,58 +119,58 @@ test_service() { test_dr() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 # avoid incorrect arp response - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 # avoid reverse route lookup - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 test_service } test_nat() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -m -t "${vip_v4}:${port}" -r "${rip_v4}:${port}" + ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1 - ip netns exec ns2 ip link del veth20 - ip netns exec ns2 ip route add default via ${dip_v4} dev veth21 + ip netns exec "${ns2}" ip link del veth20 + ip netns exec "${ns2}" ip route add default via "${dip_v4}" dev veth21 test_service } test_tun() { - ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0 - - ip netns exec ns1 modprobe ipip - ip netns exec ns1 ip link set tunl0 up - ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0 - ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0 - ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr - ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port} - ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1 - - ip netns exec ns2 modprobe ipip - ip netns exec ns2 ip link set tunl0 up - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2 - ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 - ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0 - ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1 + ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0 + + ip netns exec "${ns1}" modprobe -q ipip + ip netns exec "${ns1}" ip link set tunl0 up + ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.all.send_redirects=0 + ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.default.send_redirects=0 + ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr + ip netns exec "${ns1}" ipvsadm -a -i -t "${vip_v4}:${port}" -r ${rip_v4}:${port} + ip netns exec "${ns1}" ip addr add ${vip_v4}/32 dev lo:1 + + ip netns exec "${ns2}" modprobe -q ipip + ip netns exec "${ns2}" ip link set tunl0 up + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.tunl0.rp_filter=0 + ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0 + ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1 test_service } -- cgit v1.2.3 From f51fe0256135d39a642c83785efaaa91acdef8d6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:16 +0200 Subject: selftests: netfilter: nf_nat_edemux.sh: move to lib.sh infra While at it, use checktool helper. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-12-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nf_nat_edemux.sh | 82 +++++++--------------- 1 file changed, 26 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh index a1aa8f4a5828..1014551dd769 100755 --- a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh +++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh @@ -4,88 +4,60 @@ # Test NAT source port clash resolution # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh ret=0 - -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" socatpid=0 cleanup() { - [ $socatpid -gt 0 ] && kill $socatpid - ip netns del $ns1 - ip netns del $ns2 -} - -socat -h > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without socat" - exit $ksft_skip -fi - -iptables --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without iptables" - exit $ksft_skip -fi + [ "$socatpid" -gt 0 ] && kill "$socatpid" -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi + cleanup_all_ns +} -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi +checktool "socat -h" "run test without socat" +checktool "iptables --version" "run test without iptables" trap cleanup EXIT -ip netns add $ns2 +setup_ns ns1 ns2 # Connect the namespaces using a veth pair ip link add name veth2 type veth peer name veth1 -ip link set netns $ns1 dev veth1 -ip link set netns $ns2 dev veth2 +ip link set netns "$ns1" dev veth1 +ip link set netns "$ns2" dev veth2 -ip netns exec $ns1 ip link set up dev lo -ip netns exec $ns1 ip link set up dev veth1 -ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 +ip netns exec "$ns1" ip link set up dev lo +ip netns exec "$ns1" ip link set up dev veth1 +ip netns exec "$ns1" ip addr add 192.168.1.1/24 dev veth1 -ip netns exec $ns2 ip link set up dev lo -ip netns exec $ns2 ip link set up dev veth2 -ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 +ip netns exec "$ns2" ip link set up dev lo +ip netns exec "$ns2" ip link set up dev veth2 +ip netns exec "$ns2" ip addr add 192.168.1.2/24 dev veth2 # Create a server in one namespace -ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & +ip netns exec "$ns1" socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 & socatpid=$! # Restrict source port to just one so we don't have to exhaust # all others. -ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" +ip netns exec "$ns2" sysctl -q net.ipv4.ip_local_port_range="10000 10000" # add a virtual IP using DNAT -ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 +ip netns exec "$ns2" iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 # ... and route it to the other namespace -ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 - -sleep 1 +ip netns exec "$ns2" ip route add 10.96.0.1 via 192.168.1.1 # add a persistent connection from the other namespace -ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & +ip netns exec "$ns2" socat -t 10 - TCP:192.168.1.1:5201 > /dev/null & sleep 1 # ip daddr:dport will be rewritten to 192.168.1.1 5201 # NAT must reallocate source port 10000 because # 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use -echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null +echo test | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null ret=$? # Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). @@ -96,16 +68,14 @@ else fi # check sport clashres. -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 -ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201 +ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201 -sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & -cpid1=$! -sleep 1 +sleep 5 | ip netns exec "$ns2" socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null & # if connect succeeds, client closes instantly due to EOF on stdin. # if connect hangs, it will time out after 5s. -echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & +echo | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null & cpid2=$! time_then=$(date +%s) @@ -117,7 +87,7 @@ time_now=$(date +%s) # 'cpid2' to connect and then exit (and no connect delay). delta=$((time_now - time_then)) -if [ $delta -lt 2 -a $rv -eq 0 ]; then +if [ $delta -lt 2 ] && [ $rv -eq 0 ]; then echo "PASS: could connect to service via redirected ports" else echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)" -- cgit v1.2.3 From fa03bb7c8c0113c5d408c32eefc280feaed77acc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:17 +0200 Subject: selftests: netfilter: nft_conntrack_helper.sh: test to lib.sh infra prefer socat over nc, nc has too many incompatible versions around. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-13-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../net/netfilter/nft_conntrack_helper.sh | 132 +++++++++------------ 1 file changed, 53 insertions(+), 79 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh index faa7778d7bd1..abcaa7337197 100755 --- a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh +++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh @@ -5,72 +5,48 @@ # 2. auto-assign still works. # # Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 + +source lib.sh + ret=0 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" testipv6=1 +checktool "socat -h" "run test without socat" +checktool "conntrack --version" "run test without conntrack" +checktool "nft --version" "run test without nft" + cleanup() { - ip netns del ${ns1} - ip netns del ${ns2} -} - -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi + ip netns pids "$ns1" | xargs kill 2>/dev/null -which nc >/dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without netcat tool" - exit $ksft_skip -fi + ip netns del "$ns1" + ip netns del "$ns2" +} trap cleanup EXIT -ip netns add ${ns1} -ip netns add ${ns2} +setup_ns ns1 ns2 -ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$ns1" type veth peer name veth0 netns "$ns2" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi -ip -net ${ns1} link set lo up -ip -net ${ns1} link set veth0 up +ip -net "$ns1" link set veth0 up +ip -net "$ns2" link set veth0 up -ip -net ${ns2} link set lo up -ip -net ${ns2} link set veth0 up +ip -net "$ns1" addr add 10.0.1.1/24 dev veth0 +ip -net "$ns1" addr add dead:1::1/64 dev veth0 nodad -ip -net ${ns1} addr add 10.0.1.1/24 dev veth0 -ip -net ${ns1} addr add dead:1::1/64 dev veth0 - -ip -net ${ns2} addr add 10.0.1.2/24 dev veth0 -ip -net ${ns2} addr add dead:1::2/64 dev veth0 +ip -net "$ns2" addr add 10.0.1.2/24 dev veth0 +ip -net "$ns2" addr add dead:1::2/64 dev veth0 nodad load_ruleset_family() { local family=$1 local ns=$2 -ip netns exec ${ns} nft -f - < /dev/null |grep -q 'helper=ftp' - if [ $? -ne 0 ] ; then - if [ $autoassign -eq 0 ] ;then + if ! ip netns exec "$netns" conntrack -L -f $family -p tcp --dport "$port" 2> /dev/null |grep -q 'helper=ftp';then + if [ "$autoassign" -eq 0 ] ;then echo "FAIL: ${netns} did not show attached helper $message" 1>&2 ret=1 else echo "PASS: ${netns} did not show attached helper $message" 1>&2 fi else - if [ $autoassign -eq 0 ] ;then + if [ "$autoassign" -eq 0 ] ;then echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 else echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 @@ -120,69 +95,68 @@ check_for_helper() return 0 } +listener_ready() +{ + ns="$1" + port="$2" + proto="$3" + ss -N "$ns" -lnt -o "sport = :$port" | grep -q "$port" +} + test_helper() { local port=$1 local autoassign=$2 - if [ $autoassign -eq 0 ] ;then + if [ "$autoassign" -eq 0 ] ;then msg="set via ruleset" else msg="auto-assign" fi - sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & + ip netns exec "$ns2" socat -t 3 -u -4 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" "$port" "-4" - sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & - sleep 1 + ip netns exec "$ns1" socat -u -4 STDIN TCP:10.0.1.2:"$port" < /dev/null > /dev/null - check_for_helper "$ns1" "ip $msg" $port $autoassign - check_for_helper "$ns2" "ip $msg" $port $autoassign - - wait + check_for_helper "$ns1" "ip $msg" "$port" "$autoassign" + check_for_helper "$ns2" "ip $msg" "$port" "$autoassign" if [ $testipv6 -eq 0 ] ;then return 0 fi - ip netns exec ${ns1} conntrack -F 2> /dev/null - ip netns exec ${ns2} conntrack -F 2> /dev/null - - sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null & + ip netns exec "$ns1" conntrack -F 2> /dev/null + ip netns exec "$ns2" conntrack -F 2> /dev/null - sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null & - sleep 1 + ip netns exec "$ns2" socat -t 3 -u -6 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null & + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" "$port" "-6" - check_for_helper "$ns1" "ipv6 $msg" $port - check_for_helper "$ns2" "ipv6 $msg" $port + ip netns exec "$ns1" socat -t 3 -u -6 STDIN TCP:"[dead:1::2]":"$port" < /dev/null > /dev/null - wait + check_for_helper "$ns1" "ipv6 $msg" "$port" + check_for_helper "$ns2" "ipv6 $msg" "$port" } -load_ruleset_family ip ${ns1} -if [ $? -ne 0 ];then +if ! load_ruleset_family ip "$ns1"; then echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2 exit 1 fi -load_ruleset_family ip6 ${ns1} -if [ $? -ne 0 ];then +if ! load_ruleset_family ip6 "$ns1"; then echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2 testipv6=0 fi -load_ruleset_family inet ${ns2} -if [ $? -ne 0 ];then +if ! load_ruleset_family inet "${ns2}"; then echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2 - load_ruleset_family ip ${ns2} - if [ $? -ne 0 ];then + if ! load_ruleset_family ip "${ns2}"; then echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2 exit 1 fi - if [ $testipv6 -eq 1 ] ;then - load_ruleset_family ip6 ${ns2} - if [ $? -ne 0 ];then + if [ "$testipv6" -eq 1 ] ;then + if ! load_ruleset_family ip6 "$ns2"; then echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2 exit 1 fi @@ -190,8 +164,8 @@ if [ $? -ne 0 ];then fi test_helper 2121 0 -ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec "$ns1" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec "$ns2" sysctl -qe 'net.netfilter.nf_conntrack_helper=1' test_helper 21 1 exit $ret -- cgit v1.2.3 From 6bc0709bf1115765bfed820af852a4593938079b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:18 +0200 Subject: selftests: netfilter: nft_fib.sh: move to lib.sh infra Also lower ping interval, wait times (helpers get called several times) and set nodad for ipv6 addresses: 20s down to 4s. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-14-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_fib.sh | 71 +++++++----------------- 1 file changed, 19 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh index dff476e45e77..04d6dc886b8a 100755 --- a/tools/testing/selftests/net/netfilter/nft_fib.sh +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -3,43 +3,25 @@ # This tests the fib expression. # # Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 + +source lib.sh + ret=0 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" timeout=4 log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) cleanup() { - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} + cleanup_all_ns [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool "nft --version" "run test without nft" -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi - -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi +setup_ns nsrouter ns1 ns2 trap cleanup EXIT @@ -50,8 +32,6 @@ if [ $? -eq 0 ]; then fi sysctl -q net.netfilter.nf_log_all_netns=1 -ip netns add ${ns1} -ip netns add ${ns2} load_ruleset() { local netns=$1 @@ -95,8 +75,7 @@ EOF } check_drops() { - dmesg | grep -q ' nft_rpfilter: ' - if [ $? -eq 0 ]; then + if dmesg | grep -q ' nft_rpfilter: ';then dmesg | grep ' nft_rpfilter: ' echo "FAIL: rpfilter did drop packets" return 1 @@ -130,35 +109,30 @@ load_ruleset ${nsrouter} load_ruleset ${ns1} load_ruleset ${ns2} -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} -ip -net ${nsrouter} link set lo up ip -net ${nsrouter} link set veth0 up ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 nodad ip -net ${nsrouter} link set veth1 up ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 nodad -ip -net ${ns1} link set lo up ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up ip -net ${ns2} link set eth0 up ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 nodad ip -net ${ns1} route add default via 10.0.1.1 ip -net ${ns1} route add default via dead:1::1 ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 nodad ip -net ${ns2} route add default via 10.0.2.1 ip -net ${ns2} route add default via dead:2::1 @@ -166,17 +140,13 @@ test_ping() { local daddr4=$1 local daddr6=$2 - ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null - ret=$? - if [ $ret -ne 0 ];then + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr4" > /dev/null; then check_drops echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 return 1 fi - ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null - ret=$? - if [ $ret -ne 0 ];then + if ! ip netns exec "$ns1" ping -c 1 -q "$daddr6" > /dev/null; then check_drops echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 return 1 @@ -191,8 +161,6 @@ ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null -sleep 3 - test_ping 10.0.2.1 dead:2::1 || exit 1 check_drops || exit 1 @@ -210,12 +178,12 @@ ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 ip -net ${ns1} addr del dead:1::99/64 dev eth0 ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr add dead:2::99/64 dev eth0 +ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad ip -net ${ns1} route add default via 10.0.2.1 ip -net ${ns1} -6 route add default via dead:2::1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad # switch to ruleset that doesn't log, this time # its expected that this does drop the packets. @@ -227,11 +195,10 @@ load_ruleset_count ${nsrouter} check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 -ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null +ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 -sleep 2 -ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null +ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 # delete all rules @@ -240,7 +207,7 @@ ip netns exec ${ns2} nft flush ruleset ip netns exec ${nsrouter} nft flush ruleset ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 ip -net ${ns1} addr del dead:2::99/64 dev eth0 -- cgit v1.2.3 From 53e9426204a0dbc2d651f1088e0f7eaf19c57793 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:19 +0200 Subject: selftests: netfilter: nft_flowtable.sh: move test to lib.sh infra Use socat, the different nc implementations have too much variance wrt. supported options. Avoid sleeping until listener is up, use busywait helper for this, this also greatly reduces test duration. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-15-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_flowtable.sh | 108 +++++++-------------- 1 file changed, 36 insertions(+), 72 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a32f490f7539..d765c65c31f3 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -14,14 +14,8 @@ # nft_flowtable.sh -o8000 -l1500 -r2000 # -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsr1="nsr1-$sfx" -nsr2="nsr2-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 nsin="" @@ -30,27 +24,16 @@ ns2out="" log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} - checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" -checktool "which nc" "run test without nc (netcat)" -checktool "ip netns add $nsr1" "create net namespace $nsr1" +checktool "socat -h" "run test without socat" -ip netns add $ns1 -ip netns add $ns2 -ip netns add $nsr2 +setup_ns ns1 ns2 nsr1 nsr2 cleanup() { - ip netns del $ns1 - ip netns del $ns2 - ip netns del $nsr1 - ip netns del $nsr2 + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + + cleanup_all_ns rm -f "$nsin" "$ns1out" "$ns2out" @@ -66,16 +49,16 @@ ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 -for dev in lo veth0 veth1; do - ip -net $nsr1 link set $dev up - ip -net $nsr2 link set $dev up +for dev in veth0 veth1; do + ip -net "$nsr1" link set "$dev" up + ip -net "$nsr2" link set "$dev" up done -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad -ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 -ip -net $nsr2 addr add dead:2::1/64 dev veth1 +ip -net "$nsr2" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsr2" addr add dead:2::1/64 dev veth1 nodad # set different MTUs so we need to push packets coming from ns1 (large MTU) # to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), @@ -121,11 +104,11 @@ ip -net $ns2 link set eth0 mtu $rmtu # transfer-net between nsr1 and nsr2. # these addresses are not used for connections. -ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 -ip -net $nsr1 addr add fee1:2::1/64 dev veth1 +ip -net "$nsr1" addr add 192.168.10.1/24 dev veth1 +ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad -ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 -ip -net $nsr2 addr add fee1:2::2/64 dev veth0 +ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 +ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad for i in 0 1; do ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null @@ -148,8 +131,8 @@ ip -net $ns1 addr add 10.0.1.99/24 dev eth0 ip -net $ns2 addr add 10.0.2.99/24 dev eth0 ip -net $ns1 route add default via 10.0.1.1 ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 -ip -net $ns2 addr add dead:2::99/64 dev eth0 +ip -net $ns1 addr add dead:1::99/64 dev eth0 nodad +ip -net $ns2 addr add dead:2::99/64 dev eth0 nodad ip -net $ns1 route add default via dead:1::1 ip -net $ns2 route add default via dead:2::1 @@ -219,10 +202,6 @@ if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then exit 1 fi -if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" -fi - nsin=$(mktemp) ns1out=$(mktemp) ns2out=$(mktemp) @@ -345,6 +324,11 @@ check_transfer() return 0 } +listener_ready() +{ + ss -N "$nsb" -lnt -o "sport = :12345" | grep -q 12345 +} + test_tcp_forwarding_ip() { local nsa=$1 @@ -353,33 +337,14 @@ test_tcp_forwarding_ip() local dstport=$4 local lret=0 - ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & + timeout 10 ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & lpid=$! - sleep 1 - ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & - cpid=$! - - sleep 1 - - prev="$(ls -l $ns1out $ns2out)" - sleep 1 + busywait 1000 listener_ready - while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do - sleep 1; - prev="$(ls -l $ns1out $ns2out)" - done - - if test -d /proc/"$lpid"/; then - kill $lpid - fi - - if test -d /proc/"$cpid"/; then - kill $cpid - fi + timeout 10 ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" wait $lpid - wait $cpid if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then lret=1 @@ -550,7 +515,7 @@ ip -net $nsr1 addr flush dev veth0 ip -net $nsr1 link set up dev veth0 ip -net $nsr1 link set veth0 master br0 ip -net $nsr1 addr add 10.0.1.1/24 dev br0 -ip -net $nsr1 addr add dead:1::1/64 dev br0 +ip -net $nsr1 addr add dead:1::1/64 dev br0 nodad ip -net $nsr1 link set up dev br0 ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null @@ -593,7 +558,7 @@ ip -net $ns1 link set eth0 up ip -net $ns1 link set eth0.10 up ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0.10 +ip -net $ns1 addr add dead:1::99/64 dev eth0.10 nodad if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 @@ -616,10 +581,10 @@ ip -net $ns1 link delete eth0.10 type vlan ip -net $ns1 link set eth0 up ip -net $ns1 addr add 10.0.1.99/24 dev eth0 ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 +ip -net $ns1 addr add dead:1::99/64 dev eth0 nodad ip -net $ns1 route add default via dead:1::1 ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 +ip -net $nsr1 addr add dead:1::1/64 dev veth0 nodad ip -net $nsr1 link set up dev veth0 KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) @@ -647,7 +612,6 @@ do_esp() { ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow # to fwd decrypted packets after esp processing: ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow - } do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 @@ -661,12 +625,12 @@ ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 ip -net $ns2 route add default via 10.0.2.1 ip -net $ns2 route add default via dead:2::1 -if test_tcp_forwarding $ns1 $ns2; then +if test_tcp_forwarding "$ns1" "$ns2"; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" - ip netns exec $nsr1 nft list ruleset 1>&2 - ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 + ip netns exec "$nsr1" nft list ruleset 1>&2 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 fi exit $ret -- cgit v1.2.3 From 49af681bcab41ac80745fdf7632816c79e123b22 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Apr 2024 01:36:20 +0200 Subject: selftests: netfilter: nft_nat.sh: move to lib.sh infra Use busywait helper to wait until socat listener is up to avoid "sleep" calls. This reduces script execution time slighty (12s to 7s). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240411233624.8129-16-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_nat.sh | 480 ++++++++++------------- 1 file changed, 206 insertions(+), 274 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh index dd40d9f6f259..9e39de26455f 100755 --- a/tools/testing/selftests/net/netfilter/nft_nat.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat.sh @@ -3,77 +3,60 @@ # This test is for basic NAT functionality: snat, dnat, redirect, masquerade. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh + ret=0 test_inet_nat=true -sfx=$(mktemp -u "XXXXXXXX") -ns0="ns0-$sfx" -ns1="ns1-$sfx" -ns2="ns2-$sfx" +checktool "nft --version" "run test without nft tool" +checktool "socat -h" "run test without socat" cleanup() { - for i in 0 1 2; do ip netns del ns$i-"$sfx";done -} + ip netns pids "$ns0" | xargs kill 2>/dev/null + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi + rm -f "$INFILE" "$OUTFILE" -ip netns add "$ns0" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns0" - exit $ksft_skip -fi + cleanup_all_ns +} trap cleanup EXIT -ip netns add "$ns1" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns1" - exit $ksft_skip -fi +INFILE=$(mktemp) +OUTFILE=$(mktemp) -ip netns add "$ns2" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $ns2" - exit $ksft_skip -fi +setup_ns ns0 ns1 ns2 -ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2" -ip -net "$ns0" link set lo up ip -net "$ns0" link set veth0 up ip -net "$ns0" addr add 10.0.1.1/24 dev veth0 -ip -net "$ns0" addr add dead:1::1/64 dev veth0 +ip -net "$ns0" addr add dead:1::1/64 dev veth0 nodad ip -net "$ns0" link set veth1 up ip -net "$ns0" addr add 10.0.2.1/24 dev veth1 -ip -net "$ns0" addr add dead:2::1/64 dev veth1 - -for i in 1 2; do - ip -net ns$i-$sfx link set lo up - ip -net ns$i-$sfx link set eth0 up - ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0 - ip -net ns$i-$sfx route add default via 10.0.$i.1 - ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0 - ip -net ns$i-$sfx route add default via dead:$i::1 -done +ip -net "$ns0" addr add dead:2::1/64 dev veth1 nodad + +do_config() +{ + ns="$1" + subnet="$2" + + ip -net "$ns" link set eth0 up + ip -net "$ns" addr add "10.0.$subnet.99/24" dev eth0 + ip -net "$ns" route add default via "10.0.$subnet.1" + ip -net "$ns" addr add "dead:$subnet::99/64" dev eth0 nodad + ip -net "$ns" route add default via "dead:$subnet::1" +} + +do_config "$ns1" 1 +do_config "$ns2" 2 bad_counter() { @@ -83,7 +66,7 @@ bad_counter() local tag=$4 echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2 - ip netns exec $ns nft list counter inet filter $counter 1>&2 + ip netns exec "$ns" nft list counter inet filter "$counter" 1>&2 } check_counters() @@ -91,26 +74,23 @@ check_counters() ns=$1 local lret=0 - cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1" + if ! ip netns exec "$ns" nft list counter inet filter ns0in | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0in "packets 1 bytes 84" "check_counters 1" lret=1 fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2" + + if ! ip netns exec "$ns" nft list counter inet filter ns0out | grep -q "packets 1 bytes 84";then + bad_counter "$ns" ns0out "packets 1 bytes 84" "check_counters 2" lret=1 fi expect="packets 1 bytes 104" - cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0in6 "$expect" "check_counters 3" + if ! ip netns exec "$ns" nft list counter inet filter ns0in6 | grep -q "$expect";then + bad_counter "$ns" ns0in6 "$expect" "check_counters 3" lret=1 fi - cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter $ns ns0out6 "$expect" "check_counters 4" + if ! ip netns exec "$ns" nft list counter inet filter ns0out6 | grep -q "$expect";then + bad_counter "$ns" ns0out6 "$expect" "check_counters 4" lret=1 fi @@ -122,41 +102,35 @@ check_ns0_counters() local ns=$1 local lret=0 - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0in6 "packets 0 bytes 0" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0";then bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 " lret=1 fi for dir in "in" "out" ; do expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}" "$expect" "check_ns0_counters 4" lret=1 fi expect="packets 1 bytes 104" - cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5" + if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}6" | grep -q "$expect";then + bad_counter "$ns0" "$ns${dir}6" "$expect" "check_ns0_counters 5" lret=1 fi done @@ -166,8 +140,8 @@ check_ns0_counters() reset_counters() { - for i in 0 1 2;do - ip netns exec ns$i-$sfx nft reset counters inet > /dev/null + for i in "$ns0" "$ns1" "$ns2" ;do + ip netns exec "$i" nft reset counters inet > /dev/null done } @@ -177,7 +151,7 @@ test_local_dnat6() local lret=0 local IPF="" - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then IPF="ip6" fi @@ -195,8 +169,7 @@ EOF fi # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null;then lret=1 echo "ERROR: ping6 failed" return $lret @@ -204,8 +177,7 @@ EOF expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1" lret=1 fi @@ -213,8 +185,7 @@ EOF expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2" lret=1 fi @@ -223,8 +194,7 @@ EOF # expect 0 count in ns1 expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3" lret=1 fi @@ -233,8 +203,7 @@ EOF # expect 1 packet in ns2 expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4" lret=1 fi @@ -252,7 +221,7 @@ test_local_dnat() local lret=0 local IPF="" - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then IPF="ip" fi @@ -265,7 +234,7 @@ table $family nat { } EOF if [ $? -ne 0 ]; then - if [ $family = "inet" ];then + if [ "$family" = "inet" ];then echo "SKIP: inet nat tests" test_inet_nat=false return $ksft_skip @@ -275,8 +244,7 @@ EOF fi # ping netns1, expect rewrite to netns2 - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then lret=1 echo "ERROR: ping failed" return $lret @@ -284,18 +252,16 @@ EOF expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_local_dnat 1" lret=1 fi done expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2" + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns2$dir" "$expect" "test_local_dnat 2" lret=1 fi done @@ -303,9 +269,8 @@ EOF # expect 0 count in ns1 expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3" + if ! ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_local_dnat 3" lret=1 fi done @@ -313,20 +278,18 @@ EOF # expect 1 packet in ns2 expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4" + if ! ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect";then + bad_counter "$ns2" "ns0$dir" "$expect" "test_local_dnat 4" lret=1 fi done test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2" - ip netns exec "$ns0" nft flush chain $family nat output + ip netns exec "$ns0" nft flush chain "$family" nat output reset_counters - ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then lret=1 echo "ERROR: ping failed" return $lret @@ -334,16 +297,14 @@ EOF expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5" lret=1 fi done expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6" lret=1 fi @@ -352,8 +313,7 @@ EOF # expect 1 count in ns1 expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7" lret=1 fi @@ -362,8 +322,7 @@ EOF # expect 0 packet in ns2 expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8" lret=1 fi @@ -374,13 +333,19 @@ EOF return $lret } +listener_ready() +{ + local ns="$1" + local port="$2" + local proto="$3" + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" +} + test_local_dnat_portonly() { local family=$1 local daddr=$2 local lret=0 - local sr_s - local sr_r ip netns exec "$ns0" nft -f /dev/stdin < /dev/null - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" return 1 - lret=1 fi expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade6 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade6 2" lret=1 fi done @@ -462,8 +422,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" lret=1 fi @@ -471,14 +430,12 @@ EOF # ns1 should have seen packets from ns0, due to masquerade expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4" lret=1 fi @@ -487,27 +444,23 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade6 6" lret=1 fi done - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)" lret=1 fi - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting;then echo "ERROR: Could not flush $family nat postrouting" 1>&2 lret=1 fi @@ -526,23 +479,20 @@ test_masquerade() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping $ns1 from "$ns2" $natflags" + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then + echo "ERROR: cannot ping $ns1 from $ns2 $natflags" lret=1 fi expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 2" lret=1 fi done @@ -563,8 +513,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags" lret=1 fi @@ -572,15 +521,13 @@ EOF # ns1 should have seen packets from ns0, due to masquerade expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3" + if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 3" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4" + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 4" lret=1 fi done @@ -588,27 +535,23 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 5" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6" + if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then + bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade 6" lret=1 fi done - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)" lret=1 fi - ip netns exec "$ns0" nft flush chain $family nat postrouting - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting; then echo "ERROR: Could not flush $family nat postrouting" 1>&2 lret=1 fi @@ -625,22 +568,19 @@ test_redirect6() ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" lret=1 fi expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2" lret=1 fi @@ -662,8 +602,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect" lret=1 fi @@ -671,8 +610,7 @@ EOF # ns1 should have seen no packets from ns2, due to redirection expect="packets 0 bytes 0" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3" lret=1 fi @@ -681,15 +619,13 @@ EOF # ns0 should have seen packets from ns2, due to masquerade expect="packets 1 bytes 104" for dir in "in6" "out6" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4" lret=1 fi done - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table "$family" nat;then echo "ERROR: Could not delete $family nat table" 1>&2 lret=1 fi @@ -707,22 +643,19 @@ test_redirect() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2" lret=1 fi expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then - bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1" + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then + bad_counter "$ns1" "$ns2$dir" "$expect" "test_redirect 1" lret=1 fi - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2" lret=1 fi @@ -744,8 +677,7 @@ EOF return $ksft_skip fi - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect" lret=1 fi @@ -754,8 +686,7 @@ EOF expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3" lret=1 fi @@ -764,15 +695,13 @@ EOF # ns0 should have seen packets from ns2, due to masquerade expect="packets 1 bytes 84" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4" lret=1 fi done - ip netns exec "$ns0" nft delete table $family nat - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table "$family" nat;then echo "ERROR: Could not delete $family nat table" 1>&2 lret=1 fi @@ -803,13 +732,13 @@ test_port_shadow() # make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405. echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405 - echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 & - sc_r=$! + echo ROUTER | ip netns exec "$ns0" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405 2>/dev/null & + local sc_r=$! + echo CLIENT | ip netns exec "$ns2" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405,reuseport 2>/dev/null & + local sc_c=$! - echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport & - sc_c=$! - - sleep 0.3 + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1405 "-u" + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" 1405 "-u" # ns1 tries to connect to ns0:1405. With default settings this should connect # to client, it matches the conntrack entry created above. @@ -846,7 +775,7 @@ table $family filter { EOF test_port_shadow "port-filter" "ROUTER" - ip netns exec "$ns0" nft delete table $family filter + ip netns exec "$ns0" nft delete table "$family" filter } # This prevents port shadow of router service via notrack. @@ -868,7 +797,7 @@ table $family raw { EOF test_port_shadow "port-notrack" "ROUTER" - ip netns exec "$ns0" nft delete table $family raw + ip netns exec "$ns0" nft delete table "$family" raw } # This prevents port shadow of router service via sport remap. @@ -886,21 +815,19 @@ table $family pat { EOF test_port_shadow "pat" "ROUTER" - ip netns exec "$ns0" nft delete table $family pat + ip netns exec "$ns0" nft delete table "$family" pat } test_port_shadowing() { local family="ip" - conntrack -h >/dev/null 2>&1 - if [ $? -ne 0 ];then + if ! conntrack -h >/dev/null 2>&1;then echo "SKIP: Could not run nat port shadowing test without conntrack tool" return fi - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! socat -h > /dev/null 2>&1;then echo "SKIP: Could not run nat port shadowing test without socat tool" return fi @@ -946,8 +873,7 @@ test_stateless_nat_ip() ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules" return 1 fi @@ -981,23 +907,20 @@ EOF reset_counters - ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null; then echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules" lret=1 fi # ns1 should have seen packets from .2.2, due to stateless rewrite. expect="packets 1 bytes 84" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then bad_counter "$ns1" ns0insl "$expect" "test_stateless 1" lret=1 fi for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2" lret=1 fi @@ -1006,14 +929,12 @@ EOF # ns1 should not have seen packets from ns2, due to masquerade expect="packets 0 bytes 0" for dir in "in" "out" ; do - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect";then bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3" lret=1 fi - cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect";then bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4" lret=1 fi @@ -1021,8 +942,7 @@ EOF reset_counters - socat -h > /dev/null 2>&1 - if [ $? -ne 0 ];then + if ! socat -h > /dev/null 2>&1;then echo "SKIP: Could not run stateless nat frag test without socat tool" if [ $lret -eq 0 ]; then return $ksft_skip @@ -1032,42 +952,36 @@ EOF return $lret fi - local tmpfile=$(mktemp) - dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null + dd if=/dev/urandom of="$INFILE" bs=4096 count=1 2>/dev/null - local outfile=$(mktemp) - ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null & - sc_r=$! + ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:"$OUTFILE" < /dev/null 2>/dev/null & + + busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 4233 "-u" - sleep 1 # re-do with large ping -> ip fragmentation - ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns2" timeout 3 socat -u STDIN UDP4-SENDTO:"10.0.1.99:4233" < "$INFILE" > /dev/null;then echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2 lret=1 fi wait - cmp "$tmpfile" "$outfile" - if [ $? -ne 0 ]; then - ls -l "$tmpfile" "$outfile" + if ! cmp "$INFILE" "$OUTFILE";then + ls -l "$INFILE" "$OUTFILE" echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2 lret=1 fi - rm -f "$tmpfile" "$outfile" + :> "$OUTFILE" # ns1 should have seen packets from 2.2, due to stateless rewrite. expect="packets 3 bytes 4164" - cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect") - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then bad_counter "$ns1" ns0insl "$expect" "test_stateless 5" lret=1 fi - ip netns exec "$ns0" nft delete table ip stateless - if [ $? -ne 0 ]; then + if ! ip netns exec "$ns0" nft delete table ip stateless; then echo "ERROR: Could not delete table ip stateless" 1>&2 lret=1 fi @@ -1078,8 +992,8 @@ EOF } # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 -for i in 0 1 2; do -ip netns exec ns$i-$sfx nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s)" 1>&2 - ret=1 - fi - - ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null - if [ $? -ne 0 ];then - echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 - ret=1 - fi - check_counters ns$i-$sfx - if [ $? -ne 0 ]; then - ret=1 - fi - - check_ns0_counters ns$i - if [ $? -ne 0 ]; then - ret=1 - fi - reset_counters -done +ping_basic() +{ + i="$1" + if ! ip netns exec "$ns0" ping -c 1 -q 10.0."$i".99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s)" 1>&2 + ret=1 + fi + + if ! ip netns exec "$ns0" ping -c 1 -q dead:"$i"::99 > /dev/null;then + echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2 + ret=1 + fi +} + +test_basic_conn() +{ + local nsexec + name="$1" + + nsexec=$(eval echo \$"$1") + + ping_basic 1 + ping_basic 2 + + if ! check_counters "$nsexec";then + return 1 + fi + + if ! check_ns0_counters "$name";then + return 1 + fi + + reset_counters + return 0 +} + +if ! test_basic_conn "ns1" ; then + echo "ERROR: basic test for ns1 failed" 1>&2 + exit 1 +fi +if ! test_basic_conn "ns2"; then + echo "ERROR: basic test for ns1 failed" 1>&2 +fi if [ $ret -eq 0 ];then echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2" -- cgit v1.2.3 From 0adab2b6b7336fb6ee3c6456a432dad3b1d25647 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 14 Apr 2024 01:45:09 +0200 Subject: tools/nolibc: add support for uname(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All supported kernels are assumed to use struct new_utsname. This is validated in test_uname(). uname(2) can for example be used in ksft_min_kernel_version() from the kernels selftest framework. Link: https://lore.kernel.org/lkml/20240412123536.GA32444@redhat.com/ Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau --- tools/include/nolibc/sys.h | 27 ++++++++++++++++++ tools/testing/selftests/nolibc/nolibc-test.c | 42 ++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) (limited to 'tools') diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index dda9dffd1d74..7b82bc3cf107 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -22,6 +22,7 @@ #include /* for statx() */ #include #include +#include #include "arch.h" #include "errno.h" @@ -1139,6 +1140,32 @@ int umount2(const char *path, int flags) } +/* + * int uname(struct utsname *buf); + */ + +struct utsname { + char sysname[65]; + char nodename[65]; + char release[65]; + char version[65]; + char machine[65]; + char domainname[65]; +}; + +static __attribute__((unused)) +int sys_uname(struct utsname *buf) +{ + return my_syscall1(__NR_uname, buf); +} + +static __attribute__((unused)) +int uname(struct utsname *buf) +{ + return __sysret(sys_uname(buf)); +} + + /* * int unlink(const char *path); */ diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index d373fc14706c..94bb6e11c16f 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -780,6 +781,45 @@ int test_stat_timestamps(void) return 0; } +int test_uname(void) +{ + struct utsname buf; + char osrelease[sizeof(buf.release)]; + ssize_t r; + int fd; + + memset(&buf.domainname, 'P', sizeof(buf.domainname)); + + if (uname(&buf)) + return 1; + + if (strncmp("Linux", buf.sysname, sizeof(buf.sysname))) + return 1; + + fd = open("/proc/sys/kernel/osrelease", O_RDONLY); + if (fd == -1) + return 1; + + r = read(fd, osrelease, sizeof(osrelease)); + if (r == -1) + return 1; + + close(fd); + + if (osrelease[r - 1] == '\n') + r--; + + /* Validate one of the later fields to ensure field sizes are correct */ + if (strncmp(osrelease, buf.release, r)) + return 1; + + /* Ensure the field domainname is set, it is missing from struct old_utsname */ + if (strnlen(buf.domainname, sizeof(buf.domainname)) == sizeof(buf.domainname)) + return 1; + + return 0; +} + int test_mmap_munmap(void) { int ret, fd, i, page_size; @@ -985,6 +1025,8 @@ int run_syscall(int min, int max) CASE_TEST(stat_fault); EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break; CASE_TEST(stat_timestamps); EXPECT_SYSZR(1, test_stat_timestamps()); break; CASE_TEST(symlink_root); EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break; + CASE_TEST(uname); EXPECT_SYSZR(proc, test_uname()); break; + CASE_TEST(uname_fault); EXPECT_SYSER(1, uname(NULL), -1, EFAULT); break; CASE_TEST(unlink_root); EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break; CASE_TEST(unlink_blah); EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break; CASE_TEST(wait_child); EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break; -- cgit v1.2.3 From 72ba6cba0a6e9f06c09187ddbbdc9f80ee93ffb3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 12 Apr 2024 07:14:32 -0700 Subject: tools: ynl: don't return None for dumps YNL currently reports None for empty dump: $ cli.py ...netdev.yaml --dump page-pool-get None This doesn't matter for the CLI but when writing YNL based tests having to deal with either list or None is annoying. Limit the None conversion to non-dump ops: $ cli.py ...netdev.yaml --dump page-pool-get [] Reviewed-by: Donald Hunter Link: https://lore.kernel.org/r/20240412141436.828666-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 0ba5f6fb8747..a67f7b6fef92 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -995,9 +995,11 @@ class YnlFamily(SpecFamily): rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header)) rsp.append(rsp_msg) + if dump: + return rsp if not rsp: return None - if not dump and len(rsp) == 1: + if len(rsp) == 1: return rsp[0] return rsp -- cgit v1.2.3 From eeb409bde964df1956297b6775ff5f53dd98d556 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 12 Apr 2024 07:14:33 -0700 Subject: selftests: net: print report check location in python tests Developing Python tests is a bit annoying because when test fails we only print the fail message and no info about which exact check led to it. Print the location (the first line of this example is new): # At /root/ksft-net-drv/./net/nl_netdev.py line 38: # Check failed 0 != 10 not ok 3 nl_netdev.page_pool_check Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240412141436.828666-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index c7210525981c..b4b0bfff68b0 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 import builtins +import inspect from .consts import KSFT_MAIN_NAME KSFT_RESULT = None @@ -18,32 +19,34 @@ def ksft_pr(*objs, **kwargs): print("#", *objs, **kwargs) +def _fail(*args): + global KSFT_RESULT + KSFT_RESULT = False + + frame = inspect.stack()[2] + ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":") + ksft_pr(*args) + + def ksft_eq(a, b, comment=""): global KSFT_RESULT if a != b: - KSFT_RESULT = False - ksft_pr("Check failed", a, "!=", b, comment) + _fail("Check failed", a, "!=", b, comment) def ksft_true(a, comment=""): - global KSFT_RESULT if not a: - KSFT_RESULT = False - ksft_pr("Check failed", a, "does not eval to True", comment) + _fail("Check failed", a, "does not eval to True", comment) def ksft_in(a, b, comment=""): - global KSFT_RESULT if a not in b: - KSFT_RESULT = False - ksft_pr("Check failed", a, "not in", b, comment) + _fail("Check failed", a, "not in", b, comment) def ksft_ge(a, b, comment=""): - global KSFT_RESULT if a < b: - KSFT_RESULT = False - ksft_pr("Check failed", a, "<", b, comment) + _fail("Check failed", a, "<", b, comment) def ktap_result(ok, cnt=1, case="", comment=""): -- cgit v1.2.3 From 99583b970b9073ea258235e6c794fd515df19c61 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 12 Apr 2024 07:14:34 -0700 Subject: selftests: net: print full exception on failure Instead of a summary line print the full exception. This makes debugging Python tests much easier. Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240412141436.828666-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index b4b0bfff68b0..793e4761645e 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -2,6 +2,7 @@ import builtins import inspect +import traceback from .consts import KSFT_MAIN_NAME KSFT_RESULT = None @@ -85,7 +86,8 @@ def ksft_run(cases, args=()): totals['xfail'] += 1 continue except Exception as e: - for line in str(e).split('\n'): + tb = traceback.format_exc() + for line in tb.strip().split('\n'): ksft_pr("Exception|", line) ktap_result(False, cnt, case) totals['fail'] += 1 -- cgit v1.2.3 From 8554d6e39b6ad967e1debe98550a0c56aaf8c8ea Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 12 Apr 2024 07:14:35 -0700 Subject: selftests: net: support use of NetdevSimDev under "with" in python Using "with" on an entire driver test env is supported already, but it's also useful to use "with" on an individual nsim. Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240412141436.828666-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/nsim.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py index 97457aca7e08..94aa32f59fdb 100644 --- a/tools/testing/selftests/net/lib/py/nsim.py +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -84,6 +84,17 @@ class NetdevSimDev: for port_index in range(port_count): self.nsims.append(self._make_port(port_index, ifnames[port_index])) + self.removed = False + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.remove() + def _make_port(self, port_index, ifname): return NetdevSim(self, port_index, ifname, self.ns) @@ -112,7 +123,9 @@ class NetdevSimDev: raise Exception("netdevices did not appear within timeout") def remove(self): - self.ctrl_write("del_device", "%u" % (self.addr, )) + if not self.removed: + self.ctrl_write("del_device", "%u" % (self.addr, )) + self.removed = True def remove_nsim(self, nsim): self.nsims.remove(nsim) -- cgit v1.2.3 From 05fa5c31b9882b175d5e5a8e310f31b1a9b019a3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 12 Apr 2024 07:14:36 -0700 Subject: selftests: net: exercise page pool reporting via netlink Add a Python test for the basic ops. # ./net/nl_netdev.py KTAP version 1 1..3 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240412141436.828666-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 12 +++++ tools/testing/selftests/net/lib/py/nsim.py | 1 + tools/testing/selftests/net/nl_netdev.py | 76 +++++++++++++++++++++++++++++- 3 files changed, 87 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 793e4761645e..3769b9197213 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -2,6 +2,7 @@ import builtins import inspect +import time import traceback from .consts import KSFT_MAIN_NAME @@ -50,6 +51,17 @@ def ksft_ge(a, b, comment=""): _fail("Check failed", a, "<", b, comment) +def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): + end = time.monotonic() + deadline + while True: + if cond(): + return + if time.monotonic() > end: + _fail("Waiting for condition timed out", comment) + return + time.sleep(sleep) + + def ktap_result(ok, cnt=1, case="", comment=""): res = "" if not ok: diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py index 94aa32f59fdb..06896cdf7c18 100644 --- a/tools/testing/selftests/net/lib/py/nsim.py +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -28,6 +28,7 @@ class NetdevSim: self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) ret = ip("-j link show dev %s" % ifname, ns=ns) self.dev = json.loads(ret.stdout)[0] + self.ifindex = self.dev["ifindex"] def dfs_write(self, path, val): self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val) diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index 2b8b488fb419..6909b1760739 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_pr, ksft_eq, ksft_ge, NetdevFamily +import time +from lib.py import ksft_run, ksft_pr, ksft_eq, ksft_ge, ksft_busy_wait +from lib.py import NetdevFamily, NetdevSimDev, ip def empty_check(nf) -> None: @@ -15,9 +17,79 @@ def lo_check(nf) -> None: ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0) +def page_pool_check(nf) -> None: + with NetdevSimDev() as nsimdev: + nsim = nsimdev.nsims[0] + + def up(): + ip(f"link set dev {nsim.ifname} up") + + def down(): + ip(f"link set dev {nsim.ifname} down") + + def get_pp(): + pp_list = nf.page_pool_get({}, dump=True) + return [pp for pp in pp_list if pp.get("ifindex") == nsim.ifindex] + + # No page pools when down + down() + ksft_eq(len(get_pp()), 0) + + # Up, empty page pool appears + up() + pp_list = get_pp() + ksft_ge(len(pp_list), 0) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 0) + + # Down, it disappears, again + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 0) + + # Up, allocate a page + up() + nsim.dfs_write("pp_hold", "y") + pp_list = nf.page_pool_get({}, dump=True) + refs = sum([pp["inflight"] for pp in pp_list if pp.get("ifindex") == nsim.ifindex]) + ksft_ge(refs, 1) + + # Now let's leak a page + down() + pp_list = get_pp() + ksft_eq(len(pp_list), 1) + refs = sum([pp["inflight"] for pp in pp_list]) + ksft_eq(refs, 1) + attached = [pp for pp in pp_list if "detach-time" not in pp] + ksft_eq(len(attached), 0) + + # New pp can get created, and we'll have two + up() + pp_list = get_pp() + attached = [pp for pp in pp_list if "detach-time" not in pp] + detached = [pp for pp in pp_list if "detach-time" in pp] + ksft_eq(len(attached), 1) + ksft_eq(len(detached), 1) + + # Free the old page and the old pp is gone + nsim.dfs_write("pp_hold", "n") + # Freeing check is once a second so we may need to retry + ksft_busy_wait(lambda: len(get_pp()) == 1, deadline=2) + + # And down... + down() + ksft_eq(len(get_pp()), 0) + + # Last, leave the page hanging for destroy, nothing to check + # we're trying to exercise the orphaning path in the kernel + up() + nsim.dfs_write("pp_hold", "y") + + def main() -> None: nf = NetdevFamily() - ksft_run([empty_check, lo_check], args=(nf, )) + ksft_run([empty_check, lo_check, page_pool_check], + args=(nf, )) if __name__ == "__main__": -- cgit v1.2.3 From 39988fdc126b5148f102fc2afbc9fd92d8249f53 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2024 19:51:34 -0700 Subject: torture: Scale --do-kvfree test time Currently, the torture.sh --do-kvfree testing is hard-coded to ten minutes, ignoring the --duration argument. This commit therefore scales this test duration the same as for the rcutorture tests. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 13875ee7b050..990d24696fd3 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -559,7 +559,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" -- cgit v1.2.3 From a4022a332f437ae5b10921d66058ce98a2db2c20 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:04 +0200 Subject: selftests: net: Unify code of busywait() and slowwait() Bodies of busywait() and slowwait() functions are almost identical. Extract the common code into a helper, loopy_wait, and convert busywait() and slowwait() into trivial wrappers. Moreover, the fact that slowwait() uses seconds for units is really not intuitive, and the comment does not help much. Instead make the unit part of the name of the argument to further clarify what units are expected. Cc: Hangbin Liu Signed-off-by: Petr Machata Reviewed-by: Benjamin Poirier Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 22 ++-------------------- tools/testing/selftests/net/lib.sh | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 4103ed7afcde..658e4e7bf4b9 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -95,27 +95,9 @@ source "$net_forwarding_dir/../lib.sh" # timeout in seconds slowwait() { - local timeout=$1; shift - - local start_time="$(date -u +%s)" - while true - do - local out - out=$("$@") - local ret=$? - if ((!ret)); then - echo -n "$out" - return 0 - fi + local timeout_sec=$1; shift - local current_time="$(date -u +%s)" - if ((current_time - start_time > timeout)); then - echo -n "$out" - return 1 - fi - - sleep 0.1 - done + loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@" } ############################################################################## diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index b7f7b8695165..c868c0aec121 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -58,9 +58,10 @@ ksft_exit_status_merge() $ksft_xfail $ksft_pass $ksft_skip $ksft_fail } -busywait() +loopy_wait() { - local timeout=$1; shift + local sleep_cmd=$1; shift + local timeout_ms=$1; shift local start_time="$(date -u +%s%3N)" while true @@ -74,13 +75,22 @@ busywait() fi local current_time="$(date -u +%s%3N)" - if ((current_time - start_time > timeout)); then + if ((current_time - start_time > timeout_ms)); then echo -n "$out" return 1 fi + + $sleep_cmd done } +busywait() +{ + local timeout_ms=$1; shift + + loopy_wait : "$timeout_ms" "$@" +} + cleanup_ns() { local ns="" -- cgit v1.2.3 From 2291752fae3d2d773f2d29c159d383138411d06f Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:05 +0200 Subject: selftests: forwarding: lib.sh: Validate NETIFS The variable should contain at least NUM_NETIFS interfaces, stored as keys named "p$i", for i in `seq $NUM_NETIFS`. Signed-off-by: Petr Machata Reviewed-by: Benjamin Poirier Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 658e4e7bf4b9..3cbbc2fd4d7d 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -273,11 +273,6 @@ if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then require_command mreceive fi -if [[ ! -v NUM_NETIFS ]]; then - echo "SKIP: importer does not define \"NUM_NETIFS\"" - exit $ksft_skip -fi - ############################################################################## # Command line options handling @@ -296,6 +291,23 @@ done ############################################################################## # Network interfaces configuration +if [[ ! -v NUM_NETIFS ]]; then + echo "SKIP: importer does not define \"NUM_NETIFS\"" + exit $ksft_skip +fi + +if (( NUM_NETIFS > ${#NETIFS[@]} )); then + echo "SKIP: Importer requires $NUM_NETIFS NETIFS, but only ${#NETIFS[@]} are defined (${NETIFS[@]})" + exit $ksft_skip +fi + +for i in $(seq ${#NETIFS[@]}); do + if [[ ! ${NETIFS[p$i]} ]]; then + echo "SKIP: NETIFS[p$i] not given" + exit $ksft_skip + fi +done + create_netif_veth() { local i -- cgit v1.2.3 From 492976136bb9fac0ebda28a163561eea8c2896d5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:06 +0200 Subject: selftests: forwarding: bail_on_lldpad() should SKIP $ksft_skip is used to mark selftests that have tooling issues. The fact that LLDPad is running, but shouldn't, is one such issue. Therefore have bail_on_lldpad() bail with $ksft_skip. Signed-off-by: Petr Machata Reviewed-by: Benjamin Poirier Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 3cbbc2fd4d7d..7913c6ee418d 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -2138,6 +2138,8 @@ bail_on_lldpad() { local reason1="$1"; shift local reason2="$1"; shift + local caller=${FUNCNAME[1]} + local src=${BASH_SOURCE[1]} if systemctl is-active --quiet lldpad; then @@ -2158,7 +2160,8 @@ bail_on_lldpad() an environment variable ALLOW_LLDPAD to a non-empty string. EOF - exit 1 + log_test_skip $src:$caller + exit $EXIT_STATUS else return fi -- cgit v1.2.3 From 042db639bf33aee118846a87223acebe6700137d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:07 +0200 Subject: selftests: drivers: hw: Fix ethtool_rmon When rx-pktsNtoM reports a range that involves very low-valued range, such as 0-64, the calculated length of the packet will be -4, because FCS is subtracted from the value. mausezahn then confuses the value for an option and bails out. As a result, the test dumps many mausezahn error messages. Instead, cap the value at 0. mausezahn will use an appropriate minimum packet length. Cc: Vladimir Oltean Cc: Tobias Waldekranz Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh index e2a1c10d3503..8f60c1685ad4 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh @@ -44,6 +44,7 @@ bucket_test() # Mausezahn does not include FCS bytes in its length - but the # histogram counters do len=$((len - ETH_FCS_LEN)) + len=$((len > 0 ? len : 0)) before=$(ethtool --json -S $iface --groups rmon | \ jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val") -- cgit v1.2.3 From f359d44a4e83eba434f00c4363e4b97928fd5661 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:08 +0200 Subject: selftests: drivers: hw: ethtool.sh: Adjust output Some log_test calls are done in a loop, and lead to the same log output. This might prove tricky to deduplicate for automated tools. Instead, roll the unique information from log_info to log_test, and drop the log_info. This also leads to more compact and clearer output. This change prompts rewording the messages so that they are not excessively long. Some check_err messages do not indicate what the issue actually is, so reword them to say it's a "ping with", like is the case in some other instances in this test. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/hw/ethtool.sh | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh index bb12d5d70949..fa6953de6b6d 100755 --- a/tools/testing/selftests/drivers/net/hw/ethtool.sh +++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh @@ -65,9 +65,8 @@ same_speeds_autoneg_off() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "speed $speed autoneg off" - log_test "force of same speed autoneg off" - log_info "speed = $speed" + check_err $? "ping with speed $speed autoneg off" + log_test "force speed $speed on both ends" done ethtool -s $h2 autoneg on @@ -112,9 +111,8 @@ combination_of_neg_on_and_off() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "h1-speed=$speed autoneg off, h2 autoneg on" - log_test "one side with autoneg off and another with autoneg on" - log_info "force speed = $speed" + check_err $? "ping with h1-speed=$speed autoneg off, h2 autoneg on" + log_test "force speed $speed vs. autoneg" done ethtool -s $h1 autoneg on @@ -207,10 +205,9 @@ advertise_subset_of_speeds() setup_wait_dev_with_timeout $h1 setup_wait_dev_with_timeout $h2 ping_do $h1 192.0.2.2 - check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" + check_err $? "ping with h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)" - log_test "advertise subset of speeds" - log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise" + log_test "advertise $speed_1_to_advertise vs. $speed_2_to_advertise" done ethtool -s $h2 autoneg on -- cgit v1.2.3 From bfc42940682b809e1f51491d8d9f82b7638fcbde Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:09 +0200 Subject: selftests: drivers: hw: Include tc_common.sh in hw_stats_l3 The tests use the constant TC_HIT_TIMEOUT when waiting on the counter values. However it does not include tc_common.sh where the counter is specified. The test has been robust in our testing, which means the counter is bumped quickly enough that the updated value is available already on the first iteration. Nevertheless it's not correct. Include tc_common.sh as appropriate. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh | 1 + tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh | 1 + 2 files changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh index 7dfc50366c99..67fafefc80be 100755 --- a/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh @@ -50,6 +50,7 @@ ALL_TESTS=" NUM_NETIFS=4 lib_dir=$(dirname "$0") source "$lib_dir"/../../../net/forwarding/lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh h1_create() { diff --git a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh index ab8d04855af5..a94d92e1abce 100755 --- a/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh +++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh @@ -15,6 +15,7 @@ NUM_NETIFS=6 lib_dir=$(dirname "$0") source "$lib_dir"/../../../net/forwarding/lib.sh source "$lib_dir"/../../../net/forwarding/ipip_lib.sh +source "$lib_dir"/../../../net/forwarding/tc_common.sh setup_prepare() { -- cgit v1.2.3 From 8d612ed4b55435812fe1cfb73be1dff4a4b9ce07 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Fri, 12 Apr 2024 19:03:10 +0200 Subject: selftests: mlxsw: ethtool_lanes: Wait for lanes parameter dump explicitly The ethtool dump includes the lanes parameter only when the port is up. Therefore, the ethtool_lanes.sh test waits for ports to come before testing the lanes parameter. In some cases, the test considers the port as up, but the lanes parameter is not yet dumped although assumed to be, resulting in ethtool_lanes.sh test failure. To avoid that, ensure that the lanes parameter is indeed dumped by waiting for it explicitly, before preforming the test cases. Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 91891b9418d7..877cd6df94a1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -24,8 +24,8 @@ setup_prepare() busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 check_err $? "ports did not come up" - local lanes_exist=$(ethtool $swp1 | grep 'Lanes:') - if [[ -z $lanes_exist ]]; then + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + if [[ $? -ne 0 ]]; then log_test "SKIP: driver does not support lanes setting" exit 1 fi @@ -122,8 +122,9 @@ autoneg() ethtool_set $swp1 speed $max_speed lanes $lanes ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "$lanes lanes is autonegotiated" @@ -160,8 +161,9 @@ autoneg_force_mode() ethtool_set $swp2 speed $max_speed lanes $lanes autoneg off ip link set dev $swp1 up ip link set dev $swp2 up - busywait "$TIMEOUT" wait_for_port_up ethtool $swp2 - check_err $? "ports did not come up" + + busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:" + check_err $? "Lanes parameter is not presented on time" check_lanes $swp1 $lanes $max_speed log_test "Autoneg off, $lanes lanes detected during force mode" -- cgit v1.2.3 From ba7d1e99b1935b06429b531aafe18d4e3a04cf9d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:11 +0200 Subject: selftests: forwarding: router_mpath_nh: Add a diagram This test lacks a topology diagram, making the setup not obvious. Add one. Cc: David Ahern Signed-off-by: Petr Machata Reviewed-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- .../selftests/net/forwarding/router_mpath_nh.sh | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 3f0f5dc95542..2ba44247c60a 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 -- cgit v1.2.3 From b51a94b2d59d8d5f34a5eba1a6b3e733d5cb4355 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:12 +0200 Subject: selftests: forwarding: router_mpath_nh_res: Add a diagram This test lacks a topology diagram, making the setup not obvious. Add one. Signed-off-by: Petr Machata Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- .../net/forwarding/router_mpath_nh_res.sh | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index 4b483d24ad00..cd9e346436fc 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -1,6 +1,41 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.2/24 | | +# | 2001:db8:1::2/64 | | +# +-------------------|-----+ +# | +# +-------------------|----------------------+ +# | | R1 | +# | $rp11 + | +# | 192.0.2.1/24 | +# | 2001:db8:1::1/64 | +# | | +# | + $rp12 + $rp13 | +# | | 169.254.2.12/24 | 169.254.3.13/24 | +# | | fe80:2::12/64 | fe80:3::13/64 | +# +--|--------------------|------------------+ +# | | +# +--|--------------------|------------------+ +# | + $rp22 + $rp23 | +# | 169.254.2.22/24 169.254.3.23/24 | +# | fe80:2::22/64 fe80:3::23/64 | +# | | +# | $rp21 + | +# | 198.51.100.1/24 | | +# | 2001:db8:2::1/64 | R2 | +# +-------------------|----------------------+ +# | +# +-------------------|-----+ +# | | | +# | $h2 + | +# | 198.51.100.2/24 | +# | 2001:db8:2::2/64 H2 | +# +-------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 -- cgit v1.2.3 From 74ddac073cfe5f47c1509b665ec9b30df1fcd71c Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 12 Apr 2024 19:03:13 +0200 Subject: selftests: forwarding: router_nh: Add a diagram This test lacks a topology diagram, making the setup not obvious. Add one. Signed-off-by: Petr Machata Reviewed-by: Hangbin Liu Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/router_nh.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh index f3a53738bdcc..92904b01eae9 100755 --- a/tools/testing/selftests/net/forwarding/router_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_nh.sh @@ -1,6 +1,20 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# +-------------------------+ +-------------------------+ +# | H1 | | H2 | +# | $h1 + | | $h2 + | +# | 192.0.2.2/24 | | | 198.51.100.2/24 | | +# | 2001:db8:1::2/64 | | | 2001:db8:2::2/64 | | +# +-------------------|-----+ +-------------------|-----+ +# | | +# +-------------------|----------------------------|-----+ +# | R1 | | | +# | $rp1 + $rp2 + | +# | 192.0.2.1/24 198.51.100.1/24 | +# | 2001:db8:1::1/64 2001:db8:2::1/64 | +# +------------------------------------------------------+ + ALL_TESTS=" ping_ipv4 ping_ipv6 -- cgit v1.2.3 From 2bc3cf575a162a2ca9c98262a63e95cc2b619de7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 12 Apr 2024 11:33:07 -0700 Subject: perf annotate-data: Improve debug message with location info To verify it found the correct variable, let's add the location expression to the debug message. $ perf --debug type-profile annotate --data-type ... ----------------------------------------------------------- find data type for 0xaf0(reg15) at schedule+0xeb CU for kernel/sched/core.c (die:0x1180523) frame base: cfa=0 fbreg=6 found "rq" in scope=3/4 (die: 0x11b6a00) type_offset=0xaf0 variable location: reg15 type='struct rq' size=0xfc0 (die:0x11892e2) ----------------------------------------------------------- find data type for 0x7bc(reg3) at tcp_get_info+0x62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 offset: 1980 is bigger than size: 760 check variable "sk" failed (die: 0x7b92b2c) variable location: reg3 type='struct sock' size=0x2f8 (die:0x7b63c3a) ----------------------------------------------------------- ... The first case is fine. It looked up a data type in r15 with offset of 0xaf0 at schedule+0xeb. It found the CU die and the frame base info and the variable "rq" was found in the scope 3/4. Its location is the r15 register and the type size is 0xfc0 which includes 0xaf0. But the second case is not good. It looked up a data type in rbx (reg3) with offset 0x7bc. It found a CU and the frame base which is good so far. And it also found a variable "sk" but the access offset is bigger than the type size (1980 vs. 760 or 0x7bc vs. 0x2f8). The variable has the right location (reg3) but I need to figure out why it accesses beyond what it's supposed to. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240412183310.2518474-2-namhyung@kernel.org [ Fix the build on 32-bit by casting Dwarf_Word to (long) in pr_debug_location() ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 99 ++++++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 1cd857400038..e53d66c46c54 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -46,6 +46,7 @@ static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) { struct strbuf sb; char *str; + Dwarf_Word size = 0; if (!debug_type_profile && verbose < 3) return; @@ -72,13 +73,67 @@ static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) break; } + dwarf_aggregate_size(die, &size); + strbuf_init(&sb, 32); die_get_typename_from_type(die, &sb); str = strbuf_detach(&sb, NULL); - pr_info(" type=%s (die:%lx)\n", str, (long)dwarf_dieoffset(die)); + pr_info(" type='%s' size=%#lx (die:%#lx)\n", + str, (long)size, (long)dwarf_dieoffset(die)); free(str); } +static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg) +{ + ptrdiff_t off = 0; + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Op *ops; + size_t nops; + + if (!debug_type_profile && verbose < 3) + return; + + if (dwarf_attr(die, DW_AT_location, &attr) == NULL) + return; + + while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) { + if (reg != DWARF_REG_PC && end < pc) + continue; + if (reg != DWARF_REG_PC && start > pc) + break; + + pr_info(" variable location: "); + switch (ops->atom) { + case DW_OP_reg0 ...DW_OP_reg31: + pr_info("reg%d\n", ops->atom - DW_OP_reg0); + break; + case DW_OP_breg0 ...DW_OP_breg31: + pr_info("base=reg%d, offset=%#lx\n", + ops->atom - DW_OP_breg0, (long)ops->number); + break; + case DW_OP_regx: + pr_info("reg%ld\n", (long)ops->number); + break; + case DW_OP_bregx: + pr_info("base=reg%ld, offset=%#lx\n", + (long)ops->number, (long)ops->number2); + break; + case DW_OP_fbreg: + pr_info("use frame base, offset=%#lx\n", (long)ops->number); + break; + case DW_OP_addr: + pr_info("address=%#lx\n", (long)ops->number); + break; + default: + pr_info("unknown: code=%#x, number=%#lx\n", + ops->atom, (long)ops->number); + break; + } + break; + } +} + /* * Type information in a register, valid when @ok is true. * The @caller_saved registers are invalidated after a function call. @@ -1404,7 +1459,7 @@ again: found = find_data_type_insn(dloc, reg, &basic_blocks, var_types, cu_die, type_die); if (found > 0) { - pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x", + pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n", dloc->op->offset, reg, dloc->type_offset); pr_debug_type_name(type_die, TSR_KIND_TYPE); ret = 0; @@ -1440,16 +1495,16 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) char buf[64]; if (dloc->op->multi_regs) - snprintf(buf, sizeof(buf), " or reg%d", dloc->op->reg2); + snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2); else if (dloc->op->reg1 == DWARF_REG_PC) - snprintf(buf, sizeof(buf), " (PC)"); + snprintf(buf, sizeof(buf), "PC"); else - buf[0] = '\0'; + snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1); pr_debug_dtp("-----------------------------------------------------------\n"); - pr_debug_dtp("%s [%"PRIx64"] for reg%d%s offset=%#x in %s\n", - __func__, dloc->ip - dloc->ms->sym->start, - dloc->op->reg1, buf, dloc->op->offset, dloc->ms->sym->name); + pr_debug_dtp("find data type for %#x(%s) at %s+%#"PRIx64"\n", + dloc->op->offset, buf, dloc->ms->sym->name, + dloc->ip - dloc->ms->sym->start); /* * IP is a relative instruction address from the start of the map, as @@ -1468,14 +1523,15 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) reg = loc->reg1; offset = loc->offset; - pr_debug_dtp("CU die offset: %#lx\n", (long)dwarf_dieoffset(&cu_die)); + pr_debug_dtp("CU for %s (die:%#lx)\n", + dwarf_diename(&cu_die), (long)dwarf_dieoffset(&cu_die)); if (reg == DWARF_REG_PC) { if (get_global_var_type(&cu_die, dloc, dloc->ip, dloc->var_addr, &offset, type_die)) { dloc->type_offset = offset; - pr_debug_dtp("found PC-rel by addr=%#"PRIx64" offset=%#x", + pr_debug_dtp("found by addr=%#"PRIx64" type_offset=%#x\n", dloc->var_addr, offset); pr_debug_type_name(type_die, TSR_KIND_TYPE); ret = 0; @@ -1537,13 +1593,22 @@ retry: pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ", dwarf_diename(&var_die), i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i])); - if (reg == DWARF_REG_PC) - pr_debug_dtp("%#x(PC) offset=%#x", loc->offset, offset); - else if (reg == DWARF_REG_FB || is_fbreg) - pr_debug_dtp("%#x(reg%d) stack fb_offset=%#x offset=%#x", - loc->offset, reg, fb_offset, offset); - else - pr_debug_dtp("%#x(reg%d)", loc->offset, reg); + if (reg == DWARF_REG_PC) { + pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n", + dloc->var_addr, offset); + } else if (reg == DWARF_REG_FB || is_fbreg) { + pr_debug_dtp("stack_offset=%#x type_offset=%#x\n", + fb_offset, offset); + } else { + pr_debug_dtp("type_offset=%#x\n", offset); + } + pr_debug_location(&var_die, pc, reg); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + } else { + pr_debug_dtp("check variable \"%s\" failed (die: %#lx)\n", + dwarf_diename(&var_die), + (long)dwarf_dieoffset(&var_die)); + pr_debug_location(&var_die, pc, reg); pr_debug_type_name(type_die, TSR_KIND_TYPE); } dloc->type_offset = offset; -- cgit v1.2.3 From 645af3fb62bf12911ce1fc79efa676dae9a8289b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 12 Apr 2024 11:33:08 -0700 Subject: perf dwarf-aux: Check pointer offset when checking variables In match_var_offset(), it checks the offset range with the target type only for non-pointer types. But it also needs to check the pointer types with the target type. This is because there can be more than one pointer variable located in the same register. Let's look at the following example. It's looking up a variable for reg3 at tcp_get_info+0x62. It found "sk" variable but it wasn't the right one since it accesses beyond the target type (struct 'sock' in this case) size. ----------------------------------------------------------- find data type for 0x7bc(reg3) at tcp_get_info+0x62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 offset: 1980 is bigger than size: 760 check variable "sk" failed (die: 0x7b92b2c) variable location: reg3 type='struct sock' size=0x2f8 (die:0x7b63c3a) Actually there was another variable "tp" in the function and it's located at the same (reg3) because it's just type-casted like below. void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); ... The 'struct tcp_sock' contains the 'struct sock' at offset 0 so it can just use the same address as a pointer to tcp_sock. That means it should match variables correctly by checking the offset and size. Actually it cannot distinguish if the offset was smaller than the size of the original struct sock. But I think it's fine as they are the same at that part. So let's check the target type size and retry if it doesn't match. Now it succeeded to find the correct variable. ----------------------------------------------------------- find data type for 0x7bc(reg3) at tcp_get_info+0x62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 found "tp" in scope=1/1 (die: 0x7b92b16) type_offset=0x7bc variable location: reg3 type='struct tcp_sock' size=0xa68 (die:0x7b81380) Fixes: bc10db8eb8955fbc ("perf annotate-data: Support stack variables") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240412183310.2518474-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 7dad99ee3ff3..b361fd7ebd56 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1361,7 +1361,7 @@ struct find_var_data { #define DWARF_OP_DIRECT_REGS 32 static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, - u64 addr_offset, u64 addr_type) + u64 addr_offset, u64 addr_type, bool is_pointer) { Dwarf_Die type_die; Dwarf_Word size; @@ -1375,6 +1375,12 @@ static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, if (die_get_real_type(die_mem, &type_die) == NULL) return false; + if (is_pointer && dwarf_tag(&type_die) == DW_TAG_pointer_type) { + /* Get the target type of the pointer */ + if (die_get_real_type(&type_die, &type_die) == NULL) + return false; + } + if (dwarf_aggregate_size(&type_die, &size) < 0) return false; @@ -1442,31 +1448,38 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) if (data->is_fbreg && ops->atom == DW_OP_fbreg && data->offset >= (int)ops->number && check_allowed_ops(ops, nops) && - match_var_offset(die_mem, data, data->offset, ops->number)) + match_var_offset(die_mem, data, data->offset, ops->number, + /*is_pointer=*/false)) return DIE_FIND_CB_END; /* Only match with a simple case */ if (data->reg < DWARF_OP_DIRECT_REGS) { /* pointer variables saved in a register 0 to 31 */ if (ops->atom == (DW_OP_reg0 + data->reg) && - check_allowed_ops(ops, nops)) + check_allowed_ops(ops, nops) && + match_var_offset(die_mem, data, data->offset, 0, + /*is_pointer=*/true)) return DIE_FIND_CB_END; /* Local variables accessed by a register + offset */ if (ops->atom == (DW_OP_breg0 + data->reg) && check_allowed_ops(ops, nops) && - match_var_offset(die_mem, data, data->offset, ops->number)) + match_var_offset(die_mem, data, data->offset, ops->number, + /*is_pointer=*/false)) return DIE_FIND_CB_END; } else { /* pointer variables saved in a register 32 or above */ if (ops->atom == DW_OP_regx && ops->number == data->reg && - check_allowed_ops(ops, nops)) + check_allowed_ops(ops, nops) && + match_var_offset(die_mem, data, data->offset, 0, + /*is_pointer=*/true)) return DIE_FIND_CB_END; /* Local variables accessed by a register + offset */ if (ops->atom == DW_OP_bregx && data->reg == ops->number && check_allowed_ops(ops, nops) && - match_var_offset(die_mem, data, data->offset, ops->number2)) + match_var_offset(die_mem, data, data->offset, ops->number2, + /*is_poitner=*/false)) return DIE_FIND_CB_END; } } @@ -1528,7 +1541,8 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) continue; if (check_allowed_ops(ops, nops) && - match_var_offset(die_mem, data, data->addr, ops->number)) + match_var_offset(die_mem, data, data->addr, ops->number, + /*is_pointer=*/false)) return DIE_FIND_CB_END; } return DIE_FIND_CB_SIBLING; -- cgit v1.2.3 From 0519fadbbe3b1ed396d944911c1ff3a276701474 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 12 Apr 2024 11:33:09 -0700 Subject: perf dwarf-aux: Check variable address range properly In match_var_offset(), it just checked the end address of the variable with the given offset because it assumed the register holds a pointer to the data type and the offset starts from the base. But I found some cases that the stack pointer (rsp = reg7) register is used to pointer a stack variable while the frame base is maintained by a different register (rbp = reg6). In that case, it cannot simply use the stack pointer as it cannot guarantee that it points to the frame base. So it needs to check both boundaries of the variable location. Before: ----------------------------------------------------------- find data type for 0x7c(reg7) at tcp_getsockopt+0xb62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 no pointer or no type check variable "tss" failed (die: 0x7b95801) variable location: base reg7, offset=0x110 type='struct scm_timestamping_internal' size=0x30 (die:0x7b8c126) So the current code just checks register number for the non-PC and non-FB registers and assuming it has offset 0. But this variable has offset 0x110 so it should not match to this. After: ----------------------------------------------------------- find data type for 0x7c(reg7) at tcp_getsockopt+0xb62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 no pointer or no type check variable "zc" failed (die: 0x7b9580a) variable location: base=reg7, offset=0x40 type='struct tcp_zerocopy_receive' size=0x40 (die:7b947f4) Now it find the correct variable "zc". It was located at reg7 + 0x40 and the size if 0x40 which means it should cover [0x40, 0x80). And the access was for reg7 + 0x7c so it found the right one. But it still failed to use the variable and it would be handled in the next patch. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240412183310.2518474-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index b361fd7ebd56..40cfbdfe2d75 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1372,6 +1372,9 @@ static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, return true; } + if (addr_offset < addr_type) + return false; + if (die_get_real_type(die_mem, &type_die) == NULL) return false; @@ -1446,7 +1449,6 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) /* Local variables accessed using frame base register */ if (data->is_fbreg && ops->atom == DW_OP_fbreg && - data->offset >= (int)ops->number && check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->offset, ops->number, /*is_pointer=*/false)) @@ -1537,9 +1539,6 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) if (ops->atom != DW_OP_addr) continue; - if (data->addr < ops->number) - continue; - if (check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->addr, ops->number, /*is_pointer=*/false)) -- cgit v1.2.3 From a5a00497b9dfefbf6872f387bc7692919e1785d3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 12 Apr 2024 11:33:10 -0700 Subject: perf annotate-data: Handle RSP if it's not the FB register In some cases, the stack pointer on x86 (rsp = reg7) is used to point variables on stack but it's not the frame base register. Then it should handle the register like normal registers (IOW not to access the other stack variables using offset calculation) but it should not assume it would have a pointer. Before: ----------------------------------------------------------- find data type for 0x7c(reg7) at tcp_getsockopt+0xb62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 no pointer or no type check variable "zc" failed (die: 0x7b9580a) variable location: base=reg7, offset=0x40 type='struct tcp_zerocopy_receive' size=0x40 (die:0x7b947f4) After: ----------------------------------------------------------- find data type for 0x7c(reg7) at tcp_getsockopt+0xb62 CU for net/ipv4/tcp.c (die:0x7b5f516) frame base: cfa=0 fbreg=6 found "zc" in scope=3/3 (die: 0x7b957fc) type_offset=0x3c variable location: base=reg7, offset=0x40 type='struct tcp_zerocopy_receive' size=0x40 (die:0x7b947f4) Note that the type-offset was properly calculated to 0x3c as the variable starts at 0x40. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240412183310.2518474-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index e53d66c46c54..12d5faff3b7a 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -25,6 +25,9 @@ #include "symbol_conf.h" #include "thread.h" +/* register number of the stack pointer */ +#define X86_REG_SP 7 + enum type_state_kind { TSR_KIND_INVALID = 0, TSR_KIND_TYPE, @@ -197,7 +200,7 @@ static void init_type_state(struct type_state *state, struct arch *arch) state->regs[10].caller_saved = true; state->regs[11].caller_saved = true; state->ret_reg = 0; - state->stack_reg = 7; + state->stack_reg = X86_REG_SP; } } @@ -382,10 +385,18 @@ static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die) } /* The type info will be saved in @type_die */ -static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset, - bool is_pointer) +static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, + Dwarf_Die *type_die, int reg, int offset, bool is_fbreg) { Dwarf_Word size; + bool is_pointer = true; + + if (reg == DWARF_REG_PC) + is_pointer = false; + else if (reg == dloc->fbreg || is_fbreg) + is_pointer = false; + else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP) + is_pointer = false; /* Get the type of the variable */ if (die_get_real_type(var_die, type_die) == NULL) { @@ -607,7 +618,6 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, { u64 pc; int offset; - bool is_pointer = false; const char *var_name = NULL; struct global_var_entry *gvar; Dwarf_Die var_die; @@ -623,7 +633,8 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, /* Try to get the variable by address first */ if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) && - check_variable(&var_die, type_die, offset, is_pointer) == 0) { + check_variable(dloc, &var_die, type_die, DWARF_REG_PC, offset, + /*is_fbreg=*/false) == 0) { var_name = dwarf_diename(&var_die); *var_offset = offset; goto ok; @@ -636,7 +647,8 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, /* Try to get the name of global variable */ if (die_find_variable_at(cu_die, var_name, pc, &var_die) && - check_variable(&var_die, type_die, *var_offset, is_pointer) == 0) + check_variable(dloc, &var_die, type_die, DWARF_REG_PC, *var_offset, + /*is_fbreg=*/false) == 0) goto ok; return false; @@ -1587,8 +1599,7 @@ retry: } /* Found a variable, see if it's correct */ - ret = check_variable(&var_die, type_die, offset, - reg != DWARF_REG_PC && !is_fbreg); + ret = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg); if (ret == 0) { pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ", dwarf_diename(&var_die), i+1, nr_scopes, -- cgit v1.2.3 From 986e7663f98ec7441d9d948263ec0dda71e7479f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 13 Apr 2024 02:14:26 +0100 Subject: bpftool: Update documentation where progs/maps can be passed by name When using references to BPF programs, bpftool supports passing programs by name on the command line. The manual pages for "bpftool prog" and "bpftool map" (for prog_array updates) mention it, but we have a few additional subcommands that support referencing programs by name but do not mention it in their documentation. Let's update the pages for subcommands "btf", "cgroup", and "net". Similarly, we can reference maps by name when passing them to "bpftool prog load", so we update the page for "bpftool prog" as well. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413011427.14402-2-qmo@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 2 +- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 2 +- tools/bpf/bpftool/Documentation/bpftool-net.rst | 2 +- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index f66781f20af2..eaba24320fb2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -30,7 +30,7 @@ BTF COMMANDS | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } | *FORMAT* := { **raw** | **c** } | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index b2610d169e60..e8185596a759 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -30,7 +30,7 @@ CGROUP COMMANDS | **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* | **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | | **cgroup_inet_sock_create** | **cgroup_sock_ops** | | **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index f8e65869f8b4..348812881297 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -28,7 +28,7 @@ NET COMMANDS | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 8e730cfb2589..d6304e01afe0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -39,7 +39,7 @@ PROG COMMANDS | **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* | **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *TYPE* := { | **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | -- cgit v1.2.3 From ad2d22b617b7c0ca2cff4da6dc063183822484bb Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 13 Apr 2024 02:14:27 +0100 Subject: bpftool: Address minor issues in bash completion This commit contains a series of clean-ups and fixes for bpftool's bash completion file: - Make sure all local variables are declared as such. - Make sure variables are initialised before being read. - Update ELF section ("maps" -> ".maps") for looking up map names in object files. - Fix call to _init_completion. - Move definition for MAP_TYPE and PROG_TYPE higher up in the scope to avoid defining them multiple times, reuse MAP_TYPE where relevant. - Simplify completion for "duration" keyword in "bpftool prog profile". - Fix completion for "bpftool struct_ops register" and "bpftool link (pin|detach)" where we would repeatedly suggest file names instead of suggesting just one name. - Fix completion for "bpftool iter pin ... map MAP" to account for the "map" keyword. - Add missing "detach" suggestion for "bpftool link". Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413011427.14402-3-qmo@kernel.org --- tools/bpf/bpftool/bash-completion/bpftool | 61 +++++++++++++------------------ 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 6e4f7ce6bc01..04afe2ac2228 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -106,19 +106,19 @@ _bpftool_get_link_ids() _bpftool_get_obj_map_names() { - local obj + local obj maps obj=$1 - maps=$(objdump -j maps -t $obj 2>/dev/null | \ - command awk '/g . maps/ {print $NF}') + maps=$(objdump -j .maps -t $obj 2>/dev/null | \ + command awk '/g . .maps/ {print $NF}') COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) ) } _bpftool_get_obj_map_idxs() { - local obj + local obj nmaps obj=$1 @@ -136,7 +136,7 @@ _sysfs_get_netdevs() # Retrieve type of the map that we are operating on. _bpftool_map_guess_map_type() { - local keyword ref + local keyword idx ref="" for (( idx=3; idx < ${#words[@]}-1; idx++ )); do case "${words[$((idx-2))]}" in lookup|update) @@ -255,8 +255,9 @@ _bpftool_map_update_get_name() _bpftool() { - local cur prev words objword json=0 - _init_completion || return + local cur prev words cword comp_args + local json=0 + _init_completion -- "$@" || return # Deal with options if [[ ${words[cword]} == -* ]]; then @@ -293,7 +294,7 @@ _bpftool() esac # Remove all options so completions don't have to deal with them. - local i + local i pprev for (( i=1; i < ${#words[@]}; )); do if [[ ${words[i]::1} == - ]] && [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then @@ -307,7 +308,7 @@ _bpftool() prev=${words[cword - 1]} pprev=${words[cword - 2]} - local object=${words[1]} command=${words[2]} + local object=${words[1]} if [[ -z $object || $cword -eq 1 ]]; then case $cur in @@ -324,8 +325,12 @@ _bpftool() esac fi + local command=${words[2]} [[ $command == help ]] && return 0 + local MAP_TYPE='id pinned name' + local PROG_TYPE='id pinned tag name' + # Completion depends on object and command in use case $object in prog) @@ -346,8 +351,6 @@ _bpftool() ;; esac - local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned name' local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ itlb_misses dtlb_misses' case $command in @@ -457,7 +460,7 @@ _bpftool() obj=${words[3]} if [[ ${words[-4]} == "map" ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 fi if [[ ${words[-3]} == "map" ]]; then @@ -541,20 +544,9 @@ _bpftool() COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) return 0 ;; - 6) - case $prev in - duration) - return 0 - ;; - *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) - return 0 - ;; - esac - return 0 - ;; *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + [[ $prev == duration ]] && return 0 + _bpftool_once_attr "$METRIC_TYPE" return 0 ;; esac @@ -612,7 +604,7 @@ _bpftool() return 0 ;; register) - _filedir + [[ $prev == $command ]] && _filedir return 0 ;; *) @@ -638,9 +630,12 @@ _bpftool() pinned) _filedir ;; - *) + map) _bpftool_one_of_list $MAP_TYPE ;; + *) + _bpftool_once_attr 'map' + ;; esac return 0 ;; @@ -652,7 +647,6 @@ _bpftool() esac ;; map) - local MAP_TYPE='id pinned name' case $command in show|list|dump|peek|pop|dequeue|freeze) case $prev in @@ -793,13 +787,11 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) - local MAP_TYPE='id pinned name' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 ;; prog_array) - local PROG_TYPE='id pinned tag name' COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ -- "$cur" ) ) return 0 @@ -821,7 +813,7 @@ _bpftool() esac _bpftool_once_attr 'key' - local UPDATE_FLAGS='any exist noexist' + local UPDATE_FLAGS='any exist noexist' idx for (( idx=3; idx < ${#words[@]}-1; idx++ )); do if [[ ${words[idx]} == 'value' ]]; then # 'value' is present, but is not the last @@ -893,7 +885,6 @@ _bpftool() esac ;; btf) - local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' case $command in dump) @@ -1033,7 +1024,6 @@ _bpftool() local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' - local PROG_TYPE='id pinned tag name' # Check for $prev = $command first if [ $prev = $command ]; then _filedir @@ -1086,7 +1076,6 @@ _bpftool() esac ;; net) - local PROG_TYPE='id pinned tag name' local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' case $command in show|list) @@ -1193,14 +1182,14 @@ _bpftool() pin|detach) if [[ $prev == "$command" ]]; then COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) - else + elif [[ $pprev == "$command" ]]; then _filedir fi return 0 ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help pin detach show list' -- "$cur" ) ) ;; esac ;; -- cgit v1.2.3 From 232d79aaa7816aa09766962616c768d349947f17 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Apr 2024 17:45:51 -0700 Subject: selftests: drv-net: add stdout to the command failed exception ping prints all the info to stdout. To make debug easier capture stdout in the Exception raised when command unexpectedly fails. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240416004556.1618804-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index f0d425731fd4..19612348c30d 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -33,7 +33,8 @@ class cmd: if self.proc.returncode != 0 and fail: if len(stderr) > 0 and stderr[-1] == "\n": stderr = stderr[:-1] - raise Exception("Command failed: %s\n%s" % (self.proc.args, stderr)) + raise Exception("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % + (self.proc.args, stdout, stderr)) def ip(args, json=None, ns=None): -- cgit v1.2.3 From 438ce84bae90b197314e781a20d8a043187b6e59 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Apr 2024 17:45:52 -0700 Subject: selftests: drv-net: add config for netdevsim Real driver testing will obviously require enabling more options, but will require more manual setup in the first place. For CIs running purely software tests we need to enable netdevsim. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240416004556.1618804-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/config | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/config (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config new file mode 100644 index 000000000000..f6a58ce8a230 --- /dev/null +++ b/tools/testing/selftests/drivers/net/config @@ -0,0 +1,2 @@ +CONFIG_IPV6=y +CONFIG_NETDEVSIM=m -- cgit v1.2.3 From 9ef1ed26a67b817fd08faf851468ef040db7d280 Mon Sep 17 00:00:00 2001 From: Yujie Liu Date: Tue, 16 Apr 2024 13:32:10 +0800 Subject: selftests: fix netfilter path in Makefile Netfilter tests have been moved to a subdir under selftests/net by patch series [1]. Fix the path in selftests/Makefile accordingly. This helps fix the following error: tools/testing/selftests$ make ... make[1]: Entering directory 'tools/testing/selftests' make[1]: *** netfilter: No such file or directory. Stop. make[1]: Leaving directory 'tools/testing/selftests' [1] https://lore.kernel.org/all/20240411233624.8129-1-fw@strlen.de/ Reported-by: kernel test robot Signed-off-by: Yujie Liu Signed-off-by: David S. Miller --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 6dab886d6f7a..c785b6256a45 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -64,7 +64,7 @@ TARGETS += net/hsr TARGETS += net/mptcp TARGETS += net/openvswitch TARGETS += net/tcp_ao -TARGETS += netfilter +TARGETS += net/netfilter TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd -- cgit v1.2.3 From 9213e52970a5997c9eb176c7afcc6ec67b1b1e6f Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Sat, 13 Apr 2024 23:12:57 +0200 Subject: libbpf: Fix misaligned array closing bracket In btf_dump_array_data(), libbpf will call btf_dump_dump_type_data() for each element. For an array of characters, each element will be processed the following way: - btf_dump_dump_type_data() is called to print the character - btf_dump_data_pfx() prefixes the current line with the proper number of indentations - btf_dump_int_data() is called to print the character - After the last character is printed, btf_dump_dump_type_data() calls btf_dump_data_pfx() before writing the closing bracket However, for an array containing characters, btf_dump_int_data() won't print any '\0' and subsequent characters. This leads to situations where the line prefix is written, no character is added, then the prefix is written again before adding the closing bracket: (struct sk_metadata){ .str_array = (__u8[14])[ 'H', 'e', 'l', 'l', 'o', ], This change solves this issue by printing the '\0' character, which has two benefits: - The bracket closing the array is properly aligned - It's clear from a user point of view that libbpf uses '\0' as a terminator for arrays of characters. Signed-off-by: Quentin Deslandes Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413211258.134421-2-qde@naccy.de --- tools/lib/bpf/btf_dump.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 4d9f30bf7f01..6a37e8517435 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1929,6 +1929,7 @@ static int btf_dump_int_data(struct btf_dump *d, if (d->typed_dump->is_array_terminated) break; if (*(char *)data == '\0') { + btf_dump_type_values(d, "'\\0'"); d->typed_dump->is_array_terminated = true; break; } -- cgit v1.2.3 From e739e01d8df8bf26dbc9dcaeaee3c8c55e8ffa71 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Sat, 13 Apr 2024 23:12:58 +0200 Subject: libbpf: Fix dump of subsequent char arrays When dumping a character array, libbpf will watch for a '\0' and set is_array_terminated=true if found. This prevents libbpf from printing the remaining characters of the array, treating it as a nul-terminated string. However, once this flag is set, it's never reset, leading to subsequent characters array not being printed properly: .str_multi = (__u8[2][16])[ [ 'H', 'e', 'l', ], ], This patch saves the is_array_terminated flag and restores its default (false) value before looping over the elements of an array, then restores it afterward. This way, libbpf's behavior is unchanged when dumping the characters of an array, but subsequent arrays are printed properly: .str_multi = (__u8[2][16])[ [ 'H', 'e', 'l', ], [ 'l', 'o', ], ], Signed-off-by: Quentin Deslandes Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413211258.134421-3-qde@naccy.de --- tools/lib/bpf/btf_dump.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 6a37e8517435..5dbca76b953f 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -2032,6 +2032,7 @@ static int btf_dump_array_data(struct btf_dump *d, __u32 i, elem_type_id; __s64 elem_size; bool is_array_member; + bool is_array_terminated; elem_type_id = array->type; elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); @@ -2067,12 +2068,15 @@ static int btf_dump_array_data(struct btf_dump *d, */ is_array_member = d->typed_dump->is_array_member; d->typed_dump->is_array_member = true; + is_array_terminated = d->typed_dump->is_array_terminated; + d->typed_dump->is_array_terminated = false; for (i = 0; i < array->nelems; i++, data += elem_size) { if (d->typed_dump->is_array_terminated) break; btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); } d->typed_dump->is_array_member = is_array_member; + d->typed_dump->is_array_terminated = is_array_terminated; d->typed_dump->depth--; btf_dump_data_pfx(d); btf_dump_type_values(d, "]"); -- cgit v1.2.3 From 0993d724674a9d905ebdc9b9cd0b64e30523c5d6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Apr 2024 11:17:16 -0700 Subject: perf hist: Move histogram related code to hist.h It's strange that sort.h has the definition of struct hist_entry. As sort.h already includes hist.h, let's move the data structure to hist.h. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240411181718.2367948-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.h | 191 +++++++++++++++++++++++++++++++++++++++++++++-- tools/perf/util/sort.h | 188 ---------------------------------------------- tools/perf/util/values.h | 1 + 3 files changed, 184 insertions(+), 196 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 4a0aea0c9e00..8f072f3749eb 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -4,21 +4,22 @@ #include #include -#include "evsel.h" +#include "callchain.h" #include "color.h" #include "events_stats.h" +#include "evsel.h" +#include "map_symbol.h" #include "mutex.h" +#include "sample.h" +#include "spark.h" +#include "stat.h" -struct hist_entry; -struct hist_entry_ops; struct addr_location; -struct map_symbol; struct mem_info; struct kvm_info; struct branch_info; struct branch_stack; struct block_info; -struct symbol; struct ui_progress; enum hist_filter { @@ -150,6 +151,159 @@ extern const struct hist_iter_ops hist_iter_branch; extern const struct hist_iter_ops hist_iter_mem; extern const struct hist_iter_ops hist_iter_cumulative; +struct res_sample { + u64 time; + int cpu; + int tid; +}; + +struct he_stat { + u64 period; + u64 period_sys; + u64 period_us; + u64 period_guest_sys; + u64 period_guest_us; + u32 nr_events; +}; + +struct namespace_id { + u64 dev; + u64 ino; +}; + +struct hist_entry_diff { + bool computed; + union { + /* PERF_HPP__DELTA */ + double period_ratio_delta; + + /* PERF_HPP__RATIO */ + double period_ratio; + + /* HISTC_WEIGHTED_DIFF */ + s64 wdiff; + + /* PERF_HPP_DIFF__CYCLES */ + s64 cycles; + }; + struct stats stats; + unsigned long svals[NUM_SPARKS]; +}; + +struct hist_entry_ops { + void *(*new)(size_t size); + void (*free)(void *ptr); +}; + +/** + * struct hist_entry - histogram entry + * + * @row_offset - offset from the first callchain expanded to appear on screen + * @nr_rows - rows expanded in callchain, recalculated on folding/unfolding + */ +struct hist_entry { + struct rb_node rb_node_in; + struct rb_node rb_node; + union { + struct list_head node; + struct list_head head; + } pairs; + struct he_stat stat; + struct he_stat *stat_acc; + struct map_symbol ms; + struct thread *thread; + struct comm *comm; + struct namespace_id cgroup_id; + u64 cgroup; + u64 ip; + u64 transaction; + s32 socket; + s32 cpu; + u64 code_page_size; + u64 weight; + u64 ins_lat; + u64 p_stage_cyc; + u8 cpumode; + u8 depth; + int mem_type_off; + struct simd_flags simd_flags; + + /* We are added by hists__add_dummy_entry. */ + bool dummy; + bool leaf; + + char level; + u8 filtered; + + u16 callchain_size; + union { + /* + * Since perf diff only supports the stdio output, TUI + * fields are only accessed from perf report (or perf + * top). So make it a union to reduce memory usage. + */ + struct hist_entry_diff diff; + struct /* for TUI */ { + u16 row_offset; + u16 nr_rows; + bool init_have_children; + bool unfolded; + bool has_children; + bool has_no_entry; + }; + }; + char *srcline; + char *srcfile; + struct symbol *parent; + struct branch_info *branch_info; + long time; + struct hists *hists; + struct mem_info *mem_info; + struct block_info *block_info; + struct kvm_info *kvm_info; + void *raw_data; + u32 raw_size; + int num_res; + struct res_sample *res_samples; + void *trace_output; + struct perf_hpp_list *hpp_list; + struct hist_entry *parent_he; + struct hist_entry_ops *ops; + struct annotated_data_type *mem_type; + union { + /* this is for hierarchical entry structure */ + struct { + struct rb_root_cached hroot_in; + struct rb_root_cached hroot_out; + }; /* non-leaf entries */ + struct rb_root sorted_chain; /* leaf entry has callchains */ + }; + struct callchain_root callchain[0]; /* must be last member */ +}; + +static __pure inline bool hist_entry__has_callchains(struct hist_entry *he) +{ + return he->callchain_size != 0; +} + +static inline bool hist_entry__has_pairs(struct hist_entry *he) +{ + return !list_empty(&he->pairs.node); +} + +static inline struct hist_entry *hist_entry__next_pair(struct hist_entry *he) +{ + if (hist_entry__has_pairs(he)) + return list_entry(he->pairs.node.next, struct hist_entry, pairs.node); + return NULL; +} + +static inline void hist_entry__add_pair(struct hist_entry *pair, + struct hist_entry *he) +{ + list_add_tail(&pair->pairs.node, &he->pairs.head); +} + struct hist_entry *hists__add_entry(struct hists *hists, struct addr_location *al, struct symbol *parent, @@ -186,6 +340,8 @@ int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size, struct hists *hists); int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp, struct perf_hpp_fmt *fmt, int printed); +int hist_entry__sym_snprintf(struct hist_entry *he, char *bf, size_t size, + unsigned int width); void hist_entry__delete(struct hist_entry *he); typedef int (*hists__resort_cb_t)(struct hist_entry *he, void *arg); @@ -238,6 +394,20 @@ void hists__match(struct hists *leader, struct hists *other); int hists__link(struct hists *leader, struct hists *other); int hists__unlink(struct hists *hists); +static inline float hist_entry__get_percent_limit(struct hist_entry *he) +{ + u64 period = he->stat.period; + u64 total_period = hists__total_period(he->hists); + + if (unlikely(total_period == 0)) + return 0; + + if (symbol_conf.cumulate_callchain) + period = he->stat_acc->period; + + return period * 100.0 / total_period; +} + struct hists_evsel { struct evsel evsel; struct hists hists; @@ -460,15 +630,20 @@ struct hist_browser_timer { int refresh; }; -struct res_sample; - enum rstype { A_NORMAL, A_ASM, A_SOURCE }; -struct block_hist; +struct block_hist { + struct hists block_hists; + struct perf_hpp_list block_list; + struct perf_hpp_fmt block_fmt; + int block_idx; + bool valid; + struct hist_entry he; +}; #ifdef HAVE_SLANG_SUPPORT #include "../ui/keysyms.h" diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 6f6b4189a389..690892a92cf3 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -3,19 +3,9 @@ #define __PERF_SORT_H #include #include -#include -#include -#include "map_symbol.h" -#include "symbol_conf.h" -#include "callchain.h" -#include "values.h" #include "hist.h" -#include "stat.h" -#include "spark.h" struct option; -struct thread; -struct annotated_data_type; extern regex_t parent_regex; extern const char *sort_order; @@ -39,175 +29,6 @@ extern struct sort_entry sort_type; extern const char default_mem_sort_order[]; extern bool chk_double_cl; -struct res_sample { - u64 time; - int cpu; - int tid; -}; - -struct he_stat { - u64 period; - u64 period_sys; - u64 period_us; - u64 period_guest_sys; - u64 period_guest_us; - u32 nr_events; -}; - -struct namespace_id { - u64 dev; - u64 ino; -}; - -struct hist_entry_diff { - bool computed; - union { - /* PERF_HPP__DELTA */ - double period_ratio_delta; - - /* PERF_HPP__RATIO */ - double period_ratio; - - /* HISTC_WEIGHTED_DIFF */ - s64 wdiff; - - /* PERF_HPP_DIFF__CYCLES */ - s64 cycles; - }; - struct stats stats; - unsigned long svals[NUM_SPARKS]; -}; - -struct hist_entry_ops { - void *(*new)(size_t size); - void (*free)(void *ptr); -}; - -/** - * struct hist_entry - histogram entry - * - * @row_offset - offset from the first callchain expanded to appear on screen - * @nr_rows - rows expanded in callchain, recalculated on folding/unfolding - */ -struct hist_entry { - struct rb_node rb_node_in; - struct rb_node rb_node; - union { - struct list_head node; - struct list_head head; - } pairs; - struct he_stat stat; - struct he_stat *stat_acc; - struct map_symbol ms; - struct thread *thread; - struct comm *comm; - struct namespace_id cgroup_id; - u64 cgroup; - u64 ip; - u64 transaction; - s32 socket; - s32 cpu; - u64 code_page_size; - u64 weight; - u64 ins_lat; - u64 p_stage_cyc; - u8 cpumode; - u8 depth; - int mem_type_off; - struct simd_flags simd_flags; - - /* We are added by hists__add_dummy_entry. */ - bool dummy; - bool leaf; - - char level; - u8 filtered; - - u16 callchain_size; - union { - /* - * Since perf diff only supports the stdio output, TUI - * fields are only accessed from perf report (or perf - * top). So make it a union to reduce memory usage. - */ - struct hist_entry_diff diff; - struct /* for TUI */ { - u16 row_offset; - u16 nr_rows; - bool init_have_children; - bool unfolded; - bool has_children; - bool has_no_entry; - }; - }; - char *srcline; - char *srcfile; - struct symbol *parent; - struct branch_info *branch_info; - long time; - struct hists *hists; - struct mem_info *mem_info; - struct block_info *block_info; - struct kvm_info *kvm_info; - void *raw_data; - u32 raw_size; - int num_res; - struct res_sample *res_samples; - void *trace_output; - struct perf_hpp_list *hpp_list; - struct hist_entry *parent_he; - struct hist_entry_ops *ops; - struct annotated_data_type *mem_type; - union { - /* this is for hierarchical entry structure */ - struct { - struct rb_root_cached hroot_in; - struct rb_root_cached hroot_out; - }; /* non-leaf entries */ - struct rb_root sorted_chain; /* leaf entry has callchains */ - }; - struct callchain_root callchain[0]; /* must be last member */ -}; - -static __pure inline bool hist_entry__has_callchains(struct hist_entry *he) -{ - return he->callchain_size != 0; -} - -int hist_entry__sym_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width); - -static inline bool hist_entry__has_pairs(struct hist_entry *he) -{ - return !list_empty(&he->pairs.node); -} - -static inline struct hist_entry *hist_entry__next_pair(struct hist_entry *he) -{ - if (hist_entry__has_pairs(he)) - return list_entry(he->pairs.node.next, struct hist_entry, pairs.node); - return NULL; -} - -static inline void hist_entry__add_pair(struct hist_entry *pair, - struct hist_entry *he) -{ - list_add_tail(&pair->pairs.node, &he->pairs.head); -} - -static inline float hist_entry__get_percent_limit(struct hist_entry *he) -{ - u64 period = he->stat.period; - u64 total_period = hists__total_period(he->hists); - - if (unlikely(total_period == 0)) - return 0; - - if (symbol_conf.cumulate_callchain) - period = he->stat_acc->period; - - return period * 100.0 / total_period; -} - enum sort_mode { SORT_MODE__NORMAL, SORT_MODE__BRANCH, @@ -299,15 +120,6 @@ struct sort_entry { u8 se_width_idx; }; -struct block_hist { - struct hists block_hists; - struct perf_hpp_list block_list; - struct perf_hpp_fmt block_fmt; - int block_idx; - bool valid; - struct hist_entry he; -}; - extern struct sort_entry sort_thread; struct evlist; diff --git a/tools/perf/util/values.h b/tools/perf/util/values.h index 8c41f22f42cf..791c1ad606c2 100644 --- a/tools/perf/util/values.h +++ b/tools/perf/util/values.h @@ -2,6 +2,7 @@ #ifndef __PERF_VALUES_H #define __PERF_VALUES_H +#include #include struct perf_read_values { -- cgit v1.2.3 From 6fcf1e65253c52a7584817cbd3393c63d5660467 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Apr 2024 11:17:17 -0700 Subject: perf hist: Add weight fields to hist entry stats Like period and sample numbers, it'd be better to track weight values and display them in the output rather than having them as sort keys. This patch just adds a few more fields to save the weights in a hist entry. It'll be displayed as new output fields in the later patch. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240411181718.2367948-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 12 ++++++++++-- tools/perf/util/hist.h | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index fa359180ebf8..9d43f8ae412d 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -308,6 +308,9 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->period_us += src->period_us; dest->period_guest_sys += src->period_guest_sys; dest->period_guest_us += src->period_guest_us; + dest->weight1 += src->weight1; + dest->weight2 += src->weight2; + dest->weight3 += src->weight3; dest->nr_events += src->nr_events; } @@ -315,7 +318,9 @@ static void he_stat__decay(struct he_stat *he_stat) { he_stat->period = (he_stat->period * 7) / 8; he_stat->nr_events = (he_stat->nr_events * 7) / 8; - /* XXX need decay for weight too? */ + he_stat->weight1 = (he_stat->weight1 * 7) / 8; + he_stat->weight2 = (he_stat->weight2 * 7) / 8; + he_stat->weight3 = (he_stat->weight3 * 7) / 8; } static void hists__delete_entry(struct hists *hists, struct hist_entry *he); @@ -614,7 +619,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, cmp = hist_entry__cmp(he, entry); if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period); + he_stat__add_stat(&he->stat, &entry->stat); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) @@ -731,6 +736,9 @@ __hists__add_entry(struct hists *hists, .stat = { .nr_events = 1, .period = sample->period, + .weight1 = sample->weight, + .weight2 = sample->ins_lat, + .weight3 = sample->p_stage_cyc, }, .parent = sym_parent, .filtered = symbol__parent_filter(sym_parent) | al->filtered, diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 8f072f3749eb..f34f101c36c2 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -163,6 +163,9 @@ struct he_stat { u64 period_us; u64 period_guest_sys; u64 period_guest_us; + u64 weight1; + u64 weight2; + u64 weight3; u32 nr_events; }; -- cgit v1.2.3 From 7043dc5286a8c082d449e2257f9f87d7f764e7d4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Apr 2024 11:17:18 -0700 Subject: perf report: Add weight[123] output fields Add weight1, weight2 and weight3 fields to -F/--fields and their aliases like 'ins_lat', 'p_stage_cyc' and 'retire_lat'. Note that they are in the sort keys too but the difference is that output fields will sum up the weight values and display the average. In the sort key, users can see the distribution of weight value and I think it's confusing we have local vs. global weight for the same weight. For example, I experiment with mem-loads events to get the weights. On my laptop, it seems only weight1 field is supported. $ perf mem record -- perf test -w noploop Let's look at the noploop function only. It has 7 samples. $ perf script -F event,ip,sym,weight | grep noploop # event weight ip sym cpu/mem-loads,ldlat=30/P: 43 55b3c122bffc noploop cpu/mem-loads,ldlat=30/P: 48 55b3c122bffc noploop cpu/mem-loads,ldlat=30/P: 38 55b3c122bffc noploop <--- same weight cpu/mem-loads,ldlat=30/P: 38 55b3c122bffc noploop <--- same weight cpu/mem-loads,ldlat=30/P: 59 55b3c122bffc noploop cpu/mem-loads,ldlat=30/P: 33 55b3c122bffc noploop cpu/mem-loads,ldlat=30/P: 38 55b3c122bffc noploop <--- same weight When you use the 'weight' sort key, it'd show entries with a separate weight value separately. Also note that the first entry has 3 samples with weight value 38, so they are displayed together and the weight value is the sum of 3 samples (114 = 38 * 3). $ perf report -n -s +weight | grep -e Weight -e noploop # Overhead Samples Command Shared Object Symbol Weight 0.53% 3 perf perf [.] noploop 114 0.18% 1 perf perf [.] noploop 59 0.18% 1 perf perf [.] noploop 48 0.18% 1 perf perf [.] noploop 43 0.18% 1 perf perf [.] noploop 33 If you use 'local_weight' sort key, you can see the actual weight. $ perf report -n -s +local_weight | grep -e Weight -e noploop # Overhead Samples Command Shared Object Symbol Local Weight 0.53% 3 perf perf [.] noploop 38 0.18% 1 perf perf [.] noploop 59 0.18% 1 perf perf [.] noploop 48 0.18% 1 perf perf [.] noploop 43 0.18% 1 perf perf [.] noploop 33 But when you use the -F/--field option instead, you can see the average weight for the while noploop function (as it won't group samples by weight value and use the default 'comm,dso,sym' sort keys). $ perf report -n -F +weight | grep -e Weight -e noploop Warning: --fields weight shows the average value unlike in the --sort key. # Overhead Samples Weight1 Command Shared Object Symbol 1.23% 7 42.4 perf perf [.] noploop The weight1 field shows the average value: (38 * 3 + 59 + 48 + 43 + 33) / 7 = 42.4 Also it'd show the warning that 'weight' field has the average value. Using 'weight1' can remove the warning. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240411181718.2367948-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 9 +++- tools/perf/ui/hist.c | 92 +++++++++++++++++++++++++------- tools/perf/util/hist.h | 15 +++++- tools/perf/util/sort.c | 28 ++++++---- tools/perf/util/sort.h | 2 +- 5 files changed, 115 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index d8b863e01fe0..d2b1593ef700 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -121,6 +121,9 @@ OPTIONS - type: Data type of sample memory access. - typeoff: Offset in the data type of sample memory access. - symoff: Offset in the symbol. + - weight1: Average value of event specific weight (1st field of weight_struct). + - weight2: Average value of event specific weight (2nd field of weight_struct). + - weight3: Average value of event specific weight (3rd field of weight_struct). By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) @@ -198,7 +201,11 @@ OPTIONS --fields=:: Specify output field - multiple keys can be specified in CSV format. Following fields are available: - overhead, overhead_sys, overhead_us, overhead_children, sample and period. + overhead, overhead_sys, overhead_us, overhead_children, sample, period, + weight1, weight2, weight3, ins_lat, p_stage_cyc and retire_lat. The + last 3 names are alias for the corresponding weights. When the weight + fields are used, they will show the average value of the weight. + Also it can contain any sort key(s). By default, every sort keys not specified in -F will be appended diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 2bf959d08354..685ba2a54fd8 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -25,7 +25,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, const char *fmt, int len, - hpp_snprint_fn print_fn, bool fmt_percent) + hpp_snprint_fn print_fn, enum perf_hpp_fmt_type fmtype) { int ret; struct hists *hists = he->hists; @@ -33,7 +33,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, char *buf = hpp->buf; size_t size = hpp->size; - if (fmt_percent) { + if (fmtype == PERF_HPP_FMT_TYPE__PERCENT) { double percent = 0.0; u64 total = hists__total_period(hists); @@ -41,8 +41,16 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, percent = 100.0 * get_field(he) / total; ret = hpp__call_print_fn(hpp, print_fn, fmt, len, percent); - } else + } else if (fmtype == PERF_HPP_FMT_TYPE__AVERAGE) { + double average = 0; + + if (he->stat.nr_events) + average = 1.0 * get_field(he) / he->stat.nr_events; + + ret = hpp__call_print_fn(hpp, print_fn, fmt, len, average); + } else { ret = hpp__call_print_fn(hpp, print_fn, fmt, len, get_field(he)); + } if (evsel__is_group_event(evsel)) { int prev_idx, idx_delta; @@ -54,6 +62,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, list_for_each_entry(pair, &he->pairs.head, pairs.node) { u64 period = get_field(pair); u64 total = hists__total_period(pair->hists); + int nr_samples = pair->stat.nr_events; if (!total) continue; @@ -66,7 +75,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, * zero-fill group members in the middle which * have no sample */ - if (fmt_percent) { + if (fmtype != PERF_HPP_FMT_TYPE__RAW) { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, 0.0); } else { @@ -75,9 +84,14 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, } } - if (fmt_percent) { + if (fmtype == PERF_HPP_FMT_TYPE__PERCENT) { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, 100.0 * period / total); + } else if (fmtype == PERF_HPP_FMT_TYPE__AVERAGE) { + double avg = nr_samples ? (period / nr_samples) : 0; + + ret += hpp__call_print_fn(hpp, print_fn, fmt, + len, avg); } else { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, period); @@ -92,7 +106,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, /* * zero-fill group members at last which have no sample */ - if (fmt_percent) { + if (fmtype != PERF_HPP_FMT_TYPE__RAW) { ret += hpp__call_print_fn(hpp, print_fn, fmt, len, 0.0); } else { @@ -114,33 +128,35 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent) + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype) { int len = fmt->user_len ?: fmt->len; if (symbol_conf.field_sep) { return __hpp__fmt(hpp, he, get_field, fmtstr, 1, - print_fn, fmt_percent); + print_fn, fmtype); } - if (fmt_percent) + if (fmtype == PERF_HPP_FMT_TYPE__PERCENT) len -= 2; /* 2 for a space and a % sign */ else len -= 1; - return __hpp__fmt(hpp, he, get_field, fmtstr, len, print_fn, fmt_percent); + return __hpp__fmt(hpp, he, get_field, fmtstr, len, print_fn, fmtype); } int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent) + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype) { if (!symbol_conf.cumulate_callchain) { int len = fmt->user_len ?: fmt->len; return snprintf(hpp->buf, hpp->size, " %*s", len - 1, "N/A"); } - return hpp__fmt(fmt, hpp, he, get_field, fmtstr, print_fn, fmt_percent); + return hpp__fmt(fmt, hpp, he, get_field, fmtstr, print_fn, fmtype); } static int field_cmp(u64 field_a, u64 field_b) @@ -350,7 +366,7 @@ static int hpp__color_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \ - hpp_color_scnprintf, true); \ + hpp_color_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_ENTRY_PERCENT_FN(_type, _field) \ @@ -358,7 +374,7 @@ static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \ - hpp_entry_scnprintf, true); \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_SORT_FN(_type, _field) \ @@ -378,7 +394,7 @@ static int hpp__color_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \ - hpp_color_scnprintf, true); \ + hpp_color_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_ENTRY_ACC_PERCENT_FN(_type, _field) \ @@ -386,7 +402,7 @@ static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \ - hpp_entry_scnprintf, true); \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__PERCENT); \ } #define __HPP_SORT_ACC_FN(_type, _field) \ @@ -406,7 +422,7 @@ static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ struct perf_hpp *hpp, struct hist_entry *he) \ { \ return hpp__fmt(fmt, hpp, he, he_get_raw_##_field, " %*"PRIu64, \ - hpp_entry_scnprintf, false); \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__RAW); \ } #define __HPP_SORT_RAW_FN(_type, _field) \ @@ -416,6 +432,26 @@ static int64_t hpp__sort_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \ return __hpp__sort(a, b, he_get_raw_##_field); \ } +#define __HPP_ENTRY_AVERAGE_FN(_type, _field) \ +static u64 he_get_##_field(struct hist_entry *he) \ +{ \ + return he->stat._field; \ +} \ + \ +static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \ + struct perf_hpp *hpp, struct hist_entry *he) \ +{ \ + return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.1f", \ + hpp_entry_scnprintf, PERF_HPP_FMT_TYPE__AVERAGE); \ +} + +#define __HPP_SORT_AVERAGE_FN(_type, _field) \ +static int64_t hpp__sort_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \ + struct hist_entry *a, struct hist_entry *b) \ +{ \ + return __hpp__sort(a, b, he_get_##_field); \ +} + #define HPP_PERCENT_FNS(_type, _field) \ __HPP_COLOR_PERCENT_FN(_type, _field) \ @@ -431,6 +467,10 @@ __HPP_SORT_ACC_FN(_type, _field) __HPP_ENTRY_RAW_FN(_type, _field) \ __HPP_SORT_RAW_FN(_type, _field) +#define HPP_AVERAGE_FNS(_type, _field) \ +__HPP_ENTRY_AVERAGE_FN(_type, _field) \ +__HPP_SORT_AVERAGE_FN(_type, _field) + HPP_PERCENT_FNS(overhead, period) HPP_PERCENT_FNS(overhead_sys, period_sys) HPP_PERCENT_FNS(overhead_us, period_us) @@ -441,6 +481,10 @@ HPP_PERCENT_ACC_FNS(overhead_acc, period) HPP_RAW_FNS(samples, nr_events) HPP_RAW_FNS(period, period) +HPP_AVERAGE_FNS(weight1, weight1) +HPP_AVERAGE_FNS(weight2, weight2) +HPP_AVERAGE_FNS(weight3, weight3) + static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused, struct hist_entry *a __maybe_unused, struct hist_entry *b __maybe_unused) @@ -510,7 +554,10 @@ struct perf_hpp_fmt perf_hpp__format[] = { HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us, OVERHEAD_GUEST_US), HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc, OVERHEAD_ACC), HPP__PRINT_FNS("Samples", samples, SAMPLES), - HPP__PRINT_FNS("Period", period, PERIOD) + HPP__PRINT_FNS("Period", period, PERIOD), + HPP__PRINT_FNS("Weight1", weight1, WEIGHT1), + HPP__PRINT_FNS("Weight2", weight2, WEIGHT2), + HPP__PRINT_FNS("Weight3", weight3, WEIGHT3), }; struct perf_hpp_list perf_hpp_list = { @@ -526,6 +573,7 @@ struct perf_hpp_list perf_hpp_list = { #undef HPP_PERCENT_FNS #undef HPP_PERCENT_ACC_FNS #undef HPP_RAW_FNS +#undef HPP_AVERAGE_FNS #undef __HPP_HEADER_FN #undef __HPP_WIDTH_FN @@ -534,9 +582,11 @@ struct perf_hpp_list perf_hpp_list = { #undef __HPP_COLOR_ACC_PERCENT_FN #undef __HPP_ENTRY_ACC_PERCENT_FN #undef __HPP_ENTRY_RAW_FN +#undef __HPP_ENTRY_AVERAGE_FN #undef __HPP_SORT_FN #undef __HPP_SORT_ACC_FN #undef __HPP_SORT_RAW_FN +#undef __HPP_SORT_AVERAGE_FN static void fmt_free(struct perf_hpp_fmt *fmt) { @@ -785,6 +835,12 @@ void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists) fmt->len = 12; break; + case PERF_HPP__WEIGHT1: + case PERF_HPP__WEIGHT2: + case PERF_HPP__WEIGHT3: + fmt->len = 8; + break; + default: break; } diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index f34f101c36c2..5260822b9773 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -550,6 +550,9 @@ enum { PERF_HPP__OVERHEAD_ACC, PERF_HPP__SAMPLES, PERF_HPP__PERIOD, + PERF_HPP__WEIGHT1, + PERF_HPP__WEIGHT2, + PERF_HPP__WEIGHT3, PERF_HPP__MAX_INDEX }; @@ -596,16 +599,24 @@ void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists); void perf_hpp__set_user_width(const char *width_list_str); void hists__reset_column_width(struct hists *hists); +enum perf_hpp_fmt_type { + PERF_HPP_FMT_TYPE__RAW, + PERF_HPP_FMT_TYPE__PERCENT, + PERF_HPP_FMT_TYPE__AVERAGE, +}; + typedef u64 (*hpp_field_fn)(struct hist_entry *he); typedef int (*hpp_callback_fn)(struct perf_hpp *hpp, bool front); typedef int (*hpp_snprint_fn)(struct perf_hpp *hpp, const char *fmt, ...); int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent); + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype); int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he, hpp_field_fn get_field, - const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent); + const char *fmtstr, hpp_snprint_fn print_fn, + enum perf_hpp_fmt_type fmtype); static inline void advance_hpp(struct perf_hpp *hpp, int inc) { diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 92a1bd695e8a..add8601c57fd 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2441,6 +2441,13 @@ static struct hpp_dimension hpp_sort_dimensions[] = { DIM(PERF_HPP__OVERHEAD_ACC, "overhead_children"), DIM(PERF_HPP__SAMPLES, "sample"), DIM(PERF_HPP__PERIOD, "period"), + DIM(PERF_HPP__WEIGHT1, "weight1"), + DIM(PERF_HPP__WEIGHT2, "weight2"), + DIM(PERF_HPP__WEIGHT3, "weight3"), + /* aliases for weight_struct */ + DIM(PERF_HPP__WEIGHT2, "ins_lat"), + DIM(PERF_HPP__WEIGHT3, "retire_lat"), + DIM(PERF_HPP__WEIGHT3, "p_stage_cyc"), }; #undef DIM @@ -3743,26 +3750,29 @@ void sort__setup_elide(FILE *output) } } -int output_field_add(struct perf_hpp_list *list, char *tok) +int output_field_add(struct perf_hpp_list *list, const char *tok) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { - struct sort_dimension *sd = &common_sort_dimensions[i]; + for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) { + struct hpp_dimension *hd = &hpp_sort_dimensions[i]; - if (!sd->name || strncasecmp(tok, sd->name, strlen(tok))) + if (strncasecmp(tok, hd->name, strlen(tok))) continue; - return __sort_dimension__add_output(list, sd); + if (!strcasecmp(tok, "weight")) + ui__warning("--fields weight shows the average value unlike in the --sort key.\n"); + + return __hpp_dimension__add_output(list, hd); } - for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) { - struct hpp_dimension *hd = &hpp_sort_dimensions[i]; + for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { + struct sort_dimension *sd = &common_sort_dimensions[i]; - if (strncasecmp(tok, hd->name, strlen(tok))) + if (!sd->name || strncasecmp(tok, sd->name, strlen(tok))) continue; - return __hpp_dimension__add_output(list, hd); + return __sort_dimension__add_output(list, sd); } for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) { diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 690892a92cf3..0bd0ee3ae76b 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -141,7 +141,7 @@ void reset_dimensions(void); int sort_dimension__add(struct perf_hpp_list *list, const char *tok, struct evlist *evlist, int level); -int output_field_add(struct perf_hpp_list *list, char *tok); +int output_field_add(struct perf_hpp_list *list, const char *tok); int64_t sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right); int64_t -- cgit v1.2.3 From 6b718ac6874c2233b8dec369a8a377d6c5b638e6 Mon Sep 17 00:00:00 2001 From: Chaitanya S Prakash Date: Mon, 8 Apr 2024 11:52:28 +0530 Subject: perf tools: Enable configs required for test_uprobe_from_different_cu.sh Test "perf probe of function from different CU" fails due to certain configs not being enabled. Building the kernel with CONFIG_KPROBE_EVENTS=y and CONFIG_UPROBE_EVENTS=y fixes the issue. As CONFIG_KPROBE_EVENTS is dependent on CONFIG_KPROBES, enable it as well. Some platforms enable these configs as a part of their defconfig, so this change is only required for the ones that don't do so. Reviewed-by: Masami Hiramatsu Signed-off-by: Chaitanya S Prakash Cc: Anshuman Khandual Cc: James Clark Link: https://lore.kernel.org/r/20240408062230.1949882-1-ChaitanyaS.Prakash@arm.com Link: https://lore.kernel.org/r/20240408062230.1949882-7-ChaitanyaS.Prakash@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/config-fragments/config | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/config-fragments/config b/tools/perf/tests/config-fragments/config index c340b3195fca..4fca12851016 100644 --- a/tools/perf/tests/config-fragments/config +++ b/tools/perf/tests/config-fragments/config @@ -9,3 +9,6 @@ CONFIG_GENERIC_TRACER=y CONFIG_FTRACE=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_KPROBES=y +CONFIG_KPROBE_EVENTS=y +CONFIG_UPROBE_EVENTS=y -- cgit v1.2.3 From 6fc6d7f59376c7cda4d866159b29fe96d96afe83 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Apr 2024 12:05:34 -0700 Subject: selftests: adopt BPF's approach to quieter builds selftest build is fairly noisy, it's easy to miss warnings. It's standard practice to add alternative messages in the Makefile. I was grepping for existing solutions, and found that bpf already has the right knobs. Move them to lib.mk and adopt in net. Convert the basic rules in lib.mk. Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240411190534.444918-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/bpf/Makefile | 13 +------------ tools/testing/selftests/lib.mk | 17 ++++++++++++++++- tools/testing/selftests/net/Makefile | 9 ++++++--- 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b0be07f29dde..f06c527eee34 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -135,18 +135,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi -# Emit succinct information message describing current building step -# $1 - generic step name (e.g., CC, LINK, etc); -# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; -# $3 - target (assumed to be file); only file name will be emitted; -# $4 - optional extra arg, emitted as-is, if provided. -ifeq ($(V),1) -Q = -msg = -else -Q = @ -msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; -MAKEFLAGS += --no-print-directory +ifneq ($(V),1) submake_extras := feature_display=0 endif diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index da2cade3bab0..aeeac5f83492 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -44,6 +44,20 @@ endif selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) top_srcdir = $(selfdir)/../../.. +# msg: emit succinct information message describing current building step +# $1 - generic step name (e.g., CC, LINK, etc); +# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor]; +# $3 - target (assumed to be file); only file name will be emitted; +# $4 - optional extra arg, emitted as-is, if provided. +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +MAKEFLAGS += --no-print-directory +endif + ifeq ($(KHDR_INCLUDES),) KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif @@ -176,7 +190,8 @@ endif ifeq ($(OVERRIDE_TARGETS),) LOCAL_HDRS += $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h $(OUTPUT)/%:%.c $(LOCAL_HDRS) - $(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S $(COMPILE.S) $^ -o $@ diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index a3c781cb8367..7e7f243d0ab2 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -125,7 +125,8 @@ BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a MAKE_DIRS := $(BUILD_DIR)/libbpf $(MAKE_DIRS): - mkdir -p $@ + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ # Get Clang's default includes on this system, as opposed to those seen by # '--target=bpf'. This fixes "missing" files on some architectures/distros, @@ -149,13 +150,15 @@ BPF_PROG_OBJS := $(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o \ $(OUTPUT)/sample_map_ret0.bpf.o $(OUTPUT)/sample_ret0.bpf.o $(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) - $(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \ + $(call msg,BPF_PROG,,$@) + $(Q)$(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \ -c $< -o $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ | $(BUILD_DIR)/libbpf - $(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + $(call msg,MAKE,,$@) + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ EXTRA_CFLAGS='-g -O0' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers -- cgit v1.2.3 From 3fde60afe1f84746c1177861bd27b3ebb00cb8f5 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 16 Apr 2024 11:09:13 +0200 Subject: selftests: openvswitch: Fix escape chars in regexp. Character sequences starting with `\` are interpreted by python as escaped Unicode characters. However, they have other meaning in regular expressions (e.g: "\d"). It seems Python >= 3.12 starts emitting a SyntaxWarning when these escaped sequences are not recognized as valid Unicode characters. An example of these warnings: tools/testing/selftests/net/openvswitch/ovs-dpctl.py:505: SyntaxWarning: invalid escape sequence '\d' Fix all the warnings by flagging literals as raw strings. Signed-off-by: Adrian Moreno Reviewed-by: Aaron Conole Link: https://lore.kernel.org/r/20240416090913.2028475-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/ovs-dpctl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 5e0e539a323d..1dd057afd3fb 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -489,7 +489,7 @@ class ovsactions(nla): actstr, reason = parse_extract_field( actstr, "drop(", - "([0-9]+)", + r"([0-9]+)", lambda x: int(x, 0), False, None, @@ -502,9 +502,9 @@ class ovsactions(nla): actstr = actstr[len("drop"): ] return (totallen - len(actstr)) - elif parse_starts_block(actstr, "^(\d+)", False, True): + elif parse_starts_block(actstr, r"^(\d+)", False, True): actstr, output = parse_extract_field( - actstr, None, "(\d+)", lambda x: int(x), False, "0" + actstr, None, r"(\d+)", lambda x: int(x), False, "0" ) self["attrs"].append(["OVS_ACTION_ATTR_OUTPUT", output]) parsed = True @@ -512,7 +512,7 @@ class ovsactions(nla): actstr, recircid = parse_extract_field( actstr, "recirc(", - "([0-9a-fA-Fx]+)", + r"([0-9a-fA-Fx]+)", lambda x: int(x, 0), False, 0, @@ -588,17 +588,17 @@ class ovsactions(nla): actstr = actstr[3:] actstr, ip_block_min = parse_extract_field( - actstr, "=", "([0-9a-fA-F\.]+)", str, False + actstr, "=", r"([0-9a-fA-F\.]+)", str, False ) actstr, ip_block_max = parse_extract_field( - actstr, "-", "([0-9a-fA-F\.]+)", str, False + actstr, "-", r"([0-9a-fA-F\.]+)", str, False ) actstr, proto_min = parse_extract_field( - actstr, ":", "(\d+)", int, False + actstr, ":", r"(\d+)", int, False ) actstr, proto_max = parse_extract_field( - actstr, "-", "(\d+)", int, False + actstr, "-", r"(\d+)", int, False ) if t is not None: -- cgit v1.2.3 From 655614ea2bd3b5774cdb95c7630d8327bf221934 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Apr 2024 16:11:39 -0700 Subject: selftests: net: fix counting totals when some checks fail Totals currently only pay attention to exceptions, if check fails (say ksft_eq()) the test case will be counted as pass: # At /ksft/drivers/net/./ping.py line 18: # Check failed 1 != 2 not ok 1 ping.test_v4 ok 2 ping.test_v6 ok 3 ping.test_tcp # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 ^^^^^^^^^^^^^ Pay attention to the result. Fixes: b86761ff6374 ("selftests: net: add scaffolding for Netlink tests in Python") Link: https://lore.kernel.org/r/20240417231146.2435572-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 3769b9197213..640dfbf47702 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -106,7 +106,10 @@ def ksft_run(cases, args=()): continue ktap_result(KSFT_RESULT, cnt, case) - totals['pass'] += 1 + if KSFT_RESULT: + totals['pass'] += 1 + else: + totals['fail'] += 1 print( f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" -- cgit v1.2.3 From 4fa6bd4b33ac9b914e3d1bb95848239ab8fdde1a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Apr 2024 16:11:40 -0700 Subject: selftests: net: set the exit code correctly in Python tests Test cases need to exit with non-zero status if they failed, we currently don't do that: # KTAP version 1 # 1..3 # # At /root/ksft-net-drv/drivers/net/./ping.py line 18: # # Check failed 1 != 2 # not ok 1 ping.test_v4 # ok 2 ping.test_v6 # ok 3 ping.test_tcp # # Totals: pass:2 fail:1 xfail:0 xpass:0 skip:0 error:0 ok 1 selftests: drivers/net: ping.py ^^^^ It's a bit tempting to make the exit part of ksft_run(), but that only works well for very trivial setups. We can revisit this later, if people forget to call ksft_exit(). Link: https://lore.kernel.org/r/20240417231146.2435572-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/stats.py | 4 +++- tools/testing/selftests/net/lib/py/ksft.py | 10 ++++++++++ tools/testing/selftests/net/nl_netdev.py | 4 +++- 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 5a9d4e56b28b..947df3eb681f 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_in, ksft_true, KsftSkipEx, KsftXfailEx +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_in, ksft_true, KsftSkipEx, KsftXfailEx from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv @@ -80,6 +81,7 @@ def main() -> None: with NetDrvEnv(__file__) as cfg: ksft_run([check_pause, check_fec, pkt_byte_sum], args=(cfg, )) + ksft_exit() if __name__ == "__main__": diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 640dfbf47702..25f2572fa540 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -2,11 +2,13 @@ import builtins import inspect +import sys import time import traceback from .consts import KSFT_MAIN_NAME KSFT_RESULT = None +KSFT_RESULT_ALL = True class KsftSkipEx(Exception): @@ -63,6 +65,9 @@ def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): def ktap_result(ok, cnt=1, case="", comment=""): + global KSFT_RESULT_ALL + KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok + res = "" if not ok: res += "not " @@ -114,3 +119,8 @@ def ksft_run(cases, args=()): print( f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" ) + + +def ksft_exit(): + global KSFT_RESULT_ALL + sys.exit(0 if KSFT_RESULT_ALL else 1) diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index 6909b1760739..93d9d914529b 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 import time -from lib.py import ksft_run, ksft_pr, ksft_eq, ksft_ge, ksft_busy_wait +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_eq, ksft_ge, ksft_busy_wait from lib.py import NetdevFamily, NetdevSimDev, ip @@ -90,6 +91,7 @@ def main() -> None: nf = NetdevFamily() ksft_run([empty_check, lo_check, page_pool_check], args=(nf, )) + ksft_exit() if __name__ == "__main__": -- cgit v1.2.3 From eb4d27cf9aef3e6c9bcaf8fa1a1cadc2433d847b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 16 Apr 2024 10:00:13 -0700 Subject: perf docs: Document bpf event modifier Document that 'b' is used as a modifier to make an event use a BPF counter. Fixes: 01bd8efcec444468 ("perf stat: Introduce ':b' modifier") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Thomas Richter Link: https://lore.kernel.org/r/20240416170014.985191-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 3b12595193c9..6bf2468f59d3 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -71,6 +71,7 @@ counted. The following modifiers exist: D - pin the event to the PMU W - group is weak and will fallback to non-group if not schedulable, e - group or event are exclusive and do not share the PMU + b - use BPF aggregration (see perf stat --bpf-counters) The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: -- cgit v1.2.3 From d9bd1d4264baddf7ab8baae86e91674d369f22de Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 16 Apr 2024 10:00:14 -0700 Subject: perf test bpf-counters: Add test for BPF event modifier Refactor test to better enable sharing of logic, to give an idea of progress and introduce test functions. Add test of measuring both cycles and cycles:b simultaneously. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Song Liu Cc: Thomas Richter Link: https://lore.kernel.org/r/20240416170014.985191-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_bpf_counters.sh | 75 ++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh index 2d9209874774..61f8149d854e 100755 --- a/tools/perf/tests/shell/stat_bpf_counters.sh +++ b/tools/perf/tests/shell/stat_bpf_counters.sh @@ -4,21 +4,59 @@ set -e +workload="perf bench sched messaging -g 1 -l 100 -t" + # check whether $2 is within +/- 20% of $1 compare_number() { - first_num=$1 - second_num=$2 - - # upper bound is first_num * 120% - upper=$(expr $first_num + $first_num / 5 ) - # lower bound is first_num * 80% - lower=$(expr $first_num - $first_num / 5 ) - - if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then - echo "The difference between $first_num and $second_num are greater than 20%." - exit 1 - fi + first_num=$1 + second_num=$2 + + # upper bound is first_num * 120% + upper=$(expr $first_num + $first_num / 5 ) + # lower bound is first_num * 80% + lower=$(expr $first_num - $first_num / 5 ) + + if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then + echo "The difference between $first_num and $second_num are greater than 20%." + exit 1 + fi +} + +check_counts() +{ + base_cycles=$1 + bpf_cycles=$2 + + if [ "$base_cycles" = "&1 | awk '/cycles/ {print $1}') + bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- $workload 2>&1 | awk '/cycles/ {print $1}') + check_counts $base_cycles $bpf_cycles + compare_number $base_cycles $bpf_cycles + echo "[Success]" +} + +test_bpf_modifier() +{ + printf "Testing bpf event modifier " + stat_output=$(perf stat --no-big-num -e cycles/name=base_cycles/,cycles/name=bpf_cycles/b -- $workload 2>&1) + base_cycles=$(echo "$stat_output"| awk '/base_cycles/ {print $1}') + bpf_cycles=$(echo "$stat_output"| awk '/bpf_cycles/ {print $1}') + check_counts $base_cycles $bpf_cycles + compare_number $base_cycles $bpf_cycles + echo "[Success]" } # skip if --bpf-counters is not supported @@ -30,16 +68,7 @@ if ! perf stat -e cycles --bpf-counters true > /dev/null 2>&1; then exit 2 fi -base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}') -if [ "$base_cycles" = "&1 | awk '/cycles/ {print $1}') -if [ "$bpf_cycles" = " Date: Mon, 8 Apr 2024 14:40:22 -0700 Subject: perf vendor events arm64: AmpereOne/AmpereOneX: Mark L1D_CACHE_INVAL impacted by errata L1D_CACHE_INVAL overcounts in certain situations. See AC03_CPU_41 and AC04_CPU_1 for more details. Mark the event impacted by the errata. Reviewed-by: James Clark Signed-off-by: Ilkka Koskinen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20240408214022.541839-1-ilkka@os.amperecomputing.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json | 4 +++- tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json index 7a2b7b200f14..ac75f12e27bf 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json @@ -9,7 +9,9 @@ "ArchStdEvent": "L1D_CACHE_REFILL_RD" }, { - "ArchStdEvent": "L1D_CACHE_INVAL" + "ArchStdEvent": "L1D_CACHE_INVAL", + "Errata": "Errata AC03_CPU_41", + "BriefDescription": "L1D cache invalidate. Impacted by errata -" }, { "ArchStdEvent": "L1D_TLB_REFILL_RD" diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json index c50d8e930b05..f4bfe7083a6b 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json @@ -9,7 +9,9 @@ "ArchStdEvent": "L1D_CACHE_REFILL_RD" }, { - "ArchStdEvent": "L1D_CACHE_INVAL" + "ArchStdEvent": "L1D_CACHE_INVAL", + "Errata": "Errata AC04_CPU_1", + "BriefDescription": "L1D cache invalidate. Impacted by errata -" }, { "ArchStdEvent": "L1D_TLB_REFILL_RD" -- cgit v1.2.3 From b828a23a75a012b9242166b78ff1c3ff2189f436 Mon Sep 17 00:00:00 2001 From: Chen Pei Date: Mon, 15 Apr 2024 17:55:32 +0800 Subject: perf genelf: Fix compiling with libelf on rv32 When cross-compiling perf with libelf, the following error occurred: In file included from tests/genelf.c:14: tests/../util/genelf.h:50:2: error: #error "unsupported architecture" 50 | #error "unsupported architecture" | ^~~~~ tests/../util/genelf.h:59:5: warning: "GEN_ELF_CLASS" is not defined, evaluates to 0 [-Wundef] 59 | #if GEN_ELF_CLASS == ELFCLASS64 Fix this by adding GEN-ELF-ARCH and GEN-ELF-CLASS definitions for rv32. Reviewed-by: Ian Rogers Signed-off-by: Chen Pei Cc: Albert Ou Cc: Palmer Dabbelt Cc: Paul Walmsley Link: https://lore.kernel.org/r/20240415095532.4930-1-cp0613@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/genelf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h index 5f18d20ea903..4e2e4f40e134 100644 --- a/tools/perf/util/genelf.h +++ b/tools/perf/util/genelf.h @@ -43,6 +43,9 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent #elif defined(__riscv) && __riscv_xlen == 64 #define GEN_ELF_ARCH EM_RISCV #define GEN_ELF_CLASS ELFCLASS64 +#elif defined(__riscv) && __riscv_xlen == 32 +#define GEN_ELF_ARCH EM_RISCV +#define GEN_ELF_CLASS ELFCLASS32 #elif defined(__loongarch__) #define GEN_ELF_ARCH EM_LOONGARCH #define GEN_ELF_CLASS ELFCLASS64 -- cgit v1.2.3 From 10b6ee3b597b1b1b4dc390aaf9d589664af31df9 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 26 Mar 2024 11:37:49 +0000 Subject: perf test shell arm_coresight: Increase buffer size for Coresight basic tests These tests record in a mode that includes kernel trace but look for samples of a userspace process. This makes them sensitive to any kernel compilation options that increase the amount of time spent in the kernel. If the trace buffer is completely filled before userspace is reached then the test will fail. Double the buffer size to fix this. The other tests in the same file aren't sensitive to this for various reasons, for example the iterate devices test filters by userspace trace only. But in order to keep coverage of all the modes, increase the buffer size rather than filtering by userspace for the basic tests. Fixes: d1efa4a0a696e487 ("perf cs-etm: Add separate decode paths for timeless and per-thread modes") Reviewed-by: Anshuman Khandual Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Link: https://lore.kernel.org/r/20240326113749.257250-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_coresight.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh index 65dd85207125..3302ea0b9672 100755 --- a/tools/perf/tests/shell/test_arm_coresight.sh +++ b/tools/perf/tests/shell/test_arm_coresight.sh @@ -188,7 +188,7 @@ arm_cs_etm_snapshot_test() { arm_cs_etm_basic_test() { echo "Recording trace with '$*'" - perf record -o ${perfdata} "$@" -- ls > /dev/null 2>&1 + perf record -o ${perfdata} "$@" -m,8M -- ls > /dev/null 2>&1 perf_script_branch_samples ls && perf_report_branch_samples ls && -- cgit v1.2.3 From 03f2357017c37d68e73d7d8d77abfcb72e12bc86 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Fri, 12 Apr 2024 14:07:41 -0700 Subject: perf stat: Add new field in stat_config to enable hardware aware grouping Hardware counter and event information could be used to help creating event groups that better utilize hardware counters and improve multiplexing. Reviewed-by: Ian Rogers Signed-off-by: Weilin Wang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240412210756.309828-2-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 5 +++++ tools/perf/util/metricgroup.c | 3 +++ tools/perf/util/metricgroup.h | 1 + tools/perf/util/stat.h | 1 + 4 files changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 65388c57bb5d..65a3dd7ffac3 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2085,6 +2085,7 @@ static int add_default_attributes(void) stat_config.metric_no_threshold, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events); } @@ -2118,6 +2119,7 @@ static int add_default_attributes(void) stat_config.metric_no_threshold, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events); } @@ -2152,6 +2154,7 @@ static int add_default_attributes(void) /*metric_no_threshold=*/true, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events) < 0) return -1; } @@ -2193,6 +2196,7 @@ static int add_default_attributes(void) /*metric_no_threshold=*/true, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events) < 0) return -1; @@ -2727,6 +2731,7 @@ int cmd_stat(int argc, const char **argv) stat_config.metric_no_threshold, stat_config.user_requested_cpu_list, stat_config.system_wide, + stat_config.hardware_aware_grouping, &stat_config.metric_events); zfree(&metrics); diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 6ec083af14a1..9be406524617 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1690,12 +1690,15 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, bool metric_no_threshold, const char *user_requested_cpu_list, bool system_wide, + bool hardware_aware_grouping, struct rblist *metric_events) { const struct pmu_metrics_table *table = pmu_metrics_table__find(); if (!table) return -EINVAL; + if (hardware_aware_grouping) + pr_debug("Use hardware aware grouping instead of traditional metric grouping method\n"); return parse_groups(perf_evlist, pmu, str, metric_no_group, metric_no_merge, metric_no_threshold, user_requested_cpu_list, system_wide, diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index d5325c6ec8e1..779f6ede1b51 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -77,6 +77,7 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, bool metric_no_threshold, const char *user_requested_cpu_list, bool system_wide, + bool hardware_aware_grouping, struct rblist *metric_events); int metricgroup__parse_groups_test(struct evlist *evlist, const struct pmu_metrics_table *table, diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index d6e5c8787ba2..fd7a187551bd 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -87,6 +87,7 @@ struct perf_stat_config { bool metric_no_group; bool metric_no_merge; bool metric_no_threshold; + bool hardware_aware_grouping; bool stop_read_counter; bool iostat_run; char *user_requested_cpu_list; -- cgit v1.2.3 From a529bec023d7d14d9fb7d5b456921630e63edc6b Mon Sep 17 00:00:00 2001 From: Dima Kogan Date: Mon, 15 Apr 2024 21:55:10 -0700 Subject: perf probe-event: Un-hardcode sizeof(buf) In several places we had char buf[64]; ... snprintf(buf, 64, ...); This patch changes it to char buf[64]; ... snprintf(buf, sizeof(buf), ...); so the "64" is only stated once. Signed-off-by: Dima Kogan Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20240416045533.162692-2-dima@secretsauce.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 5c12459e9765..c1dbad8a83dc 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -236,7 +236,7 @@ static int convert_exec_to_group(const char *exec, char **result) } } - ret = e_snprintf(buf, 64, "%s_%s", PERFPROBE_GROUP, ptr1); + ret = e_snprintf(buf, sizeof(buf), "%s_%s", PERFPROBE_GROUP, ptr1); if (ret < 0) goto out; @@ -2867,7 +2867,7 @@ static int probe_trace_event__set_name(struct probe_trace_event *tev, group = PERFPROBE_GROUP; /* Get an unused new event name */ - ret = get_new_event_name(buf, 64, event, namelist, + ret = get_new_event_name(buf, sizeof(buf), event, namelist, tev->point.retprobe, allow_suffix); if (ret < 0) return ret; -- cgit v1.2.3 From c15ed4442981ccf8e7e0d885f5cb500400dde9d7 Mon Sep 17 00:00:00 2001 From: Dima Kogan Date: Mon, 15 Apr 2024 21:55:11 -0700 Subject: perf probe-event: Better error message for a too-long probe name This is a common failure mode when probing userspace C++ code (where the mangling adds significant length to the symbol names). Prior to this patch, only a very generic error message is produced, making the user guess at what the issue is. Signed-off-by: Dima Kogan Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20240416045533.162692-3-dima@secretsauce.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index c1dbad8a83dc..8a73c9464b70 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2758,7 +2758,7 @@ static int get_new_event_name(char *buf, size_t len, const char *base, /* Try no suffix number */ ret = e_snprintf(buf, len, "%s%s", nbase, ret_event ? "__return" : ""); if (ret < 0) { - pr_debug("snprintf() failed: %d\n", ret); + pr_warning("snprintf() failed: %d; the event name nbase='%s' is too long\n", ret, nbase); goto out; } if (!strlist__has_entry(namelist, buf)) -- cgit v1.2.3 From 61ba075d9911d2a6db181fbb5602762a24d5d0a9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 18 Apr 2024 18:15:33 -0300 Subject: Revert "tools headers: Remove almost unused copy of uapi/stat.h, add few conditional defines" This reverts commit a672af9139a843eb7a48fd7846bb6df8f81b5f86. By now it is not used for building tools/perf, but Stephen Rothwell reported that when building on a O= directory that had been built with torvalds/master and this perf build command line: $ make -C tools/perf -f Makefile.perf -s -O -j60 O=/home/sfr/next/perf NO_BPF_SKEL=1 If we then merge perf-tools-next, as he did for linux-next, then we end up with a build failure for libbpf: PERF_VERSION = 6.9.rc3.g42c4635c8dee make[3]: *** No rule to make target '/home/sfr/next/next/tools/include/uapi/linux/stat.h', needed by '/home/sfr/next/perf/libbpf/staticobjs/libbpf.o'. Stop. make[2]: *** [Makefile:157: /home/sfr/next/perf/libbpf/staticobjs/libbpf-in.o] Error 2 make[1]: *** [Makefile.perf:892: /home/sfr/next/perf/libbpf/libbpf.a] Error 2 make[1]: *** Waiting for unfinished jobs.... make: *** [Makefile.perf:264: sub-make] Error 2 This needs to be further investigated to figure out how to check if libbpf really needs something that is in that tools/include/uapi/linux/stat.h file and if not to remove that file in a way that we don't break the build in any situation, to avoid requiring doing a 'make clean'. Reported-by: Stephen Rothwell Tested-by: Stephen Rothwell # PowerPC le incermental build Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240413124340.4d48c6d8@canb.auug.org.au Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/stat.h | 195 ++++++++++++++++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 2 files changed, 196 insertions(+) create mode 100644 tools/include/uapi/linux/stat.h (limited to 'tools') diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h new file mode 100644 index 000000000000..2f2ee82d5517 --- /dev/null +++ b/tools/include/uapi/linux/stat.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_STAT_H +#define _UAPI_LINUX_STAT_H + +#include + +#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) + +#define S_IFMT 00170000 +#define S_IFSOCK 0140000 +#define S_IFLNK 0120000 +#define S_IFREG 0100000 +#define S_IFBLK 0060000 +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFIFO 0010000 +#define S_ISUID 0004000 +#define S_ISGID 0002000 +#define S_ISVTX 0001000 + +#define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) +#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) +#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) +#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) +#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK) +#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO) +#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) + +#define S_IRWXU 00700 +#define S_IRUSR 00400 +#define S_IWUSR 00200 +#define S_IXUSR 00100 + +#define S_IRWXG 00070 +#define S_IRGRP 00040 +#define S_IWGRP 00020 +#define S_IXGRP 00010 + +#define S_IRWXO 00007 +#define S_IROTH 00004 +#define S_IWOTH 00002 +#define S_IXOTH 00001 + +#endif + +/* + * Timestamp structure for the timestamps in struct statx. + * + * tv_sec holds the number of seconds before (negative) or after (positive) + * 00:00:00 1st January 1970 UTC. + * + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. + * + * __reserved is held in case we need a yet finer resolution. + */ +struct statx_timestamp { + __s64 tv_sec; + __u32 tv_nsec; + __s32 __reserved; +}; + +/* + * Structures for the extended file attribute retrieval system call + * (statx()). + * + * The caller passes a mask of what they're specifically interested in as a + * parameter to statx(). What statx() actually got will be indicated in + * st_mask upon return. + * + * For each bit in the mask argument: + * + * - if the datum is not supported: + * + * - the bit will be cleared, and + * + * - the datum will be set to an appropriate fabricated value if one is + * available (eg. CIFS can take a default uid and gid), otherwise + * + * - the field will be cleared; + * + * - otherwise, if explicitly requested: + * + * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is + * set or if the datum is considered out of date, and + * + * - the field will be filled in and the bit will be set; + * + * - otherwise, if not requested, but available in approximate form without any + * effort, it will be filled in anyway, and the bit will be set upon return + * (it might not be up to date, however, and no attempt will be made to + * synchronise the internal state first); + * + * - otherwise the field and the bit will be cleared before returning. + * + * Items in STATX_BASIC_STATS may be marked unavailable on return, but they + * will have values installed for compatibility purposes so that stat() and + * co. can be emulated in userspace. + */ +struct statx { + /* 0x00 */ + __u32 stx_mask; /* What results were written [uncond] */ + __u32 stx_blksize; /* Preferred general I/O size [uncond] */ + __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ + /* 0x10 */ + __u32 stx_nlink; /* Number of hard links */ + __u32 stx_uid; /* User ID of owner */ + __u32 stx_gid; /* Group ID of owner */ + __u16 stx_mode; /* File mode */ + __u16 __spare0[1]; + /* 0x20 */ + __u64 stx_ino; /* Inode number */ + __u64 stx_size; /* File size */ + __u64 stx_blocks; /* Number of 512-byte blocks allocated */ + __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ + /* 0x40 */ + struct statx_timestamp stx_atime; /* Last access time */ + struct statx_timestamp stx_btime; /* File creation time */ + struct statx_timestamp stx_ctime; /* Last attribute change time */ + struct statx_timestamp stx_mtime; /* Last data modification time */ + /* 0x80 */ + __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ + __u32 stx_rdev_minor; + __u32 stx_dev_major; /* ID of device containing file [uncond] */ + __u32 stx_dev_minor; + /* 0x90 */ + __u64 stx_mnt_id; + __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ + __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + /* 0xa0 */ + __u64 __spare3[12]; /* Spare space for future expansion */ + /* 0x100 */ +}; + +/* + * Flags to be stx_mask + * + * Query request/result mask for statx() and struct statx::stx_mask. + * + * These bits should be set in the mask argument of statx() to request + * particular items when calling statx(). + */ +#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */ +#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */ +#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */ +#define STATX_UID 0x00000008U /* Want/got stx_uid */ +#define STATX_GID 0x00000010U /* Want/got stx_gid */ +#define STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define STATX_INO 0x00000100U /* Want/got stx_ino */ +#define STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ +#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ + +#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ + +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + +/* + * Attributes to be found in stx_attributes and masked in stx_attributes_mask. + * + * These give information about the features or the state of a file that might + * be of use to ordinary userspace programs such as GUIs or ls rather than + * specialised tools. + * + * Note that the flags marked [I] correspond to the FS_IOC_SETFLAGS flags + * semantically. Where possible, the numerical value is picked to correspond + * also. Note that the DAX attribute indicates that the file is in the CPU + * direct access state. It does not correspond to the per-inode flag that + * some filesystems support. + * + */ +#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */ +#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */ +#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ +#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ +#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ +#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ +#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ +#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ + + +#endif /* _UAPI_LINUX_STAT_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 95a3a404bc6f..9c84fd049070 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -16,6 +16,7 @@ FILES=( "include/uapi/linux/in.h" "include/uapi/linux/perf_event.h" "include/uapi/linux/seccomp.h" + "include/uapi/linux/stat.h" "include/linux/bits.h" "include/vdso/bits.h" "include/linux/const.h" -- cgit v1.2.3 From 9c598a83b7eab7b05002911fd25b0be7b996ce6d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:07 +0800 Subject: selftests/bpf: Add start_server_addr helper In order to pair up with connect_to_addr(), this patch adds a new helper start_server_addr(), which is a wrapper of __start_server(). It accepts an argument 'addr' of 'struct sockaddr_storage' type instead of a string type argument like start_server(), and a network_helper_opts argument as the last one. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/2f01d48fa026467926738debe554ac452c19b86f.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 14 ++++++++++++-- tools/testing/selftests/bpf/network_helpers.h | 2 ++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index dc1fd7af9c7a..28fe8367451b 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -52,6 +52,8 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; +static const struct network_helper_opts default_opts; + int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; @@ -185,6 +187,16 @@ close_fds: return NULL; } +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts) +{ + if (!opts) + opts = &default_opts; + + return __start_server(type, 0, (struct sockaddr *)addr, len, + opts->timeout_ms, 0); +} + void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { @@ -278,8 +290,6 @@ error_close: return -1; } -static const struct network_helper_opts default_opts; - int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 70f4e4c92733..414ea50bb3fc 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -53,6 +53,8 @@ int start_mptcp_server(int family, const char *addr, __u16 port, int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); int connect_to_fd(int server_fd, int timeout_ms); -- cgit v1.2.3 From 9851382fb3697efb2f4234d70596bd845b3af535 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:08 +0800 Subject: selftests/bpf: Use start_server_addr in cls_redirect Include network_helpers.h in prog_tests/cls_redirect.c, use the newly added public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/13f336cb4c6680175d50bb963d9532e11528c758.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/cls_redirect.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 2a55f717fc07..68cb93106658 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -10,6 +10,7 @@ #include #include +#include "network_helpers.h" #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" @@ -35,23 +36,6 @@ struct tuple { struct addr_port dst; }; -static int start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto err; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static int connect_to_server(const struct sockaddr *addr, socklen_t len, int type) { @@ -98,7 +82,7 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, socklen_t slen = sizeof(ss); struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server(addr, len, type); + *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (*server < 0) return false; -- cgit v1.2.3 From a2e4979536c440838794ee292955ce9ed55ad181 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:09 +0800 Subject: selftests/bpf: Use start_server_addr in sk_assign Include network_helpers.h in prog_tests/sk_assign.c, use the newly added public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. The code that sets SO_RCVTIMEO timeout as timeo_sec (3s) can be dropped, since start_server_addr() sets default timeout as 3s. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/2af706ffbad63b4f7eaf93a426ed1076eadf1a05.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 29 +++------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1374b626a985..b066b6b88d7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -15,6 +15,7 @@ #include #include "test_progs.h" +#include "network_helpers.h" #define BIND_PORT 1234 #define CONNECT_PORT 4321 @@ -73,30 +74,6 @@ configure_stack(void) return true; } -static int -start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto close_out; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static int connect_to_server(const struct sockaddr *addr, socklen_t len, int type) { @@ -310,7 +287,9 @@ void test_sk_assign(void) continue; prepare_addr(test->addr, test->family, BIND_PORT, false); addr = (const struct sockaddr *)test->addr; - server = start_server(addr, test->len, test->type); + server = start_server_addr(test->type, + (const struct sockaddr_storage *)addr, + test->len, NULL); if (server == -1) goto close; -- cgit v1.2.3 From db9994d022ecd42006354609f6e4e3f57e5a4b03 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:10 +0800 Subject: selftests/bpf: Update arguments of connect_to_addr Move the third argument "int type" of connect_to_addr() to the first one which is closer to how the socket syscall is doing it. And add a network_helper_opts argument as the fourth one. Then change its usages in sock_addr.c too. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/088ea8a95055f93409c5f57d12f0e58d43059ac4.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 13 ++++++++++--- tools/testing/selftests/bpf/network_helpers.h | 3 ++- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 28fe8367451b..9d63d2ac13d8 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -270,17 +270,24 @@ static int connect_fd_to_addr(int fd, return 0; } -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t addrlen, int type) +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->ss_family, type, 0); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create client socket"); return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, false)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) goto error_close; return fd; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 414ea50bb3fc..aef297dfa6ca 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -56,7 +56,8 @@ int *start_reuseport_server(int family, int type, const char *addr_str, int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 5fd617718991..61668e0f11b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -328,7 +328,7 @@ static void test_bind(struct sock_addr_test *test) goto cleanup; /* Try to connect to server just in case */ - client = connect_to_addr(&expected_addr, expected_addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &expected_addr, expected_addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -357,7 +357,7 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -538,7 +538,7 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; -- cgit v1.2.3 From 805b4d90c0df79a88907623b2528954a0125313d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:11 +0800 Subject: selftests/bpf: Use connect_to_addr in cls_redirect This patch uses public helper connect_to_addr() exported in network_helpers.h instead of the local defined function connect_to_server() in prog_tests/cls_redirect.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/4a03ac92d2d392f8721f398fa449a83ac75577bc.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/cls_redirect.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 68cb93106658..34b59f6baca1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -36,22 +36,6 @@ struct tuple { struct addr_port dst; }; -static int connect_to_server(const struct sockaddr *addr, socklen_t len, - int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(connect(fd, addr, len))) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) { const struct sockaddr_in6 *in6; @@ -89,7 +73,7 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, if (CHECK_FAIL(getsockname(*server, sa, &slen))) goto close_server; - *conn = connect_to_server(sa, slen, type); + *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); if (*conn < 0) goto close_server; -- cgit v1.2.3 From 63a51820d29bdfcf52abed2db2d32b4f6843988e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:12 +0800 Subject: selftests/bpf: Use connect_to_addr in sk_assign This patch uses public helper connect_to_addr() exported in network_helpers.h instead of the local defined function connect_to_server() in prog_tests/sk_assign.c. This can avoid duplicate code. The code that sets SO_SNDTIMEO timeout as timeo_sec (3s) can be dropped, since connect_to_addr() sets default timeout as 3s. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/98fdd384872bda10b2adb052e900a2212c9047b9.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 26 +--------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index b066b6b88d7c..0b9bd1d6f7cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -23,8 +23,6 @@ #define NS_SELF "/proc/self/ns/net" #define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" -static const struct timeval timeo_sec = { .tv_sec = 3 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int stop, duration; static bool @@ -74,28 +72,6 @@ configure_stack(void) return true; } -static int -connect_to_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = -1; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(connect(fd, addr, len))) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static in_port_t get_port(int fd) { @@ -138,7 +114,7 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) in_port_t port; int ret = 1; - client = connect_to_server(addr, len, type); + client = connect_to_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (client == -1) { perror("Cannot connect to server"); goto out; -- cgit v1.2.3 From 03a1a62f3a3c16677a44b81b0268e0c226a4deb7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:29 +0200 Subject: selftests: netfilter: nft_queue.sh: move to lib.sh infra - switch to socat, like other tests - use buswait helper to test once listener netns is ready - do not generate multiple input test files, only generate one and use cleanup hook to remove it, like other temporary files. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_queue.sh | 95 ++++++++-------------- 1 file changed, 34 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index 2eb65887e570..9aee4169d198 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -3,16 +3,10 @@ # This tests nf_queue: # 1. can process packets from all hooks # 2. support running nfqueue from more than one base chain -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 -sfx=$(mktemp -u "XXXXXXXX") -ns1="ns1-$sfx" -ns2="ns2-$sfx" -nsrouter="nsrouter-$sfx" -timeout=4 +source lib.sh +ret=0 +timeout=2 cleanup() { @@ -20,9 +14,9 @@ cleanup() ip netns pids ${ns2} | xargs kill 2>/dev/null ip netns pids ${nsrouter} | xargs kill 2>/dev/null - ip netns del ${ns1} - ip netns del ${ns2} - ip netns del ${nsrouter} + cleanup_all_ns + + rm -f "$TMPINPUT" rm -f "$TMPFILE0" rm -f "$TMPFILE1" rm -f "$TMPFILE2" "$TMPFILE3" @@ -34,26 +28,17 @@ if [ $? -ne 0 ];then exit $ksft_skip fi -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi +trap cleanup EXIT -ip netns add ${nsrouter} -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace" - exit $ksft_skip -fi +setup_ns ns1 ns2 nsrouter TMPFILE0=$(mktemp) TMPFILE1=$(mktemp) TMPFILE2=$(mktemp) TMPFILE3=$(mktemp) -trap cleanup EXIT -ip netns add ${ns1} -ip netns add ${ns2} +TMPINPUT=$(mktemp) +dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$TMPINPUT ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 if [ $? -ne 0 ];then @@ -62,28 +47,24 @@ if [ $? -ne 0 ];then fi ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} -ip -net ${nsrouter} link set lo up ip -net ${nsrouter} link set veth0 up ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 nodad ip -net ${nsrouter} link set veth1 up ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 nodad -ip -net ${ns1} link set lo up ip -net ${ns1} link set eth0 up - -ip -net ${ns2} link set lo up ip -net ${ns2} link set eth0 up ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 nodad ip -net ${ns1} route add default via 10.0.1.1 ip -net ${ns1} route add default via dead:1::1 ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 nodad ip -net ${ns2} route add default via 10.0.2.1 ip -net ${ns2} route add default via dead:2::1 @@ -161,7 +142,7 @@ test_ping() { ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null if [ $? -ne 0 ];then - return 1 + return 2 fi return 0 @@ -170,12 +151,12 @@ test_ping() { test_ping_router() { ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null if [ $? -ne 0 ];then - return 1 + return 3 fi ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null if [ $? -ne 0 ];then - return 1 + return 4 fi return 0 @@ -257,40 +238,40 @@ test_queue() echo "PASS: Expected and received $last" } +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :12345" | grep -q 12345 +} + test_tcp_forward() { ip netns exec ${nsrouter} ./nf_queue -q 2 -t $timeout & local nfqpid=$! - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + timeout 5 ip netns exec ${ns2} socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! - sleep 1 - ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null & + busywait $BUSYWAIT_TIMEOUT listener_ready ${ns2} - rm -f "$tmpfile" + ip netns exec ${ns1} socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null wait $rpid - wait $lpid + [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" } test_tcp_localhost() { - tmpfile=$(mktemp) || exit 1 - - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$TMPINPUT + timeout 5 ip netns exec ${nsrouter} socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! ip netns exec ${nsrouter} ./nf_queue -q 3 -t $timeout & local nfqpid=$! - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" + busywait $BUSYWAIT_TIMEOUT listener_ready ${nsrouter} + + ip netns exec ${nsrouter} socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null wait $rpid [ $? -eq 0 ] && echo "PASS: tcp via loopback" @@ -299,15 +280,12 @@ test_tcp_localhost() test_tcp_localhost_connectclose() { - tmpfile=$(mktemp) || exit 1 - ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & ip netns exec ${nsrouter} ./nf_queue -q 3 -t $timeout & local nfqpid=$! sleep 1 - rm -f "$tmpfile" wait $rpid [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" @@ -329,9 +307,7 @@ table inet filter { } } EOF - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile - ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + timeout 5 ip netns exec ${nsrouter} socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! ip netns exec ${nsrouter} ./nf_queue -c -q 1 -t $timeout > "$TMPFILE2" & @@ -340,9 +316,8 @@ EOF # re-queue the packet to nfqueue program on queue 2. ip netns exec ${nsrouter} ./nf_queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & - sleep 1 - ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null - rm -f "$tmpfile" + busywait $BUSYWAIT_TIMEOUT listener_ready ${nsrouter} + ip netns exec ${nsrouter} socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null wait @@ -409,8 +384,6 @@ ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null load_ruleset "filter" 0 -sleep 3 - test_ping ret=$? if [ $ret -eq 0 ];then -- cgit v1.2.3 From cebb352269e7cc6ea339fd78815e496f9156b71d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:30 +0200 Subject: selftests: netfilter: nft_queue.sh: shellcheck cleanups No functional change intended. Disable frequent shellcheck warnings wrt. "unreachable" code, those helpers get called indirectly from busywait helper. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-3-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_queue.sh | 211 ++++++++++----------- 1 file changed, 103 insertions(+), 108 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index 9aee4169d198..8538f08c64c2 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -3,6 +3,8 @@ # This tests nf_queue: # 1. can process packets from all hooks # 2. support running nfqueue from more than one base chain +# +# shellcheck disable=SC2162,SC2317 source lib.sh ret=0 @@ -10,9 +12,9 @@ timeout=2 cleanup() { - ip netns pids ${ns1} | xargs kill 2>/dev/null - ip netns pids ${ns2} | xargs kill 2>/dev/null - ip netns pids ${nsrouter} | xargs kill 2>/dev/null + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null + ip netns pids "$nsrouter" | xargs kill 2>/dev/null cleanup_all_ns @@ -22,11 +24,7 @@ cleanup() rm -f "$TMPFILE2" "$TMPFILE3" } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool "nft --version" "test without nft tool" trap cleanup EXIT @@ -38,41 +36,40 @@ TMPFILE2=$(mktemp) TMPFILE3=$(mktemp) TMPINPUT=$(mktemp) -dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$TMPINPUT +dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" -ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 nodad +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 nodad +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad -ip -net ${ns1} link set eth0 up -ip -net ${ns2} link set eth0 up +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 nodad -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 nodad -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 load_ruleset() { local name=$1 local prio=$2 -ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null - if [ $? -ne 0 ];then + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then return 1 fi - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null - if [ $? -ne 0 ];then + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then return 2 fi @@ -149,13 +144,11 @@ test_ping() { } test_ping_router() { - ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null - if [ $? -ne 0 ];then + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then return 3 fi - ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null - if [ $? -ne 0 ];then + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then return 4 fi @@ -165,7 +158,7 @@ test_ping_router() { test_queue_blackhole() { local proto=$1 -ip netns exec ${nsrouter} nft -f /dev/stdin < /dev/null + if [ "$proto" = "ip" ] ;then + ip netns exec "$ns1" ping -W 2 -c 1 -q 10.0.2.99 > /dev/null lret=$? - elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null + elif [ "$proto" = "ip6" ]; then + ip netns exec "$ns1" ping -W 2 -c 1 -q dead:2::99 > /dev/null lret=$? else lret=111 fi # queue without bypass keyword should drop traffic if no listener exists. - if [ $lret -eq 0 ];then + if [ "$lret" -eq 0 ];then echo "FAIL: $proto expected failure, got $lret" 1>&2 exit 1 fi - ip netns exec ${nsrouter} nft delete table $proto blackh - if [ $? -ne 0 ] ;then + if ! ip netns exec "$nsrouter" nft delete table "$proto" blackh; then echo "FAIL: $proto: Could not delete blackh table" exit 1 fi @@ -198,26 +190,41 @@ EOF echo "PASS: $proto: statement with no listener results in packet drop" } +nf_queue_wait() +{ + local procfile="/proc/self/net/netfilter/nfnetlink_queue" + local netns id + + netns="$1" + id="$2" + + # if this file doesn't exist, nfnetlink_module isn't loaded. + # rather than loading it ourselves, wait for kernel module autoload + # completion, nfnetlink should do so automatically because nf_queue + # helper program, spawned in the background, asked for this functionality. + test -f "$procfile" && + ip netns exec "$netns" cat "$procfile" | grep -q "^ *$id " +} + test_queue() { - local expected=$1 + local expected="$1" local last="" # spawn nf_queue listeners - ip netns exec ${nsrouter} ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" & - sleep 1 - test_ping - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2 + ip netns exec "$nsrouter" ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" & + + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 0 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1 + + if ! test_ping;then + echo "FAIL: netns routing/connectivity with active listener on queues 0 and 1: $ret" 1>&2 exit $ret fi - test_ping_router - ret=$? - if [ $ret -ne 0 ];then - echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2 + if ! test_ping_router;then + echo "FAIL: netns router unreachable listener on queue 0 and 1: $ret" 1>&2 exit $ret fi @@ -228,9 +235,7 @@ test_queue() last=$(tail -n1 "$file") if [ x"$last" != x"$expected packets total" ]; then echo "FAIL: Expected $expected packets total, but got $last" 1>&2 - cat "$file" 1>&2 - - ip netns exec ${nsrouter} nft list ruleset + ip netns exec "$nsrouter" nft list ruleset exit 1 fi done @@ -245,56 +250,50 @@ listener_ready() test_tcp_forward() { - ip netns exec ${nsrouter} ./nf_queue -q 2 -t $timeout & + ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" & local nfqpid=$! - timeout 5 ip netns exec ${ns2} socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! - busywait $BUSYWAIT_TIMEOUT listener_ready ${ns2} - - ip netns exec ${ns1} socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" - wait $rpid + ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null - [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain" + wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain" } test_tcp_localhost() { - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$TMPINPUT - timeout 5 ip netns exec ${nsrouter} socat -u TCP-LISTEN:12345 STDOUT >/dev/null & + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! - ip netns exec ${nsrouter} ./nf_queue -q 3 -t $timeout & + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & local nfqpid=$! - busywait $BUSYWAIT_TIMEOUT listener_ready ${nsrouter} + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" - ip netns exec ${nsrouter} socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback" + wait "$rpid" && echo "PASS: tcp via loopback" wait 2>/dev/null } test_tcp_localhost_connectclose() { - ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & + ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & - ip netns exec ${nsrouter} ./nf_queue -q 3 -t $timeout & - local nfqpid=$! - - sleep 1 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 - wait $rpid - [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" + wait && echo "PASS: tcp via loopback with connect/close" wait 2>/dev/null } test_tcp_localhost_requeue() { -ip netns exec ${nsrouter} nft -f /dev/stdin </dev/null & + timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! - ip netns exec ${nsrouter} ./nf_queue -c -q 1 -t $timeout > "$TMPFILE2" & + ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t "$timeout" > "$TMPFILE2" & # nfqueue 1 will be called via output hook. But this time, # re-queue the packet to nfqueue program on queue 2. - ip netns exec ${nsrouter} ./nf_queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & + ip netns exec "$nsrouter" ./nf_queue -G -d 150 -c -q 0 -Q 1 -t "$timeout" > "$TMPFILE3" & - busywait $BUSYWAIT_TIMEOUT listener_ready ${nsrouter} - ip netns exec ${nsrouter} socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null wait @@ -330,17 +329,16 @@ EOF } test_icmp_vrf() { - ip -net $ns1 link add tvrf type vrf table 9876 - if [ $? -ne 0 ];then + if ! ip -net "$ns1" link add tvrf type vrf table 9876;then echo "SKIP: Could not add vrf device" return fi - ip -net $ns1 li set eth0 master tvrf - ip -net $ns1 li set tvrf up + ip -net "$ns1" li set eth0 master tvrf + ip -net "$ns1" li set tvrf up - ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876 -ip netns exec ${ns1} nft -f /dev/stdin < /dev/null + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1 + + ip netns exec "$ns1" ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null for n in output post; do for d in tvrf eth0; do - ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1" - if [ $? -ne 0 ] ; then + if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 - ip netns exec ${ns1} nft list ruleset + ip netns exec "$ns1" nft list ruleset ret=1 return fi done done - wait $nfqpid - [ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf" + wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf" wait 2>/dev/null } -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null load_ruleset "filter" 0 -test_ping -ret=$? -if [ $ret -eq 0 ];then +if test_ping; then # queue bypass works (rules were skipped, no listener) echo "PASS: ${ns1} can reach ${ns2}" else -- cgit v1.2.3 From a849e06c8025ed79d579c789cfbc08e2374883a4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:31 +0200 Subject: selftests: netfilter: nft_synproxy.sh: move to lib.sh infra use checktool helper where applicable. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-4-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_synproxy.sh | 77 ++++++++-------------- 1 file changed, 28 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh index b62933b680d6..293f667a6aec 100755 --- a/tools/testing/selftests/net/netfilter/nft_synproxy.sh +++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh @@ -1,84 +1,65 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -ret=0 - -rnd=$(mktemp -u XXXXXXXX) -nsr="nsr-$rnd" # synproxy machine -ns1="ns1-$rnd" # iperf client -ns2="ns2-$rnd" # iperf server +source lib.sh -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi -} +ret=0 checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" checktool "iperf3 --version" "run test without iperf3" -checktool "ip netns add $nsr" "create net namespace" -modprobe -q nf_conntrack +setup_ns nsr ns1 ns2 -ip netns add $ns1 -ip netns add $ns2 +modprobe -q nf_conntrack cleanup() { - ip netns pids $ns1 | xargs kill 2>/dev/null - ip netns pids $ns2 | xargs kill 2>/dev/null - ip netns del $ns1 - ip netns del $ns2 + ip netns pids "$ns1" | xargs kill 2>/dev/null + ip netns pids "$ns2" | xargs kill 2>/dev/null - ip netns del $nsr + cleanup_all_ns } trap cleanup EXIT -ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2 +ip link add veth0 netns "$nsr" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr" type veth peer name eth0 netns "$ns2" -for dev in lo veth0 veth1; do -ip -net $nsr link set $dev up +for dev in veth0 veth1; do + ip -net "$nsr" link set "$dev" up done -ip -net $nsr addr add 10.0.1.1/24 dev veth0 -ip -net $nsr addr add 10.0.2.1/24 dev veth1 +ip -net "$nsr" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr" addr add 10.0.2.1/24 dev veth1 -ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1 -ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1 -ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth0.forwarding=1 +ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth1.forwarding=1 +ip netns exec "$nsr" sysctl -q net.netfilter.nf_conntrack_tcp_loose=0 for n in $ns1 $ns2; do - ip -net $n link set lo up - ip -net $n link set eth0 up + ip -net "$n" link set eth0 up done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 # test basic connectivity -if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: $ns1 cannot reach $ns2" 1>&2 exit 1 fi -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then echo "ERROR: $ns2 cannot reach $ns1" 1>&2 exit 1 fi -ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 & +ip netns exec "$ns2" iperf3 -s > /dev/null 2>&1 & # ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 & sleep 1 -ip netns exec $nsr nft -f - < /dev/null - -if [ $? -ne 0 ]; then +if ! ip netns exec "$ns1" timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null; then echo "FAIL: iperf3 returned an error" 1>&2 - ret=$? - ip netns exec $nsr nft list ruleset + ret=1 + ip netns exec "$nsr" nft list ruleset else echo "PASS: synproxy connection successful" fi -- cgit v1.2.3 From c1a9d47b59d02f466ead8c023f6822f68285edbe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:32 +0200 Subject: selftests: netfilter: nft_zones_many.sh: move to lib.sh infra Also do shellcheck cleanups here, no functional changes intended. When running tests via vng tool, the packetpath insertion test fails: dd: failed to open '/dev/stdout': Device or resource busy Just omit 'of=' and this will work as intended. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-5-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_zones_many.sh | 93 +++++++++++----------- 1 file changed, 45 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh index 5a8db0b48928..db53de348783 100755 --- a/tools/testing/selftests/net/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -3,11 +3,7 @@ # Test insertion speed for packets with identical addresses/ports # that are all placed in distinct conntrack zones. -sfx=$(mktemp -u "XXXXXXXX") -ns="ns-$sfx" - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh zones=2000 have_ct_tool=0 @@ -15,35 +11,25 @@ ret=0 cleanup() { - ip netns del $ns -} - -checktool (){ - if ! $1 > /dev/null 2>&1; then - echo "SKIP: Could not $2" - exit $ksft_skip - fi + cleanup_all_ns } checktool "nft --version" "run test without nft tool" -checktool "ip -Version" "run test without ip tool" checktool "socat -V" "run test without socat tool" -checktool "ip netns add $ns" "create net namespace" + +setup_ns ns1 trap cleanup EXIT -conntrack -V > /dev/null 2>&1 -if [ $? -eq 0 ];then +if conntrack -V > /dev/null 2>&1; then have_ct_tool=1 fi -ip -net "$ns" link set lo up - test_zones() { local max_zones=$1 -ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 -ip netns exec $ns nft -f /dev/stdin</dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 + dd if=/dev/zero bs=8k count=1000 2>/dev/null | ip netns exec "$ns1" socat -u STDIN UDP:127.0.0.1:12345,sourceport=12345 if [ $? -ne 0 ] ;then ret=1 break @@ -89,14 +82,15 @@ EOF echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)" done - if [ $have_ct_tool -eq 1 ]; then - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) + if [ "$have_ct_tool" -eq 1 ]; then + local count duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) - if [ $count -eq $max_zones ]; then + if [ "$count" -eq "$max_zones" ]; then echo "PASS: inserted $count entries from packet path in $duration ms total" else - ip netns exec $ns conntrack -S 1>&2 + ip netns exec "$ns1" conntrack -S 1>&2 echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" ret=1 fi @@ -110,18 +104,19 @@ EOF test_conntrack_tool() { local max_zones=$1 - ip netns exec $ns conntrack -F >/dev/null 2>/dev/null + ip netns exec "$ns1" conntrack -F >/dev/null 2>/dev/null - local outerstart=$(date +%s%3N) - local start=$(date +%s%3N) - local stop=$start - local i=0 - while [ $i -lt $max_zones ]; do + local outerstart start stop i + outerstart=$(date +%s%3N) + start=$(date +%s%3N) + stop="$start" + i=0 + while [ "$i" -lt "$max_zones" ]; do i=$((i + 1)) - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 if [ $? -ne 0 ];then - ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ + ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null echo "FAIL: conntrack -I returned an error" ret=1 @@ -137,13 +132,15 @@ test_conntrack_tool() { fi done - local count=$(ip netns exec "$ns" conntrack -C) - local duration=$((stop-outerstart)) + local count + local duration + count=$(ip netns exec "$ns1" conntrack -C) + duration=$((stop-outerstart)) - if [ $count -eq $max_zones ]; then + if [ "$count" -eq "$max_zones" ]; then echo "PASS: inserted $count entries via ctnetlink in $duration ms" else - ip netns exec $ns conntrack -S 1>&2 + ip netns exec "$ns1" conntrack -S 1>&2 echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" ret=1 fi @@ -151,7 +148,7 @@ test_conntrack_tool() { test_zones $zones -if [ $have_ct_tool -eq 1 ];then +if [ "$have_ct_tool" -eq 1 ];then test_conntrack_tool $zones else echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" -- cgit v1.2.3 From 5067fec09403b36f8f6501bdb4c5a9c949ce930b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:33 +0200 Subject: selftests: netfilter: xt_string.sh: move to lib.sh infra Intentional changes: - Use socat instead of netcat - Use a temporary file instead of pipe, else packets do not match "-m string" rules, multiple writes to the pipe cause multiple packets, but this needs only one to work. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-6-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/xt_string.sh | 55 ++++++++++++---------- 1 file changed, 30 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh index 1802653a4728..ec7042b502e4 100755 --- a/tools/testing/selftests/net/netfilter/xt_string.sh +++ b/tools/testing/selftests/net/netfilter/xt_string.sh @@ -5,43 +5,45 @@ ksft_skip=4 rc=0 -if ! iptables --version >/dev/null 2>&1; then - echo "SKIP: Test needs iptables" - exit $ksft_skip -fi -if ! ip -V >/dev/null 2>&1; then - echo "SKIP: Test needs iproute2" - exit $ksft_skip -fi -if ! nc -h >/dev/null 2>&1; then - echo "SKIP: Test needs netcat" - exit $ksft_skip -fi +source lib.sh + +checktool "socat -h" "run test without socat" +checktool "iptables --version" "test needs iptables" + +infile=$(mktemp) + +cleanup() +{ + ip netns del "$netns" + rm -f "$infile" +} + +trap cleanup EXIT + +setup_ns netns + +ip -net "$netns" link add d0 type dummy +ip -net "$netns" link set d0 up +ip -net "$netns" addr add 10.1.2.1/24 dev d0 pattern="foo bar baz" patlen=11 hdrlen=$((20 + 8)) # IPv4 + UDP -ns="ns-$(mktemp -u XXXXXXXX)" -trap 'ip netns del $ns' EXIT -ip netns add "$ns" -ip -net "$ns" link add d0 type dummy -ip -net "$ns" link set d0 up -ip -net "$ns" addr add 10.1.2.1/24 dev d0 - -#ip netns exec "$ns" tcpdump -npXi d0 & + +#ip netns exec "$netns" tcpdump -npXi d0 & #tcpdump_pid=$! -#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT +#trap 'kill $tcpdump_pid; ip netns del $netns' EXIT add_rule() { # (alg, from, to) - ip netns exec "$ns" \ + ip netns exec "$netns" \ iptables -A OUTPUT -o d0 -m string \ --string "$pattern" --algo $1 --from $2 --to $3 } showrules() { # () - ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A' + ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A' } zerorules() { - ip netns exec "$ns" iptables -Z OUTPUT + ip netns exec "$netns" iptables -Z OUTPUT } countrule() { # (pattern) showrules | grep -c -- "$*" @@ -51,7 +53,9 @@ send() { # (offset) printf " " done printf "$pattern" - ) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374 + ) > "$infile" + + ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile" } add_rule bm 1000 1500 @@ -125,4 +129,5 @@ if [ $(countrule -c 1) -ne 0 ]; then ((rc--)) fi +[ $rc -eq 0 ] && echo "PASS: string match tests" exit $rc -- cgit v1.2.3 From c0f9a2b705c2f53c531c8d51853d5e990db804b9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:34 +0200 Subject: selftests: netfilter: xt_string.sh: shellcheck cleanups no functional change intended. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-7-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/xt_string.sh | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh index ec7042b502e4..8d401c69e317 100755 --- a/tools/testing/selftests/net/netfilter/xt_string.sh +++ b/tools/testing/selftests/net/netfilter/xt_string.sh @@ -37,7 +37,7 @@ hdrlen=$((20 + 8)) # IPv4 + UDP add_rule() { # (alg, from, to) ip netns exec "$netns" \ iptables -A OUTPUT -o d0 -m string \ - --string "$pattern" --algo $1 --from $2 --to $3 + --string "$pattern" --algo "$1" --from "$2" --to "$3" } showrules() { # () ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A' @@ -49,10 +49,10 @@ countrule() { # (pattern) showrules | grep -c -- "$*" } send() { # (offset) - ( for ((i = 0; i < $1 - $hdrlen; i++)); do - printf " " + ( for ((i = 0; i < $1 - hdrlen; i++)); do + echo -n " " done - printf "$pattern" + echo -n "$pattern" ) > "$infile" ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile" @@ -65,8 +65,8 @@ add_rule kmp 1400 1600 zerorules send 0 -send $((1000 - $patlen)) -if [ $(countrule -c 0 0) -ne 4 ]; then +send $((1000 - patlen)) +if [ "$(countrule -c 0 0)" -ne 4 ]; then echo "FAIL: rules match data before --from" showrules ((rc--)) @@ -74,16 +74,16 @@ fi zerorules send 1000 -send $((1400 - $patlen)) -if [ $(countrule -c 2) -ne 2 ]; then +send $((1400 - patlen)) +if [ "$(countrule -c 2)" -ne 2 ]; then echo "FAIL: only two rules should match at low offset" showrules ((rc--)) fi zerorules -send $((1500 - $patlen)) -if [ $(countrule -c 1) -ne 4 ]; then +send $((1500 - patlen)) +if [ "$(countrule -c 1)" -ne 4 ]; then echo "FAIL: all rules should match at end of packet" showrules ((rc--)) @@ -91,7 +91,7 @@ fi zerorules send 1495 -if [ $(countrule -c 1) -ne 1 ]; then +if [ "$(countrule -c 1)" -ne 1 ]; then echo "FAIL: only kmp with proper --to should match pattern spanning fragments" showrules ((rc--)) @@ -99,23 +99,23 @@ fi zerorules send 1500 -if [ $(countrule -c 1) -ne 2 ]; then +if [ "$(countrule -c 1)" -ne 2 ]; then echo "FAIL: two rules should match pattern at start of second fragment" showrules ((rc--)) fi zerorules -send $((1600 - $patlen)) -if [ $(countrule -c 1) -ne 2 ]; then +send $((1600 - patlen)) +if [ "$(countrule -c 1)" -ne 2 ]; then echo "FAIL: two rules should match pattern at end of largest --to" showrules ((rc--)) fi zerorules -send $((1600 - $patlen + 1)) -if [ $(countrule -c 1) -ne 0 ]; then +send $((1600 - patlen + 1)) +if [ "$(countrule -c 1)" -ne 0 ]; then echo "FAIL: no rules should match pattern extending largest --to" showrules ((rc--)) @@ -123,7 +123,7 @@ fi zerorules send 1600 -if [ $(countrule -c 1) -ne 0 ]; then +if [ "$(countrule -c 1)" -ne 0 ]; then echo "FAIL: no rule should match pattern past largest --to" showrules ((rc--)) -- cgit v1.2.3 From d6905f088d2bda50519c6bbf841b1035391eebc9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:35 +0200 Subject: selftests: netfilter: nft_nat_zones.sh: shellcheck cleanups While at it: No need for iperf here, use socat. This also reduces the script runtime. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-8-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_nat_zones.sh | 193 ++++++++------------- 1 file changed, 75 insertions(+), 118 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh index b9ab37380f33..549f264b41f3 100755 --- a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh @@ -3,15 +3,14 @@ # Test connection tracking zone and NAT source port reallocation support. # -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 +source lib.sh # Don't increase too much, 2000 clients should work # just fine but script can then take several minutes with # KASAN/debug builds. maxclients=100 -have_iperf=1 +have_socat=0 ret=0 # client1---. @@ -31,12 +30,6 @@ ret=0 # NAT Gateway is supposed to do port reallocation for each of the # connections. -sfx=$(mktemp -u "XXXXXXXX") -gw="ns-gw-$sfx" -cl1="ns-cl1-$sfx" -cl2="ns-cl2-$sfx" -srv="ns-srv-$sfx" - v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) @@ -46,61 +39,29 @@ v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) cleanup() { - ip netns del $gw - ip netns del $srv - for i in $(seq 1 $maxclients); do - ip netns del ns-cl$i-$sfx 2>/dev/null - done - - sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null - sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null - sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null + cleanup_all_ns + + sysctl -q net.ipv4.neigh.default.gc_thresh1="$v4gc1" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh2="$v4gc2" 2>/dev/null + sysctl -q net.ipv4.neigh.default.gc_thresh3="$v4gc3" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh1="$v6gc1" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh2="$v6gc2" 2>/dev/null + sysctl -q net.ipv6.neigh.default.gc_thresh3="$v6gc3" 2>/dev/null } -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi +checktool "nft --version" echo "run test without nft tool" +checktool "conntrack -V" "run test without conntrack tool" -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip +if socat -h >/dev/null 2>&1; then + have_socat=1 fi -conntrack -V > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi - -iperf3 -v >/dev/null 2>&1 -if [ $? -ne 0 ];then - have_iperf=0 -fi - -ip netns add "$gw" -if [ $? -ne 0 ];then - echo "SKIP: Could not create net namespace $gw" - exit $ksft_skip -fi -ip -net "$gw" link set lo up +setup_ns gw srv trap cleanup EXIT -ip netns add "$srv" -if [ $? -ne 0 ];then - echo "SKIP: Could not create server netns $srv" - exit $ksft_skip -fi - ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" ip -net "$gw" link set veth0 up -ip -net "$srv" link set lo up ip -net "$srv" link set eth0 up sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null @@ -110,55 +71,49 @@ sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" +for i in $(seq 1 "$maxclients");do + setup_ns "cl$i" - ip netns add "$cl" - if [ $? -ne 0 ];then - echo "SKIP: Could not create client netns $cl" - exit $ksft_skip - fi - ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 - if [ $? -ne 0 ];then + cl=$(eval echo \$cl"$i") + if ! ip link add veth"$i" netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1;then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi done -for i in $(seq 1 $maxclients);do - cl="ns-cl$i-$sfx" - echo netns exec "$cl" ip link set lo up +for i in $(seq 1 "$maxclients");do + cl=$(eval echo \$cl"$i") echo netns exec "$cl" ip link set eth0 up echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 - echo netns exec "$gw" ip link set veth$i up - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 - echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 + echo netns exec "$gw" ip link set "veth$i" up + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".arp_ignore=2 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".rp_filter=0 # clients have same IP addresses. echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 - echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 nodad echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 # NB: same addresses on client-facing interfaces. - echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i - echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev "veth$i" + echo netns exec "$gw" ip addr add dead:1::2/64 dev "veth$i" nodad # gw: policy routing - echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) - echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) + echo netns exec "$gw" ip route add 10.1.0.0/24 dev "veth$i" table $((1000+i)) + echo netns exec "$gw" ip route add dead:1::0/64 dev "veth$i" table $((1000+i)) echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) - echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) + echo netns exec "$gw" ip rule add fwmark "$i" lookup $((1000+i)) done | ip -batch /dev/stdin ip -net "$gw" addr add 10.3.0.1/24 dev veth0 -ip -net "$gw" addr add dead:3::1/64 dev veth0 +ip -net "$gw" addr add dead:3::1/64 dev veth0 nodad ip -net "$srv" addr add 10.3.0.99/24 dev eth0 -ip -net "$srv" addr add dead:3::99/64 dev eth0 +ip -net "$srv" addr add dead:3::99/64 dev eth0 nodad -ip netns exec $gw nft -f /dev/stdin< /dev/null ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null @@ -224,73 +183,72 @@ ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null -for i in $(seq 1 $maxclients); do - cl="ns-cl$i-$sfx" - ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & - if [ $? -ne 0 ]; then - echo FAIL: Ping failure from $cl 1>&2 - ret=1 - break - fi +for i in $(seq 1 "$maxclients"); do + cl=$(eval echo \$cl"$i") + ip netns exec "$cl" ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & done -wait +wait || ret=1 -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" - if [ $? -ne 0 ];then +[ "$ret" -ne 0 ] && "FAIL: Ping failure from $cl" 1>&2 + +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"; then ret=1 echo "FAIL: counter icmp mismatch for veth$i" 1>&2 - ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 break fi done -ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" -if [ $? -ne 0 ];then +if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"; then ret=1 - echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" - ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }" + ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 fi -if [ $ret -eq 0 ]; then +if [ $ret -eq 0 ]; then echo "PASS: ping test from all $maxclients namespaces" fi -if [ $have_iperf -eq 0 ];then - echo "SKIP: iperf3 not installed" +if [ $have_socat -eq 0 ];then + echo "SKIP: socat not installed" if [ $ret -ne 0 ];then exit $ret fi exit $ksft_skip fi -ip netns exec $srv iperf3 -s > /dev/null 2>&1 & -iperfpid=$! -sleep 1 +listener_ready() +{ + ss -N "$1" -lnt -o "sport = :5201" | grep -q 5201 +} + +ip netns exec "$srv" socat -u TCP-LISTEN:5201,fork STDOUT > /dev/null 2>/dev/null & +socatpid=$! + +busywait 1000 listener_ready "$srv" -for i in $(seq 1 $maxclients); do +for i in $(seq 1 "$maxclients"); do if [ $ret -ne 0 ]; then break fi - cl="ns-cl$i-$sfx" - ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null - if [ $? -ne 0 ]; then - echo FAIL: Failure to connect for $cl 1>&2 - ip netns exec $gw conntrack -S 1>&2 + cl=$(eval echo \$cl"$i") + if ! ip netns exec "$cl" socat -4 -u STDIN TCP:10.3.0.99:5201,sourceport=10000 < /dev/null > /dev/null; then + echo "FAIL: Failure to connect for $cl" 1>&2 + ip netns exec "$gw" conntrack -S 1>&2 ret=1 fi done if [ $ret -eq 0 ];then - echo "PASS: iperf3 connections for all $maxclients net namespaces" + echo "PASS: socat connections for all $maxclients net namespaces" fi -kill $iperfpid +kill $socatpid wait -for i in $(seq 1 $maxclients); do - ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null - if [ $? -ne 0 ];then +for i in $(seq 1 "$maxclients"); do + if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null;then ret=1 echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 break @@ -300,8 +258,7 @@ if [ $ret -eq 0 ];then echo "PASS: Found client connection for all $maxclients net namespaces" fi -ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null -if [ $? -ne 0 ];then +if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null;then ret=1 echo "FAIL: cannot find return entry on veth0" 1>&2 fi -- cgit v1.2.3 From 05af10a88e754dd9e7e2e7a40142df0317d3c632 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:36 +0200 Subject: selftests: netfilter: conntrack_ipip_mtu.sh: shellcheck cleanups No functional change intended. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-9-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/conntrack_ipip_mtu.sh | 74 +++++++++++----------- 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh index ac0dff0f80d7..9832a5d0198a 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh @@ -31,7 +31,7 @@ setup_ns r_a r_b r_w c_a c_b cleanup() { cleanup_all_ns - rm -f ${rx} + rm -f "$rx" } trap cleanup EXIT @@ -46,20 +46,20 @@ listener_ready() test_path() { msg="$1" - ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null & + ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null & busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000 for i in 1 2 3; do head -c1400 /dev/zero | tr "\000" "a" | \ - ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000 + ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000 done wait - bytes=$(wc -c < ${rx}) + bytes=$(wc -c < "$rx") - if [ $bytes -eq 1400 ];then + if [ "$bytes" -eq 1400 ];then echo "OK: PMTU $msg connection tracking" else echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" @@ -78,24 +78,24 @@ test_path() { # 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) # No iptables rules at all. -ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} -ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} +ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w" +ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a" l_addr="10.2.2.1" r_addr="10.4.4.1" -ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip +ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip for dev in lo veth0 veth1 ipip0; do - ip -net ${r_a} link set $dev up + ip -net "$r_a" link set "$dev" up done -ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 -ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 +ip -net "$r_a" addr add 10.2.2.1/24 dev veth0 +ip -net "$r_a" addr add 192.168.10.1/24 dev veth1 -ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 -ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 +ip -net "$r_a" route add 192.168.20.0/24 dev ipip0 +ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254 -ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Detailed setup for Router B # --------------------------- @@ -108,46 +108,46 @@ ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) # No iptables rules at all. -ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} -ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} +ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w" +ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b" l_addr="10.4.4.1" r_addr="10.2.2.1" -ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip +ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip for dev in veth0 veth1 ipip0; do - ip -net ${r_b} link set $dev up + ip -net "$r_b" link set $dev up done -ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 -ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 +ip -net "$r_b" addr add 10.4.4.1/24 dev veth0 +ip -net "$r_b" addr add 192.168.20.1/24 dev veth1 -ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 -ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 -ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip -net "$r_b" route add 192.168.10.0/24 dev ipip0 +ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254 +ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Client A -ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 -ip -net ${c_a} link set dev veth0 up -ip -net ${c_a} route add default via 192.168.10.1 +ip -net "$c_a" addr add 192.168.10.2/24 dev veth0 +ip -net "$c_a" link set dev veth0 up +ip -net "$c_a" route add default via 192.168.10.1 # Client A -ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 -ip -net ${c_b} link set dev veth0 up -ip -net ${c_b} route add default via 192.168.20.1 +ip -net "$c_b" addr add 192.168.20.2/24 dev veth0 +ip -net "$c_b" link set dev veth0 up +ip -net "$c_b" route add default via 192.168.20.1 # Wan -ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 -ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 +ip -net "$r_w" addr add 10.2.2.254/24 dev veth0 +ip -net "$r_w" addr add 10.4.4.254/24 dev veth1 -ip -net ${r_w} link set dev veth0 up mtu 1400 -ip -net ${r_w} link set dev veth1 up mtu 1400 +ip -net "$r_w" link set dev veth0 up mtu 1400 +ip -net "$r_w" link set dev veth1 up mtu 1400 -ip -net ${r_a} link set dev veth0 mtu 1400 -ip -net ${r_b} link set dev veth0 mtu 1400 +ip -net "$r_a" link set dev veth0 mtu 1400 +ip -net "$r_b" link set dev veth0 mtu 1400 -ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null +ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null # Path MTU discovery # ------------------ @@ -187,5 +187,5 @@ test_path "without" #packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is #dropped on Router A before sending. -ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW +ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW test_path "with" -- cgit v1.2.3 From 9b443c769b1bee5df800ab069c015bc57b16237a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:37 +0200 Subject: selftests: netfilter: nft_fib.sh: shellcheck cleanups no functional change intended. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-10-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_fib.sh | 128 +++++++++++------------ 1 file changed, 61 insertions(+), 67 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh index 04d6dc886b8a..ce1451c275fd 100755 --- a/tools/testing/selftests/net/netfilter/nft_fib.sh +++ b/tools/testing/selftests/net/netfilter/nft_fib.sh @@ -16,7 +16,7 @@ cleanup() { cleanup_all_ns - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns } checktool "nft --version" "run test without nft" @@ -25,8 +25,7 @@ setup_ns nsrouter ns1 ns2 trap cleanup EXIT -dmesg | grep -q ' nft_rpfilter: ' -if [ $? -eq 0 ]; then +if dmesg | grep -q ' nft_rpfilter: ';then dmesg -c | grep ' nft_rpfilter: ' echo "WARN: a previous test run has failed" 1>&2 fi @@ -36,7 +35,7 @@ sysctl -q net.netfilter.nf_log_all_netns=1 load_ruleset() { local netns=$1 -ip netns exec ${netns} nft -f /dev/stdin <&2 - ip netns exec ${ns} nft list table inet filter + ip netns exec "$ns" nft list table inet filter return 1 fi - if [ $want -gt 0 ]; then + if [ "$want" -gt 0 ]; then echo "PASS: fib expression did drop packets for $address" fi return 0 } -load_ruleset ${nsrouter} -load_ruleset ${ns1} -load_ruleset ${ns2} +load_ruleset "$nsrouter" +load_ruleset "$ns1" +load_ruleset "$ns2" if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: No virtual ethernet pair device support in kernel" exit $ksft_skip fi -ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} +ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" -ip -net ${nsrouter} link set veth0 up -ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 -ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 nodad +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad -ip -net ${nsrouter} link set veth1 up -ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 -ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 nodad +ip -net "$nsrouter" link set veth1 up +ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 +ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad -ip -net ${ns1} link set eth0 up -ip -net ${ns2} link set eth0 up +ip -net "$ns1" link set eth0 up +ip -net "$ns2" link set eth0 up -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr add dead:1::99/64 dev eth0 nodad -ip -net ${ns1} route add default via 10.0.1.1 -ip -net ${ns1} route add default via dead:1::1 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" route add default via dead:1::1 -ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 -ip -net ${ns2} addr add dead:2::99/64 dev eth0 nodad -ip -net ${ns2} route add default via 10.0.2.1 -ip -net ${ns2} route add default via dead:2::1 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 test_ping() { local daddr4=$1 @@ -155,11 +151,11 @@ test_ping() { return 0 } -ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null -ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null +ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null test_ping 10.0.2.1 dead:2::1 || exit 1 check_drops || exit 1 @@ -169,69 +165,67 @@ check_drops || exit 1 echo "PASS: fib expression did not cause unwanted packet drops" -ip netns exec ${nsrouter} nft flush table inet filter +ip netns exec "$nsrouter" nft flush table inet filter -ip -net ${ns1} route del default -ip -net ${ns1} -6 route del default +ip -net "$ns1" route del default +ip -net "$ns1" -6 route del default -ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 -ip -net ${ns1} addr del dead:1::99/64 dev eth0 +ip -net "$ns1" addr del 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr del dead:1::99/64 dev eth0 -ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr add 10.0.2.99/24 dev eth0 ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad -ip -net ${ns1} route add default via 10.0.2.1 -ip -net ${ns1} -6 route add default via dead:2::1 +ip -net "$ns1" route add default via 10.0.2.1 +ip -net "$ns1" -6 route add default via dead:2::1 ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad # switch to ruleset that doesn't log, this time # its expected that this does drop the packets. -load_ruleset_count ${nsrouter} +load_ruleset_count "$nsrouter" # ns1 has a default route, but nsrouter does not. # must not check return value, ping to 1.1.1.1 will # fail. -check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 -check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 +check_fib_counter 0 "$nsrouter" 1.1.1.1 || exit 1 +check_fib_counter 0 "$nsrouter" 1c3::c01d || exit 1 ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null -check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 +check_fib_counter 1 "$nsrouter" 1.1.1.1 || exit 1 ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null -check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 +check_fib_counter 3 "$nsrouter" 1c3::c01d || exit 1 # delete all rules -ip netns exec ${ns1} nft flush ruleset -ip netns exec ${ns2} nft flush ruleset -ip netns exec ${nsrouter} nft flush ruleset +ip netns exec "$ns1" nft flush ruleset +ip netns exec "$ns2" nft flush ruleset +ip netns exec "$nsrouter" nft flush ruleset -ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad -ip -net ${ns1} addr del 10.0.2.99/24 dev eth0 -ip -net ${ns1} addr del dead:2::99/64 dev eth0 +ip -net "$ns1" addr del 10.0.2.99/24 dev eth0 +ip -net "$ns1" addr del dead:2::99/64 dev eth0 -ip -net ${nsrouter} addr del dead:2::1/64 dev veth0 +ip -net "$nsrouter" addr del dead:2::1/64 dev veth0 # ... pbr ruleset for the router, check iif+oif. -load_pbr_ruleset ${nsrouter} -if [ $? -ne 0 ] ; then +if ! load_pbr_ruleset "$nsrouter";then echo "SKIP: Could not load fib forward ruleset" exit $ksft_skip fi -ip -net ${nsrouter} rule add from all table 128 -ip -net ${nsrouter} rule add from all iif veth0 table 129 -ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0 -ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1 +ip -net "$nsrouter" rule add from all table 128 +ip -net "$nsrouter" rule add from all iif veth0 table 129 +ip -net "$nsrouter" route add table 128 to 10.0.1.0/24 dev veth0 +ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1 # drop main ipv4 table -ip -net ${nsrouter} -4 rule delete table main +ip -net "$nsrouter" -4 rule delete table main -test_ping 10.0.2.99 dead:2::99 -if [ $? -ne 0 ] ; then - ip -net ${nsrouter} nft list ruleset +if ! test_ping 10.0.2.99 dead:2::99;then + ip -net "$nsrouter" nft list ruleset echo "FAIL: fib mismatch in pbr setup" exit 1 fi -- cgit v1.2.3 From 4d7730154ed542d6a502a7ed0778069fafa2461f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:38 +0200 Subject: selftests: netfilter: nft_meta.sh: small shellcheck cleanup shellcheck complains about missing "", so add those. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-11-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_meta.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh index f33154c04d34..71505b6cb252 100755 --- a/tools/testing/selftests/net/netfilter/nft_meta.sh +++ b/tools/testing/selftests/net/netfilter/nft_meta.sh @@ -91,10 +91,10 @@ check_one_counter() local want="packets $2" local verbose="$3" - if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then + if ! ip netns exec "$ns0" nft list counter inet filter "$cname" | grep -q "$want"; then echo "FAIL: $cname, want \"$want\", got" ret=1 - ip netns exec "$ns0" nft list counter inet filter $cname + ip netns exec "$ns0" nft list counter inet filter "$cname" fi } -- cgit v1.2.3 From 1f50b0fef936abe55ef9da56b27cb9567dd1a37b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:39 +0200 Subject: selftests: netfilter: nft_audit.sh: add more skip checks This testcase doesn't work if auditd is running, audit_logread will not receive any data in that case. Add a nftables feature test for the reset keyword and skip this test if that fails. While at it, do a few minor shellcheck cleanups. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-12-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_audit.sh | 30 +++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh index 99ed5bd6e840..b390437696ba 100755 --- a/tools/testing/selftests/net/netfilter/nft_audit.sh +++ b/tools/testing/selftests/net/netfilter/nft_audit.sh @@ -6,11 +6,33 @@ SKIP_RC=4 RC=0 +if [ -r /var/run/auditd.pid ];then + read pid < /var/run/auditd.pid + p=$(pgrep ^auditd$) + + if [ "$pid" -eq "$p" ]; then + echo "SKIP: auditd is running" + exit $SKIP_RC + fi +fi + nft --version >/dev/null 2>&1 || { echo "SKIP: missing nft tool" exit $SKIP_RC } +# nft must be recent enough to support "reset" keyword. +nft --check -f /dev/stdin >/dev/null 2>&1 <$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=500 op=nft_register_rule' @@ -101,7 +123,7 @@ do_test 'nft add counter t2 c1; add counter t2 c2' \ for ((i = 3; i <= 500; i++)); do echo "add counter t2 c$i" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=498 op=nft_register_obj' @@ -115,7 +137,7 @@ do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \ for ((i = 3; i <= 500; i++)); do echo "add quota t2 q$i { 10 bytes }" -done >$rulefile +done > "$rulefile" do_test "nft -f $rulefile" \ 'table=t2 family=2 entries=498 op=nft_register_obj' @@ -157,7 +179,7 @@ table=t2 family=2 entries=135 op=nft_reset_rule' # resetting sets and elements -elem=(22 ,80 ,443) +elem=(22 ",80" ",443") relem="" for i in {1..3}; do relem+="${elem[((i - 1))]}" -- cgit v1.2.3 From 0b2e1db97b4210f8951ca05dedabc46d60458293 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Apr 2024 17:27:40 +0200 Subject: selftests: netfilter: update makefiles and kernel config Jakub reports the Makefile missed a few updates to make kselftest-install work for the netfilter tests and points out that config file lacks many dependencies such as VETH support. The settings file (timeout 8m) is added for nft_concat_range.sh script which can take several minutes to complete. Fixes: 3f189349e52a ("selftests: netfilter: move to net subdir") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/all/20240412175413.04e5e616@kernel.org/ Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240418152744.15105-13-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/Makefile | 5 +++ tools/testing/selftests/net/netfilter/config | 52 +++++++++++++++++++++++++- tools/testing/selftests/net/netfilter/settings | 1 + 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/netfilter/settings (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index dd9a75a33d28..68e4780edfdc 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -42,3 +42,8 @@ $(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS) $(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) $(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) + +TEST_FILES := lib.sh + +TEST_INCLUDES := \ + ../lib.sh diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 9df6a9f11384..60b86c7f3ea1 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -1,37 +1,87 @@ CONFIG_AUDIT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BRIDGE=m CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_IP=m CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_T_FILTER=m CONFIG_BRIDGE_NETFILTER=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_CGROUP_BPF=y +CONFIG_DUMMY=m +CONFIG_INET_ESP=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP_NF_RAW=m +CONFIG_IP6_NF_RAW=m CONFIG_IP_SCTP=m CONFIG_IP_VS=m CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_RR=m +CONFIG_IPV6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_MACVLAN=m +CONFIG_NAMESPACES=y CONFIG_NET_CLS_U32=m +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NS=y CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_HTB=m CONFIG_NET_IPIP=m CONFIG_NET_VRF=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NETFILTER_XTABLES=m CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STRING=m CONFIG_NETFILTER_XT_TARGET_REDIRECT=m CONFIG_NF_CONNTRACK=m -CONFIG_NF_CONNTRACK_EVENTS=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_MARK=y CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CT_NETLINK=m CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_FLOW_TABLE=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_BRIDGE=m CONFIG_NF_TABLES_INET=y CONFIG_NF_TABLES_IPV4=y CONFIG_NF_TABLES_IPV6=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_COMPAT=m CONFIG_NFT_CT=m CONFIG_NFT_FIB=m CONFIG_NFT_FIB_INET=m CONFIG_NFT_FIB_IPV4=m CONFIG_NFT_FIB_IPV6=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_LOG=m CONFIG_NFT_MASQ=m CONFIG_NFT_NAT=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REDIR=m CONFIG_NFT_SYNPROXY=m +CONFIG_VETH=m +CONFIG_VLAN_8021Q=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_STATISTICS=y diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings new file mode 100644 index 000000000000..288bd9704773 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/settings @@ -0,0 +1 @@ +timeout=500 -- cgit v1.2.3 From e7bb43898bcf54da7ffb4819a04c8428f7db24db Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 16 Apr 2024 08:00:26 +0200 Subject: ASoC: dapm-graph: new tool to visualize DAPM state Add a tool to generate a picture of the current DAPM state for a sound card. dapm-graph is inspired by vizdapm which used to be published on a Wolfson Micro git repository now disappeared, and has a few forks around: https://github.com/mihais/asoc-tools https://github.com/alexandrebelloni/asoc-tools dapm-graph is a full reimplementation with several improvements while still being a self-contained shell script: Improvements to rendered output: - shows the entire card, not one component hierarchy only - each component is rendered in a separate box - shows widget on/off status based on widget information alone (the original vizdapm propagates the "on" green colour to the first input widget) - use bold line and gray background and not only green/red line to show on/off status (for the color blind) Improvements for embedded system developers: - remote mode: get state of remote device (possibly with minimal rootfs) via SSH, but parsing locally for faster operation - compatible with BusyBox shell, not only bash Usability improvements: - flexible command line (uses getopts for parsing) - detailed help text - flag to enable detailed debug logging - graphviz output format detected from file extension, not hard coded - a self-contained shell script Usage is designed to be simple: dapm-grpah -c CARD - get state from debugfs for CARD dapm-grpah -c CARD -r REMOTE_TARGET - same, but remotely via SSH dapm-grpah -d STATE_DIR - from a local copy of the debugfs tree for a card Signed-off-by: Luca Ceresoli Reviewed-by: Alexandre Belloni Link: https://lore.kernel.org/r/20240416-vizdapm-ng-v1-3-5d33c0b57bc5@bootlin.com Signed-off-by: Mark Brown --- MAINTAINERS | 6 + tools/sound/dapm-graph | 303 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100755 tools/sound/dapm-graph (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index a7d0dd91ac19..aa3b455588e1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20669,6 +20669,12 @@ F: include/trace/events/sof*.h F: include/uapi/sound/asoc.h F: sound/soc/ +SOUND - SOC LAYER / dapm-graph +M: Luca Ceresoli +L: linux-sound@vger.kernel.org +S: Maintained +F: tools/sound/dapm-graph + SOUND - SOUND OPEN FIRMWARE (SOF) DRIVERS M: Pierre-Louis Bossart M: Liam Girdwood diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph new file mode 100755 index 000000000000..57d78f6df041 --- /dev/null +++ b/tools/sound/dapm-graph @@ -0,0 +1,303 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate a graph of the current DAPM state for an audio card +# +# Copyright 2024 Bootlin +# Author: Luca Ceresoli + +set -eu + +STYLE_NODE_ON="shape=box,style=bold,color=green4" +STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95" + +# Print usage and exit +# +# $1 = exit return value +# $2 = error string (required if $1 != 0) +usage() +{ + if [ "${1}" -ne 0 ]; then + echo "${2}" >&2 + fi + + echo " +Generate a graph of the current DAPM state for an audio card. + +The DAPM state can be obtained via debugfs for a card on the local host or +a remote target, or from a local copy of the debugfs tree for the card. + +Usage: + $(basename $0) [options] -c CARD - Local sound card + $(basename $0) [options] -c CARD -r REMOTE_TARGET - Card on remote system + $(basename $0) [options] -d STATE_DIR - Local directory + +Options: + -c CARD Sound card to get DAPM state of + -r REMOTE_TARGET Get DAPM state from REMOTE_TARGET via SSH and SCP + instead of using a local sound card + -d STATE_DIR Get DAPM state from a local copy of a debugfs tree + -o OUT_FILE Output file (default: dapm.dot) + -D Show verbose debugging info + -h Print this help and exit + +The output format is implied by the extension of OUT_FILE: + + * Use the .dot extension to generate a text graph representation in + graphviz dot syntax. + * Any other extension is assumed to be a format supported by graphviz for + rendering, e.g. 'png', 'svg', and will produce both the .dot file and a + picture from it. This requires the 'dot' program from the graphviz + package. +" + + exit ${1} +} + +# Connect to a remote target via SSH, collect all DAPM files from debufs +# into a tarball and get the tarball via SCP into $3/dapm.tar +# +# $1 = target as used by ssh and scp, e.g. "root@192.168.1.1" +# $2 = sound card name +# $3 = temp dir path (present on the host, created on the target) +# $4 = local directory to extract the tarball into +# +# Requires an ssh+scp server, find and tar+gz on the target +# +# Note: the tarball is needed because plain 'scp -r' from debugfs would +# copy only empty files +grab_remote_files() +{ + echo "Collecting DAPM state from ${1}" + dbg_echo "Collected DAPM state in ${3}" + + ssh "${1}" " +set -eu && +cd \"/sys/kernel/debug/asoc/${2}\" && +find * -type d -exec mkdir -p ${3}/dapm-tree/{} \; && +find * -type f -exec cp \"{}\" \"${3}/dapm-tree/{}\" \; && +cd ${3}/dapm-tree && +tar cf ${3}/dapm.tar ." + scp -q "${1}:${3}/dapm.tar" "${3}" + + mkdir -p "${4}" + tar xf "${tmp_dir}/dapm.tar" -C "${4}" +} + +# Parse a widget file and generate graph description in graphviz dot format +# +# Skips any file named "bias_level". +# +# $1 = temporary work dir +# $2 = component name +# $3 = widget filename +process_dapm_widget() +{ + local tmp_dir="${1}" + local c_name="${2}" + local w_file="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + local w_name="$(basename "${w_file}")" + local w_tag="${c_name}_${w_name}" + + if [ "${w_name}" = "bias_level" ]; then + return 0 + fi + + dbg_echo " + Widget: ${w_name}" + + cat "${w_file}" | ( + read line + + if echo "${line}" | grep -q ': On ' + then local node_style="${STYLE_NODE_ON}" + else local node_style="${STYLE_NODE_OFF}" + fi + + local w_type="" + while read line; do + # Collect widget type if present + if echo "${line}" | grep -q '^widget-type '; then + local w_type_raw="$(echo "$line" | cut -d ' ' -f 2)" + dbg_echo " - Widget type: ${w_type_raw}" + + # Note: escaping '\n' is tricky to get working with both + # bash and busybox ash, so use a '%' here and replace it + # later + local w_type="%n[${w_type_raw}]" + fi + + # Collect any links. We could use "in" links or "out" links, + # let's use "in" links + if echo "${line}" | grep -q '^in '; then + local w_src=$(echo "$line" | + awk -F\" '{print $6 "_" $4}' | + sed 's/^(null)_/ROOT_/') + dbg_echo " - Input route from: ${w_src}" + echo " \"${w_src}\" -> \"$w_tag\"" >> "${links_file}" + fi + done + + echo " \"${w_tag}\" [label=\"${w_name}${w_type}\",${node_style}]" | + tr '%' '\\' >> "${dot_file}" + ) +} + +# Parse the DAPM tree for a sound card component and generate graph +# description in graphviz dot format +# +# $1 = temporary work dir +# $2 = component directory +# $3 = forced component name (extracted for path if empty) +process_dapm_component() +{ + local tmp_dir="${1}" + local c_dir="${2}" + local c_name="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + if [ -z "${c_name}" ]; then + # Extract directory name into component name: + # "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a" + c_name="$(basename $(dirname "${c_dir}"))" + fi + + dbg_echo " * Component: ${c_name}" + + echo "" >> "${dot_file}" + echo " subgraph \"${c_name}\" {" >> "${dot_file}" + echo " cluster = true" >> "${dot_file}" + echo " label = \"${c_name}\"" >> "${dot_file}" + echo " color=dodgerblue" >> "${dot_file}" + + # Create empty file to ensure it will exist in all cases + >"${links_file}" + + # Iterate over widgets in the component dir + for w_file in ${c_dir}/*; do + process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}" + done + + echo " }" >> "${dot_file}" + + cat "${links_file}" >> "${dot_file}" +} + +# Parse the DAPM tree for a sound card and generate graph description in +# graphviz dot format +# +# $1 = temporary work dir +# $2 = directory tree with DAPM state (either in debugfs or a mirror) +process_dapm_tree() +{ + local tmp_dir="${1}" + local dapm_dir="${2}" + local dot_file="${tmp_dir}/main.dot" + + echo "digraph G {" > "${dot_file}" + echo " fontname=\"sans-serif\"" >> "${dot_file}" + echo " node [fontname=\"sans-serif\"]" >> "${dot_file}" + + + # Process root directory (no component) + process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT" + + # Iterate over components + for c_dir in "${dapm_dir}"/*/dapm + do + process_dapm_component "${tmp_dir}" "${c_dir}" "" + done + + echo "}" >> "${dot_file}" +} + +main() +{ + # Parse command line + local out_file="dapm.dot" + local card_name="" + local remote_target="" + local dapm_tree="" + local dbg_on="" + while getopts "c:r:d:o:Dh" arg; do + case $arg in + c) card_name="${OPTARG}" ;; + r) remote_target="${OPTARG}" ;; + d) dapm_tree="${OPTARG}" ;; + o) out_file="${OPTARG}" ;; + D) dbg_on="1" ;; + h) usage 0 ;; + *) usage 1 ;; + esac + done + shift $(($OPTIND - 1)) + + if [ -n "${dapm_tree}" ]; then + if [ -n "${card_name}${remote_target}" ]; then + usage 1 "Cannot use -c and -r with -d" + fi + echo "Using local tree: ${dapm_tree}" + elif [ -n "${remote_target}" ]; then + if [ -z "${card_name}" ]; then + usage 1 "-r requires -c" + fi + echo "Using card ${card_name} from remote target ${remote_target}" + elif [ -n "${card_name}" ]; then + echo "Using local card: ${card_name}" + else + usage 1 "Please choose mode using -c, -r or -d" + fi + + # Define logging function + if [ "${dbg_on}" ]; then + dbg_echo() { + echo "$*" >&2 + } + else + dbg_echo() { + : + } + fi + + # Filename must have a dot in order the infer the format from the + # extension + if ! echo "${out_file}" | grep -qE '\.'; then + echo "Missing extension in output filename ${out_file}" >&2 + usage + exit 1 + fi + + local out_fmt="${out_file##*.}" + local dot_file="${out_file%.*}.dot" + + dbg_echo "dot file: $dot_file" + dbg_echo "Output file: $out_file" + dbg_echo "Output format: $out_fmt" + + tmp_dir="$(mktemp -d /tmp/$(basename $0).XXXXXX)" + trap "{ rm -fr ${tmp_dir}; }" INT TERM EXIT + + if [ -z "${dapm_tree}" ] + then + dapm_tree="/sys/kernel/debug/asoc/${card_name}" + fi + if [ -n "${remote_target}" ]; then + dapm_tree="${tmp_dir}/dapm-tree" + grab_remote_files "${remote_target}" "${card_name}" "${tmp_dir}" "${dapm_tree}" + fi + # In all cases now ${dapm_tree} contains the DAPM state + + process_dapm_tree "${tmp_dir}" "${dapm_tree}" + cp "${tmp_dir}/main.dot" "${dot_file}" + + if [ "${out_file}" != "${dot_file}" ]; then + dot -T"${out_fmt}" "${dot_file}" -o "${out_file}" + fi + + echo "Generated file ${out_file}" +} + +main "${@}" -- cgit v1.2.3 From 231ce08b662a58d4392da998699b3d4a7e2e87cf Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 12:53:53 -0400 Subject: tools/power turbostat: Add "snapshot:" Makefile target Kernel developers often need to diagnose remote customer systems with the latest turbostat, yet customers are running binary distros with out-dated turbostat and the customer has no experience cloning linux kernel trees. Add a turbostat "snapshot" makefile target to create a standalone source snapshot from the developer's git tree, appropriately hacked so that the customer can build turbostat without a kernel tree. Include the turbostat binary in the snapshot, for convenience in those situations where the source and destination are trusted, (and have new enough glibc to execute). The snapshot is named with the date it was taken rather than the turbostat VERSION, as it could occur between VERSIONS... Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 92e139b9c792..2d6dce2c8f77 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -3,6 +3,8 @@ CC = $(CROSS_COMPILE)gcc BUILD_OUTPUT := $(CURDIR) PREFIX ?= /usr DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = turbostat-$(DAY) ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -22,9 +24,30 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 .PHONY : clean clean : @rm -f $(BUILD_OUTPUT)/turbostat + @rm -f $(SNAPSHOT).tar.gz install : turbostat - install -d $(DESTDIR)$(PREFIX)/bin + install -d $(DESTDIR)$(PREFIX)/bin install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat - install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 + +snapshot: turbostat + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp turbostat Makefile turbostat.c turbostat.8 ../../../../arch/x86/include/asm/intel-family.h $(SNAPSHOT) + + @sed -e 's/^#include /#include "bits.h"/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) -- cgit v1.2.3 From ae3326ac5742506409a03ce5d69716a8dba4eabc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 14:45:10 -0400 Subject: tools/power turbostat: Harden probe_intel_uncore_frequency() If sysfs directory "intel_uncore_frequency/cluster00/" exists, then use uncore cluster code (now its own routine). The previous check for "intel_uncore_frequency/package_00_die_00/current_freq_khz", could be unreliable in the face of sparse die id's. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98256468e248..ca33fb057d1f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5294,16 +5294,13 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void probe_intel_uncore_frequency(void) +static void probe_intel_uncore_frequency_legacy(void) { int i, j; char path[256]; - if (!genuine_intel) - return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - goto probe_cluster; + return; BIC_PRESENT(BIC_UNCORE_MHZ); @@ -5335,9 +5332,13 @@ static void probe_intel_uncore_frequency(void) fprintf(outf, " %d MHz\n", k / 1000); } } - return; +} + +static void probe_intel_uncore_frequency_cluster(void) +{ + int i; + char path[256]; -probe_cluster: if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; @@ -5351,6 +5352,7 @@ probe_cluster: sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + /* uncore## start at 00 and skip no numbers, so stop upon first missing */ if (access(path_base, R_OK)) break; @@ -5382,6 +5384,17 @@ probe_cluster: } } +static void probe_intel_uncore_frequency(void) +{ + if (!genuine_intel) + return; + + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0) + probe_intel_uncore_frequency_cluster(); + else + probe_intel_uncore_frequency_legacy(); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ -- cgit v1.2.3 From cda203388687aa075db6f8996c3c4549fa518ea8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 11:56:48 -0400 Subject: tools/power turbostat: Remember global max_die_id This is necessary to gracefully handle sparse die_id's. no functional change Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ca33fb057d1f..e6d643a58cd8 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1415,6 +1415,7 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_die_id; int max_node_num; int nodes_per_pkg; int cores_per_node; @@ -6980,7 +6981,6 @@ void topology_probe(bool startup) int i; int max_core_id = 0; int max_package_id = 0; - int max_die_id = 0; int max_siblings = 0; /* Initialize num_cpus, max_cpu_num */ @@ -7097,8 +7097,8 @@ void topology_probe(bool startup) /* get die information */ cpus[i].die_id = get_die_id(i); - if (cpus[i].die_id > max_die_id) - max_die_id = cpus[i].die_id; + if (cpus[i].die_id > topo.max_die_id) + topo.max_die_id = cpus[i].die_id; /* get numa node information */ cpus[i].physical_node_id = get_physical_node_id(&cpus[i]); @@ -7124,9 +7124,9 @@ void topology_probe(bool startup) if (!summary_only && topo.cores_per_node > 1) BIC_PRESENT(BIC_Core); - topo.num_die = max_die_id + 1; + topo.num_die = topo.max_die_id + 1; if (debug > 1) - fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die); + fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die); if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); -- cgit v1.2.3 From c8b246ea2ea5771f2a0ca6f6a9a520406e6b6eb7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 21 Apr 2024 12:02:24 -0400 Subject: tools/power turbostat: Survive sparse die_id Turbostat assumed that every package had a die_id = 0. When this assumption was violated, it exited when looking for the package uncore frequency: turbostat: /sys/.../intel_uncore_frequency/package_01_die_00/current_freq_khz: open failed: No such file or directory Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 45 ++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e6d643a58cd8..4b95fd90e16c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2992,14 +2992,29 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) return 0; } -unsigned long long get_uncore_mhz(int package, int die) +unsigned long long get_legacy_uncore_mhz(int package) { char path[128]; + int die; + static int warn_once; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, - die); + /* + * for this package, use the first die_id that exists + */ + for (die = 0; die <= topo.max_die_id; ++die) { + + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", + package, die); - return (snapshot_sysfs_counter(path) / 1000); + if (access(path, R_OK) == 0) + return (snapshot_sysfs_counter(path) / 1000); + } + if (!warn_once) { + warnx("BUG: %s: No %s", __func__, path); + warn_once = 1; + } + + return 0; } int get_epb(int cpu) @@ -3631,9 +3646,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) - p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + p->uncore_mhz = get_legacy_uncore_mhz(p->package_id); if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; @@ -5300,22 +5314,22 @@ static void probe_intel_uncore_frequency_legacy(void) int i, j; char path[256]; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - return; - - BIC_PRESENT(BIC_UNCORE_MHZ); - - if (quiet) - return; - for (i = 0; i < topo.num_packages; ++i) { - for (j = 0; j < topo.num_die; ++j) { + for (j = 0; j <= topo.max_die_id; ++j) { int k, l; char path_base[128]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); + if (access(path_base, R_OK)) + continue; + + BIC_PRESENT(BIC_UNCORE_MHZ); + + if (quiet) + return; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5480,7 +5494,6 @@ next: else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) -- cgit v1.2.3 From e7a8074d2f62c7c571e1e794f6eb2c90f68c514e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jul 2023 10:00:22 -0300 Subject: tools include UAPI: Sync linux/vhost.h with the kernel sources To get the changes in: 2855c2a7820bc819 ("vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE") 1496c47065f9f841 ("vhost-vdpa: uapi to support reporting per vq size") To pick up these changes and support them: $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > before $ cp include/uapi/linux/vhost.h tools/perf/trace/beauty/include/uapi/linux/vhost.h $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > after $ diff -u before after --- before 2024-04-22 13:39:37.185674799 -0300 +++ after 2024-04-22 13:39:52.043344784 -0300 @@ -50,5 +50,6 @@ [0x7F] = "VDPA_GET_VRING_DESC_GROUP", [0x80] = "VDPA_GET_VQS_COUNT", [0x81] = "VDPA_GET_GROUP_NUM", + [0x82] = "VDPA_GET_VRING_SIZE", [0x8] = "NEW_WORKER", }; $ For instance, see how those 'cmd' ioctl arguments get translated, now VDPA_GET_VRING_SIZE will be as well: # perf trace -a -e ioctl --max-events=10 0.000 ( 0.011 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1) = 0 21.353 ( 0.014 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1) = 0 25.766 ( 0.014 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740) = 0 25.845 ( 0.034 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70) = 0 25.916 ( 0.011 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0) = 0 25.941 ( 0.025 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ATOMIC, arg: 0x7ffe4a22c840) = 0 32.915 ( 0.009 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_RMFB, arg: 0x7ffe4a22cf9c) = 0 42.522 ( 0.013 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740) = 0 42.579 ( 0.031 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70) = 0 42.644 ( 0.010 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0) = 0 # This addresses this perf tools build warning: diff -u tools/perf/trace/beauty/include/uapi/linux/vhost.h include/uapi/linux/vhost.h But this specific process, usually boring, this time around catch a problem, namely the addition of VDPA_GET_VRING_SIZE used an ioctl number already taken, which went on unnoticed and only got caught when the tools/perf/trace/beauty/vhost_virtio_ioctl.sh script was run as part of the perf tools process of updating the tools copies of system headers it uses for creating id->string tables that, well, broke the perf tools build because there were multiple initializations in the strings table for the 0x80 entry... I'm adding here a link to the discussion, that is lacking in the fix for the reported problem, and a quote from one of the developers involved: "Thanks a lot for taking care of this! So given the header is actually buggy pls hang on to this change until I merge the fix for the header (you were CC'd on the patch). It's great we have this redundancy which allowed us to catch the bug in time, and many thanks to Namhyung Kim for reporting the issue!" This is here as a hint for anyone thinking about ways to automate checking these issues in a more automated way... ;-) Link: https://lore.kernel.org/lkml/ 20240402172151-mutt-send-email-mst@kernel.org Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Michael S. Tsirkin Cc: Namhyung Kim Cc: Zhu Lingshan Link: https://lore.kernel.org/lkml/ZiaW-csEZLKK48BE@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/uapi/linux/vhost.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/include/uapi/linux/vhost.h b/tools/perf/trace/beauty/include/uapi/linux/vhost.h index 649560c685f1..b95dd84eef2d 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/vhost.h +++ b/tools/perf/trace/beauty/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -227,4 +221,18 @@ */ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + +/* Get the queue size of a specific virtqueue. + * userspace set the vring index in vhost_vring_state.index + * kernel set the queue size in vhost_vring_state.num + */ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ + struct vhost_vring_state) #endif -- cgit v1.2.3 From 0a966d606c681b891fc7ef2e3ace67ac507a221d Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 18 Apr 2024 11:47:35 +0100 Subject: tools/net/ynl: Fix extack decoding for directional ops NetlinkProtocol.decode() was looking up ops by response value which breaks when it is used for extack decoding of directional ops. Instead, pass the op to decode(). Signed-off-by: Donald Hunter Link: https://lore.kernel.org/r/20240418104737.77914-3-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index a67f7b6fef92..a3ec7a56180a 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -386,12 +386,9 @@ class NetlinkProtocol: def _decode(self, nl_msg): return nl_msg - def decode(self, ynl, nl_msg): + def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) - fixed_header_size = 0 - if ynl: - op = ynl.rsp_by_value[msg.cmd()] - fixed_header_size = ynl._struct_size(op.fixed_header) + fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -797,7 +794,7 @@ class YnlFamily(SpecFamily): if 'bad-attr-offs' not in extack: return - msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set)) + msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op) offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header) path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset, extack['bad-attr-offs']) @@ -922,7 +919,8 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - decoded = self.nlproto.decode(self, nl_msg) + op = self.rsp_by_value[nl_msg.cmd()] + decoded = self.nlproto.decode(self, nl_msg, op) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -979,7 +977,7 @@ class YnlFamily(SpecFamily): done = True break - decoded = self.nlproto.decode(self, nl_msg) + decoded = self.nlproto.decode(self, nl_msg, op) # Check if this is a reply to our request if nl_msg.nl_seq != req_seq or decoded.cmd() != op.rsp_value: -- cgit v1.2.3 From ba8be00f68f5c70eb1df2193251a579923bd9501 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 18 Apr 2024 11:47:36 +0100 Subject: tools/net/ynl: Add multi message support to ynl Add a "--multi " command line to ynl that makes it possible to add several operations to a single netlink request payload. The --multi command line option is repeated for each operation. This is used by the nftables family for transaction batches. For example: ./tools/net/ynl/cli.py \ --spec Documentation/netlink/specs/nftables.yaml \ --multi batch-begin '{"res-id": 10}' \ --multi newtable '{"name": "test", "nfgen-family": 1}' \ --multi newchain '{"name": "chain", "table": "test", "nfgen-family": 1}' \ --multi batch-end '{"res-id": 10}' [None, None, None, None] It can also be used for bundling get requests: ./tools/net/ynl/cli.py \ --spec Documentation/netlink/specs/nftables.yaml \ --multi gettable '{"name": "test", "nfgen-family": 1}' \ --multi getchain '{"name": "chain", "table": "test", "nfgen-family": 1}' \ --output-json [{"name": "test", "use": 1, "handle": 1, "flags": [], "nfgen-family": 1, "version": 0, "res-id": 2}, {"table": "test", "name": "chain", "handle": 1, "use": 0, "nfgen-family": 1, "version": 0, "res-id": 2}] Signed-off-by: Donald Hunter Link: https://lore.kernel.org/r/20240418104737.77914-4-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/cli.py | 25 +++++++++++++++--- tools/net/ynl/lib/ynl.py | 68 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 71 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py index f131e33ac3ee..058926d69ef0 100755 --- a/tools/net/ynl/cli.py +++ b/tools/net/ynl/cli.py @@ -19,13 +19,28 @@ class YnlEncoder(json.JSONEncoder): def main(): - parser = argparse.ArgumentParser(description='YNL CLI sample') + description = """ + YNL CLI utility - a general purpose netlink utility that uses YAML + specs to drive protocol encoding and decoding. + """ + epilog = """ + The --multi option can be repeated to include several do operations + in the same netlink payload. + """ + + parser = argparse.ArgumentParser(description=description, + epilog=epilog) parser.add_argument('--spec', dest='spec', type=str, required=True) parser.add_argument('--schema', dest='schema', type=str) parser.add_argument('--no-schema', action='store_true') parser.add_argument('--json', dest='json_text', type=str) - parser.add_argument('--do', dest='do', type=str) - parser.add_argument('--dump', dest='dump', type=str) + + group = parser.add_mutually_exclusive_group() + group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) + group.add_argument('--multi', dest='multi', nargs=2, action='append', + metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) + group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + parser.add_argument('--sleep', dest='sleep', type=int) parser.add_argument('--subscribe', dest='ntf', type=str) parser.add_argument('--replace', dest='flags', action='append_const', @@ -73,6 +88,10 @@ def main(): if args.dump: reply = ynl.dump(args.dump, attrs) output(reply) + if args.multi: + ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ] + reply = ynl.do_multi(ops) + output(reply) except NlError as e: print(e) exit(1) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index a3ec7a56180a..ea48f83c2e84 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -938,16 +938,11 @@ class YnlFamily(SpecFamily): return op['do']['request']['attributes'].copy() - def _op(self, method, vals, flags=None, dump=False): - op = self.ops[method] - + def _encode_message(self, op, vals, flags, req_seq): nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK for flag in flags or []: nl_flags |= flag - if dump: - nl_flags |= Netlink.NLM_F_DUMP - req_seq = random.randint(1024, 65535) msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq) if op.fixed_header: msg += self._encode_struct(op.fixed_header, vals) @@ -955,18 +950,36 @@ class YnlFamily(SpecFamily): for name, value in vals.items(): msg += self._add_attr(op.attr_set.name, name, value, search_attrs) msg = _genl_msg_finalize(msg) + return msg - self.sock.send(msg, 0) + def _ops(self, ops): + reqs_by_seq = {} + req_seq = random.randint(1024, 65535) + payload = b'' + for (method, vals, flags) in ops: + op = self.ops[method] + msg = self._encode_message(op, vals, flags, req_seq) + reqs_by_seq[req_seq] = (op, msg, flags) + payload += msg + req_seq += 1 + + self.sock.send(payload, 0) done = False rsp = [] + op_rsp = [] while not done: reply = self.sock.recv(self._recv_size) nms = NlMsgs(reply, attr_space=op.attr_set) self._recv_dbg_print(reply, nms) for nl_msg in nms: - if nl_msg.extack: - self._decode_extack(msg, op, nl_msg.extack) + if nl_msg.nl_seq in reqs_by_seq: + (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq] + if nl_msg.extack: + self._decode_extack(req_msg, op, nl_msg.extack) + else: + op = self.rsp_by_value[nl_msg.cmd()] + req_flags = [] if nl_msg.error: raise NlError(nl_msg) @@ -974,13 +987,25 @@ class YnlFamily(SpecFamily): if nl_msg.extack: print("Netlink warning:") print(nl_msg) - done = True + + if Netlink.NLM_F_DUMP in req_flags: + rsp.append(op_rsp) + elif not op_rsp: + rsp.append(None) + elif len(op_rsp) == 1: + rsp.append(op_rsp[0]) + else: + rsp.append(op_rsp) + op_rsp = [] + + del reqs_by_seq[nl_msg.nl_seq] + done = len(reqs_by_seq) == 0 break decoded = self.nlproto.decode(self, nl_msg, op) # Check if this is a reply to our request - if nl_msg.nl_seq != req_seq or decoded.cmd() != op.rsp_value: + if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value: if decoded.cmd() in self.async_msg_ids: self.handle_ntf(decoded) continue @@ -991,18 +1016,23 @@ class YnlFamily(SpecFamily): rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name) if op.fixed_header: rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header)) - rsp.append(rsp_msg) + op_rsp.append(rsp_msg) - if dump: - return rsp - if not rsp: - return None - if len(rsp) == 1: - return rsp[0] return rsp + def _op(self, method, vals, flags=None, dump=False): + req_flags = flags or [] + if dump: + req_flags.append(Netlink.NLM_F_DUMP) + + ops = [(method, vals, req_flags)] + return self._ops(ops)[0] + def do(self, method, vals, flags=None): return self._op(method, vals, flags) def dump(self, method, vals): - return self._op(method, vals, [], dump=True) + return self._op(method, vals, dump=True) + + def do_multi(self, ops): + return self._ops(ops) -- cgit v1.2.3 From 23710925928310ec481fc0909a4d44ef89f4241a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:35:42 -0700 Subject: selftests: drv-net: test dumping qstats per device Add a test for dumping qstats device by device. ksft framework grows a ksft_raises() helper, to be used under with, which should be familiar to unittest users. Link: https://lore.kernel.org/r/20240420023543.3300306-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/stats.py | 62 ++++++++++++++++++++++++++-- tools/testing/selftests/net/lib/py/ksft.py | 18 ++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 947df3eb681f..7a7b16b180e2 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit -from lib.py import ksft_in, ksft_true, KsftSkipEx, KsftXfailEx +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv @@ -77,9 +77,65 @@ def pkt_byte_sum(cfg) -> None: raise Exception("Qstats are lower, fetched later") +def qstat_by_ifindex(cfg) -> None: + global netfam + global rtnl + + # Construct a map ifindex -> [dump, by-index, dump] + ifindexes = {} + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']] = [entry, None, None] + + for ifindex in ifindexes.keys(): + entry = netfam.qstats_get({"ifindex": ifindex}, dump=True) + ksft_eq(len(entry), 1) + ifindexes[entry[0]['ifindex']][1] = entry[0] + + stats = netfam.qstats_get({}, dump=True) + for entry in stats: + ifindexes[entry['ifindex']][2] = entry + + if len(ifindexes) == 0: + raise KsftSkipEx("No ifindex supports qstats") + + # Now make sure the stats match/make sense + for ifindex, triple in ifindexes.items(): + all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys() + + for key in all_keys: + ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) + ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + + # Test invalid dumps + # 0 is invalid + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 0}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -34) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # loopback has no stats + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": 1}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -95) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + # Try to get stats for lowest unused ifindex but not 0 + devs = rtnl.getlink({}, dump=True) + all_ifindexes = set([dev["ifi-index"] for dev in devs]) + lowest = 2 + while lowest in all_ifindexes: + lowest += 1 + + with ksft_raises(NlError) as cm: + netfam.qstats_get({"ifindex": lowest}, dump=True) + ksft_eq(cm.exception.nl_msg.error, -19) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') + + def main() -> None: with NetDrvEnv(__file__) as cfg: - ksft_run([check_pause, check_fec, pkt_byte_sum], + ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 25f2572fa540..e7f79f6185b0 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -53,6 +53,24 @@ def ksft_ge(a, b, comment=""): _fail("Check failed", a, "<", b, comment) +class ksft_raises: + def __init__(self, expected_type): + self.exception = None + self.expected_type = expected_type + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is None: + _fail(f"Expected exception {str(self.expected_type.__name__)}, none raised") + elif self.expected_type != exc_type: + _fail(f"Expected exception {str(self.expected_type.__name__)}, raised {str(exc_type.__name__)}") + self.exception = exc_val + # Suppress the exception if its the expected one + return self.expected_type == exc_type + + def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): end = time.monotonic() + deadline while True: -- cgit v1.2.3 From 1a20a9a0ddef17c0bd67eece34a7439b02a7b0ba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:31 -0700 Subject: selftests: drv-net: define endpoint structures Define the remote endpoint "model". To execute most meaningful device driver tests we need to be able to communicate with a remote system, and have it send traffic to the device under test. Various test environments will have different requirements. 0) "Local" netdevsim-based testing can simply use net namespaces. netdevsim supports connecting two devices now, to form a veth-like construct. 1) Similarly on hosts with multiple NICs, the NICs may be connected together with a loopback cable or internal device loopback. One interface may be placed into separate netns, and tests would proceed much like in the netdevsim case. Note that the loopback config or the moving of one interface into a netns is not expected to be part of selftest code. 2) Some systems may need to communicate with the remote endpoint via SSH. 3) Last but not least environment may have its own custom communication method. Fundamentally we only need two operations: - run a command remotely - deploy a binary (if some tool we need is built as part of kselftests) Wrap these two in a class. Use dynamic loading to load the Remote class. This will allow very easy definition of other communication methods without bothering upstream code base. Stick to the "simple" / "no unnecessary abstractions" model for referring to the remote endpoints. The host / remote object are passed as an argument to the usual cmd() or ip() invocation. For example: ip("link show", json=True, host=remote) Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/lib/py/__init__.py | 1 + .../testing/selftests/drivers/net/lib/py/remote.py | 15 +++++++++ .../selftests/drivers/net/lib/py/remote_netns.py | 21 ++++++++++++ .../selftests/drivers/net/lib/py/remote_ssh.py | 39 ++++++++++++++++++++++ tools/testing/selftests/net/lib/py/utils.py | 17 +++++----- 5 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/lib/py/remote.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/remote_netns.py create mode 100644 tools/testing/selftests/drivers/net/lib/py/remote_ssh.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 4653dffcd962..4789c1a4282d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -15,3 +15,4 @@ except ModuleNotFoundError as e: sys.exit(4) from .env import * +from .remote import Remote diff --git a/tools/testing/selftests/drivers/net/lib/py/remote.py b/tools/testing/selftests/drivers/net/lib/py/remote.py new file mode 100644 index 000000000000..b1780b987722 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import importlib + +_modules = {} + +def Remote(kind, args, src_path): + global _modules + + if kind not in _modules: + _modules[kind] = importlib.import_module("..remote_" + kind, __name__) + + dir_path = os.path.abspath(src_path + "/../") + return getattr(_modules[kind], "Remote")(args, dir_path) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_netns.py b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py new file mode 100644 index 000000000000..7d5eeb0271bc --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import subprocess + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + + def cmd(self, comm): + return subprocess.Popen(["ip", "netns", "exec", self.name, "bash", "-c", comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def deploy(self, what): + if os.path.isabs(what): + return what + return os.path.abspath(self.dir_path + "/" + what) diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py new file mode 100644 index 000000000000..924addde19a3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0 + +import os +import string +import subprocess +import random + +from lib.py import cmd + + +class Remote: + def __init__(self, name, dir_path): + self.name = name + self.dir_path = dir_path + self._tmpdir = None + + def __del__(self): + if self._tmpdir: + cmd("rm -rf " + self._tmpdir, host=self) + self._tmpdir = None + + def cmd(self, comm): + return subprocess.Popen(["ssh", "-q", self.name, comm], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def _mktmp(self): + return ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + + def deploy(self, what): + if not self._tmpdir: + self._tmpdir = "/tmp/" + self._mktmp() + cmd("mkdir " + self._tmpdir, host=self) + file_name = self._tmpdir + "/" + self._mktmp() + os.path.basename(what) + + if not os.path.isabs(what): + what = os.path.abspath(self.dir_path + "/" + what) + + cmd(f"scp {what} {self.name}:{file_name}") + return file_name diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 19612348c30d..7347d0c0ff05 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -4,10 +4,8 @@ import json as _json import subprocess class cmd: - def __init__(self, comm, shell=True, fail=True, ns=None, background=False): + def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None): if ns: - if isinstance(ns, NetNS): - ns = ns.name comm = f'ip netns exec {ns} ' + comm self.stdout = None @@ -15,15 +13,18 @@ class cmd: self.ret = None self.comm = comm - self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + if host: + self.proc = host.cmd(comm) + else: + self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) if not background: self.process(terminate=False, fail=fail) def process(self, terminate=True, fail=None): if terminate: self.proc.terminate() - stdout, stderr = self.proc.communicate() + stdout, stderr = self.proc.communicate(timeout=5) self.stdout = stdout.decode("utf-8") self.stderr = stderr.decode("utf-8") self.proc.stdout.close() @@ -37,12 +38,12 @@ class cmd: (self.proc.args, stdout, stderr)) -def ip(args, json=None, ns=None): +def ip(args, json=None, ns=None, host=None): cmd_str = "ip " if json: cmd_str += '-j ' cmd_str += args - cmd_obj = cmd(cmd_str, ns=ns) + cmd_obj = cmd(cmd_str, ns=ns, host=host) if json: return _json.loads(cmd_obj.stdout) return cmd_obj -- cgit v1.2.3 From 543389295085f65a405b046e31eb107b2a465bd4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:32 -0700 Subject: selftests: drv-net: factor out parsing of the env The tests with a remote end will use a different class, for clarity, but will also need to parse the env. So factor parsing the env out to a function. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 45 ++++++++++++++--------- 1 file changed, 27 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index e1abe9491daf..a081e168f3db 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -6,12 +6,36 @@ from pathlib import Path from lib.py import ip from lib.py import NetdevSimDev + +def _load_env_file(src_path): + env = os.environ.copy() + + src_dir = Path(src_path).parent.resolve() + if not (src_dir / "net.config").exists(): + return env + + lexer = shlex.shlex(open((src_dir / "net.config").as_posix(), 'r').read()) + k = None + for token in lexer: + if k is None: + k = token + env[k] = "" + elif token == "=": + pass + else: + env[k] = token + k = None + return env + + class NetDrvEnv: + """ + Class for a single NIC / host env, with no remote end + """ def __init__(self, src_path): self._ns = None - self.env = os.environ.copy() - self._load_env_file(src_path) + self.env = _load_env_file(src_path) if 'NETIF' in self.env: self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] @@ -34,19 +58,4 @@ class NetDrvEnv: self._ns.remove() self._ns = None - def _load_env_file(self, src_path): - src_dir = Path(src_path).parent.resolve() - if not (src_dir / "net.config").exists(): - return - - lexer = shlex.shlex(open((src_dir / "net.config").as_posix(), 'r').read()) - k = None - for token in lexer: - if k is None: - k = token - self.env[k] = "" - elif token == "=": - pass - else: - self.env[k] = token - k = None + -- cgit v1.2.3 From 1880f272d2f9ef2c65a78e80ede235b3123075fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:33 -0700 Subject: selftests: drv-net: construct environment for running tests which require an endpoint Nothing surprising here, hopefully. Wrap the variables from the environment into a class or spawn a netdevsim based env and pass it to the tests. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/README.rst | 33 ++++++++ tools/testing/selftests/drivers/net/lib/py/env.py | 98 ++++++++++++++++++++++- tools/testing/selftests/net/lib/py/__init__.py | 1 + tools/testing/selftests/net/lib/py/netns.py | 31 +++++++ 4 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/lib/py/netns.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index 5ef7c417d431..0cbab33dad1f 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -23,8 +23,41 @@ or:: # Variable set in a file NETIF=eth0 +Please note that the config parser is very simple, if there are +any non-alphanumeric characters in the value it needs to be in +double quotes. + NETIF ~~~~~ Name of the netdevice against which the test should be executed. When empty or not set software devices will be used. + +LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Local and remote endpoint IP addresses. + +REMOTE_TYPE +~~~~~~~~~~~ + +Communication method used to run commands on the remote endpoint. +Test framework has built-in support for ``netns`` and ``ssh`` channels. +``netns`` assumes the "remote" interface is part of the same +host, just moved to the specified netns. +``ssh`` communicates with remote endpoint over ``ssh`` and ``scp``. +Using persistent SSH connections is strongly encouraged to avoid +the latency of SSH connection setup on every command. + +Communication methods are defined by classes in ``lib/py/remote_{name}.py``. +It should be possible to add a new method without modifying any of +the framework, by simply adding an appropriately named file to ``lib/py``. + +REMOTE_ARGS +~~~~~~~~~~~ + +Arguments used to construct the communication channel. +Communication channel dependent:: + + for netns - name of the "remote" namespace + for ssh - name/address of the remote host diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index a081e168f3db..579c5b34e6fd 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -4,7 +4,8 @@ import os import shlex from pathlib import Path from lib.py import ip -from lib.py import NetdevSimDev +from lib.py import NetNS, NetdevSimDev +from .remote import Remote def _load_env_file(src_path): @@ -59,3 +60,98 @@ class NetDrvEnv: self._ns = None +class NetDrvEpEnv: + """ + Class for an environment with a local device and "remote endpoint" + which can be used to send traffic in. + + For local testing it creates two network namespaces and a pair + of netdevsim devices. + """ + + # Network prefixes used for local tests + nsim_v4_pfx = "192.0.2." + nsim_v6_pfx = "2001:db8::" + + def __init__(self, src_path): + + self.env = _load_env_file(src_path) + + # Things we try to destroy + self.remote = None + # These are for local testing state + self._netns = None + self._ns = None + self._ns_peer = None + + if "NETIF" in self.env: + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + + self.v4 = self.env.get("LOCAL_V4") + self.v6 = self.env.get("LOCAL_V6") + self.remote_v4 = self.env.get("REMOTE_V4") + self.remote_v6 = self.env.get("REMOTE_V6") + kind = self.env["REMOTE_TYPE"] + args = self.env["REMOTE_ARGS"] + else: + self.create_local() + + self.dev = self._ns.nsims[0].dev + + self.v4 = self.nsim_v4_pfx + "1" + self.v6 = self.nsim_v6_pfx + "1" + self.remote_v4 = self.nsim_v4_pfx + "2" + self.remote_v6 = self.nsim_v6_pfx + "2" + kind = "netns" + args = self._netns.name + + self.remote = Remote(kind, args, src_path) + + self.addr = self.v6 if self.v6 else self.v4 + self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4 + + self.ifname = self.dev['ifname'] + self.ifindex = self.dev['ifindex'] + + def create_local(self): + self._netns = NetNS() + self._ns = NetdevSimDev() + self._ns_peer = NetdevSimDev(ns=self._netns) + + with open("/proc/self/ns/net") as nsfd0, \ + open("/var/run/netns/" + self._netns.name) as nsfd1: + ifi0 = self._ns.nsims[0].ifindex + ifi1 = self._ns_peer.nsims[0].ifindex + NetdevSimDev.ctrl_write('link_device', + f'{nsfd0.fileno()}:{ifi0} {nsfd1.fileno()}:{ifi1}') + + ip(f" addr add dev {self._ns.nsims[0].ifname} {self.nsim_v4_pfx}1/24") + ip(f"-6 addr add dev {self._ns.nsims[0].ifname} {self.nsim_v6_pfx}1/64 nodad") + ip(f" link set dev {self._ns.nsims[0].ifname} up") + + ip(f" addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v4_pfx}2/24", ns=self._netns) + ip(f"-6 addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v6_pfx}2/64 nodad", ns=self._netns) + ip(f" link set dev {self._ns_peer.nsims[0].ifname} up", ns=self._netns) + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + """ + __exit__ gets called at the end of a "with" block. + """ + self.__del__() + + def __del__(self): + if self._ns: + self._ns.remove() + self._ns = None + if self._ns_peer: + self._ns_peer.remove() + self._ns_peer = None + if self._netns: + del self._netns + self._netns = None + if self.remote: + del self.remote + self.remote = None diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py index ded7102df18a..b6d498d125fe 100644 --- a/tools/testing/selftests/net/lib/py/__init__.py +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -2,6 +2,7 @@ from .consts import KSRC from .ksft import * +from .netns import NetNS from .nsim import * from .utils import * from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py new file mode 100644 index 000000000000..ecff85f9074f --- /dev/null +++ b/tools/testing/selftests/net/lib/py/netns.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 + +from .utils import ip +import random +import string + + +class NetNS: + def __init__(self, name=None): + if name: + self.name = name + else: + self.name = ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) + ip('netns add ' + self.name) + + def __del__(self): + if self.name: + ip('netns del ' + self.name) + self.name = None + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + self.__del__() + + def __str__(self): + return self.name + + def __repr__(self): + return f"NetNS({self.name})" -- cgit v1.2.3 From a48a87c0866444343354f211d30b5acab54bb800 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:34 -0700 Subject: selftests: drv-net: add a trivial ping test Add a very simple test for testing with a remote system. Both IPv4 and IPv6 connectivity is optional, later change will add checks to skip tests based on available addresses. Using netdevsim: $ ./run_kselftest.sh -t drivers/net:ping.py TAP version 13 1..1 # timeout set to 45 # selftests: drivers/net: ping.py # KTAP version 1 # 1..2 # ok 1 ping.test_v4 # ok 2 ping.test_v6 # # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 ok 1 selftests: drivers/net: ping.py Command line SSH: $ NETIF=virbr0 REMOTE_TYPE=ssh REMOTE_ARGS=root@192.168.122.123 \ LOCAL_V4=192.168.122.1 REMOTE_V4=192.168.122.123 \ ./tools/testing/selftests/drivers/net/ping.py KTAP version 1 1..2 ok 1 ping.test_v4 ok 2 ping.test_v6 # SKIP Test requires IPv6 connectivity # Totals: pass:1 fail:0 xfail:1 xpass:0 skip:0 error:0 Existing devices placed in netns (and using net.config): $ cat drivers/net/net.config NETIF=veth0 REMOTE_TYPE=netns REMOTE_ARGS=red LOCAL_V4="192.168.1.1" REMOTE_V4="192.168.1.2" $ ./run_kselftest.sh -t drivers/net:ping.py TAP version 13 1..1 # timeout set to 45 # selftests: drivers/net: ping.py # KTAP version 1 # 1..2 # ok 1 ping.test_v4 # ok 2 ping.test_v6 # SKIP Test requires IPv6 connectivity # # Totals: pass:1 fail:0 xfail:1 xpass:0 skip:0 error:0 Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 5 ++++- tools/testing/selftests/drivers/net/ping.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/ping.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 379cdb1960a7..754ec643768a 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -2,6 +2,9 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) -TEST_PROGS := stats.py +TEST_PROGS := \ + ping.py \ + stats.py \ +# end of TEST_PROGS include ../../lib.mk diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py new file mode 100755 index 000000000000..e75908d7c558 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ping.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvEpEnv +from lib.py import cmd + + +def test_v4(cfg) -> None: + cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}") + cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote) + + +def test_v6(cfg) -> None: + cmd(f"ping -c 1 -W0.5 {cfg.remote_v6}") + cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote) + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run([test_v4, test_v6], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 01b431641c33d488ecc6cd6d9e01f7f073bfa54f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:35 -0700 Subject: selftests: net: support matching cases by name prefix While writing tests with a lot more cases I got tired of having to jump back and forth to add the name of the test to the ksft_run() list. Most unittest frameworks do some name matching, e.g. assume that functions with names starting with test_ are test cases. Support similar flow in ksft_run(). Let the author list the desired prefixes. globals() need to be passed explicitly, IDK how to work around that. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ping.py | 3 +-- tools/testing/selftests/net/lib/py/ksft.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index e75908d7c558..9f65a0764aab 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -18,8 +18,7 @@ def test_v6(cfg) -> None: def main() -> None: with NetDrvEpEnv(__file__) as cfg: - ksft_run([test_v4, test_v6], - args=(cfg, )) + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index e7f79f6185b0..f84e9fdd0032 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -99,7 +99,18 @@ def ktap_result(ok, cnt=1, case="", comment=""): print(res) -def ksft_run(cases, args=()): +def ksft_run(cases=None, globs=None, case_pfx=None, args=()): + cases = cases or [] + + if globs and case_pfx: + for key, value in globs.items(): + if not callable(value): + continue + for prefix in case_pfx: + if key.startswith(prefix): + cases.append(value) + break + totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} print("KTAP version 1") -- cgit v1.2.3 From 31611cea8f0f45f0b803b010be47a37792ba58a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:36 -0700 Subject: selftests: drv-net: add a TCP ping test case (and useful helpers) More complex tests often have to spawn a background process, like a server which will respond to requests or tcpdump. Add support for creating such processes using the with keyword: with bkg("my-daemon", ..): # my-daemon is alive in this block My initial thought was to add this support to cmd() directly but it runs the command in the constructor, so by the time we __enter__ it's too late to make sure we used "background=True". Second useful helper transplanted from net_helper.sh is wait_port_listen(). The test itself uses socat, which insists on v6 addresses being wrapped in [], it's not the only command which requires this format, so add the wrapped address to env. The hope is to save test code from checking if address is v6. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 5 +++ tools/testing/selftests/drivers/net/ping.py | 21 ++++++++++- tools/testing/selftests/net/lib/py/utils.py | 43 +++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 579c5b34e6fd..dd5cb0226a31 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -110,6 +110,11 @@ class NetDrvEpEnv: self.addr = self.v6 if self.v6 else self.v4 self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4 + self.addr_ipver = "6" if self.v6 else "4" + # Bracketed addresses, some commands need IPv6 to be inside [] + self.baddr = f"[{self.v6}]" if self.v6 else self.v4 + self.remote_baddr = f"[{self.remote_v6}]" if self.remote_v6 else self.remote_v4 + self.ifname = self.dev['ifname'] self.ifindex = self.dev['ifindex'] diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 9f65a0764aab..4b49de59231c 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -2,8 +2,9 @@ # SPDX-License-Identifier: GPL-2.0 from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq from lib.py import NetDrvEpEnv -from lib.py import cmd +from lib.py import bkg, cmd, wait_port_listen, rand_port def test_v4(cfg) -> None: @@ -16,6 +17,24 @@ def test_v6(cfg) -> None: cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote) +def test_tcp(cfg) -> None: + port = rand_port() + listen_cmd = f"socat -{cfg.addr_ipver} -t 2 -u TCP-LISTEN:{port},reuseport STDOUT" + + with bkg(listen_cmd, exit_wait=True) as nc: + wait_port_listen(port) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.baddr}:{port}", + shell=True, host=cfg.remote) + ksft_eq(nc.stdout.strip(), "ping") + + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc: + wait_port_listen(port, host=cfg.remote) + + cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.remote_baddr}:{port}", shell=True) + ksft_eq(nc.stdout.strip(), "ping") + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 7347d0c0ff05..d3715e6c21f2 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -1,7 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 import json as _json +import random +import re import subprocess +import time + class cmd: def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None): @@ -38,6 +42,20 @@ class cmd: (self.proc.args, stdout, stderr)) +class bkg(cmd): + def __init__(self, comm, shell=True, fail=True, ns=None, host=None, + exit_wait=False): + super().__init__(comm, background=True, + shell=shell, fail=fail, ns=ns, host=host) + self.terminate = not exit_wait + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + return self.process(terminate=self.terminate) + + def ip(args, json=None, ns=None, host=None): cmd_str = "ip " if json: @@ -47,3 +65,28 @@ def ip(args, json=None, ns=None, host=None): if json: return _json.loads(cmd_obj.stdout) return cmd_obj + + +def rand_port(): + """ + Get unprivileged port, for now just random, one day we may decide to check if used. + """ + return random.randint(1024, 65535) + + +def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): + end = time.monotonic() + deadline + + pattern = f":{port:04X} .* " + if proto == "tcp": # for tcp protocol additionally check the socket state + pattern += "0A" + pattern = re.compile(pattern) + + while True: + data = cmd(f'cat /proc/net/{proto}*', ns=ns, host=host, shell=True).stdout + for row in data.split("\n"): + if pattern.search(row): + return + if time.monotonic() > end: + raise Exception("Waiting for port listen timed out") + time.sleep(sleep) -- cgit v1.2.3 From f1e68a1a4a404e739f93b4e48344f9101b581771 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:52:37 -0700 Subject: selftests: drv-net: add require_XYZ() helpers for validating env Wrap typical checks like whether given command used by the test is available in helpers. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240420025237.3309296-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 29 ++++++++++++++++++++++- tools/testing/selftests/drivers/net/ping.py | 6 +++++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index dd5cb0226a31..a3db1bb1afeb 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -3,7 +3,8 @@ import os import shlex from pathlib import Path -from lib.py import ip +from lib.py import KsftSkipEx +from lib.py import cmd, ip from lib.py import NetNS, NetdevSimDev from .remote import Remote @@ -118,6 +119,8 @@ class NetDrvEpEnv: self.ifname = self.dev['ifname'] self.ifindex = self.dev['ifindex'] + self._required_cmd = {} + def create_local(self): self._netns = NetNS() self._ns = NetdevSimDev() @@ -160,3 +163,27 @@ class NetDrvEpEnv: if self.remote: del self.remote self.remote = None + + def require_v4(self): + if not self.v4 or not self.remote_v4: + raise KsftSkipEx("Test requires IPv4 connectivity") + + def require_v6(self): + if not self.v6 or not self.remote_v6: + raise KsftSkipEx("Test requires IPv6 connectivity") + + def _require_cmd(self, comm, key, host=None): + cached = self._required_cmd.get(comm, {}) + if cached.get(key) is None: + cached[key] = cmd("command -v -- " + comm, fail=False, + shell=True, host=host).ret == 0 + self._required_cmd[comm] = cached + return cached[key] + + def require_cmd(self, comm, local=True, remote=False): + if local: + if not self._require_cmd(comm, "local"): + raise KsftSkipEx("Test requires command: " + comm) + if remote: + if not self._require_cmd(comm, "remote"): + raise KsftSkipEx("Test requires (remote) command: " + comm) diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 4b49de59231c..eb83e7b48797 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -8,16 +8,22 @@ from lib.py import bkg, cmd, wait_port_listen, rand_port def test_v4(cfg) -> None: + cfg.require_v4() + cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}") cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote) def test_v6(cfg) -> None: + cfg.require_v6() + cmd(f"ping -c 1 -W0.5 {cfg.remote_v6}") cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote) def test_tcp(cfg) -> None: + cfg.require_cmd("socat", remote=True) + port = rand_port() listen_cmd = f"socat -{cfg.addr_ipver} -t 2 -u TCP-LISTEN:{port},reuseport STDOUT" -- cgit v1.2.3 From b29781afaed29d5e41557231c46abd25bdc8d0c4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Sep 2023 08:50:10 -0300 Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: be482ff9500999f5 ("x86/bhi: Enumerate Branch History Injection (BHI) bug") 0f4a837615ff925b ("x86/bhi: Define SPEC_CTRL_BHI_DIS_S") That cause no changes to tooling: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > x86_msr.before $ objdump -dS /tmp/build/perf-tools-next/util/amd-sample-raw.o > amd-sample-raw.o.before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ make -C tools/perf O=/tmp/build/perf-tools-next CC /tmp/build/perf-tools-next/trace/beauty/tracepoints/x86_msr.o CC /tmp/build/perf-tools-next/util/amd-sample-raw.o $ objdump -dS /tmp/build/perf-tools-next/util/amd-sample-raw.o > amd-sample-raw.o.after $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > x86_msr.after $ diff -u x86_msr.before x86_msr.after $ diff -u amd-sample-raw.o.before amd-sample-raw.o.after Just silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Adrian Hunter Cc: Daniel Sneddon Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Pawan Gupta Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/ZifCnEZFx5MZQuIW@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 05956bd8bacf..e72c2b872957 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -61,10 +61,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -163,6 +166,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. -- cgit v1.2.3 From f1d0a2fbb0088675cceeab73d1f4b4308b289984 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:06 +0200 Subject: tools: sync include/uapi/linux/bpf.h cp include/uapi/linux/bpf.h tools/include/uapi/linux/bpf.h Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-6-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cee0a7915c08..e4ae83550fb3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7306,6 +7306,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); -- cgit v1.2.3 From b4abee7c1ae3d59440e7915da28c6d2cd394738a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:10 +0200 Subject: selftests/bpf: add bpf_wq tests We simply try in all supported map types if we can store/load a bpf_wq. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-10-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/wq.c | 11 +++ tools/testing/selftests/bpf/progs/wq.c | 129 ++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/wq.c create mode 100644 tools/testing/selftests/bpf/progs/wq.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c new file mode 100644 index 000000000000..9a07b8bc2c52 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires */ +#include +#include "wq.skel.h" + +void serial_test_wq(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq); +} diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c new file mode 100644 index 000000000000..833f54aa8fdf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct hmap_elem { + int counter; + struct bpf_timer timer; /* unused */ + struct bpf_spin_lock lock; /* unused */ + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +static int test_elem_callback(void *map, int *key) +{ + struct elem init = {}, *val; + + if (map == &lru && + bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + return 0; +} + +static int test_hmap_elem_callback(void *map, int *key) +{ + struct hmap_elem init = {}, *val; + + if (bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + return 0; +} + +SEC("tc") +/* test that workqueues can be used from an array */ +__retval(0) +long test_call_array_sleepable(void *ctx) +{ + int key = 0; + + return test_elem_callback(&array, &key); +} + +SEC("syscall") +/* Same test than above but from a sleepable context. */ +__retval(0) +long test_syscall_array_sleepable(void *ctx) +{ + int key = 1; + + return test_elem_callback(&array, &key); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap */ +__retval(0) +long test_call_hash_sleepable(void *ctx) +{ + int key = 2; + + return test_hmap_elem_callback(&hmap, &key); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap with NO_PREALLOC. */ +__retval(0) +long test_call_hash_malloc_sleepable(void *ctx) +{ + int key = 3; + + return test_hmap_elem_callback(&hmap_malloc, &key); +} + +SEC("tc") +/* test that workqueues can be used from a LRU map */ +__retval(0) +long test_call_lru_sleepable(void *ctx) +{ + int key = 4; + + return test_elem_callback(&lru, &key); +} -- cgit v1.2.3 From e3d9eac99afd94980475833479332fefd74c5c2b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:12 +0200 Subject: selftests/bpf: wq: add bpf_wq_init() checks Allows to test if allocation/free works Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-12-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 1 + tools/testing/selftests/bpf/prog_tests/wq.c | 8 +++ tools/testing/selftests/bpf/progs/wq.c | 10 ++++ tools/testing/selftests/bpf/progs/wq_failures.c | 78 +++++++++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/wq_failures.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 3329ea080865..785b91b629be 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -470,4 +470,5 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it, extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; +extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 9a07b8bc2c52..26ab69796103 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -2,6 +2,7 @@ /* Copyright (c) 2024 Benjamin Tissoires */ #include #include "wq.skel.h" +#include "wq_failures.skel.h" void serial_test_wq(void) { @@ -9,3 +10,10 @@ void serial_test_wq(void) RUN_TESTS(wq); } + +void serial_test_failures_wq(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq_failures); +} diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index 833f54aa8fdf..ed2fe26e14ef 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -52,6 +52,7 @@ struct { static int test_elem_callback(void *map, int *key) { struct elem init = {}, *val; + struct bpf_wq *wq; if (map == &lru && bpf_map_update_elem(map, key, &init, 0)) @@ -61,12 +62,17 @@ static int test_elem_callback(void *map, int *key) if (!val) return -2; + wq = &val->w; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + return 0; } static int test_hmap_elem_callback(void *map, int *key) { struct hmap_elem init = {}, *val; + struct bpf_wq *wq; if (bpf_map_update_elem(map, key, &init, 0)) return -1; @@ -75,6 +81,10 @@ static int test_hmap_elem_callback(void *map, int *key) if (!val) return -2; + wq = &val->work; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + return 0; } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c new file mode 100644 index 000000000000..db7015c7d541 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +SEC("tc") +/* test that bpf_wq_init takes a map as a second argument + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("pointer in R2 isn't map pointer") +long test_wq_init_nomap(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &key, 0) != 0) + return -3; + + return 0; +} + +SEC("tc") +/* test that the workqueue is part of the map in bpf_wq_init + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0") +long test_wq_init_wrong_map(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &lru, 0) != 0) + return -3; + + return 0; +} -- cgit v1.2.3 From 01b7b1c5f3cc029bdd2652eba61e953ccd286c0e Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:14 +0200 Subject: selftests/bpf: add checks for bpf_wq_set_callback() We assign the callback and set everything up. The actual tests of these callbacks will be done when bpf_wq_start() is available. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-14-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 5 ++ .../selftests/bpf/bpf_testmod/bpf_testmod.c | 5 ++ .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h | 1 + tools/testing/selftests/bpf/progs/wq.c | 41 +++++++++++--- tools/testing/selftests/bpf/progs/wq_failures.c | 66 ++++++++++++++++++++++ 5 files changed, 111 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 785b91b629be..b80b39f76034 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -471,4 +471,9 @@ extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags__k, void *aux__ign) __ksym; +#define bpf_wq_set_callback(timer, cb, flags) \ + bpf_wq_set_callback_impl(timer, cb, flags, NULL) #endif diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 39ad96a18123..eb2b78552ca2 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -494,6 +494,10 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } +__bpf_kfunc void bpf_kfunc_call_test_sleepable(void) +{ +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -520,6 +524,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7c664dd61059..ce5cd763561c 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -96,6 +96,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; void bpf_kfunc_call_test_destructive(void) __ksym; +void bpf_kfunc_call_test_sleepable(void) __ksym; void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p); struct prog_test_member *bpf_kfunc_call_memb_acquire(void); diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index ed2fe26e14ef..dc301b52f91c 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -49,7 +49,8 @@ struct { __type(value, struct elem); } lru SEC(".maps"); -static int test_elem_callback(void *map, int *key) +static int test_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) { struct elem init = {}, *val; struct bpf_wq *wq; @@ -66,10 +67,14 @@ static int test_elem_callback(void *map, int *key) if (bpf_wq_init(wq, map, 0) != 0) return -3; + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + return 0; } -static int test_hmap_elem_callback(void *map, int *key) +static int test_hmap_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) { struct hmap_elem init = {}, *val; struct bpf_wq *wq; @@ -85,6 +90,28 @@ static int test_hmap_elem_callback(void *map, int *key) if (bpf_wq_init(wq, map, 0) != 0) return -3; + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + return 0; +} + +__u32 ok; +__u32 ok_sleepable; + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + ok |= (1 << *key); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + ok_sleepable |= (1 << *key); return 0; } @@ -95,7 +122,7 @@ long test_call_array_sleepable(void *ctx) { int key = 0; - return test_elem_callback(&array, &key); + return test_elem_callback(&array, &key, wq_cb_sleepable); } SEC("syscall") @@ -105,7 +132,7 @@ long test_syscall_array_sleepable(void *ctx) { int key = 1; - return test_elem_callback(&array, &key); + return test_elem_callback(&array, &key, wq_cb_sleepable); } SEC("tc") @@ -115,7 +142,7 @@ long test_call_hash_sleepable(void *ctx) { int key = 2; - return test_hmap_elem_callback(&hmap, &key); + return test_hmap_elem_callback(&hmap, &key, wq_callback); } SEC("tc") @@ -125,7 +152,7 @@ long test_call_hash_malloc_sleepable(void *ctx) { int key = 3; - return test_hmap_elem_callback(&hmap_malloc, &key); + return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback); } SEC("tc") @@ -135,5 +162,5 @@ long test_call_lru_sleepable(void *ctx) { int key = 4; - return test_elem_callback(&lru, &key); + return test_elem_callback(&lru, &key, wq_callback); } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index db7015c7d541..4cbdb425f223 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -27,6 +27,20 @@ struct { __type(value, struct elem); } lru SEC(".maps"); +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + return 0; +} + SEC("tc") /* test that bpf_wq_init takes a map as a second argument */ @@ -76,3 +90,55 @@ long test_wq_init_wrong_map(void *ctx) return 0; } + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("arg#0 doesn't point to a map value") +long test_wrong_wq_pointer(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)&wq, wq_callback, 0)) + return 3; + + return -22; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") +long test_wrong_wq_pointer_offset(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0)) + return 3; + + return -22; +} -- cgit v1.2.3 From 8290dba51910d36721ced6ccf03049ed6b7ea2ce Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:16 +0200 Subject: selftests/bpf: wq: add bpf_wq_start() checks Allows to test if allocation/free works Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-16-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 1 + tools/testing/selftests/bpf/prog_tests/wq.c | 22 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/wq.c | 20 +++++++++++++++++--- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index b80b39f76034..93c5a6c446b3 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -471,6 +471,7 @@ extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, struct bpf_wq *wq), unsigned int flags__k, void *aux__ign) __ksym; diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 26ab69796103..8a4a91d944cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -6,9 +6,31 @@ void serial_test_wq(void) { + struct wq *wq_skel = NULL; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); RUN_TESTS(wq); + + /* re-run the success test to check if the timer was actually executed */ + + wq_skel = wq__open_and_load(); + if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + return; + + err = wq__attach(wq_skel); + if (!ASSERT_OK(err, "wq_attach")) + return; + + prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + + ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); } void serial_test_failures_wq(void) diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index dc301b52f91c..49e712acbf60 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -49,12 +49,19 @@ struct { __type(value, struct elem); } lru SEC(".maps"); +__u32 ok; +__u32 ok_sleepable; + static int test_elem_callback(void *map, int *key, int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) { struct elem init = {}, *val; struct bpf_wq *wq; + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + if (map == &lru && bpf_map_update_elem(map, key, &init, 0)) return -1; @@ -70,6 +77,9 @@ static int test_elem_callback(void *map, int *key, if (bpf_wq_set_callback(wq, callback_fn, 0)) return -4; + if (bpf_wq_start(wq, 0)) + return -5; + return 0; } @@ -79,6 +89,10 @@ static int test_hmap_elem_callback(void *map, int *key, struct hmap_elem init = {}, *val; struct bpf_wq *wq; + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + if (bpf_map_update_elem(map, key, &init, 0)) return -1; @@ -93,12 +107,12 @@ static int test_hmap_elem_callback(void *map, int *key, if (bpf_wq_set_callback(wq, callback_fn, 0)) return -4; + if (bpf_wq_start(wq, 0)) + return -5; + return 0; } -__u32 ok; -__u32 ok_sleepable; - /* callback for non sleepable workqueue */ static int wq_callback(void *map, int *key, struct bpf_wq *work) { -- cgit v1.2.3 From 3134396f1cba939783b87c63dae3a54708285a9a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 24 Apr 2024 03:13:15 +0000 Subject: selftests/bpf: Add tests for preempt kfuncs Add tests for nested cases, nested count preservation upon different subprog calls that disable/enable preemption, and test sleepable helper call in non-preemptible regions. 182/1 preempt_lock/preempt_lock_missing_1:OK 182/2 preempt_lock/preempt_lock_missing_2:OK 182/3 preempt_lock/preempt_lock_missing_3:OK 182/4 preempt_lock/preempt_lock_missing_3_minus_2:OK 182/5 preempt_lock/preempt_lock_missing_1_subprog:OK 182/6 preempt_lock/preempt_lock_missing_2_subprog:OK 182/7 preempt_lock/preempt_lock_missing_2_minus_1_subprog:OK 182/8 preempt_lock/preempt_balance:OK 182/9 preempt_lock/preempt_balance_subprog_test:OK 182/10 preempt_lock/preempt_global_subprog_test:OK 182/11 preempt_lock/preempt_sleepable_helper:OK 182 preempt_lock:OK Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424031315.2757363-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/preempt_lock.c | 9 ++ tools/testing/selftests/bpf/progs/preempt_lock.c | 135 +++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/preempt_lock.c create mode 100644 tools/testing/selftests/bpf/progs/preempt_lock.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/preempt_lock.c b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c new file mode 100644 index 000000000000..02917c672441 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +void test_preempt_lock(void) +{ + RUN_TESTS(preempt_lock); +} diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c new file mode 100644 index 000000000000..6c637ee01ec4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "bpf_misc.h" + +void bpf_preempt_disable(void) __ksym; +void bpf_preempt_enable(void) __ksym; + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("3 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_3(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_enable(); + bpf_preempt_enable(); + return 0; +} + +static __noinline void preempt_disable(void) +{ + bpf_preempt_disable(); +} + +static __noinline void preempt_enable(void) +{ + bpf_preempt_enable(); +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + preempt_enable(); + return 0; +} + +static __noinline void preempt_balance_subprog(void) +{ + preempt_disable(); + preempt_enable(); +} + +SEC("?tc") +__success int preempt_balance(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_enable(); + return 0; +} + +SEC("?tc") +__success int preempt_balance_subprog_test(struct __sk_buff *ctx) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#") +int preempt_sleepable_helper(void *ctx) +{ + u32 data; + + bpf_preempt_disable(); + bpf_copy_from_user(&data, sizeof(data), NULL); + bpf_preempt_enable(); + return 0; +} + +int __noinline preempt_global_subprog(void) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?tc") +__failure __msg("global function calls are not allowed with preemption disabled") +int preempt_global_subprog_test(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_global_subprog(); + preempt_enable(); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 160933e330f4c5a13931d725a4d952a4b9aefa71 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 24 Apr 2024 17:39:59 +0000 Subject: KVM: selftests: Add test for uaccesses to non-existent vgic-v2 CPUIF Assert that accesses to a non-existent vgic-v2 CPU interface consistently fail across the various KVM device attr ioctls. This also serves as a regression test for a bug wherein KVM hits a NULL dereference when the CPUID specified in the ioctl is invalid. Note that there is no need to print the observed errno, as TEST_ASSERT() will take care of it. Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240424173959.3776798-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index eef816b80993..ca917c71ff60 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -84,6 +84,18 @@ static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, return v; } +static struct vm_gic vm_gic_create_barebones(uint32_t gic_dev_type) +{ + struct vm_gic v; + + v.gic_dev_type = gic_dev_type; + v.vm = vm_create_barebones(); + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + return v; +} + + static void vm_gic_destroy(struct vm_gic *v) { close(v->gic_fd); @@ -357,6 +369,40 @@ static void test_vcpus_then_vgic(uint32_t gic_dev_type) vm_gic_destroy(&v); } +#define KVM_VGIC_V2_ATTR(offset, cpu) \ + (FIELD_PREP(KVM_DEV_ARM_VGIC_OFFSET_MASK, offset) | \ + FIELD_PREP(KVM_DEV_ARM_VGIC_CPUID_MASK, cpu)) + +#define GIC_CPU_CTRL 0x00 + +static void test_v2_uaccess_cpuif_no_vcpus(void) +{ + struct vm_gic v; + u64 val = 0; + int ret; + + v = vm_gic_create_barebones(KVM_DEV_TYPE_ARM_VGIC_V2); + subtest_dist_rdist(&v); + + ret = __kvm_has_device_attr(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0)); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + ret = __kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + + vm_gic_destroy(&v); +} + static void test_v3_new_redist_regions(void) { struct kvm_vcpu *vcpus[NR_VCPUS]; @@ -675,6 +721,9 @@ void run_tests(uint32_t gic_dev_type) test_vcpus_then_vgic(gic_dev_type); test_vgic_then_vcpus(gic_dev_type); + if (VGIC_DEV_IS_V2(gic_dev_type)) + test_v2_uaccess_cpuif_no_vcpus(); + if (VGIC_DEV_IS_V3(gic_dev_type)) { test_v3_new_redist_regions(); test_v3_typer_accesses(); -- cgit v1.2.3 From 151f7442436658ee84076681d8f52e987fe147ea Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:27 +0800 Subject: selftests/bpf: Fix a fd leak in error paths in open_netns As Martin mentioned in review comment, there is an existing bug that orig_netns_fd will be leaked in the later "goto fail;" case after open("/proc/self/ns/net") in open_netns() in network_helpers.c. This patch adds "close(token->orig_netns_fd);" before "free(token);" to fix it. Fixes: a30338840fa5 ("selftests/bpf: Move open_netns() and close_netns() into network_helpers.c") Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/a104040b47c3c34c67f3f125cdfdde244a870d3c.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 9d63d2ac13d8..04676572fc1e 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -478,6 +478,8 @@ struct nstoken *open_netns(const char *name) return token; fail: + if (token->orig_netns_fd != -1) + close(token->orig_netns_fd); free(token); return NULL; } -- cgit v1.2.3 From 285cffbaa8e6056c2595e07e3a320e55c71870ad Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:28 +0800 Subject: selftests/bpf: Use log_err in open_netns/close_netns ASSERT helpers defined in test_progs.h shouldn't be used in public functions like open_netns() and close_netns(). Since they depend on test__fail() which defined in test_progs.c. Public functions may be used not only in test_progs.c, but in other tests like test_sock_addr.c in the next commit. This patch uses log_err() to replace ASSERT helpers in open_netns() and close_netns() in network_helpers.c to decouple dependencies, then uses ASSERT_OK_PTR() to check the return values of all open_netns(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/d1dad22b2ff4909af3f8bfd0667d046e235303cb.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 19 ++++++++++++++----- tools/testing/selftests/bpf/prog_tests/empty_skb.c | 2 ++ .../selftests/bpf/prog_tests/ip_check_defrag.c | 2 ++ tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 2 +- tools/testing/selftests/bpf/prog_tests/test_tunnel.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 16 ++++++++++++++++ 6 files changed, 39 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 04676572fc1e..a9e4905a2115 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -459,22 +459,30 @@ struct nstoken *open_netns(const char *name) struct nstoken *token; token = calloc(1, sizeof(struct nstoken)); - if (!ASSERT_OK_PTR(token, "malloc token")) + if (!token) { + log_err("Failed to malloc token"); return NULL; + } token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY); - if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net")) + if (token->orig_netns_fd == -1) { + log_err("Failed to open(/proc/self/ns/net)"); goto fail; + } snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); nsfd = open(nspath, O_RDONLY | O_CLOEXEC); - if (!ASSERT_GE(nsfd, 0, "open netns fd")) + if (nsfd == -1) { + log_err("Failed to open(%s)", nspath); goto fail; + } err = setns(nsfd, CLONE_NEWNET); close(nsfd); - if (!ASSERT_OK(err, "setns")) + if (err) { + log_err("Failed to setns(nsfd)"); goto fail; + } return token; fail: @@ -489,7 +497,8 @@ void close_netns(struct nstoken *token) if (!token) return; - ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns"); + if (setns(token->orig_netns_fd, CLONE_NEWNET)) + log_err("Failed to setns(orig_netns_fd)"); close(token->orig_netns_fd); free(token); } diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c index 261228eb68e8..438583e1f2d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c @@ -94,6 +94,8 @@ void test_empty_skb(void) SYS(out, "ip netns add empty_skb"); tok = open_netns("empty_skb"); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add veth0 type veth peer veth1"); SYS(out, "ip link set dev veth0 up"); SYS(out, "ip link set dev veth1 up"); diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c index 8dd2af9081f4..284764e7179f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c @@ -88,6 +88,8 @@ static int attach(struct ip_check_defrag *skel, bool ipv6) int err = -1; nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto out; skel->links.defrag = bpf_program__attach_netfilter(skel->progs.defrag, &opts); if (!ASSERT_OK_PTR(skel->links.defrag, "program attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index dbe06aeaa2b2..b1073d36d77a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -530,7 +530,7 @@ static int wait_netstamp_needed_key(void) __u64 tstamp = 0; nstoken = open_netns(NS_DST); - if (!nstoken) + if (!ASSERT_OK_PTR(nstoken, "setns dst")) return -1; srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0); diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 5f1fb0a2ea56..cec746e77cd3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -612,6 +612,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); if (!ASSERT_OK(err, "test_ping")) goto done; @@ -666,6 +668,8 @@ static void test_xfrm_tunnel(void) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); close_netns(nstoken); if (!ASSERT_OK(err, "test_ping")) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 05edcf32f528..f76b5d67a3ee 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -384,6 +384,8 @@ void test_xdp_metadata(void) SYS(out, "ip netns add " RX_NETNS_NAME); tok = open_netns(TX_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1"); SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME); @@ -400,6 +402,8 @@ void test_xdp_metadata(void) SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN); switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; SYS(out, "ip link set dev " RX_NAME " address " RX_MAC); SYS(out, "ip link set dev " RX_NAME " up"); @@ -449,6 +453,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */ tx_ifindex = if_nametoindex(TX_NAME); @@ -461,6 +467,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; /* Verify packet sent from AF_XDP has proper metadata. */ if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0, @@ -468,6 +476,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; complete_tx(&tx_xsk); /* Now check metadata of packet, generated with network stack */ @@ -475,6 +485,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0, "verify_xsk_metadata")) @@ -498,6 +510,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Send packet to trigger . */ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, @@ -505,6 +519,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; while (!retries--) { if (bpf_obj2->bss->called) -- cgit v1.2.3 From e1cdb70d075e02b1f410b8446a8ff959fa15f0ee Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:29 +0800 Subject: selftests/bpf: Use start_server_addr in test_sock_addr Include network_helpers.h in test_sock_addr.c, use the newly added public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. In order to use functions defined in network_helpers.c in test_sock_addr.c, Makefile needs to be updated and needs to be included in network_helpers.h to avoid compilation errors. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/3101f57bde5502383eb41723c8956cc26be06893.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 3 ++- tools/testing/selftests/bpf/network_helpers.h | 1 + tools/testing/selftests/bpf/test_sock_addr.c | 38 +++------------------------ 3 files changed, 7 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index edc73f8f5aef..0ce5c3c26b5c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -290,11 +290,12 @@ UNPRIV_HELPERS := $(OUTPUT)/unpriv_helpers.o TRACE_HELPERS := $(OUTPUT)/trace_helpers.o JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o +NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(NETWORK_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index aef297dfa6ca..5a8c5cf4ec1a 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -11,6 +11,7 @@ typedef __u16 __sum16; #include #include #include +#include #include #include #include diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 80c42583f597..ca4fb142b7b7 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -19,6 +19,7 @@ #include #include "cgroup_helpers.h" +#include "network_helpers.h" #include "testing_helpers.h" #include "bpf_util.h" @@ -939,37 +940,6 @@ static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); } -static int start_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int fd; - - fd = socket(addr->ss_family, type, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (type == SOCK_STREAM) { - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - } - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static int connect_to_server(int type, const struct sockaddr_storage *addr, socklen_t addr_len) { @@ -1178,7 +1148,7 @@ static int run_bind_test_case(const struct sock_addr_test *test) if (init_addrs(test, &requested_addr, &expected_addr, NULL)) goto err; - servfd = start_server(test->type, &requested_addr, addr_len); + servfd = start_server_addr(test->type, &requested_addr, addr_len, NULL); if (servfd == -1) goto err; @@ -1214,7 +1184,7 @@ static int run_connect_test_case(const struct sock_addr_test *test) goto err; /* Prepare server to connect to */ - servfd = start_server(test->type, &expected_addr, addr_len); + servfd = start_server_addr(test->type, &expected_addr, addr_len, NULL); if (servfd == -1) goto err; @@ -1271,7 +1241,7 @@ static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) goto err; /* Prepare server to sendmsg to */ - servfd = start_server(test->type, &server_addr, addr_len); + servfd = start_server_addr(test->type, &server_addr, addr_len, NULL); if (servfd == -1) goto err; -- cgit v1.2.3 From c6c40798428180516df80ce89da7bbfe1f6a828a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:30 +0800 Subject: selftests/bpf: Use connect_to_addr in test_sock_addr This patch uses public network helper connect_to_addr() exported in network_helpers.h instead of the local defined function connect_to_server() in test_sock_addr.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/f263797712d93fdfaf2943585c5dfae56714a00b.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_sock_addr.c | 36 ++-------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index ca4fb142b7b7..2e29dd9d8fc3 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -940,38 +940,6 @@ static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); } -static int connect_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Fail to connect to server"); - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - int init_pktinfo(int domain, struct cmsghdr *cmsg) { struct in6_pktinfo *pktinfo6; @@ -1156,7 +1124,7 @@ static int run_bind_test_case(const struct sock_addr_test *test) goto err; /* Try to connect to server just in case */ - clientfd = connect_to_server(test->type, &expected_addr, addr_len); + clientfd = connect_to_addr(test->type, &expected_addr, addr_len, NULL); if (clientfd == -1) goto err; @@ -1188,7 +1156,7 @@ static int run_connect_test_case(const struct sock_addr_test *test) if (servfd == -1) goto err; - clientfd = connect_to_server(test->type, &requested_addr, addr_len); + clientfd = connect_to_addr(test->type, &requested_addr, addr_len, NULL); if (clientfd == -1) goto err; -- cgit v1.2.3 From e4c68bbaff1153e8730c16f566c78a25b2046372 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:31 +0800 Subject: selftests/bpf: Use make_sockaddr in test_sock_addr This patch uses public helper make_sockaddr() exported in network_helpers.h instead of the local defined function mk_sockaddr() in test_sock_addr.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/1473e189d6ca1a3925de4c5354d191a14eca0f3f.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_sock_addr.c | 64 ++++++---------------------- 1 file changed, 12 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 2e29dd9d8fc3..c412de84b88f 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -605,44 +605,6 @@ static struct sock_addr_test tests[] = { }, }; -static int mk_sockaddr(int domain, const char *ip, unsigned short port, - struct sockaddr *addr, socklen_t addr_len) -{ - struct sockaddr_in6 *addr6; - struct sockaddr_in *addr4; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - return -1; - } - - memset(addr, 0, addr_len); - - if (domain == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) - return -1; - addr4 = (struct sockaddr_in *)addr; - addr4->sin_family = domain; - addr4->sin_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) { - log_err("Invalid IPv4: %s", ip); - return -1; - } - } else if (domain == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) - return -1; - addr6 = (struct sockaddr_in6 *)addr; - addr6->sin6_family = domain; - addr6->sin6_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - } - - return 0; -} - static int load_insns(const struct sock_addr_test *test, const struct bpf_insn *insns, size_t insns_cnt) { @@ -757,9 +719,9 @@ static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) return -1; } - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&dst4_rw_addr, - sizeof(dst4_rw_addr)) == -1) + if (make_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + (struct sockaddr_storage *)&dst4_rw_addr, + NULL) == -1) return -1; struct bpf_insn insns[] = { @@ -820,9 +782,9 @@ static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, return -1; } - if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr *)&dst6_rw_addr, - sizeof(dst6_rw_addr)) == -1) + if (make_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, + (struct sockaddr_storage *)&dst6_rw_addr, + NULL) == -1) return -1; struct bpf_insn insns[] = { @@ -1084,19 +1046,17 @@ static int init_addrs(const struct sock_addr_test *test, struct sockaddr_storage *expected_addr, struct sockaddr_storage *expected_src_addr) { - socklen_t addr_len = sizeof(struct sockaddr_storage); - - if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port, - (struct sockaddr *)expected_addr, addr_len) == -1) + if (make_sockaddr(test->domain, test->expected_ip, test->expected_port, + expected_addr, NULL) == -1) goto err; - if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port, - (struct sockaddr *)requested_addr, addr_len) == -1) + if (make_sockaddr(test->domain, test->requested_ip, test->requested_port, + requested_addr, NULL) == -1) goto err; if (test->expected_src_ip && - mk_sockaddr(test->domain, test->expected_src_ip, 0, - (struct sockaddr *)expected_src_addr, addr_len) == -1) + make_sockaddr(test->domain, test->expected_src_ip, 0, + expected_src_addr, NULL) == -1) goto err; return 0; -- cgit v1.2.3 From 82e38a505c9868e784ec31e743fd8a9fa5ca1084 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 13:57:18 -0700 Subject: selftests/bpf: Fix wq test. The wq test was missing destroy(skel) part which was causing bpf progs to stay loaded. That was causing test_progs to complain with "Failed to unload bpf_testmod.ko from kernel: -11" message, but adding destroy() wasn't enough, since wq callback may be delayed, so loop on unload of bpf_testmod if errno is EAGAIN. Acked-by: Andrii Nakryiko Fixes: 8290dba51910 ("selftests/bpf: wq: add bpf_wq_start() checks") Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/wq.c | 1 + tools/testing/selftests/bpf/testing_helpers.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 8a4a91d944cc..c4bacd3160e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -31,6 +31,7 @@ void serial_test_wq(void) usleep(50); /* 10 usecs should be enough, but give it extra */ ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); + wq__destroy(wq_skel); } void serial_test_failures_wq(void) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 28b6646662af..d5379a0e6da8 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -368,9 +368,23 @@ int delete_module(const char *name, int flags) int unload_bpf_testmod(bool verbose) { + int ret, cnt = 0; + if (kern_sync_rcu()) fprintf(stdout, "Failed to trigger kernel-side RCU sync!\n"); - if (delete_module("bpf_testmod", 0)) { + + for (;;) { + ret = delete_module("bpf_testmod", 0); + if (!ret || errno != EAGAIN) + break; + if (++cnt > 10000) { + fprintf(stdout, "Unload of bpf_testmod timed out\n"); + break; + } + usleep(100); + } + + if (ret) { if (errno == ENOENT) { if (verbose) fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); -- cgit v1.2.3 From 91541ab192fc7f573e6c711ba9c2ae22a299c408 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:23 -0700 Subject: selftests: bpf: crypto skcipher algo selftests Add simple tc hook selftests to show the way to work with new crypto BPF API. Some tricky dynptr initialization is used to provide empty iv dynptr. Simple AES-ECB algo is used to demonstrate encryption and decryption of fixed size buffers. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-4-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/config | 5 + .../selftests/bpf/prog_tests/crypto_sanity.c | 197 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/crypto_basic.c | 68 +++++++ tools/testing/selftests/bpf/progs/crypto_common.h | 66 +++++++ tools/testing/selftests/bpf/progs/crypto_sanity.c | 169 ++++++++++++++++++ 5 files changed, 505 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/crypto_sanity.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_basic.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_common.h create mode 100644 tools/testing/selftests/bpf/progs/crypto_sanity.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index afd675b1bf80..eeabd798bc3a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -13,7 +13,12 @@ CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_USER_API_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_AES=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF4=y diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c new file mode 100644 index 000000000000..b1a3a49a822a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include + +#include "test_progs.h" +#include "network_helpers.h" +#include "crypto_sanity.skel.h" +#include "crypto_basic.skel.h" + +#define NS_TEST "crypto_sanity_ns" +#define IPV6_IFACE_ADDR "face::1" +static const unsigned char crypto_key[] = "testtest12345678"; +static const char plain_text[] = "stringtoencrypt0"; +static int opfd = -1, tfmfd = -1; +static const char algo[] = "ecb(aes)"; +static int init_afalg(void) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "skcipher", + .salg_name = "ecb(aes)" + }; + + tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (tfmfd == -1) + return errno; + if (bind(tfmfd, (struct sockaddr *)&sa, sizeof(sa)) == -1) + return errno; + if (setsockopt(tfmfd, SOL_ALG, ALG_SET_KEY, crypto_key, 16) == -1) + return errno; + opfd = accept(tfmfd, NULL, 0); + if (opfd == -1) + return errno; + return 0; +} + +static void deinit_afalg(void) +{ + if (tfmfd != -1) + close(tfmfd); + if (opfd != -1) + close(opfd); +} + +static void do_crypt_afalg(const void *src, void *dst, int size, bool encrypt) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char cbuf[CMSG_SPACE(4)] = {0}; + struct iovec iov; + + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_OP; + cmsg->cmsg_len = CMSG_LEN(4); + *(__u32 *)CMSG_DATA(cmsg) = encrypt ? ALG_OP_ENCRYPT : ALG_OP_DECRYPT; + + iov.iov_base = (char *)src; + iov.iov_len = size; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + sendmsg(opfd, &msg, 0); + read(opfd, dst, size); +} + +void test_crypto_basic(void) +{ + RUN_TESTS(crypto_basic); +} + +void test_crypto_sanity(void) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_enc); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_dec); + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct nstoken *nstoken = NULL; + struct crypto_sanity *skel; + char afalg_plain[16] = {0}; + char afalg_dst[16] = {0}; + struct sockaddr_in6 addr; + int sockfd, err, pfd; + socklen_t addrlen; + u16 udp_test_port; + + skel = crypto_sanity__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open")) + return; + + SYS(fail, "ip netns add %s", NS_TEST); + SYS(fail, "ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR); + SYS(fail, "ip -net %s link set dev lo up", NS_TEST); + + nstoken = open_netns(NS_TEST); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto fail; + + err = init_afalg(); + if (!ASSERT_OK(err, "AF_ALG init fail")) + goto fail; + + qdisc_hook.ifindex = if_nametoindex("lo"); + if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo")) + goto fail; + + skel->bss->key_len = 16; + skel->bss->authsize = 0; + udp_test_port = skel->data->udp_test_port; + memcpy(skel->bss->key, crypto_key, sizeof(crypto_key)); + snprintf(skel->bss->algo, 128, "%s", algo); + pfd = bpf_program__fd(skel->progs.skb_crypto_setup); + if (!ASSERT_GT(pfd, 0, "skb_crypto_setup fd")) + goto fail; + + err = bpf_prog_test_run_opts(pfd, &opts); + if (!ASSERT_OK(err, "skb_crypto_setup") || + !ASSERT_OK(opts.retval, "skb_crypto_setup retval")) + goto fail; + + if (!ASSERT_OK(skel->bss->status, "skb_crypto_setup status")) + goto fail; + + err = bpf_tc_hook_create(&qdisc_hook); + if (!ASSERT_OK(err, "create qdisc hook")) + goto fail; + + addrlen = sizeof(addr); + err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, udp_test_port, + (void *)&addr, &addrlen); + if (!ASSERT_OK(err, "make_sockaddr")) + goto fail; + + tc_attach_enc.prog_fd = bpf_program__fd(skel->progs.encrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "attach encrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "encrypt socket")) + goto fail; + err = sendto(sockfd, plain_text, sizeof(plain_text), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(plain_text), "encrypt send")) + goto fail; + + do_crypt_afalg(plain_text, afalg_dst, sizeof(afalg_dst), true); + + if (!ASSERT_OK(skel->bss->status, "encrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_dst, sizeof(afalg_dst), "encrypt AF_ALG")) + goto fail; + + tc_attach_enc.flags = tc_attach_enc.prog_fd = tc_attach_enc.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "bpf_tc_detach encrypt")) + goto fail; + + tc_attach_dec.prog_fd = bpf_program__fd(skel->progs.decrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_dec); + if (!ASSERT_OK(err, "attach decrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "decrypt socket")) + goto fail; + err = sendto(sockfd, afalg_dst, sizeof(afalg_dst), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(afalg_dst), "decrypt send")) + goto fail; + + do_crypt_afalg(afalg_dst, afalg_plain, sizeof(afalg_plain), false); + + if (!ASSERT_OK(skel->bss->status, "decrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_plain, sizeof(afalg_plain), "decrypt AF_ALG")) + goto fail; + + tc_attach_dec.flags = tc_attach_dec.prog_fd = tc_attach_dec.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_dec); + ASSERT_OK(err, "bpf_tc_detach decrypt"); + +fail: + close_netns(nstoken); + deinit_afalg(); + SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + crypto_sanity__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/crypto_basic.c b/tools/testing/selftests/bpf/progs/crypto_basic.c new file mode 100644 index 000000000000..8cf7168b42d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_basic.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +int status; +SEC("syscall") +int crypto_release(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int crypto_acquire(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + cctx = bpf_crypto_ctx_acquire(cctx); + if (!cctx) + return -EINVAL; + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_common.h b/tools/testing/selftests/bpf/progs/crypto_common.h new file mode 100644 index 000000000000..57dd7a68a8c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_common.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#ifndef _CRYPTO_COMMON_H +#define _CRYPTO_COMMON_H + +#include "errno.h" +#include + +struct bpf_crypto_ctx *bpf_crypto_ctx_create(const struct bpf_crypto_params *params, + u32 params__sz, int *err) __ksym; +struct bpf_crypto_ctx *bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) __ksym; +void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) __ksym; +int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; +int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; + +struct __crypto_ctx_value { + struct bpf_crypto_ctx __kptr * ctx; +}; + +struct array_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct __crypto_ctx_value); + __uint(max_entries, 1); +} __crypto_ctx_map SEC(".maps"); + +static inline struct __crypto_ctx_value *crypto_ctx_value_lookup(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&__crypto_ctx_map, &key); +} + +static inline int crypto_ctx_insert(struct bpf_crypto_ctx *ctx) +{ + struct __crypto_ctx_value local, *v; + struct bpf_crypto_ctx *old; + u32 key = 0; + int err; + + local.ctx = NULL; + err = bpf_map_update_elem(&__crypto_ctx_map, &key, &local, 0); + if (err) { + bpf_crypto_ctx_release(ctx); + return err; + } + + v = bpf_map_lookup_elem(&__crypto_ctx_map, &key); + if (!v) { + bpf_crypto_ctx_release(ctx); + return -ENOENT; + } + + old = bpf_kptr_xchg(&v->ctx, ctx); + if (old) { + bpf_crypto_ctx_release(old); + return -EEXIST; + } + + return 0; +} + +#endif /* _CRYPTO_COMMON_H */ diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c new file mode 100644 index 000000000000..1be0a3fa5efd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +unsigned char key[256] = {}; +u16 udp_test_port = 7777; +u32 authsize, key_len; +char algo[128] = {}; +char dst[16] = {}; +int status; + +static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) +{ + struct ipv6hdr ip6h; + struct udphdr udph; + u32 offset; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6)) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h))) + return -1; + + if (ip6h.nexthdr != IPPROTO_UDP) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph))) + return -1; + + if (udph.dest != __bpf_htons(udp_test_port)) + return -1; + + offset = ETH_HLEN + sizeof(ip6h) + sizeof(udph); + if (skb->len < offset + 16) + return -1; + + /* let's make sure that 16 bytes of payload are in the linear part of skb */ + bpf_skb_pull_data(skb, offset + 16); + bpf_dynptr_from_skb(skb, 0, psrc); + bpf_dynptr_adjust(psrc, offset, offset + 16); + + return 0; +} + +SEC("syscall") +int skb_crypto_setup(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + if (key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int decrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +SEC("tc") +int encrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + status = 0; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 8000e627dc98efc44658af6150fd81c62d936b1b Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:24 -0700 Subject: selftests: bpf: crypto: add benchmark for crypto functions Some simple benchmarks are added to understand the baseline of performance. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-5-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/bench.c | 6 + .../selftests/bpf/benchs/bench_bpf_crypto.c | 185 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/crypto_bench.c | 109 ++++++++++++ 4 files changed, 302 insertions(+) create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_bench.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0ce5c3c26b5c..ff1af05add6e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -730,6 +730,7 @@ $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h +$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -749,6 +750,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_hashmap_lookup.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ + $(OUTPUT)/bench_bpf_crypto.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 82de56c8162e..627b74ae041b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -281,6 +281,7 @@ extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; +extern struct argp bench_crypto_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -294,6 +295,7 @@ static const struct argp_child bench_parsers[] = { { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, + { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, {}, }; @@ -538,6 +540,8 @@ extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_crypto_encrypt; +extern const struct bench bench_crypto_decrypt; static const struct bench *benchs[] = { &bench_count_global, @@ -590,6 +594,8 @@ static const struct bench *benchs[] = { &bench_bpf_hashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_crypto_encrypt, + &bench_crypto_decrypt, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c new file mode 100644 index 000000000000..2845edaba8db --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include "bench.h" +#include "crypto_bench.skel.h" + +#define MAX_CIPHER_LEN 32 +static char *input; +static struct crypto_ctx { + struct crypto_bench *skel; + int pfd; +} ctx; + +static struct crypto_args { + u32 crypto_len; + char *crypto_cipher; +} args = { + .crypto_len = 16, + .crypto_cipher = "ecb(aes)", +}; + +enum { + ARG_CRYPTO_LEN = 5000, + ARG_CRYPTO_CIPHER = 5001, +}; + +static const struct argp_option opts[] = { + { "crypto-len", ARG_CRYPTO_LEN, "CRYPTO_LEN", 0, + "Set the length of crypto buffer" }, + { "crypto-cipher", ARG_CRYPTO_CIPHER, "CRYPTO_CIPHER", 0, + "Set the cipher to use (default:ecb(aes))" }, + {}, +}; + +static error_t crypto_parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_CRYPTO_LEN: + args.crypto_len = strtoul(arg, NULL, 10); + if (!args.crypto_len || + args.crypto_len > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "Invalid crypto buffer len (limit %zu)\n", + sizeof(ctx.skel->bss->dst)); + argp_usage(state); + } + break; + case ARG_CRYPTO_CIPHER: + args.crypto_cipher = strdup(arg); + if (!strlen(args.crypto_cipher) || + strlen(args.crypto_cipher) > MAX_CIPHER_LEN) { + fprintf(stderr, "Invalid crypto cipher len (limit %d)\n", + MAX_CIPHER_LEN); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_crypto_argp = { + .options = opts, + .parser = crypto_parse_arg, +}; + +static void crypto_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "bpf crypto benchmark doesn't support consumer!\n"); + exit(1); + } +} + +static void crypto_setup(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + + int err, pfd; + size_t i, sz; + + sz = args.crypto_len; + if (!sz || sz > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "invalid encrypt buffer size (source %zu, target %zu)\n", + sz, sizeof(ctx.skel->bss->dst)); + exit(1); + } + + setup_libbpf(); + + ctx.skel = crypto_bench__open(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + snprintf(ctx.skel->bss->cipher, 128, "%s", args.crypto_cipher); + memcpy(ctx.skel->bss->key, "12345678testtest", 16); + ctx.skel->bss->key_len = 16; + ctx.skel->bss->authsize = 0; + + srandom(time(NULL)); + input = malloc(sz); + for (i = 0; i < sz - 1; i++) + input[i] = '1' + random() % 9; + input[sz - 1] = '\0'; + + ctx.skel->rodata->len = args.crypto_len; + + err = crypto_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + pfd = bpf_program__fd(ctx.skel->progs.crypto_setup); + if (pfd < 0) { + fprintf(stderr, "failed to get fd for setup prog\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + err = bpf_prog_test_run_opts(pfd, &opts); + if (err || ctx.skel->bss->status) { + fprintf(stderr, "failed to run setup prog: err %d, status %d\n", + err, ctx.skel->bss->status); + crypto_bench__destroy(ctx.skel); + exit(1); + } +} + +static void crypto_encrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_encrypt); +} + +static void crypto_decrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_decrypt); +} + +static void crypto_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void *crypto_producer(void *unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .repeat = 64, + .data_in = input, + .data_size_in = args.crypto_len, + ); + + while (true) + (void)bpf_prog_test_run_opts(ctx.pfd, &opts); + return NULL; +} + +const struct bench bench_crypto_encrypt = { + .name = "crypto-encrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_encrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_crypto_decrypt = { + .name = "crypto-decrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_decrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c new file mode 100644 index 000000000000..e61fe0882293 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +const volatile unsigned int len = 16; +char cipher[128] = {}; +u32 key_len, authsize; +char dst[256] = {}; +u8 key[256] = {}; +long hits = 0; +int status; + +SEC("syscall") +int crypto_setup(void *args) +{ + struct bpf_crypto_ctx *cctx; + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + int err = 0; + + status = 0; + + if (!cipher[0] || !key_len || key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int crypto_encrypt(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return 0; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return 0; + } + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +SEC("tc") +int crypto_decrypt(struct __sk_buff *skb) +{ + struct bpf_dynptr psrc, pdst, iv; + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + + v = crypto_ctx_value_lookup(); + if (!v) + return -ENOENT; + + ctx = v->ctx; + if (!ctx) + return -ENOENT; + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 546fb63fe85e52c178684d4a7fac161708cdf34a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2024 15:05:44 +0200 Subject: selftests: netfilter: nft_concat_range.sh: move to lib.sh infra Use busywait helper instead of unconditional sleep, reduces run time from 6m to 2:30 on my system. The busywait helper calls the function passed to it as argument; disable the shellcheck test for unreachable code, it generates many (false) warnings here. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_concat_range.sh | 62 +++++++++++++--------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index e908009576c7..877c9d3777d2 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # nft_concat_range.sh - Tests for sets with concatenation of ranged fields @@ -7,10 +7,10 @@ # # Author: Stefano Brivio # -# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031 +# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031,SC2317 # ^ Configuration and templates sourced with eval, counters reused in subshells -KSELFTEST_SKIP=4 +source lib.sh # Available test groups: # - reported_issues: check for issues that were reported in the past @@ -473,8 +473,6 @@ setup_veth() { B() { ip netns exec B "$@" >/dev/null 2>&1 } - - sleep 2 } # Fill in set template and initialise set @@ -679,10 +677,17 @@ setup_send_udp6() { fi } +listener_ready() +{ + port="$1" + ss -lnt -o "sport = :$port" | grep -q "$port" +} + # Set up function to send TCP traffic on IPv4 setup_flood_tcp() { if command -v iperf3 >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -699,7 +704,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \ @@ -711,6 +716,7 @@ setup_flood_tcp() { } elif command -v iperf >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -727,7 +733,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \ @@ -739,6 +745,7 @@ setup_flood_tcp() { } elif command -v netperf >/dev/null; then flood_tcp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -755,7 +762,7 @@ setup_flood_tcp() { # shellcheck disable=SC2086 # this needs split options netserver -4 ${dst_port} -L "${dst_addr4}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" # shellcheck disable=SC2086 # this needs split options B netperf -4 -H "${dst_addr4}" ${dst_port} \ @@ -774,6 +781,7 @@ setup_flood_tcp() { setup_flood_tcp6() { if command -v iperf3 >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -790,7 +798,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}" # shellcheck disable=SC2086 # this needs split options B iperf3 -c "${dst_addr6}" ${dst_port} \ @@ -802,6 +810,7 @@ setup_flood_tcp6() { } elif command -v iperf >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -818,7 +827,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -c "${dst_addr6}" -V ${dst_port} \ @@ -830,6 +839,7 @@ setup_flood_tcp6() { } elif command -v netperf >/dev/null; then flood_tcp6() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr6}" ]; then B ip addr add "${src_addr6}" dev veth_b nodad @@ -846,7 +856,7 @@ setup_flood_tcp6() { # shellcheck disable=SC2086 # this needs split options netserver -6 ${dst_port} -L "${dst_addr6}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B netperf -6 -H "${dst_addr6}" ${dst_port} \ @@ -865,6 +875,7 @@ setup_flood_tcp6() { setup_flood_udp() { if command -v iperf3 >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -881,7 +892,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options iperf3 -s -DB "${dst_addr4}" ${dst_port} - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \ @@ -893,6 +904,7 @@ setup_flood_udp() { } elif command -v iperf >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -909,7 +921,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \ @@ -921,6 +933,7 @@ setup_flood_udp() { } elif command -v netperf >/dev/null; then flood_udp() { + local n_port="${dst_port}" [ -n "${dst_port}" ] && dst_port="-p ${dst_port}" if [ -n "${src_addr4}" ]; then B ip addr add "${src_addr4}/16" dev veth_b @@ -937,7 +950,7 @@ setup_flood_udp() { # shellcheck disable=SC2086 # this needs split options netserver -4 ${dst_port} -L "${dst_addr4}" \ >/dev/null 2>&1 - sleep 2 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port" # shellcheck disable=SC2086 # this needs split options B netperf -4 -H "${dst_addr4}" ${dst_port} \ @@ -990,14 +1003,13 @@ cleanup() { killall netperf 2>/dev/null killall netserver 2>/dev/null rm -f ${tmp} - sleep 2 } # Entry point for setup functions setup() { if [ "$(id -u)" -ne 0 ]; then echo " need to run as root" - exit ${KSELFTEST_SKIP} + exit ${ksft_skip} fi cleanup @@ -1258,7 +1270,7 @@ send_nomatch() { # - check that packets outside range don't match it # - remove some elements, check that packets don't match anymore test_correctness() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} range_size=1 for i in $(seq "${start}" $((start + count))); do @@ -1307,7 +1319,7 @@ test_concurrency() { proto=${flood_proto} tools=${flood_tools} chain_spec=${flood_spec} - setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth flood_"${proto}" set || return ${ksft_skip} range_size=1 cstart=${start} @@ -1325,7 +1337,7 @@ test_concurrency() { start=$((end + range_size)) done - sleep 10 + sleep $((RANDOM%10)) pids= for c in $(seq 1 "$(nproc)"); do ( @@ -1407,7 +1419,7 @@ test_concurrency() { # - add all the elements with 3s timeout while checking that packets match # - wait 3s after the last insertion, check that packets don't match any entry test_timeout() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} timeout=3 range_size=1 @@ -1450,7 +1462,7 @@ test_performance() { chain_spec=${perf_spec} dst="${perf_dst}" src="${perf_src}" - setup veth perf set || return ${KSELFTEST_SKIP} + setup veth perf set || return ${ksft_skip} first=${start} range_size=1 @@ -1523,7 +1535,7 @@ test_bug_flush_remove_add() { elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' for i in `seq 1 100`; do - nft add table t ${set_cmd} || return ${KSELFTEST_SKIP} + nft add table t ${set_cmd} || return ${ksft_skip} nft add element t s ${elem1} 2>/dev/null || return 1 nft flush set t s 2>/dev/null || return 1 nft add element t s ${elem2} 2>/dev/null || return 1 @@ -1534,7 +1546,7 @@ test_bug_flush_remove_add() { # - add ranged element, check that packets match it # - reload the set, check packets still match test_bug_reload() { - setup veth send_"${proto}" set || return ${KSELFTEST_SKIP} + setup veth send_"${proto}" set || return ${ksft_skip} rstart=${start} range_size=1 @@ -1635,11 +1647,11 @@ for name in ${TESTS}; do printf "[FAIL]\n" err_flush exit 1 - elif [ $ret -eq ${KSELFTEST_SKIP} ]; then + elif [ $ret -eq ${ksft_skip} ]; then printf "[SKIP]\n" err_flush fi done done -[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0 +[ ${passed} -eq 0 ] && exit ${ksft_skip} || exit 0 -- cgit v1.2.3 From ba6fbd383c12dfe6833968e3555ada422720a76f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2024 15:05:45 +0200 Subject: selftests: netfilter: nft_concat_range.sh: drop netcat support Tests fail on my workstation with netcat 110, instead of debugging+more workarounds just remove this. Tests will fall back to bash or socat. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-3-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_concat_range.sh | 74 ++++------------------ 1 file changed, 13 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 877c9d3777d2..2160de014525 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -66,7 +66,7 @@ src start 1 count 5 src_delta 2000 -tools sendip nc bash +tools sendip bash proto udp race_repeat 3 @@ -91,7 +91,7 @@ src start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 3 @@ -116,7 +116,7 @@ src start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 3 @@ -141,7 +141,7 @@ src start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -163,7 +163,7 @@ src mac start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 0 @@ -185,7 +185,7 @@ src mac proto start 10 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp6 race_repeat 0 @@ -207,7 +207,7 @@ src addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 3 @@ -227,7 +227,7 @@ src addr6 port start 10 count 5 src_delta 2000 -tools sendip socat nc +tools sendip socat proto udp6 race_repeat 3 @@ -247,7 +247,7 @@ src mac proto addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -264,7 +264,7 @@ src mac start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -286,7 +286,7 @@ src mac addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -337,7 +337,7 @@ src addr4 start 1 count 5 src_delta 2000 -tools sendip socat nc +tools sendip socat proto udp race_repeat 3 @@ -363,7 +363,7 @@ src mac start 1 count 1 src_delta 2000 -tools sendip socat nc bash +tools sendip socat bash proto udp race_repeat 0 @@ -486,12 +486,6 @@ check_tools() { __tools= for tool in ${tools}; do - if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \ - ! nc -u -w0 1.1.1.1 1 2>/dev/null; then - # Some GNU netcat builds might not support IPv6 - __tools="${__tools} netcat-openbsd" - continue - fi __tools="${__tools} ${tool}" command -v "${tool}" >/dev/null && return 0 @@ -554,29 +548,6 @@ setup_send_udp() { echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" - src_addr4= - src_port= - } - elif command -v nc >/dev/null; then - if nc -u -w0 1.1.1.1 1 2>/dev/null; then - # OpenBSD netcat - nc_opt="-w0" - else - # GNU netcat - nc_opt="-q0" - fi - - send_udp() { - if [ -n "${src_addr4}" ]; then - B ip addr add "${src_addr4}" dev veth_b - __src_addr4="-s ${src_addr4}" - fi - ip addr add "${dst_addr4}" dev veth_a 2>/dev/null - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \ - "${src_port}" "${dst_addr4}" "${dst_port}" - src_addr4= src_port= } @@ -645,25 +616,6 @@ setup_send_udp6() { echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" } - elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then - # GNU netcat might not work with IPv6, try next tool - send_udp6() { - ip -6 addr add "${dst_addr6}" dev veth_a nodad \ - 2>/dev/null - if [ -n "${src_addr6}" ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - else - src_addr6="2001:db8::2" - fi - [ -n "${src_port}" ] && src_port="-p ${src_port}" - - # shellcheck disable=SC2086 # this needs split options - echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \ - ${dst_addr6} ${dst_port} - - src_addr6= - src_port= - } elif [ -z "$(bash -c 'type -p')" ]; then send_udp6() { ip -6 addr add "${dst_addr6}" dev veth_a nodad \ -- cgit v1.2.3 From c54fa6ae35b9752f557d47f1f5df1500f5b6f2df Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2024 15:05:46 +0200 Subject: selftests: netfilter: nft_concat_range.sh: shellcheck cleanups no functional changes intended. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-4-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_concat_range.sh | 53 ++++++++++------------ 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 2160de014525..2b6661519055 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -546,7 +546,7 @@ setup_send_udp() { ip addr add "${dst_addr4}" dev veth_a 2>/dev/null [ -z "${dst_port}" ] && dst_port=12345 - echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}" + echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:"$dst_addr4":"$dst_port""${__socatbind}" src_addr4= src_port= @@ -601,11 +601,7 @@ setup_send_udp6() { __socatbind6= if [ -n "${src_addr6}" ]; then - if [ -n "${src_addr6} != "${src_addr6_added} ]; then - B ip addr add "${src_addr6}" dev veth_b nodad - - src_addr6_added=${src_addr6} - fi + B ip addr add "${src_addr6}" dev veth_b nodad __socatbind6=",bind=[${src_addr6}]" @@ -614,7 +610,7 @@ setup_send_udp6() { fi fi - echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}" + echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:["$dst_addr6"]:"$dst_port""${__socatbind6}" } elif [ -z "$(bash -c 'type -p')" ]; then send_udp6() { @@ -947,6 +943,7 @@ cleanup() { ip link del dummy0 2>/dev/null ip route del default 2>/dev/null ip -6 route del default 2>/dev/null + ip netns pids B 2>/dev/null | xargs kill 2>/dev/null ip netns del B 2>/dev/null ip link del veth_a 2>/dev/null timeout= @@ -954,7 +951,7 @@ cleanup() { killall iperf 2>/dev/null killall netperf 2>/dev/null killall netserver 2>/dev/null - rm -f ${tmp} + rm -f "$tmp" } # Entry point for setup functions @@ -1237,7 +1234,7 @@ test_correctness() { srcend=$((end + src_delta)) add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1 @@ -1245,7 +1242,7 @@ test_correctness() { # Delete elements now and then if [ $((i % 3)) -eq 0 ]; then del "$(format)" || return 1 - for j in $(seq ${start} \ + for j in $(seq "$start" \ $((range_size / 2 + 1)) ${end}); do send_nomatch "${j}" $((j + src_delta)) \ || return 1 @@ -1276,7 +1273,7 @@ test_concurrency() { range_size=1 cstart=${start} flood_pids= - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1299,7 +1296,7 @@ test_concurrency() { # $start needs to be local to this subshell # shellcheck disable=SC2030 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1314,7 +1311,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1330,7 +1327,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1343,7 +1340,7 @@ test_concurrency() { range_size=1 start=${cstart} - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1375,14 +1372,14 @@ test_timeout() { timeout=3 range_size=1 - for i in $(seq "${start}" $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) add "$(format)" || return 1 - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done @@ -1390,12 +1387,12 @@ test_timeout() { start=$((end + range_size)) done sleep 3 - for i in $(seq ${start} $((start + count))); do + for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_nomatch "${j}" $((j + src_delta)) || return 1 done @@ -1420,7 +1417,7 @@ test_performance() { range_size=1 for set in test norange noconcat; do start=${first} - for i in $(seq ${start} $((start + perf_entries))); do + for i in $(seq "$start" $((start + perf_entries))); do end=$((start + range_size)) srcstart=$((start + src_delta)) srcend=$((end + src_delta)) @@ -1428,7 +1425,7 @@ test_performance() { if [ $((end / 65534)) -gt $((start / 65534)) ]; then start=${end} end=$((end + 1)) - elif [ ${start} -eq ${end} ]; then + elif [ "$start" -eq "$end" ]; then end=$((start + 1)) fi @@ -1439,7 +1436,7 @@ test_performance() { nft -f "${tmp}" done - perf $((end - 1)) ${srcstart} + perf $((end - 1)) "$srcstart" sleep 2 @@ -1486,11 +1483,11 @@ test_bug_flush_remove_add() { set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in `seq 1 100`; do - nft add table t ${set_cmd} || return ${ksft_skip} - nft add element t s ${elem1} 2>/dev/null || return 1 + for i in $(seq 1 100); do + nft add table t "$set_cmd" || return ${ksft_skip} + nft add element t s "$elem1" 2>/dev/null || return 1 nft flush set t s 2>/dev/null || return 1 - nft add element t s ${elem2} 2>/dev/null || return 1 + nft add element t s "$elem2" 2>/dev/null || return 1 done nft flush ruleset } @@ -1537,7 +1534,7 @@ test_bug_reload() { srcstart=$((start + src_delta)) srcend=$((end + src_delta)) - for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do + for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do send_match "${j}" $((j + src_delta)) || return 1 done @@ -1560,7 +1557,7 @@ trap cleanup EXIT # Entry point for test runs passed=0 for name in ${TESTS}; do - printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')" + printf "TEST: %s\n" "$(echo "$name" | tr '_' ' ')" if [ "${name}" = "reported_issues" ]; then SUBTESTS="${BUGS}" else -- cgit v1.2.3 From f84ab634904cc491239a5165d5d7ded0acde02d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2024 15:05:47 +0200 Subject: selftests: netfilter: nft_flowtable.sh: re-run with random mtu sizes Now that the test runs much faster, also re-run it with random MTU sizes for the different link legs. flowtable should pass ip fragments, if any, up to the normal forwarding path. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-5-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/netfilter/nft_flowtable.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index d765c65c31f3..8b5a3a7e22f0 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -100,6 +100,14 @@ if ! ip -net $nsr2 link set veth1 mtu $rmtu; then exit 1 fi +if ! ip -net "$nsr1" link set veth1 mtu "$lmtu"; then + exit 1 +fi + +if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then + exit 1 +fi + ip -net $ns2 link set eth0 mtu $rmtu # transfer-net between nsr1 and nsr2. @@ -633,4 +641,15 @@ else ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 fi +if [ x"$1" = x ]; then + low=1280 + mtu=$((65536 - low)) + o=$(((RANDOM%mtu) + low)) + l=$(((RANDOM%mtu) + low)) + r=$(((RANDOM%mtu) + low)) + + echo "re-run with random mtus: -o $o -l $l -r $r" + $0 -o "$o" -l "$l" -r "$r" +fi + exit $ret -- cgit v1.2.3 From a18f284574ada9b5c77173683b13625df7fb33f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2024 15:05:48 +0200 Subject: selftests: netfilter: nft_flowtable.sh: shellcheck cleanups no functional changes intended except that test will now SKIP in case kernel lacks bridge support and initial rule load failure provides nft version information. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-6-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/nft_flowtable.sh | 257 +++++++++++---------- 1 file changed, 136 insertions(+), 121 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 8b5a3a7e22f0..86d516e8acd6 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -37,17 +37,17 @@ cleanup() { rm -f "$nsin" "$ns1out" "$ns2out" - [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns + [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns" } trap cleanup EXIT sysctl -q net.netfilter.nf_log_all_netns=1 -ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 -ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 +ip link add veth0 netns "$nsr1" type veth peer name eth0 netns "$ns1" +ip link add veth1 netns "$nsr1" type veth peer name veth0 netns "$nsr2" -ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 +ip link add veth1 netns "$nsr2" type veth peer name eth0 netns "$ns2" for dev in veth0 veth1; do ip -net "$nsr1" link set "$dev" up @@ -90,13 +90,13 @@ do esac done -if ! ip -net $nsr1 link set veth0 mtu $omtu; then +if ! ip -net "$nsr1" link set veth0 mtu "$omtu"; then exit 1 fi -ip -net $ns1 link set eth0 mtu $omtu +ip -net "$ns1" link set eth0 mtu "$omtu" -if ! ip -net $nsr2 link set veth1 mtu $rmtu; then +if ! ip -net "$nsr2" link set veth1 mtu "$rmtu"; then exit 1 fi @@ -108,7 +108,7 @@ if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then exit 1 fi -ip -net $ns2 link set eth0 mtu $rmtu +ip -net "$ns2" link set eth0 mtu "$rmtu" # transfer-net between nsr1 and nsr2. # these addresses are not used for connections. @@ -119,35 +119,34 @@ ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad for i in 0 1; do - ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null - ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null done -for ns in $ns1 $ns2;do - ip -net $ns link set lo up - ip -net $ns link set eth0 up +for ns in "$ns1" "$ns2";do + ip -net "$ns" link set eth0 up - if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then + if ! ip netns exec "$ns" sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then echo "ERROR: Check Originator/Responder values (problem during address addition)" exit 1 fi # don't set ip DF bit for first two tests - ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null + ip netns exec "$ns" sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null done -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns2 addr add 10.0.2.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 nodad -ip -net $ns2 addr add dead:2::99/64 dev eth0 nodad -ip -net $ns1 route add default via dead:1::1 -ip -net $ns2 route add default via dead:2::1 +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$ns2" route add default via dead:2::1 -ip -net $nsr1 route add default via 192.168.10.2 -ip -net $nsr2 route add default via 192.168.10.1 +ip -net "$nsr1" route add default via 192.168.10.2 +ip -net "$nsr2" route add default via 192.168.10.1 -ip netns exec $nsr1 nft -f - < /dev/null; then +if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then echo "ERROR: $ns1 cannot reach ns2" 1>&2 exit 1 fi -if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then +if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then echo "ERROR: $ns2 cannot reach $ns1" 1>&2 exit 1 fi @@ -235,23 +235,27 @@ check_counters() local what=$1 local ok=1 - local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) - local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) + local orig repl + orig=$(ip netns exec "$nsr1" nft reset counter inet filter routed_orig | grep packets) + repl=$(ip netns exec "$nsr1" nft reset counter inet filter routed_repl | grep packets) local orig_cnt=${orig#*bytes} local repl_cnt=${repl#*bytes} - local fs=$(du -sb $nsin) + local fs + fs=$(du -sb "$nsin") local max_orig=${fs%%/*} local max_repl=$((max_orig/4)) - if [ $orig_cnt -gt $max_orig ];then + # flowtable fastpath should bypass normal routing one, i.e. the counters in forward hook + # should always be lower than the size of the transmitted file (max_orig). + if [ "$orig_cnt" -gt "$max_orig" ];then echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 ret=1 ok=0 fi - if [ $repl_cnt -gt $max_repl ];then + if [ "$repl_cnt" -gt $max_repl ];then echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 ret=1 ok=0 @@ -267,39 +271,40 @@ check_dscp() local what=$1 local ok=1 - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) + local counter + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp3 | grep packets) local pc4=${counter%*bytes*} local pc4=${pc4#*packets} - local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) + counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp0 | grep packets) local pc4z=${counter%*bytes*} local pc4z=${pc4z#*packets} case "$what" in "dscp_none") - if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then + if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_fwd") - if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_ingress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_egress") - if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then + if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 @@ -311,7 +316,7 @@ check_dscp() ok=0 esac - if [ $ok -eq 1 ] ;then + if [ "$ok" -eq 1 ] ;then echo "PASS: $what: dscp packet counters match" fi } @@ -356,10 +361,12 @@ test_tcp_forwarding_ip() if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then lret=1 + ret=1 fi if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then lret=1 + ret=1 fi return $lret @@ -376,7 +383,7 @@ test_tcp_forwarding_set_dscp() { check_dscp "dscp_none" -ip netns exec $nsr1 nft -f - <&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. -ip -net $ns2 route del default via 10.0.2.1 -ip -net $ns2 route del default via dead:2::1 -ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route del default via 10.0.2.1 +ip -net "$ns2" route del default via dead:2::1 +ip -net "$ns2" route add 192.168.10.1 via 10.0.2.1 # Second test: # Same, but with NAT enabled. Same as in first test: we expect normal forward path # to handle most packets. -ip netns exec $nsr1 nft -f - <&2 exit 0 fi -if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 0 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi @@ -501,35 +508,40 @@ fi # Same as second test, but with PMTU discovery enabled. This # means that we expect the fastpath to handle packets as soon # as the endpoints adjust the packet size. -ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns1" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # reset counters. # With pmtu in-place we'll also check that nft counters # are lower than file size and packets were forwarded via flowtable layer. # For earlier tests (large mtus), packets cannot be handled via flowtable # (except pure acks and other small packets). -ip netns exec $nsr1 nft reset counters table inet filter >/dev/null +ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null -if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset fi # Another test: # Add bridge interface br0 to Router1, with NAT enabled. -ip -net $nsr1 link add name br0 type bridge -ip -net $nsr1 addr flush dev veth0 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set veth0 master br0 -ip -net $nsr1 addr add 10.0.1.1/24 dev br0 -ip -net $nsr1 addr add dead:1::1/64 dev br0 nodad -ip -net $nsr1 link set up dev br0 +test_bridge() { +if ! ip -net "$nsr1" link add name br0 type bridge 2>/dev/null;then + echo "SKIP: could not add bridge br0" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" addr flush dev veth0 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set veth0 master br0 +ip -net "$nsr1" addr add 10.0.1.1/24 dev br0 +ip -net "$nsr1" addr add dead:1::1/64 dev br0 nodad +ip -net "$nsr1" link set up dev br0 -ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null +ip netns exec "$nsr1" sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null # br0 with NAT enabled. -ip netns exec $nsr1 nft -f - <&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # Another test: # Add bridge interface br0 to Router1, with NAT and VLAN. -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set down dev veth0 -ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 -ip -net $nsr1 link set up dev veth0 -ip -net $nsr1 link set up dev veth0.10 -ip -net $nsr1 link set veth0.10 master br0 - -ip -net $ns1 addr flush dev eth0 -ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 -ip -net $ns1 link set eth0 up -ip -net $ns1 link set eth0.10 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0.10 nodad - -if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set down dev veth0 +ip -net "$nsr1" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr1" link set up dev veth0 +ip -net "$nsr1" link set up dev veth0.10 +ip -net "$nsr1" link set veth0.10 master br0 + +ip -net "$ns1" addr flush dev eth0 +ip -net "$ns1" link add link eth0 name eth0.10 type vlan id 10 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" link set eth0.10 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0.10 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0.10 nodad + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "bridge and VLAN"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 - ip netns exec $nsr1 nft list ruleset + ip netns exec "$nsr1" nft list ruleset ret=1 fi # restore test topology (remove bridge and VLAN) -ip -net $nsr1 link set veth0 nomaster -ip -net $nsr1 link set veth0 down -ip -net $nsr1 link set veth0.10 down -ip -net $nsr1 link delete veth0.10 type vlan -ip -net $nsr1 link delete br0 type bridge -ip -net $ns1 addr flush dev eth0.10 -ip -net $ns1 link set eth0.10 down -ip -net $ns1 link set eth0 down -ip -net $ns1 link delete eth0.10 type vlan +ip -net "$nsr1" link set veth0 nomaster +ip -net "$nsr1" link set veth0 down +ip -net "$nsr1" link set veth0.10 down +ip -net "$nsr1" link delete veth0.10 type vlan +ip -net "$nsr1" link delete br0 type bridge +ip -net "$ns1" addr flush dev eth0.10 +ip -net "$ns1" link set eth0.10 down +ip -net "$ns1" link set eth0 down +ip -net "$ns1" link delete eth0.10 type vlan # restore address in ns1 and nsr1 -ip -net $ns1 link set eth0 up -ip -net $ns1 addr add 10.0.1.99/24 dev eth0 -ip -net $ns1 route add default via 10.0.1.1 -ip -net $ns1 addr add dead:1::99/64 dev eth0 nodad -ip -net $ns1 route add default via dead:1::1 -ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net $nsr1 addr add dead:1::1/64 dev veth0 nodad -ip -net $nsr1 link set up dev veth0 +ip -net "$ns1" link set eth0 up +ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 +ip -net "$ns1" route add default via 10.0.1.1 +ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" route add default via dead:1::1 +ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0 +ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad +ip -net "$nsr1" link set up dev veth0 +} + +test_bridge KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) @@ -613,25 +628,25 @@ do_esp() { local spi_out=$6 local spi_in=$7 - ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet - ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + ip -net "$ns" xfrm state add src "$remote" dst "$me" proto esp spi "$spi_in" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$rnet" dst "$lnet" + ip -net "$ns" xfrm state add src "$me" dst "$remote" proto esp spi "$spi_out" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$lnet" dst "$rnet" # to encrypt packets as they go out (includes forwarded packets that need encapsulation) - ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow + ip -net "$ns" xfrm policy add src "$lnet" dst "$rnet" dir out tmpl src "$me" dst "$remote" proto esp mode tunnel priority 1 action allow # to fwd decrypted packets after esp processing: - ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow + ip -net "$ns" xfrm policy add src "$rnet" dst "$lnet" dir fwd tmpl src "$remote" dst "$me" proto esp mode tunnel priority 1 action allow } -do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 +do_esp "$nsr1" 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 "$SPI1" "$SPI2" -do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 +do_esp "$nsr2" 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 "$SPI2" "$SPI1" -ip netns exec $nsr1 nft delete table ip nat +ip netns exec "$nsr1" nft delete table ip nat # restore default routes -ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 -ip -net $ns2 route add default via 10.0.2.1 -ip -net $ns2 route add default via dead:2::1 +ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 +ip -net "$ns2" route add default via 10.0.2.1 +ip -net "$ns2" route add default via dead:2::1 if test_tcp_forwarding "$ns1" "$ns2"; then check_counters "ipsec tunnel mode for ns1/ns2" @@ -641,7 +656,7 @@ else ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 fi -if [ x"$1" = x ]; then +if [ "$1" = "" ]; then low=1280 mtu=$((65536 - low)) o=$(((RANDOM%mtu) + low)) -- cgit v1.2.3 From bb0ee78f9418374e087485888af374cc932f63c8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2024 15:05:49 +0200 Subject: selftests: netfilter: skip tests on early errors br_netfilter: If we can't add the needed initial nftables ruleset skip the test, kernel doesn't support a required feature. rpath: run a subset of the tests if possible, but make sure we return the skip return value so they are marked appropriately by the kselftest framework. nft_audit.sh: provide version information when skipping, this should help catching kernel problem (feature not available in kernel) vs. userspace issue (parser doesn't support keyword). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-7-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/br_netfilter.sh | 4 ++++ tools/testing/selftests/net/netfilter/nft_audit.sh | 3 ++- tools/testing/selftests/net/netfilter/rpath.sh | 10 ++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh index 1084faf88f0b..d7806753f5de 100755 --- a/tools/testing/selftests/net/netfilter/br_netfilter.sh +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -124,6 +124,10 @@ table bridge filter { } } EOF +if [ "$?" -ne 0 ];then + echo "SKIP: could not add nftables ruleset" + exit $ksft_skip +fi # place 1, 2 & 3 in same subnet, connected via ns0:br0. # ns4 is placed in same subnet as well, but its not diff --git a/tools/testing/selftests/net/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh index b390437696ba..902f8114bc80 100755 --- a/tools/testing/selftests/net/netfilter/nft_audit.sh +++ b/tools/testing/selftests/net/netfilter/nft_audit.sh @@ -29,7 +29,8 @@ reset rules t c EOF if [ "$?" -ne 0 ];then - echo "SKIP: nft reset feature test failed" + echo -n "SKIP: nft reset feature test failed: " + nft --version exit $SKIP_RC fi diff --git a/tools/testing/selftests/net/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh index 5289c8447a41..4485fd7675ed 100755 --- a/tools/testing/selftests/net/netfilter/rpath.sh +++ b/tools/testing/selftests/net/netfilter/rpath.sh @@ -64,12 +64,18 @@ ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad # firewall matches to test [ -n "$iptables" ] && { common='-t raw -A PREROUTING -s 192.168.0.0/16' - ip netns exec "$ns2" "$iptables" $common -m rpfilter + if ! ip netns exec "$ns2" "$iptables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert } [ -n "$ip6tables" ] && { common='-t raw -A PREROUTING -s fec0::/16' - ip netns exec "$ns2" "$ip6tables" $common -m rpfilter + if ! ip netns exec "$ns2" "$ip6tables" $common -m rpfilter;then + echo "Cannot add rpfilter rule" + exit $ksft_skip + fi ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert } [ -n "$nft" ] && ip netns exec "$ns2" $nft -f - < Date: Tue, 23 Apr 2024 15:05:50 +0200 Subject: selftests: netfilter: conntrack_vrf.sh: prefer socat, not iperf3 Use socat, like most of the other scripts already do. This also makes the script complete slightly faster (3s -> 1s). iperf3 establishes two connections (1 control connection, and 1+x depending on test), so adjust expected counter values as well. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240423130604.7013-8-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/conntrack_vrf.sh | 40 ++++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh index f7417004ec71..073e8e62d350 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh @@ -43,15 +43,9 @@ cleanup() cleanup_all_ns } -if ! nft --version > /dev/null 2>&1;then - echo "SKIP: Could not run test without nft tool" - exit $ksft_skip -fi - -if ! conntrack --version > /dev/null 2>&1;then - echo "SKIP: Could not run test without conntrack tool" - exit $ksft_skip -fi +checktool "nft --version" "run test without nft" +checktool "conntrack --version" "run test without conntrack" +checktool "socat -h" "run test without socat" trap cleanup EXIT @@ -79,7 +73,15 @@ ip -net "$ns1" li set veth0 up ip -net "$ns0" addr add $IP0/$PFXL dev veth0 ip -net "$ns1" addr add $IP1/$PFXL dev veth0 -ip netns exec "$ns1" iperf3 -s > /dev/null 2>&1 & +listener_ready() +{ + local ns="$1" + + ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555" +} + +ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null & +busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" # test vrf ingress handling. # The incoming connection should be placed in conntrack zone 1, @@ -160,16 +162,16 @@ table ip nat { } } EOF - if ! ip netns exec "$ns0" ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device" + if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on vrf device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 2' && - if ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' && + ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then + echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" else echo "FAIL: vrf rules have unexpected counter value" ret=1 @@ -195,15 +197,15 @@ table ip nat { } } EOF - if ! ip netns exec "$ns0" ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null; then - echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device" + if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then + echo "FAIL: connect failure with masquerade + sport rewrite on veth device" ret=1 return fi # must also check that nat table was evaluated on second (lower device) iteration. - if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 2'; then - echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device" + if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then + echo "PASS: connect with masquerade + sport rewrite on veth device" else echo "FAIL: vrf masq rule has unexpected counter value" ret=1 -- cgit v1.2.3 From 8e2b318a65c30626d49d3bf1940037afb386e596 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 22 Apr 2024 12:25:42 +0200 Subject: selftests: netfilter: nft_zones_many.sh: set ct sysctl after ruleset load nf_conntrack_udp_timeout sysctl only exist once conntrack module is loaded, if this test runs standalone on a modular kernel sysctl setting fails, this can result in test failure as udp conntrack entries expire too fast. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240422102546.2494-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_zones_many.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh index db53de348783..4ad75038f6ff 100755 --- a/tools/testing/selftests/net/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -28,7 +28,6 @@ fi test_zones() { local max_zones=$1 -ip netns exec "$ns1" sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 ip netns exec "$ns1" nft -f /dev/stdin< Date: Mon, 22 Apr 2024 12:33:53 +0200 Subject: selftests: netfilter: fix conntrack_dump_flush retval on unsupported kernel With CONFIG_NETFILTER=n test passes instead of skip. Before: ./run_kselftest.sh -t net/netfilter:conntrack_dump_flush [..] # Starting 3 tests from 1 test cases. # RUN conntrack_dump_flush.test_dump_by_zone ... mnl_socket_open: Protocol not supported [..] ok 3 conntrack_dump_flush.test_flush_by_zone_default # PASSED: 3 / 3 tests passed. # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 After: mnl_socket_open: Protocol not supported [..] ok 3 conntrack_dump_flush.test_flush_by_zone_default # SKIP cannot open netlink_netfilter socket # PASSED: 3 / 3 tests passed. # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:3 error:0 Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240422103358.3511-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/conntrack_dump_flush.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c index ca8d6b976c42..bd9317bf5ada 100644 --- a/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c +++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c @@ -313,13 +313,11 @@ FIXTURE_SETUP(conntrack_dump_flush) self->sock = mnl_socket_open(NETLINK_NETFILTER); if (!self->sock) { perror("mnl_socket_open"); - exit(EXIT_FAILURE); + SKIP(return, "cannot open netlink_netfilter socket"); } - if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) { - perror("mnl_socket_bind"); - exit(EXIT_FAILURE); - } + ret = mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID); + EXPECT_EQ(ret, 0); ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); if (ret < 0 && errno == EPERM) -- cgit v1.2.3 From 6b88ce902f0bdcb3694a6ceddc8b3d0b40db3772 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 23 Apr 2024 11:35:41 -0700 Subject: selftests: net: name bpf objects consistently and simplify Makefile The BPF sources moved with bpf_offload.py have a suffix of .bpf.c which seems to be useful convention. Rename the 2 other BPF sources we had. Use wildcard in the Makefile, since we can match all those files easily now. Link: https://lore.kernel.org/r/20240423183542.3807234-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 9 +- tools/testing/selftests/net/nat6to4.bpf.c | 285 ++++++++++++++++++++++++++ tools/testing/selftests/net/nat6to4.c | 285 -------------------------- tools/testing/selftests/net/udpgro.sh | 2 +- tools/testing/selftests/net/udpgro_bench.sh | 2 +- tools/testing/selftests/net/udpgro_frglist.sh | 8 +- tools/testing/selftests/net/udpgro_fwd.sh | 2 +- tools/testing/selftests/net/veth.sh | 2 +- tools/testing/selftests/net/xdp_dummy.bpf.c | 13 ++ tools/testing/selftests/net/xdp_dummy.c | 13 -- 10 files changed, 309 insertions(+), 312 deletions(-) create mode 100644 tools/testing/selftests/net/nat6to4.bpf.c delete mode 100644 tools/testing/selftests/net/nat6to4.c create mode 100644 tools/testing/selftests/net/xdp_dummy.bpf.c delete mode 100644 tools/testing/selftests/net/xdp_dummy.c (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7e7f243d0ab2..c388308ff517 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -82,10 +82,6 @@ TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += csum -TEST_GEN_FILES += nat6to4.o -TEST_GEN_FILES += xdp_dummy.o -TEST_GEN_FILES += sample_ret0.bpf.o -TEST_GEN_FILES += sample_map_ret0.bpf.o TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh @@ -100,6 +96,8 @@ TEST_PROGS += bpf_offload.py TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) + TEST_INCLUDES := forwarding/lib.sh include ../lib.mk @@ -146,8 +144,7 @@ endif CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) -BPF_PROG_OBJS := $(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o \ - $(OUTPUT)/sample_map_ret0.bpf.o $(OUTPUT)/sample_ret0.bpf.o +BPF_PROG_OBJS := $(patsubst %.c,$(OUTPUT)/%.o,$(wildcard *.bpf.c)) $(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS) $(call msg,BPF_PROG,,$@) diff --git a/tools/testing/selftests/net/nat6to4.bpf.c b/tools/testing/selftests/net/nat6to4.bpf.c new file mode 100644 index 000000000000..ac54c36b25fc --- /dev/null +++ b/tools/testing/selftests/net/nat6to4.bpf.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code is taken from the Android Open Source Project and the author + * (Maciej Żenczykowski) has gave permission to relicense it under the + * GPLv2. Therefore this program is free software; + * You can redistribute it and/or modify it under the terms of the GNU + * General Public License version 2 as published by the Free Software + * Foundation + + * The original headers, including the original license headers, are + * included below for completeness. + * + * Copyright (C) 2019 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include +#include + +#define IP_DF 0x4000 // Flag: "Don't Fragment" + +SEC("schedcls/ingress6/nat_6") +int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb) +{ + const int l2_header_size = sizeof(struct ethhdr); + void *data = (void *)(long)skb->data; + const void *data_end = (void *)(long)skb->data_end; + const struct ethhdr * const eth = data; // used iff is_ethernet + const struct ipv6hdr * const ip6 = (void *)(eth + 1); + + // Require ethernet dst mac address to be our unicast address. + if (skb->pkt_type != PACKET_HOST) + return TC_ACT_OK; + + // Must be meta-ethernet IPv6 frame + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + // Must have (ethernet and) ipv6 header + if (data + l2_header_size + sizeof(*ip6) > data_end) + return TC_ACT_OK; + + // Ethertype - if present - must be IPv6 + if (eth->h_proto != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + // IP version must be 6 + if (ip6->version != 6) + return TC_ACT_OK; + // Maximum IPv6 payload length that can be translated to IPv4 + if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr)) + return TC_ACT_OK; + switch (ip6->nexthdr) { + case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 + case IPPROTO_UDP: // address means there is no need to update their checksums. + case IPPROTO_GRE: // We do not need to bother looking at GRE/ESP headers, + case IPPROTO_ESP: // since there is never a checksum to update. + break; + default: // do not know how to handle anything else + return TC_ACT_OK; + } + + struct ethhdr eth2; // used iff is_ethernet + + eth2 = *eth; // Copy over the ethernet header (src/dst mac) + eth2.h_proto = bpf_htons(ETH_P_IP); // But replace the ethertype + + struct iphdr ip = { + .version = 4, // u4 + .ihl = sizeof(struct iphdr) / sizeof(__u32), // u4 + .tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8 + .tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)), // u16 + .id = 0, // u16 + .frag_off = bpf_htons(IP_DF), // u16 + .ttl = ip6->hop_limit, // u8 + .protocol = ip6->nexthdr, // u8 + .check = 0, // u16 + .saddr = 0x0201a8c0, // u32 + .daddr = 0x0101a8c0, // u32 + }; + + // Calculate the IPv4 one's complement checksum of the IPv4 header. + __wsum sum4 = 0; + + for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i) + sum4 += ((__u16 *)&ip)[i]; + + // Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4 + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 + ip.check = (__u16)~sum4; // sum4 cannot be zero, so this is never 0xFFFF + + // Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header. + __wsum sum6 = 0; + // We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits) + for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i) + sum6 += ~((__u16 *)ip6)[i]; // note the bitwise negation + + // Note that there is no L4 checksum update: we are relying on the checksum neutrality + // of the ipv6 address chosen by netd's ClatdController. + + // Packet mutations begin - point of no return, but if this first modification fails + // the packet is probably still pristine, so let clatd handle it. + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0)) + return TC_ACT_OK; + bpf_csum_update(skb, sum6); + + data = (void *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + if (data + l2_header_size + sizeof(struct iphdr) > data_end) + return TC_ACT_SHOT; + + struct ethhdr *new_eth = data; + + // Copy over the updated ethernet header + *new_eth = eth2; + + // Copy over the new ipv4 header. + *(struct iphdr *)(new_eth + 1) = ip; + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +SEC("schedcls/egress4/snat4") +int sched_cls_egress4_snat4_prog(struct __sk_buff *skb) +{ + const int l2_header_size = sizeof(struct ethhdr); + void *data = (void *)(long)skb->data; + const void *data_end = (void *)(long)skb->data_end; + const struct ethhdr *const eth = data; // used iff is_ethernet + const struct iphdr *const ip4 = (void *)(eth + 1); + + // Must be meta-ethernet IPv4 frame + if (skb->protocol != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + // Must have ipv4 header + if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end) + return TC_ACT_OK; + + // Ethertype - if present - must be IPv4 + if (eth->h_proto != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + // IP version must be 4 + if (ip4->version != 4) + return TC_ACT_OK; + + // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header + if (ip4->ihl != 5) + return TC_ACT_OK; + + // Maximum IPv6 payload length that can be translated to IPv4 + if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr)) + return TC_ACT_OK; + + // Calculate the IPv4 one's complement checksum of the IPv4 header. + __wsum sum4 = 0; + + for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i) + sum4 += ((__u16 *)ip4)[i]; + + // Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4 + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 + // for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF + if (sum4 != 0xFFFF) + return TC_ACT_OK; + + // Minimum IPv4 total length is the size of the header + if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4)) + return TC_ACT_OK; + + // We are incapable of dealing with IPv4 fragments + if (ip4->frag_off & ~bpf_htons(IP_DF)) + return TC_ACT_OK; + + switch (ip4->protocol) { + case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 + case IPPROTO_GRE: // address means there is no need to update their checksums. + case IPPROTO_ESP: // We do not need to bother looking at GRE/ESP headers, + break; // since there is never a checksum to update. + + case IPPROTO_UDP: // See above comment, but must also have UDP header... + if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end) + return TC_ACT_OK; + const struct udphdr *uh = (const struct udphdr *)(ip4 + 1); + // If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the + // checksum. Otherwise the network or more likely the NAT64 gateway might + // drop the packet because in most cases IPv6/UDP packets with a zero checksum + // are invalid. See RFC 6935. TODO: calculate checksum via bpf_csum_diff() + if (!uh->check) + return TC_ACT_OK; + break; + + default: // do not know how to handle anything else + return TC_ACT_OK; + } + struct ethhdr eth2; // used iff is_ethernet + + eth2 = *eth; // Copy over the ethernet header (src/dst mac) + eth2.h_proto = bpf_htons(ETH_P_IPV6); // But replace the ethertype + + struct ipv6hdr ip6 = { + .version = 6, // __u8:4 + .priority = ip4->tos >> 4, // __u8:4 + .flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0}, // __u8[3] + .payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20), // __be16 + .nexthdr = ip4->protocol, // __u8 + .hop_limit = ip4->ttl, // __u8 + }; + ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); + ip6.saddr.in6_u.u6_addr32[1] = 0; + ip6.saddr.in6_u.u6_addr32[2] = 0; + ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1); + ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); + ip6.daddr.in6_u.u6_addr32[1] = 0; + ip6.daddr.in6_u.u6_addr32[2] = 0; + ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2); + + // Calculate the IPv6 16-bit one's complement checksum of the IPv6 header. + __wsum sum6 = 0; + // We'll end up with a non-zero sum due to ip6.version == 6 + for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i) + sum6 += ((__u16 *)&ip6)[i]; + + // Packet mutations begin - point of no return, but if this first modification fails + // the packet is probably still pristine, so let clatd handle it. + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0)) + return TC_ACT_OK; + + // This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet. + // In such a case, skb->csum is a 16-bit one's complement sum of the entire payload, + // thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum. + // However, we've already verified the ipv4 checksum is correct and thus 0. + // Thus we only need to add the ipv6 header's sum. + // + // bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error + // (-ENOTSUPP) if it isn't. So we just ignore the return code (see above for more details). + bpf_csum_update(skb, sum6); + + // bpf_skb_change_proto() invalidates all pointers - reload them. + data = (void *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + // I cannot think of any valid way for this error condition to trigger, however I do + // believe the explicit check is required to keep the in kernel ebpf verifier happy. + if (data + l2_header_size + sizeof(ip6) > data_end) + return TC_ACT_SHOT; + + struct ethhdr *new_eth = data; + + // Copy over the updated ethernet header + *new_eth = eth2; + // Copy over the new ipv4 header. + *(struct ipv6hdr *)(new_eth + 1) = ip6; + return TC_ACT_OK; +} + +char _license[] SEC("license") = ("GPL"); diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.c deleted file mode 100644 index ac54c36b25fc..000000000000 --- a/tools/testing/selftests/net/nat6to4.c +++ /dev/null @@ -1,285 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * This code is taken from the Android Open Source Project and the author - * (Maciej Żenczykowski) has gave permission to relicense it under the - * GPLv2. Therefore this program is free software; - * You can redistribute it and/or modify it under the terms of the GNU - * General Public License version 2 as published by the Free Software - * Foundation - - * The original headers, including the original license headers, are - * included below for completeness. - * - * Copyright (C) 2019 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - -#include -#include - -#define IP_DF 0x4000 // Flag: "Don't Fragment" - -SEC("schedcls/ingress6/nat_6") -int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb) -{ - const int l2_header_size = sizeof(struct ethhdr); - void *data = (void *)(long)skb->data; - const void *data_end = (void *)(long)skb->data_end; - const struct ethhdr * const eth = data; // used iff is_ethernet - const struct ipv6hdr * const ip6 = (void *)(eth + 1); - - // Require ethernet dst mac address to be our unicast address. - if (skb->pkt_type != PACKET_HOST) - return TC_ACT_OK; - - // Must be meta-ethernet IPv6 frame - if (skb->protocol != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - // Must have (ethernet and) ipv6 header - if (data + l2_header_size + sizeof(*ip6) > data_end) - return TC_ACT_OK; - - // Ethertype - if present - must be IPv6 - if (eth->h_proto != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - // IP version must be 6 - if (ip6->version != 6) - return TC_ACT_OK; - // Maximum IPv6 payload length that can be translated to IPv4 - if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr)) - return TC_ACT_OK; - switch (ip6->nexthdr) { - case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 - case IPPROTO_UDP: // address means there is no need to update their checksums. - case IPPROTO_GRE: // We do not need to bother looking at GRE/ESP headers, - case IPPROTO_ESP: // since there is never a checksum to update. - break; - default: // do not know how to handle anything else - return TC_ACT_OK; - } - - struct ethhdr eth2; // used iff is_ethernet - - eth2 = *eth; // Copy over the ethernet header (src/dst mac) - eth2.h_proto = bpf_htons(ETH_P_IP); // But replace the ethertype - - struct iphdr ip = { - .version = 4, // u4 - .ihl = sizeof(struct iphdr) / sizeof(__u32), // u4 - .tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8 - .tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)), // u16 - .id = 0, // u16 - .frag_off = bpf_htons(IP_DF), // u16 - .ttl = ip6->hop_limit, // u8 - .protocol = ip6->nexthdr, // u8 - .check = 0, // u16 - .saddr = 0x0201a8c0, // u32 - .daddr = 0x0101a8c0, // u32 - }; - - // Calculate the IPv4 one's complement checksum of the IPv4 header. - __wsum sum4 = 0; - - for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i) - sum4 += ((__u16 *)&ip)[i]; - - // Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4 - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 - ip.check = (__u16)~sum4; // sum4 cannot be zero, so this is never 0xFFFF - - // Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header. - __wsum sum6 = 0; - // We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits) - for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i) - sum6 += ~((__u16 *)ip6)[i]; // note the bitwise negation - - // Note that there is no L4 checksum update: we are relying on the checksum neutrality - // of the ipv6 address chosen by netd's ClatdController. - - // Packet mutations begin - point of no return, but if this first modification fails - // the packet is probably still pristine, so let clatd handle it. - if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0)) - return TC_ACT_OK; - bpf_csum_update(skb, sum6); - - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - if (data + l2_header_size + sizeof(struct iphdr) > data_end) - return TC_ACT_SHOT; - - struct ethhdr *new_eth = data; - - // Copy over the updated ethernet header - *new_eth = eth2; - - // Copy over the new ipv4 header. - *(struct iphdr *)(new_eth + 1) = ip; - return bpf_redirect(skb->ifindex, BPF_F_INGRESS); -} - -SEC("schedcls/egress4/snat4") -int sched_cls_egress4_snat4_prog(struct __sk_buff *skb) -{ - const int l2_header_size = sizeof(struct ethhdr); - void *data = (void *)(long)skb->data; - const void *data_end = (void *)(long)skb->data_end; - const struct ethhdr *const eth = data; // used iff is_ethernet - const struct iphdr *const ip4 = (void *)(eth + 1); - - // Must be meta-ethernet IPv4 frame - if (skb->protocol != bpf_htons(ETH_P_IP)) - return TC_ACT_OK; - - // Must have ipv4 header - if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end) - return TC_ACT_OK; - - // Ethertype - if present - must be IPv4 - if (eth->h_proto != bpf_htons(ETH_P_IP)) - return TC_ACT_OK; - - // IP version must be 4 - if (ip4->version != 4) - return TC_ACT_OK; - - // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header - if (ip4->ihl != 5) - return TC_ACT_OK; - - // Maximum IPv6 payload length that can be translated to IPv4 - if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr)) - return TC_ACT_OK; - - // Calculate the IPv4 one's complement checksum of the IPv4 header. - __wsum sum4 = 0; - - for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i) - sum4 += ((__u16 *)ip4)[i]; - - // Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4 - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE - sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 - // for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF - if (sum4 != 0xFFFF) - return TC_ACT_OK; - - // Minimum IPv4 total length is the size of the header - if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4)) - return TC_ACT_OK; - - // We are incapable of dealing with IPv4 fragments - if (ip4->frag_off & ~bpf_htons(IP_DF)) - return TC_ACT_OK; - - switch (ip4->protocol) { - case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 - case IPPROTO_GRE: // address means there is no need to update their checksums. - case IPPROTO_ESP: // We do not need to bother looking at GRE/ESP headers, - break; // since there is never a checksum to update. - - case IPPROTO_UDP: // See above comment, but must also have UDP header... - if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end) - return TC_ACT_OK; - const struct udphdr *uh = (const struct udphdr *)(ip4 + 1); - // If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the - // checksum. Otherwise the network or more likely the NAT64 gateway might - // drop the packet because in most cases IPv6/UDP packets with a zero checksum - // are invalid. See RFC 6935. TODO: calculate checksum via bpf_csum_diff() - if (!uh->check) - return TC_ACT_OK; - break; - - default: // do not know how to handle anything else - return TC_ACT_OK; - } - struct ethhdr eth2; // used iff is_ethernet - - eth2 = *eth; // Copy over the ethernet header (src/dst mac) - eth2.h_proto = bpf_htons(ETH_P_IPV6); // But replace the ethertype - - struct ipv6hdr ip6 = { - .version = 6, // __u8:4 - .priority = ip4->tos >> 4, // __u8:4 - .flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0}, // __u8[3] - .payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20), // __be16 - .nexthdr = ip4->protocol, // __u8 - .hop_limit = ip4->ttl, // __u8 - }; - ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); - ip6.saddr.in6_u.u6_addr32[1] = 0; - ip6.saddr.in6_u.u6_addr32[2] = 0; - ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1); - ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); - ip6.daddr.in6_u.u6_addr32[1] = 0; - ip6.daddr.in6_u.u6_addr32[2] = 0; - ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2); - - // Calculate the IPv6 16-bit one's complement checksum of the IPv6 header. - __wsum sum6 = 0; - // We'll end up with a non-zero sum due to ip6.version == 6 - for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i) - sum6 += ((__u16 *)&ip6)[i]; - - // Packet mutations begin - point of no return, but if this first modification fails - // the packet is probably still pristine, so let clatd handle it. - if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0)) - return TC_ACT_OK; - - // This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet. - // In such a case, skb->csum is a 16-bit one's complement sum of the entire payload, - // thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum. - // However, we've already verified the ipv4 checksum is correct and thus 0. - // Thus we only need to add the ipv6 header's sum. - // - // bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error - // (-ENOTSUPP) if it isn't. So we just ignore the return code (see above for more details). - bpf_csum_update(skb, sum6); - - // bpf_skb_change_proto() invalidates all pointers - reload them. - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - - // I cannot think of any valid way for this error condition to trigger, however I do - // believe the explicit check is required to keep the in kernel ebpf verifier happy. - if (data + l2_header_size + sizeof(ip6) > data_end) - return TC_ACT_SHOT; - - struct ethhdr *new_eth = data; - - // Copy over the updated ethernet header - *new_eth = eth2; - // Copy over the new ipv4 header. - *(struct ipv6hdr *)(new_eth + 1) = ip6; - return TC_ACT_OK; -} - -char _license[] SEC("license") = ("GPL"); diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 8802604148dd..11a1ebda564f 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" # set global exit status, but never reset nonzero one. check_err() diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh index 7080eae5312b..c51ea90a1395 100755 --- a/tools/testing/selftests/net/udpgro_bench.sh +++ b/tools/testing/selftests/net/udpgro_bench.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh index e1ff645bd3d1..17404f49cdb6 100755 --- a/tools/testing/selftests/net/udpgro_frglist.sh +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -7,7 +7,7 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" cleanup() { local -r jobs="$(jobs -p)" @@ -42,8 +42,8 @@ run_one() { ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp tc -n "${PEER_NS}" qdisc add dev veth1 clsact - tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6 direct-action - tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action + tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.bpf.o section schedcls/ingress6/nat_6 direct-action + tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action echo ${rx_args} ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & @@ -89,7 +89,7 @@ if [ ! -f ${BPF_FILE} ]; then exit -1 fi -if [ ! -f nat6to4.o ]; then +if [ ! -f nat6to4.bpf.o ]; then echo "Missing nat6to4 helper. Run 'make' first" exit -1 fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 83ed987cff34..550d8eb3e224 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -3,7 +3,7 @@ source net_helper.sh -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly BASE="ns-$(mktemp -u XXXXXX)" readonly SRC=2 readonly DST=1 diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 3a394b43e274..4f1edbafb946 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -BPF_FILE="xdp_dummy.o" +BPF_FILE="xdp_dummy.bpf.o" readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" readonly BASE=`basename $STATS` readonly SRC=2 diff --git a/tools/testing/selftests/net/xdp_dummy.bpf.c b/tools/testing/selftests/net/xdp_dummy.bpf.c new file mode 100644 index 000000000000..d988b2e0cee8 --- /dev/null +++ b/tools/testing/selftests/net/xdp_dummy.bpf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define KBUILD_MODNAME "xdp_dummy" +#include +#include + +SEC("xdp") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.c deleted file mode 100644 index d988b2e0cee8..000000000000 --- a/tools/testing/selftests/net/xdp_dummy.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#define KBUILD_MODNAME "xdp_dummy" -#include -#include - -SEC("xdp") -int xdp_dummy_prog(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 3f584c211d8cc049848620472aaec1bebd8ddaae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 23 Apr 2024 11:35:42 -0700 Subject: selftests: net: extract BPF building logic from the Makefile The BPF sample building code looks a little bit spaghetti-ish so move it out to its own Makefile snippet. Similar in the spirit to how we include lib.mk. libynl will soon get a similar snippet. There is a small change hiding in the move, the relative paths (../../.., ../.. etc) are replaced with variables from lib.mk such as top_srcdir and selfdir. Link: https://lore.kernel.org/r/20240423183542.3807234-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 53 +----------------------------------- tools/testing/selftests/net/bpf.mk | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 52 deletions(-) create mode 100644 tools/testing/selftests/net/bpf.mk (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c388308ff517..5befca249452 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -108,55 +108,4 @@ $(OUTPUT)/tcp_inq: LDLIBS += -lpthread $(OUTPUT)/bind_bhash: LDLIBS += -lpthread $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/ -# Rules to generate bpf objs -CLANG ?= clang -SCRATCH_DIR := $(OUTPUT)/tools -BUILD_DIR := $(SCRATCH_DIR)/build -BPFDIR := $(abspath ../../../lib/bpf) -APIDIR := $(abspath ../../../include/uapi) - -CCINCLUDE += -I../bpf -CCINCLUDE += -I../../../../usr/include/ -CCINCLUDE += -I$(SCRATCH_DIR)/include - -BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a - -MAKE_DIRS := $(BUILD_DIR)/libbpf -$(MAKE_DIRS): - $(call msg,MKDIR,,$@) - $(Q)mkdir -p $@ - -# Get Clang's default includes on this system, as opposed to those seen by -# '--target=bpf'. This fixes "missing" files on some architectures/distros, -# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. -# -# Use '-idirafter': Don't interfere with include mechanics except where the -# build would have failed anyways. -define get_sys_includes -$(shell $(1) $(2) -v -E - &1 \ - | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -$(shell $(1) $(2) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) $(2) -dM -E - Date: Mon, 22 Apr 2024 20:01:52 +0000 Subject: KVM: selftests: Align with kernel's GIC definitions There are a few subtle incongruencies between the GIC definitions used by the kernel and selftests. Furthermore, the selftests header blends implementation detail (e.g. default priority) with the architectural definitions. This is all rather annoying, since bulk imports of the kernel header is not possible. Move selftests-specific definitions out of the offending header and realign tests on the canonical definitions for things like sysregs. Finally, haul in a fresh copy of the gicv3 header to enable a forthcoming ITS selftest. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-14-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 4 +- .../testing/selftests/kvm/include/aarch64/gic_v3.h | 586 +++++++++++++++++++-- tools/testing/selftests/kvm/lib/aarch64/gic_v3.c | 13 +- 3 files changed, 568 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 2e64b4856e38..d61a6302f467 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -152,7 +152,7 @@ static void reset_stats(void) static uint64_t gic_read_ap1r0(void) { - uint64_t reg = read_sysreg_s(SYS_ICV_AP1R0_EL1); + uint64_t reg = read_sysreg_s(SYS_ICC_AP1R0_EL1); dsb(sy); return reg; @@ -160,7 +160,7 @@ static uint64_t gic_read_ap1r0(void) static void gic_write_ap1r0(uint64_t val) { - write_sysreg_s(val, SYS_ICV_AP1R0_EL1); + write_sysreg_s(val, SYS_ICC_AP1R0_EL1); isb(); } diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h index ba0886e8a2bb..a76615fa39a1 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h @@ -1,82 +1,604 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * ARM Generic Interrupt Controller (GIC) v3 specific defines + * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved. + * Author: Marc Zyngier */ - -#ifndef SELFTEST_KVM_GICV3_H -#define SELFTEST_KVM_GICV3_H - -#include +#ifndef __SELFTESTS_GIC_V3_H +#define __SELFTESTS_GIC_V3_H /* - * Distributor registers + * Distributor registers. We assume we're running non-secure, with ARE + * being set. Secure-only and non-ARE registers are not described. */ #define GICD_CTLR 0x0000 #define GICD_TYPER 0x0004 +#define GICD_IIDR 0x0008 +#define GICD_TYPER2 0x000C +#define GICD_STATUSR 0x0010 +#define GICD_SETSPI_NSR 0x0040 +#define GICD_CLRSPI_NSR 0x0048 +#define GICD_SETSPI_SR 0x0050 +#define GICD_CLRSPI_SR 0x0058 #define GICD_IGROUPR 0x0080 #define GICD_ISENABLER 0x0100 #define GICD_ICENABLER 0x0180 #define GICD_ISPENDR 0x0200 #define GICD_ICPENDR 0x0280 -#define GICD_ICACTIVER 0x0380 #define GICD_ISACTIVER 0x0300 +#define GICD_ICACTIVER 0x0380 #define GICD_IPRIORITYR 0x0400 #define GICD_ICFGR 0x0C00 +#define GICD_IGRPMODR 0x0D00 +#define GICD_NSACR 0x0E00 +#define GICD_IGROUPRnE 0x1000 +#define GICD_ISENABLERnE 0x1200 +#define GICD_ICENABLERnE 0x1400 +#define GICD_ISPENDRnE 0x1600 +#define GICD_ICPENDRnE 0x1800 +#define GICD_ISACTIVERnE 0x1A00 +#define GICD_ICACTIVERnE 0x1C00 +#define GICD_IPRIORITYRnE 0x2000 +#define GICD_ICFGRnE 0x3000 +#define GICD_IROUTER 0x6000 +#define GICD_IROUTERnE 0x8000 +#define GICD_IDREGS 0xFFD0 +#define GICD_PIDR2 0xFFE8 + +#define ESPI_BASE_INTID 4096 /* - * The assumption is that the guest runs in a non-secure mode. - * The following bits of GICD_CTLR are defined accordingly. + * Those registers are actually from GICv2, but the spec demands that they + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3). */ +#define GICD_ITARGETSR 0x0800 +#define GICD_SGIR 0x0F00 +#define GICD_CPENDSGIR 0x0F10 +#define GICD_SPENDSGIR 0x0F20 + #define GICD_CTLR_RWP (1U << 31) #define GICD_CTLR_nASSGIreq (1U << 8) +#define GICD_CTLR_DS (1U << 6) #define GICD_CTLR_ARE_NS (1U << 4) #define GICD_CTLR_ENABLE_G1A (1U << 1) #define GICD_CTLR_ENABLE_G1 (1U << 0) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + +/* + * In systems with a single security state (what we emulate in KVM) + * the meaning of the interrupt group enable bits is slightly different + */ +#define GICD_CTLR_ENABLE_SS_G1 (1U << 1) +#define GICD_CTLR_ENABLE_SS_G0 (1U << 0) + +#define GICD_TYPER_RSS (1U << 26) +#define GICD_TYPER_LPIS (1U << 17) +#define GICD_TYPER_MBIS (1U << 16) +#define GICD_TYPER_ESPI (1U << 8) + +#define GICD_TYPER_ID_BITS(typer) ((((typer) >> 19) & 0x1f) + 1) +#define GICD_TYPER_NUM_LPIS(typer) ((((typer) >> 11) & 0x1f) + 1) #define GICD_TYPER_SPIS(typer) ((((typer) & 0x1f) + 1) * 32) -#define GICD_INT_DEF_PRI_X4 0xa0a0a0a0 +#define GICD_TYPER_ESPIS(typer) \ + (((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0) + +#define GICD_TYPER2_nASSGIcap (1U << 8) +#define GICD_TYPER2_VIL (1U << 7) +#define GICD_TYPER2_VID GENMASK(4, 0) + +#define GICD_IROUTER_SPI_MODE_ONE (0U << 31) +#define GICD_IROUTER_SPI_MODE_ANY (1U << 31) + +#define GIC_PIDR2_ARCH_MASK 0xf0 +#define GIC_PIDR2_ARCH_GICv3 0x30 +#define GIC_PIDR2_ARCH_GICv4 0x40 + +#define GIC_V3_DIST_SIZE 0x10000 + +#define GIC_PAGE_SIZE_4K 0ULL +#define GIC_PAGE_SIZE_16K 1ULL +#define GIC_PAGE_SIZE_64K 2ULL +#define GIC_PAGE_SIZE_MASK 3ULL /* - * Redistributor registers + * Re-Distributor registers, offsets from RD_base */ -#define GICR_CTLR 0x000 -#define GICR_WAKER 0x014 +#define GICR_CTLR GICD_CTLR +#define GICR_IIDR 0x0004 +#define GICR_TYPER 0x0008 +#define GICR_STATUSR GICD_STATUSR +#define GICR_WAKER 0x0014 +#define GICR_SETLPIR 0x0040 +#define GICR_CLRLPIR 0x0048 +#define GICR_PROPBASER 0x0070 +#define GICR_PENDBASER 0x0078 +#define GICR_INVLPIR 0x00A0 +#define GICR_INVALLR 0x00B0 +#define GICR_SYNCR 0x00C0 +#define GICR_IDREGS GICD_IDREGS +#define GICR_PIDR2 GICD_PIDR2 + +#define GICR_CTLR_ENABLE_LPIS (1UL << 0) +#define GICR_CTLR_CES (1UL << 1) +#define GICR_CTLR_IR (1UL << 2) +#define GICR_CTLR_RWP (1UL << 3) -#define GICR_CTLR_RWP (1U << 3) +#define GICR_TYPER_CPU_NUMBER(r) (((r) >> 8) & 0xffff) + +#define EPPI_BASE_INTID 1056 + +#define GICR_TYPER_NR_PPIS(r) \ + ({ \ + unsigned int __ppinum = ((r) >> 27) & 0x1f; \ + unsigned int __nr_ppis = 16; \ + if (__ppinum == 1 || __ppinum == 2) \ + __nr_ppis += __ppinum * 32; \ + \ + __nr_ppis; \ + }) #define GICR_WAKER_ProcessorSleep (1U << 1) #define GICR_WAKER_ChildrenAsleep (1U << 2) +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL + +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +/* encode a size field of width @w containing @n - 1 units */ +#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0)) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + +#define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 12)) +#define GICR_PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 16)) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + +#define GICR_PENDBASER_PTZ BIT_ULL(62) + /* - * Redistributor registers, offsets from SGI base + * Re-Distributor registers, offsets from SGI_base */ #define GICR_IGROUPR0 GICD_IGROUPR #define GICR_ISENABLER0 GICD_ISENABLER #define GICR_ICENABLER0 GICD_ICENABLER #define GICR_ISPENDR0 GICD_ISPENDR +#define GICR_ICPENDR0 GICD_ICPENDR #define GICR_ISACTIVER0 GICD_ISACTIVER #define GICR_ICACTIVER0 GICD_ICACTIVER -#define GICR_ICENABLER GICD_ICENABLER -#define GICR_ICACTIVER GICD_ICACTIVER #define GICR_IPRIORITYR0 GICD_IPRIORITYR +#define GICR_ICFGR0 GICD_ICFGR +#define GICR_IGRPMODR0 GICD_IGRPMODR +#define GICR_NSACR GICD_NSACR + +#define GICR_TYPER_PLPIS (1U << 0) +#define GICR_TYPER_VLPIS (1U << 1) +#define GICR_TYPER_DIRTY (1U << 2) +#define GICR_TYPER_DirectLPIS (1U << 3) +#define GICR_TYPER_LAST (1U << 4) +#define GICR_TYPER_RVPEID (1U << 7) +#define GICR_TYPER_COMMON_LPI_AFF GENMASK_ULL(25, 24) +#define GICR_TYPER_AFFINITY GENMASK_ULL(63, 32) + +#define GICR_INVLPIR_INTID GENMASK_ULL(31, 0) +#define GICR_INVLPIR_VPEID GENMASK_ULL(47, 32) +#define GICR_INVLPIR_V GENMASK_ULL(63, 63) + +#define GICR_INVALLR_VPEID GICR_INVLPIR_VPEID +#define GICR_INVALLR_V GICR_INVLPIR_V + +#define GIC_V3_REDIST_SIZE 0x20000 + +#define LPI_PROP_GROUP1 (1 << 1) +#define LPI_PROP_ENABLED (1 << 0) + +/* + * Re-Distributor registers, offsets from VLPI_base + */ +#define GICR_VPROPBASER 0x0070 + +#define GICR_VPROPBASER_IDBITS_MASK 0x1f + +#define GICR_VPROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT (56) + +#define GICR_VPROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK) +#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK) +#define GICR_VPROPBASER_CACHEABILITY_MASK \ + GICR_VPROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable) + +#define GICR_VPROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB) +#define GICR_VPROPBASER_nC GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC) +#define GICR_VPROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt) +#define GICR_VPROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWb) +#define GICR_VPROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt) +#define GICR_VPROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb) +#define GICR_VPROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt) +#define GICR_VPROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb) + +/* + * GICv4.1 VPROPBASER reinvention. A subtle mix between the old + * VPROPBASER and ITS_BASER. Just not quite any of the two. + */ +#define GICR_VPROPBASER_4_1_VALID (1ULL << 63) +#define GICR_VPROPBASER_4_1_ENTRY_SIZE GENMASK_ULL(61, 59) +#define GICR_VPROPBASER_4_1_INDIRECT (1ULL << 55) +#define GICR_VPROPBASER_4_1_PAGE_SIZE GENMASK_ULL(54, 53) +#define GICR_VPROPBASER_4_1_Z (1ULL << 52) +#define GICR_VPROPBASER_4_1_ADDR GENMASK_ULL(51, 12) +#define GICR_VPROPBASER_4_1_SIZE GENMASK_ULL(6, 0) + +#define GICR_VPENDBASER 0x0078 + +#define GICR_VPENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_VPENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK) +#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK) +#define GICR_VPENDBASER_CACHEABILITY_MASK \ + GICR_VPENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPENDBASER_NonShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable) + +#define GICR_VPENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, InnerShareable) + +#define GICR_VPENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB) +#define GICR_VPENDBASER_nC GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC) +#define GICR_VPENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt) +#define GICR_VPENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWb) +#define GICR_VPENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt) +#define GICR_VPENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb) +#define GICR_VPENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt) +#define GICR_VPENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb) + +#define GICR_VPENDBASER_Dirty (1ULL << 60) +#define GICR_VPENDBASER_PendingLast (1ULL << 61) +#define GICR_VPENDBASER_IDAI (1ULL << 62) +#define GICR_VPENDBASER_Valid (1ULL << 63) + +/* + * GICv4.1 VPENDBASER, used for VPE residency. On top of these fields, + * also use the above Valid, PendingLast and Dirty. + */ +#define GICR_VPENDBASER_4_1_DB (1ULL << 62) +#define GICR_VPENDBASER_4_1_VGRP0EN (1ULL << 59) +#define GICR_VPENDBASER_4_1_VGRP1EN (1ULL << 58) +#define GICR_VPENDBASER_4_1_VPEID GENMASK_ULL(15, 0) + +#define GICR_VSGIR 0x0080 + +#define GICR_VSGIR_VPEID GENMASK(15, 0) + +#define GICR_VSGIPENDR 0x0088 + +#define GICR_VSGIPENDR_BUSY (1U << 31) +#define GICR_VSGIPENDR_PENDING GENMASK(15, 0) + +/* + * ITS registers, offsets from ITS_base + */ +#define GITS_CTLR 0x0000 +#define GITS_IIDR 0x0004 +#define GITS_TYPER 0x0008 +#define GITS_MPIDR 0x0018 +#define GITS_CBASER 0x0080 +#define GITS_CWRITER 0x0088 +#define GITS_CREADR 0x0090 +#define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 +#define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc + +#define GITS_TRANSLATER 0x10040 + +#define GITS_SGIR 0x20020 + +#define GITS_SGIR_VPEID GENMASK_ULL(47, 32) +#define GITS_SGIR_VINTID GENMASK_ULL(3, 0) + +#define GITS_CTLR_ENABLE (1U << 0) +#define GITS_CTLR_ImDe (1U << 1) +#define GITS_CTLR_ITS_NUMBER_SHIFT 4 +#define GITS_CTLR_ITS_NUMBER (0xFU << GITS_CTLR_ITS_NUMBER_SHIFT) +#define GITS_CTLR_QUIESCENT (1U << 31) + +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_VLPIS (1UL << 1) +#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 +#define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS_SHIFT 8 +#define GITS_TYPER_DEVBITS_SHIFT 13 +#define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) +#define GITS_TYPER_PTA (1UL << 19) +#define GITS_TYPER_HCC_SHIFT 24 +#define GITS_TYPER_HCC(r) (((r) >> GITS_TYPER_HCC_SHIFT) & 0xff) +#define GITS_TYPER_VMOVP (1ULL << 37) +#define GITS_TYPER_VMAPP (1ULL << 40) +#define GITS_TYPER_SVPET GENMASK_ULL(42, 41) -/* CPU interface registers */ -#define SYS_ICC_PMR_EL1 sys_reg(3, 0, 4, 6, 0) -#define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) -#define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) -#define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) -#define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) -#define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) -#define SYS_ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define GITS_IIDR_REV_SHIFT 12 +#define GITS_IIDR_REV_MASK (0xf << GITS_IIDR_REV_SHIFT) +#define GITS_IIDR_REV(r) (((r) >> GITS_IIDR_REV_SHIFT) & 0xf) +#define GITS_IIDR_PRODUCTID_SHIFT 24 -#define SYS_ICV_AP1R0_EL1 sys_reg(3, 0, 12, 9, 0) +#define GITS_CBASER_VALID (1ULL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK -#define ICC_PMR_DEF_PRIO 0xf0 +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWb) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) + +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + +#define GITS_BASER_NR_REGS 8 + +#define GITS_BASER_VALID (1ULL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) + +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + +#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) +#define GITS_BASER_PHYS_52_to_48(phys) \ + (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + +#define GITS_BASER_SHAREABILITY_SHIFT (10) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) +#define GITS_BASER_PAGE_SIZE_SHIFT (8) +#define __GITS_BASER_PSZ(sz) (GIC_PAGE_SIZE_ ## sz << GITS_BASER_PAGE_SIZE_SHIFT) +#define GITS_BASER_PAGE_SIZE_4K __GITS_BASER_PSZ(4K) +#define GITS_BASER_PAGE_SIZE_16K __GITS_BASER_PSZ(16K) +#define GITS_BASER_PAGE_SIZE_64K __GITS_BASER_PSZ(64K) +#define GITS_BASER_PAGE_SIZE_MASK __GITS_BASER_PSZ(MASK) +#define GITS_BASER_PAGES_MAX 256 +#define GITS_BASER_PAGES_SHIFT (0) +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) + +#define GITS_BASER_TYPE_NONE 0 +#define GITS_BASER_TYPE_DEVICE 1 +#define GITS_BASER_TYPE_VCPU 2 +#define GITS_BASER_TYPE_RESERVED3 3 +#define GITS_BASER_TYPE_COLLECTION 4 +#define GITS_BASER_TYPE_RESERVED5 5 +#define GITS_BASER_TYPE_RESERVED6 6 +#define GITS_BASER_TYPE_RESERVED7 7 + +#define GITS_LVL1_ENTRY_SIZE (8UL) + +/* + * ITS commands + */ +#define GITS_CMD_MAPD 0x08 +#define GITS_CMD_MAPC 0x09 +#define GITS_CMD_MAPTI 0x0a +#define GITS_CMD_MAPI 0x0b +#define GITS_CMD_MOVI 0x01 +#define GITS_CMD_DISCARD 0x0f +#define GITS_CMD_INV 0x0c +#define GITS_CMD_MOVALL 0x0e +#define GITS_CMD_INVALL 0x0d +#define GITS_CMD_INT 0x03 +#define GITS_CMD_CLEAR 0x04 +#define GITS_CMD_SYNC 0x05 + +/* + * GICv4 ITS specific commands + */ +#define GITS_CMD_GICv4(x) ((x) | 0x20) +#define GITS_CMD_VINVALL GITS_CMD_GICv4(GITS_CMD_INVALL) +#define GITS_CMD_VMAPP GITS_CMD_GICv4(GITS_CMD_MAPC) +#define GITS_CMD_VMAPTI GITS_CMD_GICv4(GITS_CMD_MAPTI) +#define GITS_CMD_VMOVI GITS_CMD_GICv4(GITS_CMD_MOVI) +#define GITS_CMD_VSYNC GITS_CMD_GICv4(GITS_CMD_SYNC) +/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */ +#define GITS_CMD_VMOVP GITS_CMD_GICv4(2) +#define GITS_CMD_VSGI GITS_CMD_GICv4(3) +#define GITS_CMD_INVDB GITS_CMD_GICv4(0xe) + +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPD_ITTSIZE_OOR 0x010802 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_ID_OOR 0x010a05 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + +/* + * CPU interface registers + */ +#define ICC_CTLR_EL1_EOImode_SHIFT (1) +#define ICC_CTLR_EL1_EOImode_drop_dir (0U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_drop (1U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_CBPR_SHIFT 0 +#define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PMHE_SHIFT 6 +#define ICC_CTLR_EL1_PMHE_MASK (1 << ICC_CTLR_EL1_PMHE_SHIFT) +#define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 +#define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) +#define ICC_CTLR_EL1_ID_BITS_SHIFT 11 +#define ICC_CTLR_EL1_ID_BITS_MASK (0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT) +#define ICC_CTLR_EL1_SEIS_SHIFT 14 +#define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT) +#define ICC_CTLR_EL1_A3V_SHIFT 15 +#define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT) +#define ICC_CTLR_EL1_RSS (0x1 << 18) +#define ICC_CTLR_EL1_ExtRange (0x1 << 19) +#define ICC_PMR_EL1_SHIFT 0 +#define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT) +#define ICC_BPR0_EL1_SHIFT 0 +#define ICC_BPR0_EL1_MASK (0x7 << ICC_BPR0_EL1_SHIFT) +#define ICC_BPR1_EL1_SHIFT 0 +#define ICC_BPR1_EL1_MASK (0x7 << ICC_BPR1_EL1_SHIFT) +#define ICC_IGRPEN0_EL1_SHIFT 0 +#define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) +#define ICC_IGRPEN1_EL1_SHIFT 0 +#define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) +#define ICC_SRE_EL1_DIB (1U << 2) +#define ICC_SRE_EL1_DFB (1U << 1) #define ICC_SRE_EL1_SRE (1U << 0) -#define ICC_IGRPEN1_EL1_ENABLE (1U << 0) +/* These are for GICv2 emulation only */ +#define GICH_LR_VIRTUALID (0x3ffUL << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT (10) +#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) + +#define ICC_IAR1_EL1_SPURIOUS 0x3ff + +#define ICC_SRE_EL2_SRE (1 << 0) +#define ICC_SRE_EL2_ENABLE (1 << 3) -#define GICV3_MAX_CPUS 512 +#define ICC_SGI1R_TARGET_LIST_SHIFT 0 +#define ICC_SGI1R_TARGET_LIST_MASK (0xffff << ICC_SGI1R_TARGET_LIST_SHIFT) +#define ICC_SGI1R_AFFINITY_1_SHIFT 16 +#define ICC_SGI1R_AFFINITY_1_MASK (0xff << ICC_SGI1R_AFFINITY_1_SHIFT) +#define ICC_SGI1R_SGI_ID_SHIFT 24 +#define ICC_SGI1R_SGI_ID_MASK (0xfULL << ICC_SGI1R_SGI_ID_SHIFT) +#define ICC_SGI1R_AFFINITY_2_SHIFT 32 +#define ICC_SGI1R_AFFINITY_2_MASK (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT) +#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40 +#define ICC_SGI1R_RS_SHIFT 44 +#define ICC_SGI1R_RS_MASK (0xfULL << ICC_SGI1R_RS_SHIFT) +#define ICC_SGI1R_AFFINITY_3_SHIFT 48 +#define ICC_SGI1R_AFFINITY_3_MASK (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT) -#endif /* SELFTEST_KVM_GICV3_H */ +#endif diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c index 263bf3ed8fd5..cd8f0e209599 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -9,9 +9,20 @@ #include "processor.h" #include "delay.h" +#include "gic.h" #include "gic_v3.h" #include "gic_private.h" +#define GICV3_MAX_CPUS 512 + +#define GICD_INT_DEF_PRI 0xa0 +#define GICD_INT_DEF_PRI_X4 ((GICD_INT_DEF_PRI << 24) |\ + (GICD_INT_DEF_PRI << 16) |\ + (GICD_INT_DEF_PRI << 8) |\ + GICD_INT_DEF_PRI) + +#define ICC_PMR_DEF_PRIO 0xf0 + struct gicv3_data { void *dist_base; void *redist_base[GICV3_MAX_CPUS]; @@ -320,7 +331,7 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); /* Enable non-secure Group-1 interrupts */ - write_sysreg_s(ICC_IGRPEN1_EL1_ENABLE, SYS_ICC_GRPEN1_EL1); + write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); gicv3_data.redist_base[cpu] = redist_base_cpu; } -- cgit v1.2.3 From 1505bc70f80df9824e9d68d15a7452856df7488c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 22 Apr 2024 20:01:53 +0000 Subject: KVM: selftests: Standardise layout of GIC frames It would appear that all of the selftests are using the same exact layout for the GIC frames. Fold this back into the library implementation to avoid defining magic values all over the selftests. This is an extension of Colton's change, ripping out parameterization of from the library internals in addition to the public interfaces. Co-developed-by: Colton Lewis Signed-off-by: Colton Lewis Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-15-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 8 +-- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 11 +--- .../selftests/kvm/aarch64/vpmu_counter_access.c | 6 +-- tools/testing/selftests/kvm/dirty_log_perf_test.c | 5 +- tools/testing/selftests/kvm/include/aarch64/gic.h | 12 ++++- tools/testing/selftests/kvm/include/aarch64/vgic.h | 3 +- tools/testing/selftests/kvm/lib/aarch64/gic.c | 18 +++---- .../selftests/kvm/lib/aarch64/gic_private.h | 4 +- tools/testing/selftests/kvm/lib/aarch64/gic_v3.c | 62 ++++++++++------------ tools/testing/selftests/kvm/lib/aarch64/vgic.c | 18 +++---- 10 files changed, 62 insertions(+), 85 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 4eaba83cdcf3..89be559cdb7e 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -14,9 +14,6 @@ #include "timer_test.h" #include "vgic.h" -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - enum guest_stage { GUEST_STAGE_VTIMER_CVAL = 1, GUEST_STAGE_VTIMER_TVAL, @@ -149,8 +146,7 @@ static void guest_code(void) local_irq_disable(); - gic_init(GIC_V3, test_args.nr_vcpus, - (void *)GICD_BASE_GPA, (void *)GICR_BASE_GPA); + gic_init(GIC_V3, test_args.nr_vcpus); timer_set_ctl(VIRTUAL, CTL_IMASK); timer_set_ctl(PHYSICAL, CTL_IMASK); @@ -209,7 +205,7 @@ struct kvm_vm *test_vm_create(void) vcpu_init_descriptor_tables(vcpus[i]); test_init_timer_irq(vm); - gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); /* Make all the test's cmdline args visible to the guest */ diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index d61a6302f467..a51dbd2a5f84 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -19,9 +19,6 @@ #include "gic_v3.h" #include "vgic.h" -#define GICD_BASE_GPA 0x08000000ULL -#define GICR_BASE_GPA 0x080A0000ULL - /* * Stores the user specified args; it's passed to the guest and to every test * function. @@ -49,9 +46,6 @@ struct test_args { #define IRQ_DEFAULT_PRIO (LOWEST_PRIO - 1) #define IRQ_DEFAULT_PRIO_REG (IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */ -static void *dist = (void *)GICD_BASE_GPA; -static void *redist = (void *)GICR_BASE_GPA; - /* * The kvm_inject_* utilities are used by the guest to ask the host to inject * interrupts (e.g., using the KVM_IRQ_LINE ioctl). @@ -478,7 +472,7 @@ static void guest_code(struct test_args *args) bool level_sensitive = args->level_sensitive; struct kvm_inject_desc *f, *inject_fns; - gic_init(GIC_V3, 1, dist, redist); + gic_init(GIC_V3, 1); for (i = 0; i < nr_irqs; i++) gic_irq_enable(i); @@ -764,8 +758,7 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); vcpu_args_set(vcpu, 1, args_gva); - gic_fd = vgic_v3_setup(vm, 1, nr_irqs, - GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, 1, nr_irqs); __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping"); vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index f2fb0e3f14bc..d31b9f64ba14 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -404,9 +404,6 @@ static void guest_code(uint64_t expected_pmcr_n) GUEST_DONE(); } -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - /* Create a VM that has one vCPU with PMUv3 configured. */ static void create_vpmu_vm(void *guest_code) { @@ -438,8 +435,7 @@ static void create_vpmu_vm(void *guest_code) init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3); vpmu_vm.vcpu = aarch64_vcpu_add(vpmu_vm.vm, 0, &init, guest_code); vcpu_init_descriptor_tables(vpmu_vm.vcpu); - vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64, - GICD_BASE_GPA, GICR_BASE_GPA); + vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64); __TEST_REQUIRE(vpmu_vm.gic_fd >= 0, "Failed to create vgic-v3, skipping"); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 504f6fe980e8..61535c4f3405 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -22,9 +22,6 @@ #ifdef __aarch64__ #include "aarch64/vgic.h" -#define GICD_BASE_GPA 0x8000000ULL -#define GICR_BASE_GPA 0x80A0000ULL - static int gic_fd; static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) @@ -33,7 +30,7 @@ static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) * The test can still run even if hardware does not support GICv3, as it * is only an optimization to reduce guest exits. */ - gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); } static void arch_cleanup_vm(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h index b217ea17cac5..53617b3f52cf 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic.h @@ -6,11 +6,20 @@ #ifndef SELFTEST_KVM_GIC_H #define SELFTEST_KVM_GIC_H +#include + enum gic_type { GIC_V3, GIC_TYPE_MAX, }; +#define GICD_BASE_GPA 0x8000000ULL +#define GICR_BASE_GPA (GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE) + +/* The GIC is identity-mapped into the guest at the time of setup. */ +#define GICD_BASE_GVA ((volatile void *)GICD_BASE_GPA) +#define GICR_BASE_GVA ((volatile void *)GICR_BASE_GPA) + #define MIN_SGI 0 #define MIN_PPI 16 #define MIN_SPI 32 @@ -21,8 +30,7 @@ enum gic_type { #define INTID_IS_PPI(intid) (MIN_PPI <= (intid) && (intid) < MIN_SPI) #define INTID_IS_SPI(intid) (MIN_SPI <= (intid) && (intid) <= MAX_SPI) -void gic_init(enum gic_type type, unsigned int nr_cpus, - void *dist_base, void *redist_base); +void gic_init(enum gic_type type, unsigned int nr_cpus); void gic_irq_enable(unsigned int intid); void gic_irq_disable(unsigned int intid); unsigned int gic_get_and_ack_irq(void); diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h index 0ac6f05c63f9..ce19aa0a8360 100644 --- a/tools/testing/selftests/kvm/include/aarch64/vgic.h +++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h @@ -16,8 +16,7 @@ ((uint64_t)(flags) << 12) | \ index) -int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, - uint64_t gicd_base_gpa, uint64_t gicr_base_gpa); +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs); #define VGIC_MAX_RESERVED 1023 diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c index 55668631d546..7abbf8866512 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic.c @@ -17,13 +17,12 @@ static const struct gic_common_ops *gic_common_ops; static struct spinlock gic_lock; -static void gic_cpu_init(unsigned int cpu, void *redist_base) +static void gic_cpu_init(unsigned int cpu) { - gic_common_ops->gic_cpu_init(cpu, redist_base); + gic_common_ops->gic_cpu_init(cpu); } -static void -gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) +static void gic_dist_init(enum gic_type type, unsigned int nr_cpus) { const struct gic_common_ops *gic_ops = NULL; @@ -40,7 +39,7 @@ gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) GUEST_ASSERT(gic_ops); - gic_ops->gic_init(nr_cpus, dist_base); + gic_ops->gic_init(nr_cpus); gic_common_ops = gic_ops; /* Make sure that the initialized data is visible to all the vCPUs */ @@ -49,18 +48,15 @@ gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) spin_unlock(&gic_lock); } -void gic_init(enum gic_type type, unsigned int nr_cpus, - void *dist_base, void *redist_base) +void gic_init(enum gic_type type, unsigned int nr_cpus) { uint32_t cpu = guest_get_vcpuid(); GUEST_ASSERT(type < GIC_TYPE_MAX); - GUEST_ASSERT(dist_base); - GUEST_ASSERT(redist_base); GUEST_ASSERT(nr_cpus); - gic_dist_init(type, nr_cpus, dist_base); - gic_cpu_init(cpu, redist_base); + gic_dist_init(type, nr_cpus); + gic_cpu_init(cpu); } void gic_irq_enable(unsigned int intid) diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h index 75d07313c893..d24e9ecc96c6 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h @@ -8,8 +8,8 @@ #define SELFTEST_KVM_GIC_PRIVATE_H struct gic_common_ops { - void (*gic_init)(unsigned int nr_cpus, void *dist_base); - void (*gic_cpu_init)(unsigned int cpu, void *redist_base); + void (*gic_init)(unsigned int nr_cpus); + void (*gic_cpu_init)(unsigned int cpu); void (*gic_irq_enable)(unsigned int intid); void (*gic_irq_disable)(unsigned int intid); uint64_t (*gic_read_iar)(void); diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c index cd8f0e209599..515335179045 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -24,8 +24,6 @@ #define ICC_PMR_DEF_PRIO 0xf0 struct gicv3_data { - void *dist_base; - void *redist_base[GICV3_MAX_CPUS]; unsigned int nr_cpus; unsigned int nr_spis; }; @@ -46,17 +44,23 @@ static void gicv3_gicd_wait_for_rwp(void) { unsigned int count = 100000; /* 1s */ - while (readl(gicv3_data.dist_base + GICD_CTLR) & GICD_CTLR_RWP) { + while (readl(GICD_BASE_GVA + GICD_CTLR) & GICD_CTLR_RWP) { GUEST_ASSERT(count--); udelay(10); } } -static void gicv3_gicr_wait_for_rwp(void *redist_base) +static inline volatile void *gicr_base_cpu(uint32_t cpu) +{ + /* Align all the redistributors sequentially */ + return GICR_BASE_GVA + cpu * SZ_64K * 2; +} + +static void gicv3_gicr_wait_for_rwp(uint32_t cpu) { unsigned int count = 100000; /* 1s */ - while (readl(redist_base + GICR_CTLR) & GICR_CTLR_RWP) { + while (readl(gicr_base_cpu(cpu) + GICR_CTLR) & GICR_CTLR_RWP) { GUEST_ASSERT(count--); udelay(10); } @@ -67,7 +71,7 @@ static void gicv3_wait_for_rwp(uint32_t cpu_or_dist) if (cpu_or_dist & DIST_BIT) gicv3_gicd_wait_for_rwp(); else - gicv3_gicr_wait_for_rwp(gicv3_data.redist_base[cpu_or_dist]); + gicv3_gicr_wait_for_rwp(cpu_or_dist); } static enum gicv3_intid_range get_intid_range(unsigned int intid) @@ -127,15 +131,15 @@ static void gicv3_set_eoi_split(bool split) uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset) { - void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base - : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); return readl(base + offset); } void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val) { - void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base - : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); writel(reg_val, base + offset); } @@ -274,7 +278,7 @@ static bool gicv3_irq_get_pending(uint32_t intid) return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1); } -static void gicv3_enable_redist(void *redist_base) +static void gicv3_enable_redist(volatile void *redist_base) { uint32_t val = readl(redist_base + GICR_WAKER); unsigned int count = 100000; /* 1s */ @@ -289,21 +293,15 @@ static void gicv3_enable_redist(void *redist_base) } } -static inline void *gicr_base_cpu(void *redist_base, uint32_t cpu) +static void gicv3_cpu_init(unsigned int cpu) { - /* Align all the redistributors sequentially */ - return redist_base + cpu * SZ_64K * 2; -} - -static void gicv3_cpu_init(unsigned int cpu, void *redist_base) -{ - void *sgi_base; + volatile void *sgi_base; unsigned int i; - void *redist_base_cpu; + volatile void *redist_base_cpu; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); - redist_base_cpu = gicr_base_cpu(redist_base, cpu); + redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); gicv3_enable_redist(redist_base_cpu); @@ -321,7 +319,7 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) writel(GICD_INT_DEF_PRI_X4, sgi_base + GICR_IPRIORITYR0 + i); - gicv3_gicr_wait_for_rwp(redist_base_cpu); + gicv3_gicr_wait_for_rwp(cpu); /* Enable the GIC system register (ICC_*) access */ write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE, @@ -332,17 +330,14 @@ static void gicv3_cpu_init(unsigned int cpu, void *redist_base) /* Enable non-secure Group-1 interrupts */ write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); - - gicv3_data.redist_base[cpu] = redist_base_cpu; } static void gicv3_dist_init(void) { - void *dist_base = gicv3_data.dist_base; unsigned int i; /* Disable the distributor until we set things up */ - writel(0, dist_base + GICD_CTLR); + writel(0, GICD_BASE_GVA + GICD_CTLR); gicv3_gicd_wait_for_rwp(); /* @@ -350,33 +345,32 @@ static void gicv3_dist_init(void) * Also, deactivate and disable them. */ for (i = 32; i < gicv3_data.nr_spis; i += 32) { - writel(~0, dist_base + GICD_IGROUPR + i / 8); - writel(~0, dist_base + GICD_ICACTIVER + i / 8); - writel(~0, dist_base + GICD_ICENABLER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_IGROUPR + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICACTIVER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICENABLER + i / 8); } /* Set a default priority for all the SPIs */ for (i = 32; i < gicv3_data.nr_spis; i += 4) writel(GICD_INT_DEF_PRI_X4, - dist_base + GICD_IPRIORITYR + i); + GICD_BASE_GVA + GICD_IPRIORITYR + i); /* Wait for the settings to sync-in */ gicv3_gicd_wait_for_rwp(); /* Finally, enable the distributor globally with ARE */ writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | - GICD_CTLR_ENABLE_G1, dist_base + GICD_CTLR); + GICD_CTLR_ENABLE_G1, GICD_BASE_GVA + GICD_CTLR); gicv3_gicd_wait_for_rwp(); } -static void gicv3_init(unsigned int nr_cpus, void *dist_base) +static void gicv3_init(unsigned int nr_cpus) { GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS); gicv3_data.nr_cpus = nr_cpus; - gicv3_data.dist_base = dist_base; gicv3_data.nr_spis = GICD_TYPER_SPIS( - readl(gicv3_data.dist_base + GICD_TYPER)); + readl(GICD_BASE_GVA + GICD_TYPER)); if (gicv3_data.nr_spis > 1020) gicv3_data.nr_spis = 1020; diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 184378d593e9..7738fdb0cea1 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -19,8 +19,6 @@ * Input args: * vm - KVM VM * nr_vcpus - Number of vCPUs supported by this VM - * gicd_base_gpa - Guest Physical Address of the Distributor region - * gicr_base_gpa - Guest Physical Address of the Redistributor region * * Output args: None * @@ -30,11 +28,10 @@ * redistributor regions of the guest. Since it depends on the number of * vCPUs for the VM, it must be called after all the vCPUs have been created. */ -int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, - uint64_t gicd_base_gpa, uint64_t gicr_base_gpa) +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs) { int gic_fd; - uint64_t redist_attr; + uint64_t attr; struct list_head *iter; unsigned int nr_gic_pages, nr_vcpus_created = 0; @@ -60,18 +57,19 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + attr = GICD_BASE_GPA; kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa); + KVM_VGIC_V3_ADDR_TYPE_DIST, &attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE); - virt_map(vm, gicd_base_gpa, gicd_base_gpa, nr_gic_pages); + virt_map(vm, GICD_BASE_GPA, GICD_BASE_GPA, nr_gic_pages); /* Redistributor setup */ - redist_attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, gicr_base_gpa, 0, 0); + attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, GICR_BASE_GPA, 0, 0); kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &redist_attr); + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_REDIST_SIZE * nr_vcpus); - virt_map(vm, gicr_base_gpa, gicr_base_gpa, nr_gic_pages); + virt_map(vm, GICR_BASE_GPA, GICR_BASE_GPA, nr_gic_pages); kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); -- cgit v1.2.3 From 232269eb7dd5242877abfab1d47a1eb049a44b95 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 22 Apr 2024 20:01:54 +0000 Subject: KVM: selftests: Add quadword MMIO accessors The base registers in the GIC ITS and redistributor for LPIs are 64 bits wide. Add quadword accessors to poke at them. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-16-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/aarch64/processor.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9e518b562827..f129a1152985 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -177,11 +177,28 @@ static __always_inline u32 __raw_readl(const volatile void *addr) return val; } +static __always_inline void __raw_writeq(u64 val, volatile void *addr) +{ + asm volatile("str %0, [%1]" : : "rZ" (val), "r" (addr)); +} + +static __always_inline u64 __raw_readq(const volatile void *addr) +{ + u64 val; + asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr)); + return val; +} + #define writel_relaxed(v,c) ((void)__raw_writel((__force u32)cpu_to_le32(v),(c))) #define readl_relaxed(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; }) +#define writeq_relaxed(v,c) ((void)__raw_writeq((__force u64)cpu_to_le64(v),(c))) +#define readq_relaxed(c) ({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; }) #define writel(v,c) ({ __iowmb(); writel_relaxed((v),(c));}) #define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(__v); __v; }) +#define writeq(v,c) ({ __iowmb(); writeq_relaxed((v),(c));}) +#define readq(c) ({ u64 __v = readq_relaxed(c); __iormb(__v); __v; }) + static inline void local_irq_enable(void) { -- cgit v1.2.3 From be26db61e880b3892f189e9ef54b7b80599245bf Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 22 Apr 2024 20:01:55 +0000 Subject: KVM: selftests: Add a minimal library for interacting with an ITS A prerequisite of testing LPI injection performance is of course instantiating an ITS for the guest. Add a small library for creating an ITS and interacting with it from the guest. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-17-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/include/aarch64/gic.h | 8 +- .../selftests/kvm/include/aarch64/gic_v3_its.h | 19 ++ tools/testing/selftests/kvm/include/aarch64/vgic.h | 2 + .../testing/selftests/kvm/lib/aarch64/gic_v3_its.c | 248 +++++++++++++++++++++ tools/testing/selftests/kvm/lib/aarch64/vgic.c | 18 ++ 6 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h create mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 741c7dc16afc..4335e5744cc6 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -45,6 +45,7 @@ LIBKVM_x86_64 += lib/x86_64/vmx.c LIBKVM_aarch64 += lib/aarch64/gic.c LIBKVM_aarch64 += lib/aarch64/gic_v3.c +LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c LIBKVM_aarch64 += lib/aarch64/handlers.S LIBKVM_aarch64 += lib/aarch64/processor.c LIBKVM_aarch64 += lib/aarch64/spinlock.c diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h index 53617b3f52cf..6d03188435e4 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic.h @@ -13,10 +13,16 @@ enum gic_type { GIC_TYPE_MAX, }; -#define GICD_BASE_GPA 0x8000000ULL +/* + * Note that the redistributor frames are at the end, as the range scales + * with the number of vCPUs in the VM. + */ +#define GITS_BASE_GPA 0x8000000ULL +#define GICD_BASE_GPA (GITS_BASE_GPA + KVM_VGIC_V3_ITS_SIZE) #define GICR_BASE_GPA (GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE) /* The GIC is identity-mapped into the guest at the time of setup. */ +#define GITS_BASE_GVA ((volatile void *)GITS_BASE_GPA) #define GICD_BASE_GVA ((volatile void *)GICD_BASE_GPA) #define GICR_BASE_GVA ((volatile void *)GICR_BASE_GPA) diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h new file mode 100644 index 000000000000..3722ed9c8f96 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTESTS_GIC_V3_ITS_H__ +#define __SELFTESTS_GIC_V3_ITS_H__ + +#include + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size); + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid); +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid); +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid); +void its_send_invall_cmd(void *cmdq_base, u32 collection_id); + +#endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h index ce19aa0a8360..c481d0c00a5d 100644 --- a/tools/testing/selftests/kvm/include/aarch64/vgic.h +++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h @@ -32,4 +32,6 @@ void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); #define KVM_IRQCHIP_NUM_PINS (1020 - 32) +int vgic_its_setup(struct kvm_vm *vm); + #endif // SELFTEST_KVM_VGIC_H diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c new file mode 100644 index 000000000000..09f270545646 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest ITS library, generously donated by drivers/irqchip/irq-gic-v3-its.c + * over in the kernel tree. + */ + +#include +#include +#include +#include + +#include "kvm_util.h" +#include "vgic.h" +#include "gic.h" +#include "gic_v3.h" +#include "processor.h" + +static u64 its_read_u64(unsigned long offset) +{ + return readq_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u64(unsigned long offset, u64 val) +{ + writeq_relaxed(val, GITS_BASE_GVA + offset); +} + +static u32 its_read_u32(unsigned long offset) +{ + return readl_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u32(unsigned long offset, u32 val) +{ + writel_relaxed(val, GITS_BASE_GVA + offset); +} + +static unsigned long its_find_baser(unsigned int type) +{ + int i; + + for (i = 0; i < GITS_BASER_NR_REGS; i++) { + u64 baser; + unsigned long offset = GITS_BASER + (i * sizeof(baser)); + + baser = its_read_u64(offset); + if (GITS_BASER_TYPE(baser) == type) + return offset; + } + + GUEST_FAIL("Couldn't find an ITS BASER of type %u", type); + return -1; +} + +static void its_install_table(unsigned int type, vm_paddr_t base, size_t size) +{ + unsigned long offset = its_find_baser(type); + u64 baser; + + baser = ((size / SZ_64K) - 1) | + GITS_BASER_PAGE_SIZE_64K | + GITS_BASER_InnerShareable | + base | + GITS_BASER_RaWaWb | + GITS_BASER_VALID; + + its_write_u64(offset, baser); +} + +static void its_install_cmdq(vm_paddr_t base, size_t size) +{ + u64 cbaser; + + cbaser = ((size / SZ_4K) - 1) | + GITS_CBASER_InnerShareable | + base | + GITS_CBASER_RaWaWb | + GITS_CBASER_VALID; + + its_write_u64(GITS_CBASER, cbaser); +} + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size) +{ + u32 ctlr; + + its_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz); + its_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz); + its_install_cmdq(cmdq, cmdq_size); + + ctlr = its_read_u32(GITS_CTLR); + ctlr |= GITS_CTLR_ENABLE; + its_write_u32(GITS_CTLR, ctlr); +} + +struct its_cmd_block { + union { + u64 raw_cmd[4]; + __le64 raw_cmd_le[4]; + }; +}; + +static inline void its_fixup_cmd(struct its_cmd_block *cmd) +{ + /* Let's fixup BE commands */ + cmd->raw_cmd_le[0] = cpu_to_le64(cmd->raw_cmd[0]); + cmd->raw_cmd_le[1] = cpu_to_le64(cmd->raw_cmd[1]); + cmd->raw_cmd_le[2] = cpu_to_le64(cmd->raw_cmd[2]); + cmd->raw_cmd_le[3] = cpu_to_le64(cmd->raw_cmd[3]); +} + +static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l) +{ + u64 mask = GENMASK_ULL(h, l); + *raw_cmd &= ~mask; + *raw_cmd |= (val << l) & mask; +} + +static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr) +{ + its_mask_encode(&cmd->raw_cmd[0], cmd_nr, 7, 0); +} + +static void its_encode_devid(struct its_cmd_block *cmd, u32 devid) +{ + its_mask_encode(&cmd->raw_cmd[0], devid, 63, 32); +} + +static void its_encode_event_id(struct its_cmd_block *cmd, u32 id) +{ + its_mask_encode(&cmd->raw_cmd[1], id, 31, 0); +} + +static void its_encode_phys_id(struct its_cmd_block *cmd, u32 phys_id) +{ + its_mask_encode(&cmd->raw_cmd[1], phys_id, 63, 32); +} + +static void its_encode_size(struct its_cmd_block *cmd, u8 size) +{ + its_mask_encode(&cmd->raw_cmd[1], size, 4, 0); +} + +static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8); +} + +static void its_encode_valid(struct its_cmd_block *cmd, int valid) +{ + its_mask_encode(&cmd->raw_cmd[2], !!valid, 63, 63); +} + +static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16); +} + +static void its_encode_collection(struct its_cmd_block *cmd, u16 col) +{ + its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); +} + +#define GITS_CMDQ_POLL_ITERATIONS 0 + +static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) +{ + u64 cwriter = its_read_u64(GITS_CWRITER); + struct its_cmd_block *dst = cmdq_base + cwriter; + u64 cbaser = its_read_u64(GITS_CBASER); + size_t cmdq_size; + u64 next; + int i; + + cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K; + + its_fixup_cmd(cmd); + + WRITE_ONCE(*dst, *cmd); + dsb(ishst); + next = (cwriter + sizeof(*cmd)) % cmdq_size; + its_write_u64(GITS_CWRITER, next); + + /* + * Polling isn't necessary considering KVM's ITS emulation at the time + * of writing this, as the CMDQ is processed synchronously after a write + * to CWRITER. + */ + for (i = 0; its_read_u64(GITS_CREADR) != next; i++) { + __GUEST_ASSERT(i < GITS_CMDQ_POLL_ITERATIONS, + "ITS didn't process command at offset %lu after %d iterations\n", + cwriter, i); + + cpu_relax(); + } +} + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPD); + its_encode_devid(&cmd, device_id); + its_encode_size(&cmd, ilog2(itt_size) - 1); + its_encode_itt(&cmd, itt_base); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPC); + its_encode_collection(&cmd, collection_id); + its_encode_target(&cmd, vcpu_id); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPTI); + its_encode_devid(&cmd, device_id); + its_encode_event_id(&cmd, event_id); + its_encode_phys_id(&cmd, intid); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_invall_cmd(void *cmdq_base, u32 collection_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_INVALL); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 7738fdb0cea1..5e8f0d5382c2 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -166,3 +166,21 @@ void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) { vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER); } + +int vgic_its_setup(struct kvm_vm *vm) +{ + int its_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS); + u64 attr; + + attr = GITS_BASE_GPA; + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &attr); + + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + virt_map(vm, GITS_BASE_GPA, GITS_BASE_GPA, + vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE)); + + return its_fd; +} -- cgit v1.2.3 From 03e560ab539009856266b0cf8c100c9f7d1f8fee Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 22 Apr 2024 20:01:56 +0000 Subject: KVM: selftests: Add helper for enabling LPIs on a redistributor The selftests GIC library presently does not support LPIs. Add a userspace helper for configuring a redistributor for LPIs, installing an LPI configuration table and LPI pending table. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-18-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/aarch64/gic.h | 3 +++ tools/testing/selftests/kvm/lib/aarch64/gic_v3.c | 24 +++++++++++++++++++++++ tools/testing/selftests/kvm/lib/aarch64/vgic.c | 2 ++ 3 files changed, 29 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h index 6d03188435e4..baeb3c859389 100644 --- a/tools/testing/selftests/kvm/include/aarch64/gic.h +++ b/tools/testing/selftests/kvm/include/aarch64/gic.h @@ -58,4 +58,7 @@ void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table); + #endif /* SELFTEST_KVM_GIC_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c index 515335179045..66d05506f78b 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -401,3 +401,27 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, }; + +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table) +{ + volatile void *rdist_base = gicr_base_cpu(guest_get_vcpuid()); + + u32 ctlr; + u64 val; + + val = (cfg_table | + GICR_PROPBASER_InnerShareable | + GICR_PROPBASER_RaWaWb | + ((ilog2(cfg_table_size) - 1) & GICR_PROPBASER_IDBITS_MASK)); + writeq_relaxed(val, rdist_base + GICR_PROPBASER); + + val = (pend_table | + GICR_PENDBASER_InnerShareable | + GICR_PENDBASER_RaWaWb); + writeq_relaxed(val, rdist_base + GICR_PENDBASER); + + ctlr = readl_relaxed(rdist_base + GICR_CTLR); + ctlr |= GICR_CTLR_ENABLE_LPIS; + writel_relaxed(ctlr, rdist_base + GICR_CTLR); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 5e8f0d5382c2..4427f43f73ea 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -3,8 +3,10 @@ * ARM Generic Interrupt Controller (GIC) v3 host support */ +#include #include #include +#include #include #include -- cgit v1.2.3 From c3c369b508d9a447436b7abb2fded9aec18953ff Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 22 Apr 2024 20:01:57 +0000 Subject: KVM: selftests: Use MPIDR_HWID_BITMASK from cputype.h No need for a home-rolled definition, just rely on the common header. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-19-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 ++ tools/testing/selftests/kvm/include/aarch64/processor.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 9b004905d1d3..9fa3578d47d5 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -13,7 +13,9 @@ #define _GNU_SOURCE +#include #include +#include #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index f129a1152985..331ff6b2dbe2 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -58,8 +58,6 @@ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) -#define MPIDR_HWID_BITMASK (0xff00fffffful) - void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init); struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code); -- cgit v1.2.3 From 96d36ad95b03c89857d405b3317efb0188ac59cb Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 22 Apr 2024 20:01:58 +0000 Subject: KVM: selftests: Add stress test for LPI injection Now that all the infrastructure is in place, add a test to stress KVM's LPI injection. Keep a 1:1 mapping of device IDs to signalling threads, allowing the user to scale up/down the sender side of an LPI. Make use of the new VM stats for the translation cache to estimate the translation hit rate. Since the primary focus of the test is on performance, you'll notice that the guest code is not pedantic about the LPIs it receives. Counting the number of LPIs would require synchronization between the device and vCPU threads to avoid coalescing and would get in the way of performance numbers. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240422200158.2606761-20-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/vgic_lpi_stress.c | 410 +++++++++++++++++++++ 2 files changed, 411 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4335e5744cc6..e78cac712229 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -158,6 +158,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq +TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer diff --git a/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c new file mode 100644 index 000000000000..fc4fe52fb6f8 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic_lpi_stress - Stress test for KVM's ITS emulation + * + * Copyright (c) 2024 Google LLC + */ + +#include +#include +#include +#include + +#include "kvm_util.h" +#include "gic.h" +#include "gic_v3.h" +#include "gic_v3_its.h" +#include "processor.h" +#include "ucall.h" +#include "vgic.h" + +#define TEST_MEMSLOT_INDEX 1 + +#define GIC_LPI_OFFSET 8192 + +static size_t nr_iterations = 1000; +static vm_paddr_t gpa_base; + +static struct kvm_vm *vm; +static struct kvm_vcpu **vcpus; +static int gic_fd, its_fd; + +static struct test_data { + bool request_vcpus_stop; + u32 nr_cpus; + u32 nr_devices; + u32 nr_event_ids; + + vm_paddr_t device_table; + vm_paddr_t collection_table; + vm_paddr_t cmdq_base; + void *cmdq_base_va; + vm_paddr_t itt_tables; + + vm_paddr_t lpi_prop_table; + vm_paddr_t lpi_pend_tables; +} test_data = { + .nr_cpus = 1, + .nr_devices = 1, + .nr_event_ids = 16, +}; + +static void guest_irq_handler(struct ex_regs *regs) +{ + u32 intid = gic_get_and_ack_irq(); + + if (intid == IAR_SPURIOUS) + return; + + GUEST_ASSERT(intid >= GIC_LPI_OFFSET); + gic_set_eoi(intid); +} + +static void guest_setup_its_mappings(void) +{ + u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET; + u32 nr_events = test_data.nr_event_ids; + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + + for (coll_id = 0; coll_id < nr_cpus; coll_id++) + its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true); + + /* Round-robin the LPIs to all of the vCPUs in the VM */ + coll_id = 0; + for (device_id = 0; device_id < nr_devices; device_id++) { + vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K); + + its_send_mapd_cmd(test_data.cmdq_base_va, device_id, + itt_base, SZ_64K, true); + + for (event_id = 0; event_id < nr_events; event_id++) { + its_send_mapti_cmd(test_data.cmdq_base_va, device_id, + event_id, coll_id, intid++); + + coll_id = (coll_id + 1) % test_data.nr_cpus; + } + } +} + +static void guest_invalidate_all_rdists(void) +{ + int i; + + for (i = 0; i < test_data.nr_cpus; i++) + its_send_invall_cmd(test_data.cmdq_base_va, i); +} + +static void guest_setup_gic(void) +{ + static atomic_int nr_cpus_ready = 0; + u32 cpuid = guest_get_vcpuid(); + + gic_init(GIC_V3, test_data.nr_cpus); + gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K, + test_data.lpi_pend_tables + (cpuid * SZ_64K)); + + atomic_fetch_add(&nr_cpus_ready, 1); + + if (cpuid > 0) + return; + + while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus) + cpu_relax(); + + its_init(test_data.collection_table, SZ_64K, + test_data.device_table, SZ_64K, + test_data.cmdq_base, SZ_64K); + + guest_setup_its_mappings(); + guest_invalidate_all_rdists(); +} + +static void guest_code(size_t nr_lpis) +{ + guest_setup_gic(); + + GUEST_SYNC(0); + + /* + * Don't use WFI here to avoid blocking the vCPU thread indefinitely and + * never getting the stop signal. + */ + while (!READ_ONCE(test_data.request_vcpus_stop)) + cpu_relax(); + + GUEST_DONE(); +} + +static void setup_memslot(void) +{ + size_t pages; + size_t sz; + + /* + * For the ITS: + * - A single level device table + * - A single level collection table + * - The command queue + * - An ITT for each device + */ + sz = (3 + test_data.nr_devices) * SZ_64K; + + /* + * For the redistributors: + * - A shared LPI configuration table + * - An LPI pending table for each vCPU + */ + sz += (1 + test_data.nr_cpus) * SZ_64K; + + pages = sz / vm->page_size; + gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz; + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base, + TEST_MEMSLOT_INDEX, pages, 0); +} + +#define LPI_PROP_DEFAULT_PRIO 0xa0 + +static void configure_lpis(void) +{ + size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids; + u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table); + size_t i; + + for (i = 0; i < nr_lpis; i++) { + tbl[i] = LPI_PROP_DEFAULT_PRIO | + LPI_PROP_GROUP1 | + LPI_PROP_ENABLED; + } +} + +static void setup_test_data(void) +{ + size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K); + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + vm_paddr_t cmdq_base; + + test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, + TEST_MEMSLOT_INDEX); + virt_map(vm, cmdq_base, cmdq_base, pages_per_64k); + test_data.cmdq_base = cmdq_base; + test_data.cmdq_base_va = (void *)cmdq_base; + + test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices, + gpa_base, TEST_MEMSLOT_INDEX); + + test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, TEST_MEMSLOT_INDEX); + configure_lpis(); + + test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus, + gpa_base, TEST_MEMSLOT_INDEX); + + sync_global_to_guest(vm, test_data); +} + +static void setup_gic(void) +{ + gic_fd = vgic_v3_setup(vm, test_data.nr_cpus, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create GICv3"); + + its_fd = vgic_its_setup(vm); +} + +static void signal_lpi(u32 device_id, u32 event_id) +{ + vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER; + + struct kvm_msi msi = { + .address_lo = db_addr, + .address_hi = db_addr >> 32, + .data = event_id, + .devid = device_id, + .flags = KVM_MSI_VALID_DEVID, + }; + + /* + * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM, + * which for arm64 implies having a valid translation in the ITS. + */ + TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1, + "KVM_SIGNAL_MSI ioctl failed"); +} + +static pthread_barrier_t test_setup_barrier; + +static void *lpi_worker_thread(void *data) +{ + u32 device_id = (size_t)data; + u32 event_id; + size_t i; + + pthread_barrier_wait(&test_setup_barrier); + + for (i = 0; i < nr_iterations; i++) + for (event_id = 0; event_id < test_data.nr_event_ids; event_id++) + signal_lpi(device_id, event_id); + + return NULL; +} + +static void *vcpu_worker_thread(void *data) +{ + struct kvm_vcpu *vcpu = data; + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + pthread_barrier_wait(&test_setup_barrier); + continue; + case UCALL_DONE: + return NULL; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unknown ucall: %lu", uc.cmd); + } + } + + return NULL; +} + +static void report_stats(struct timespec delta) +{ + double nr_lpis; + double time; + + nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations; + + time = delta.tv_sec; + time += ((double)delta.tv_nsec) / NSEC_PER_SEC; + + pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time); +} + +static void run_test(void) +{ + u32 nr_devices = test_data.nr_devices; + u32 nr_vcpus = test_data.nr_cpus; + pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t)); + pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t)); + struct timespec start, delta; + size_t i; + + TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays"); + + pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1); + + for (i = 0; i < nr_vcpus; i++) + pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]); + + for (i = 0; i < nr_devices; i++) + pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i); + + pthread_barrier_wait(&test_setup_barrier); + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (i = 0; i < nr_devices; i++) + pthread_join(lpi_threads[i], NULL); + + delta = timespec_elapsed(start); + write_guest_global(vm, test_data.request_vcpus_stop, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i], NULL); + + report_stats(delta); +} + +static void setup_vm(void) +{ + int i; + + vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu)); + TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); + + vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); + + vm_init_descriptor_tables(vm); + for (i = 0; i < test_data.nr_cpus; i++) + vcpu_init_descriptor_tables(vcpus[i]); + + vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); + + setup_memslot(); + + setup_gic(); + + setup_test_data(); +} + +static void destroy_vm(void) +{ + close(its_fd); + close(gic_fd); + kvm_vm_free(vm); + free(vcpus); +} + +static void pr_usage(const char *name) +{ + pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name); + pr_info(" -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus); + pr_info(" -d:\tnumber of devices (default: %u)\n", test_data.nr_devices); + pr_info(" -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids); + pr_info(" -i:\tnumber of iterations (default: %lu)\n", nr_iterations); +} + +int main(int argc, char **argv) +{ + u32 nr_threads; + int c; + + while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) { + switch (c) { + case 'v': + test_data.nr_cpus = atoi(optarg); + break; + case 'd': + test_data.nr_devices = atoi(optarg); + break; + case 'e': + test_data.nr_event_ids = atoi(optarg); + break; + case 'i': + nr_iterations = strtoul(optarg, NULL, 0); + break; + case 'h': + default: + pr_usage(argv[0]); + return 1; + } + } + + nr_threads = test_data.nr_cpus + test_data.nr_devices; + if (nr_threads > get_nprocs()) + pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n", + nr_threads, get_nprocs()); + + setup_vm(); + + run_test(); + + destroy_vm(); + + return 0; +} -- cgit v1.2.3 From a9e59f712582eb5f1d04554c588699c8b47e3870 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Apr 2024 11:58:20 +0200 Subject: tools: testing: selftests: prefer TEST_PROGS for conntrack_dump_flush Currently conntrack_dump_flush test program always runs when passing TEST_PROGS argument: % make -C tools/testing/selftests TARGETS=net/netfilter \ TEST_PROGS=conntrack_ipip_mtu.sh run_tests make: Entering [..] TAP version 13 1..2 [..] selftests: net/netfilter: conntrack_dump_flush [..] Move away from TEST_CUSTOM_PROGS to avoid this. After this, above command will only run the program specified in TEST_PROGS. Link: https://lore.kernel.org/netdev/20240423191609.70c14c42@kernel.org/ Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240424095824.5555-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 68e4780edfdc..72c6001964a6 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -28,10 +28,9 @@ TEST_PROGS += nft_zones_many.sh TEST_PROGS += rpath.sh TEST_PROGS += xt_string.sh -TEST_CUSTOM_PROGS += conntrack_dump_flush +TEST_GEN_PROGS = conntrack_dump_flush TEST_GEN_FILES = audit_logread -TEST_GEN_FILES += conntrack_dump_flush TEST_GEN_FILES += connect_close nf_queue TEST_GEN_FILES += sctp_collision -- cgit v1.2.3 From 8ec3bf5c31d2a59c320ff59397f6ac362341d9d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 15:55:29 -0700 Subject: bpf: Add bpf_guard_preempt() convenience macro Add bpf_guard_preempt() macro that uses newly introduced bpf_preempt_disable/enable() kfuncs to guard a critical section. Signed-off-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424225529.16782-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/bpf_experimental.h | 22 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/preempt_lock.c | 7 ++----- 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 93c5a6c446b3..8b9cc87be4c4 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -397,6 +397,28 @@ l_true: \ , [as]"i"((dst_as << 16) | src_as)); #endif +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; + +typedef struct { +} __bpf_preempt_t; + +static inline __bpf_preempt_t __bpf_preempt_constructor(void) +{ + __bpf_preempt_t ret = {}; + + bpf_preempt_disable(); + return ret; +} +static inline void __bpf_preempt_destructor(__bpf_preempt_t *t) +{ + bpf_preempt_enable(); +} +#define bpf_guard_preempt() \ + __bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \ + __attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \ + __bpf_preempt_constructor() + /* Description * Assert that a conditional expression is true. * Returns diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c index 6c637ee01ec4..672fc368d9c4 100644 --- a/tools/testing/selftests/bpf/progs/preempt_lock.c +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -3,9 +3,7 @@ #include #include #include "bpf_misc.h" - -void bpf_preempt_disable(void) __ksym; -void bpf_preempt_enable(void) __ksym; +#include "bpf_experimental.h" SEC("?tc") __failure __msg("1 bpf_preempt_enable is missing") @@ -92,8 +90,7 @@ static __noinline void preempt_balance_subprog(void) SEC("?tc") __success int preempt_balance(struct __sk_buff *ctx) { - bpf_preempt_disable(); - bpf_preempt_enable(); + bpf_guard_preempt(); return 0; } -- cgit v1.2.3 From 638a485c4996be1d38303cf25ea8d12dfd16011b Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 25 Apr 2024 16:06:27 +0200 Subject: selftests/bpf: Add ring_buffer__consume_n test. Add a testcase for the ring_buffer__consume_n() API. The test produces multiple samples in a ring buffer, using a sys_getpid() fentry prog, and consumes them from user-space in batches, rather than consuming all of them greedily, like ring_buffer__consume() does. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/lkml/CAEf4BzaR4zqUpDmj44KNLdpJ=Tpa97GrvzuzVNO5nM6b7oWd1w@mail.gmail.com Link: https://lore.kernel.org/bpf/20240425140627.112728-1-andrea.righi@canonical.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 65 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_ringbuf_n.c | 47 ++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_n.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ff1af05add6e..ca8b73f7c774 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -456,7 +456,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_map_key.c + test_ringbuf_n.c test_ringbuf_map_key.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 48c5695b7abf..4c6f42dae409 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -13,6 +13,7 @@ #include #include #include "test_ringbuf.lskel.h" +#include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #define EDONE 7777 @@ -326,6 +327,68 @@ cleanup: test_ringbuf_lskel__destroy(skel); } +/* + * Test ring_buffer__consume_n() by producing N_TOT_SAMPLES samples in the ring + * buffer, via getpid(), and consuming them in chunks of N_SAMPLES. + */ +#define N_TOT_SAMPLES 32 +#define N_SAMPLES 4 + +/* Sample value to verify the callback validity */ +#define SAMPLE_VALUE 42L + +static int process_n_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + ASSERT_EQ(s->value, SAMPLE_VALUE, "sample_value"); + + return 0; +} + +static void ringbuf_n_subtest(void) +{ + struct test_ringbuf_n_lskel *skel_n; + int err, i; + + skel_n = test_ringbuf_n_lskel__open(); + if (!ASSERT_OK_PTR(skel_n, "test_ringbuf_n_lskel__open")) + return; + + skel_n->maps.ringbuf.max_entries = getpagesize(); + skel_n->bss->pid = getpid(); + + err = test_ringbuf_n_lskel__load(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__load")) + goto cleanup; + + ringbuf = ring_buffer__new(skel_n->maps.ringbuf.map_fd, + process_n_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new")) + goto cleanup; + + err = test_ringbuf_n_lskel__attach(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__attach")) + goto cleanup_ringbuf; + + /* Produce N_TOT_SAMPLES samples in the ring buffer by calling getpid() */ + skel_n->bss->value = SAMPLE_VALUE; + for (i = 0; i < N_TOT_SAMPLES; i++) + syscall(__NR_getpgid); + + /* Consume all samples from the ring buffer in batches of N_SAMPLES */ + for (i = 0; i < N_TOT_SAMPLES; i += err) { + err = ring_buffer__consume_n(ringbuf, N_SAMPLES); + if (!ASSERT_EQ(err, N_SAMPLES, "rb_consume")) + goto cleanup_ringbuf; + } + +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_n_lskel__destroy(skel_n); +} + static int process_map_key_sample(void *ctx, void *data, size_t len) { struct sample *s; @@ -384,6 +447,8 @@ void test_ringbuf(void) { if (test__start_subtest("ringbuf")) ringbuf_subtest(); + if (test__start_subtest("ringbuf_n")) + ringbuf_n_subtest(); if (test__start_subtest("ringbuf_map_key")) ringbuf_map_key_subtest(); } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_n.c b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c new file mode 100644 index 000000000000..8669eb42dbe0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Andrea Righi + +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +#define TASK_COMM_LEN 16 + +struct sample { + int pid; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +int pid = 0; +long value = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_n(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + + sample->pid = pid; + sample->value = value; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + + bpf_ringbuf_submit(sample, 0); + + return 0; +} -- cgit v1.2.3 From dbc8fc9d6de17b618591d4d2a6227dee27fe0f51 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Sat, 27 Jan 2024 12:57:00 -0600 Subject: powerpc/papr_scm: Move duplicate definitions to common header files papr_scm and ndtest share common PDSM payload structs like nd_papr_pdsm_health. Presently these structs are duplicated across papr_pdsm.h and ndtest.h header files. Since 'ndtest' is essentially arch independent and can run on platforms other than PPC64, a way needs to be deviced to avoid redundancy and duplication of PDSM structs in future. So the patch proposes moving the PDSM header from arch/powerpc/include- -/uapi/ to the generic include/uapi/linux directory. Also, there are some #defines common between papr_scm and ndtest which are not exported to the user space. So, move them to a header file which can be shared across ndtest and papr_scm via newly introduced include/linux/papr_scm.h. Signed-off-by: Shivaprasad G Bhat Signed-off-by: Vaibhav Jain Suggested-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/170638176942.112443.2937254675538057083.stgit@ltcd48-lp2.aus.stglab.ibm.com Signed-off-by: Ira Weiny --- MAINTAINERS | 2 + arch/powerpc/include/uapi/asm/papr_pdsm.h | 165 ------------------------------ arch/powerpc/platforms/pseries/papr_scm.c | 43 +------- include/linux/papr_scm.h | 49 +++++++++ include/uapi/linux/papr_pdsm.h | 165 ++++++++++++++++++++++++++++++ tools/testing/nvdimm/test/ndtest.c | 2 + tools/testing/nvdimm/test/ndtest.h | 31 ------ 7 files changed, 220 insertions(+), 237 deletions(-) delete mode 100644 arch/powerpc/include/uapi/asm/papr_pdsm.h create mode 100644 include/linux/papr_scm.h create mode 100644 include/uapi/linux/papr_pdsm.h (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index ebf03f5f0619..0960c519f949 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12497,6 +12497,8 @@ F: drivers/rtc/rtc-opal.c F: drivers/scsi/ibmvscsi/ F: drivers/tty/hvc/hvc_opal.c F: drivers/watchdog/wdrtas.c +F: include/linux/papr_scm.h +F: include/uapi/linux/papr_pdsm.h F: tools/testing/selftests/powerpc N: /pmac N: powermac diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h deleted file mode 100644 index 17439925045c..000000000000 --- a/arch/powerpc/include/uapi/asm/papr_pdsm.h +++ /dev/null @@ -1,165 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * PAPR nvDimm Specific Methods (PDSM) and structs for libndctl - * - * (C) Copyright IBM 2020 - * - * Author: Vaibhav Jain - */ - -#ifndef _UAPI_ASM_POWERPC_PAPR_PDSM_H_ -#define _UAPI_ASM_POWERPC_PAPR_PDSM_H_ - -#include -#include - -/* - * PDSM Envelope: - * - * The ioctl ND_CMD_CALL exchange data between user-space and kernel via - * envelope which consists of 2 headers sections and payload sections as - * illustrated below: - * +-----------------+---------------+---------------------------+ - * | 64-Bytes | 8-Bytes | Max 184-Bytes | - * +-----------------+---------------+---------------------------+ - * | ND-HEADER | PDSM-HEADER | PDSM-PAYLOAD | - * +-----------------+---------------+---------------------------+ - * | nd_family | | | - * | nd_size_out | cmd_status | | - * | nd_size_in | reserved | nd_pdsm_payload | - * | nd_command | payload --> | | - * | nd_fw_size | | | - * | nd_payload ---> | | | - * +---------------+-----------------+---------------------------+ - * - * ND Header: - * This is the generic libnvdimm header described as 'struct nd_cmd_pkg' - * which is interpreted by libnvdimm before passed on to papr_scm. Important - * member fields used are: - * 'nd_family' : (In) NVDIMM_FAMILY_PAPR_SCM - * 'nd_size_in' : (In) PDSM-HEADER + PDSM-IN-PAYLOAD (usually 0) - * 'nd_size_out' : (In) PDSM-HEADER + PDSM-RETURN-PAYLOAD - * 'nd_command' : (In) One of PAPR_PDSM_XXX - * 'nd_fw_size' : (Out) PDSM-HEADER + size of actual payload returned - * - * PDSM Header: - * This is papr-scm specific header that precedes the payload. This is defined - * as nd_cmd_pdsm_pkg. Following fields aare available in this header: - * - * 'cmd_status' : (Out) Errors if any encountered while servicing PDSM. - * 'reserved' : Not used, reserved for future and should be set to 0. - * 'payload' : A union of all the possible payload structs - * - * PDSM Payload: - * - * The layout of the PDSM Payload is defined by various structs shared between - * papr_scm and libndctl so that contents of payload can be interpreted. As such - * its defined as a union of all possible payload structs as - * 'union nd_pdsm_payload'. Based on the value of 'nd_cmd_pkg.nd_command' - * appropriate member of the union is accessed. - */ - -/* Max payload size that we can handle */ -#define ND_PDSM_PAYLOAD_MAX_SIZE 184 - -/* Max payload size that we can handle */ -#define ND_PDSM_HDR_SIZE \ - (sizeof(struct nd_pkg_pdsm) - ND_PDSM_PAYLOAD_MAX_SIZE) - -/* Various nvdimm health indicators */ -#define PAPR_PDSM_DIMM_HEALTHY 0 -#define PAPR_PDSM_DIMM_UNHEALTHY 1 -#define PAPR_PDSM_DIMM_CRITICAL 2 -#define PAPR_PDSM_DIMM_FATAL 3 - -/* struct nd_papr_pdsm_health.extension_flags field flags */ - -/* Indicate that the 'dimm_fuel_gauge' field is valid */ -#define PDSM_DIMM_HEALTH_RUN_GAUGE_VALID 1 - -/* Indicate that the 'dimm_dsc' field is valid */ -#define PDSM_DIMM_DSC_VALID 2 - -/* - * Struct exchanged between kernel & ndctl in for PAPR_PDSM_HEALTH - * Various flags indicate the health status of the dimm. - * - * extension_flags : Any extension fields present in the struct. - * dimm_unarmed : Dimm not armed. So contents wont persist. - * dimm_bad_shutdown : Previous shutdown did not persist contents. - * dimm_bad_restore : Contents from previous shutdown werent restored. - * dimm_scrubbed : Contents of the dimm have been scrubbed. - * dimm_locked : Contents of the dimm cant be modified until CEC reboot - * dimm_encrypted : Contents of dimm are encrypted. - * dimm_health : Dimm health indicator. One of PAPR_PDSM_DIMM_XXXX - * dimm_fuel_gauge : Life remaining of DIMM as a percentage from 0-100 - */ -struct nd_papr_pdsm_health { - union { - struct { - __u32 extension_flags; - __u8 dimm_unarmed; - __u8 dimm_bad_shutdown; - __u8 dimm_bad_restore; - __u8 dimm_scrubbed; - __u8 dimm_locked; - __u8 dimm_encrypted; - __u16 dimm_health; - - /* Extension flag PDSM_DIMM_HEALTH_RUN_GAUGE_VALID */ - __u16 dimm_fuel_gauge; - - /* Extension flag PDSM_DIMM_DSC_VALID */ - __u64 dimm_dsc; - }; - __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; - }; -}; - -/* Flags for injecting specific smart errors */ -#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) -#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) - -struct nd_papr_pdsm_smart_inject { - union { - struct { - /* One or more of PDSM_SMART_INJECT_ */ - __u32 flags; - __u8 fatal_enable; - __u8 unsafe_shutdown_enable; - }; - __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; - }; -}; - -/* - * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel - * via 'nd_cmd_pkg.nd_command' member of the ioctl struct - */ -enum papr_pdsm { - PAPR_PDSM_MIN = 0x0, - PAPR_PDSM_HEALTH, - PAPR_PDSM_SMART_INJECT, - PAPR_PDSM_MAX, -}; - -/* Maximal union that can hold all possible payload types */ -union nd_pdsm_payload { - struct nd_papr_pdsm_health health; - struct nd_papr_pdsm_smart_inject smart_inject; - __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; -} __packed; - -/* - * PDSM-header + payload expected with ND_CMD_CALL ioctl from libnvdimm - * Valid member of union 'payload' is identified via 'nd_cmd_pkg.nd_command' - * that should always precede this struct when sent to papr_scm via CMD_CALL - * interface. - */ -struct nd_pkg_pdsm { - __s32 cmd_status; /* Out: Sub-cmd status returned back */ - __u16 reserved[2]; /* Ignored and to be set as '0' */ - union nd_pdsm_payload payload; -} __packed; - -#endif /* _UAPI_ASM_POWERPC_PAPR_PDSM_H_ */ diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index c233f9db039b..9b6420eb3567 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -16,7 +16,8 @@ #include #include -#include +#include +#include #include #include #include @@ -29,46 +30,6 @@ (1ul << ND_CMD_SET_CONFIG_DATA) | \ (1ul << ND_CMD_CALL)) -/* DIMM health bitmap indicators */ -/* SCM device is unable to persist memory contents */ -#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) -/* SCM device failed to persist memory contents */ -#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) -/* SCM device contents are persisted from previous IPL */ -#define PAPR_PMEM_SHUTDOWN_CLEAN (1ULL << (63 - 2)) -/* SCM device contents are not persisted from previous IPL */ -#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) -/* SCM device memory life remaining is critically low */ -#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) -/* SCM device will be garded off next IPL due to failure */ -#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) -/* SCM contents cannot persist due to current platform health status */ -#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) -/* SCM device is unable to persist memory contents in certain conditions */ -#define PAPR_PMEM_HEALTH_NON_CRITICAL (1ULL << (63 - 7)) -/* SCM device is encrypted */ -#define PAPR_PMEM_ENCRYPTED (1ULL << (63 - 8)) -/* SCM device has been scrubbed and locked */ -#define PAPR_PMEM_SCRUBBED_AND_LOCKED (1ULL << (63 - 9)) - -/* Bits status indicators for health bitmap indicating unarmed dimm */ -#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -/* Bits status indicators for health bitmap indicating unflushed dimm */ -#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) - -/* Bits status indicators for health bitmap indicating unrestored dimm */ -#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) - -/* Bit status indicators for smart event notification */ -#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ - PAPR_PMEM_HEALTH_FATAL | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) -#define PAPR_SCM_PERF_STATS_VERSION 0x1 - /* Struct holding a single performance metric */ struct papr_scm_perf_stat { u8 stat_id[8]; diff --git a/include/linux/papr_scm.h b/include/linux/papr_scm.h new file mode 100644 index 000000000000..eb36453813db --- /dev/null +++ b/include/linux/papr_scm.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __LINUX_PAPR_SCM_H +#define __LINUX_PAPR_SCM_H + +/* DIMM health bitmap indicators */ +/* SCM device is unable to persist memory contents */ +#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) +/* SCM device failed to persist memory contents */ +#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) +/* SCM device contents are persisted from previous IPL */ +#define PAPR_PMEM_SHUTDOWN_CLEAN (1ULL << (63 - 2)) +/* SCM device contents are not persisted from previous IPL */ +#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) +/* SCM device memory life remaining is critically low */ +#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) +/* SCM device will be garded off next IPL due to failure */ +#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) +/* SCM contents cannot persist due to current platform health status */ +#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) +/* SCM device is unable to persist memory contents in certain conditions */ +#define PAPR_PMEM_HEALTH_NON_CRITICAL (1ULL << (63 - 7)) +/* SCM device is encrypted */ +#define PAPR_PMEM_ENCRYPTED (1ULL << (63 - 8)) +/* SCM device has been scrubbed and locked */ +#define PAPR_PMEM_SCRUBBED_AND_LOCKED (1ULL << (63 - 9)) + +#define PAPR_PMEM_SAVE_FAILED (1ULL << (63 - 10)) + +/* Bits status indicators for health bitmap indicating unarmed dimm */ +#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ + PAPR_PMEM_HEALTH_UNHEALTHY) + +/* Bits status indicators for health bitmap indicating unflushed dimm */ +#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) + +/* Bits status indicators for health bitmap indicating unrestored dimm */ +#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) + +/* Bit status indicators for smart event notification */ +#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ + PAPR_PMEM_HEALTH_FATAL | \ + PAPR_PMEM_HEALTH_UNHEALTHY) + +#define PAPR_PMEM_SAVE_MASK (PAPR_PMEM_SAVE_FAILED) + +#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) +#define PAPR_SCM_PERF_STATS_VERSION 0x1 + +#endif /* __LINUX_PAPR_SCM_H */ diff --git a/include/uapi/linux/papr_pdsm.h b/include/uapi/linux/papr_pdsm.h new file mode 100644 index 000000000000..17439925045c --- /dev/null +++ b/include/uapi/linux/papr_pdsm.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * PAPR nvDimm Specific Methods (PDSM) and structs for libndctl + * + * (C) Copyright IBM 2020 + * + * Author: Vaibhav Jain + */ + +#ifndef _UAPI_ASM_POWERPC_PAPR_PDSM_H_ +#define _UAPI_ASM_POWERPC_PAPR_PDSM_H_ + +#include +#include + +/* + * PDSM Envelope: + * + * The ioctl ND_CMD_CALL exchange data between user-space and kernel via + * envelope which consists of 2 headers sections and payload sections as + * illustrated below: + * +-----------------+---------------+---------------------------+ + * | 64-Bytes | 8-Bytes | Max 184-Bytes | + * +-----------------+---------------+---------------------------+ + * | ND-HEADER | PDSM-HEADER | PDSM-PAYLOAD | + * +-----------------+---------------+---------------------------+ + * | nd_family | | | + * | nd_size_out | cmd_status | | + * | nd_size_in | reserved | nd_pdsm_payload | + * | nd_command | payload --> | | + * | nd_fw_size | | | + * | nd_payload ---> | | | + * +---------------+-----------------+---------------------------+ + * + * ND Header: + * This is the generic libnvdimm header described as 'struct nd_cmd_pkg' + * which is interpreted by libnvdimm before passed on to papr_scm. Important + * member fields used are: + * 'nd_family' : (In) NVDIMM_FAMILY_PAPR_SCM + * 'nd_size_in' : (In) PDSM-HEADER + PDSM-IN-PAYLOAD (usually 0) + * 'nd_size_out' : (In) PDSM-HEADER + PDSM-RETURN-PAYLOAD + * 'nd_command' : (In) One of PAPR_PDSM_XXX + * 'nd_fw_size' : (Out) PDSM-HEADER + size of actual payload returned + * + * PDSM Header: + * This is papr-scm specific header that precedes the payload. This is defined + * as nd_cmd_pdsm_pkg. Following fields aare available in this header: + * + * 'cmd_status' : (Out) Errors if any encountered while servicing PDSM. + * 'reserved' : Not used, reserved for future and should be set to 0. + * 'payload' : A union of all the possible payload structs + * + * PDSM Payload: + * + * The layout of the PDSM Payload is defined by various structs shared between + * papr_scm and libndctl so that contents of payload can be interpreted. As such + * its defined as a union of all possible payload structs as + * 'union nd_pdsm_payload'. Based on the value of 'nd_cmd_pkg.nd_command' + * appropriate member of the union is accessed. + */ + +/* Max payload size that we can handle */ +#define ND_PDSM_PAYLOAD_MAX_SIZE 184 + +/* Max payload size that we can handle */ +#define ND_PDSM_HDR_SIZE \ + (sizeof(struct nd_pkg_pdsm) - ND_PDSM_PAYLOAD_MAX_SIZE) + +/* Various nvdimm health indicators */ +#define PAPR_PDSM_DIMM_HEALTHY 0 +#define PAPR_PDSM_DIMM_UNHEALTHY 1 +#define PAPR_PDSM_DIMM_CRITICAL 2 +#define PAPR_PDSM_DIMM_FATAL 3 + +/* struct nd_papr_pdsm_health.extension_flags field flags */ + +/* Indicate that the 'dimm_fuel_gauge' field is valid */ +#define PDSM_DIMM_HEALTH_RUN_GAUGE_VALID 1 + +/* Indicate that the 'dimm_dsc' field is valid */ +#define PDSM_DIMM_DSC_VALID 2 + +/* + * Struct exchanged between kernel & ndctl in for PAPR_PDSM_HEALTH + * Various flags indicate the health status of the dimm. + * + * extension_flags : Any extension fields present in the struct. + * dimm_unarmed : Dimm not armed. So contents wont persist. + * dimm_bad_shutdown : Previous shutdown did not persist contents. + * dimm_bad_restore : Contents from previous shutdown werent restored. + * dimm_scrubbed : Contents of the dimm have been scrubbed. + * dimm_locked : Contents of the dimm cant be modified until CEC reboot + * dimm_encrypted : Contents of dimm are encrypted. + * dimm_health : Dimm health indicator. One of PAPR_PDSM_DIMM_XXXX + * dimm_fuel_gauge : Life remaining of DIMM as a percentage from 0-100 + */ +struct nd_papr_pdsm_health { + union { + struct { + __u32 extension_flags; + __u8 dimm_unarmed; + __u8 dimm_bad_shutdown; + __u8 dimm_bad_restore; + __u8 dimm_scrubbed; + __u8 dimm_locked; + __u8 dimm_encrypted; + __u16 dimm_health; + + /* Extension flag PDSM_DIMM_HEALTH_RUN_GAUGE_VALID */ + __u16 dimm_fuel_gauge; + + /* Extension flag PDSM_DIMM_DSC_VALID */ + __u64 dimm_dsc; + }; + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; + }; +}; + +/* Flags for injecting specific smart errors */ +#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) +#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) + +struct nd_papr_pdsm_smart_inject { + union { + struct { + /* One or more of PDSM_SMART_INJECT_ */ + __u32 flags; + __u8 fatal_enable; + __u8 unsafe_shutdown_enable; + }; + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; + }; +}; + +/* + * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel + * via 'nd_cmd_pkg.nd_command' member of the ioctl struct + */ +enum papr_pdsm { + PAPR_PDSM_MIN = 0x0, + PAPR_PDSM_HEALTH, + PAPR_PDSM_SMART_INJECT, + PAPR_PDSM_MAX, +}; + +/* Maximal union that can hold all possible payload types */ +union nd_pdsm_payload { + struct nd_papr_pdsm_health health; + struct nd_papr_pdsm_smart_inject smart_inject; + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; +} __packed; + +/* + * PDSM-header + payload expected with ND_CMD_CALL ioctl from libnvdimm + * Valid member of union 'payload' is identified via 'nd_cmd_pkg.nd_command' + * that should always precede this struct when sent to papr_scm via CMD_CALL + * interface. + */ +struct nd_pkg_pdsm { + __s32 cmd_status; /* Out: Sub-cmd status returned back */ + __u16 reserved[2]; /* Ignored and to be set as '0' */ + union nd_pdsm_payload payload; +} __packed; + +#endif /* _UAPI_ASM_POWERPC_PAPR_PDSM_H_ */ diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c index b8419f460368..fa9d1327fc71 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "../watermark.h" #include "nfit_test.h" diff --git a/tools/testing/nvdimm/test/ndtest.h b/tools/testing/nvdimm/test/ndtest.h index 2c54c9cbb90c..8f27ad6f7319 100644 --- a/tools/testing/nvdimm/test/ndtest.h +++ b/tools/testing/nvdimm/test/ndtest.h @@ -5,37 +5,6 @@ #include #include -/* SCM device is unable to persist memory contents */ -#define PAPR_PMEM_UNARMED (1ULL << (63 - 0)) -/* SCM device failed to persist memory contents */ -#define PAPR_PMEM_SHUTDOWN_DIRTY (1ULL << (63 - 1)) -/* SCM device contents are not persisted from previous IPL */ -#define PAPR_PMEM_EMPTY (1ULL << (63 - 3)) -#define PAPR_PMEM_HEALTH_CRITICAL (1ULL << (63 - 4)) -/* SCM device will be garded off next IPL due to failure */ -#define PAPR_PMEM_HEALTH_FATAL (1ULL << (63 - 5)) -/* SCM contents cannot persist due to current platform health status */ -#define PAPR_PMEM_HEALTH_UNHEALTHY (1ULL << (63 - 6)) - -/* Bits status indicators for health bitmap indicating unarmed dimm */ -#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_PMEM_SAVE_FAILED (1ULL << (63 - 10)) - -/* Bits status indicators for health bitmap indicating unflushed dimm */ -#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY) - -/* Bits status indicators for health bitmap indicating unrestored dimm */ -#define PAPR_PMEM_BAD_RESTORE_MASK (PAPR_PMEM_EMPTY) - -/* Bit status indicators for smart event notification */ -#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \ - PAPR_PMEM_HEALTH_FATAL | \ - PAPR_PMEM_HEALTH_UNHEALTHY) - -#define PAPR_PMEM_SAVE_MASK (PAPR_PMEM_SAVE_FAILED) - struct ndtest_config; struct ndtest_priv { -- cgit v1.2.3 From 57456adef68db15ab81578d42848bae5b542999a Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 8 Mar 2024 09:51:22 +0100 Subject: ndtest: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/c04bfc941a9f5d249b049572c1ae122fe551ee5d.1709886922.git.u.kleine-koenig@pengutronix.de Signed-off-by: Ira Weiny --- tools/testing/nvdimm/test/ndtest.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c index fa9d1327fc71..b438f3d053ee 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -832,12 +832,11 @@ static int ndtest_bus_register(struct ndtest_priv *p) return 0; } -static int ndtest_remove(struct platform_device *pdev) +static void ndtest_remove(struct platform_device *pdev) { struct ndtest_priv *p = to_ndtest_priv(&pdev->dev); nvdimm_bus_unregister(p->bus); - return 0; } static int ndtest_probe(struct platform_device *pdev) @@ -884,7 +883,7 @@ static const struct platform_device_id ndtest_id[] = { static struct platform_driver ndtest_driver = { .probe = ndtest_probe, - .remove = ndtest_remove, + .remove_new = ndtest_remove, .driver = { .name = KBUILD_MODNAME, }, -- cgit v1.2.3 From 3b3b84aacb4420226576c9732e7b539ca7b79633 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:18 -0700 Subject: selftests/bpf: adjust dummy_st_ops_success to detect additional error As reported by Jose E. Marchesi in off-list discussion, GCC and LLVM generate slightly different code for dummy_st_ops_success/test_1(): SEC("struct_ops/test_1") int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { int ret; if (!state) return 0xf2f3f4f5; ret = state->val; state->val = 0x5a; return ret; } GCC-generated LLVM-generated ---------------------------- --------------------------- 0: r1 = *(u64 *)(r1 + 0x0) 0: w0 = -0xd0c0b0b 1: if r1 == 0x0 goto 5f 1: r1 = *(u64 *)(r1 + 0x0) 2: r0 = *(s32 *)(r1 + 0x0) 2: if r1 == 0x0 goto 6f 3: *(u32 *)(r1 + 0x0) = 0x5a 3: r0 = *(u32 *)(r1 + 0x0) 4: exit 4: w2 = 0x5a 5: r0 = -0xd0c0b0b 5: *(u32 *)(r1 + 0x0) = r2 6: exit 6: exit If the 'state' argument is not marked as nullable in net/bpf/bpf_dummy_struct_ops.c, the verifier would assume that 'r1 == 0x0' is never true: - for the GCC version, this means that instructions #5-6 would be marked as dead and removed; - for the LLVM version, all instructions would be marked as live. The test dummy_st_ops/dummy_init_ret_value actually sets the 'state' parameter to NULL. Therefore, when the 'state' argument is not marked as nullable, the GCC-generated version of the code would trigger a NULL pointer dereference at instruction #3. This patch updates the test_1() test case to always follow a shape similar to the GCC-generated version above, in order to verify whether the 'state' nullability is marked correctly. Reported-by: Jose E. Marchesi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dummy_st_ops_success.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c index 1efa746c25dc..cc7b69b001aa 100644 --- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c +++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c @@ -11,8 +11,17 @@ int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { int ret; - if (!state) - return 0xf2f3f4f5; + /* Check that 'state' nullable status is detected correctly. + * If 'state' argument would be assumed non-null by verifier + * the code below would be deleted as dead (which it shouldn't). + * Hide it from the compiler behind 'asm' block to avoid + * unnecessary optimizations. + */ + asm volatile ( + "if %[state] != 0 goto +2;" + "r0 = 0xf2f3f4f5;" + "exit;" + ::[state]"p"(state)); ret = state->val; state->val = 0x5a; -- cgit v1.2.3 From f612210d456a0b969a0adca91e68dbea0e0ea301 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:19 -0700 Subject: selftests/bpf: do not pass NULL for non-nullable params in dummy_st_ops dummy_st_ops.test_2 and dummy_st_ops.test_sleepable do not have their 'state' parameter marked as nullable. Update dummy_st_ops.c to avoid passing NULL for such parameters, as the next patch would allow kernel to enforce this restriction. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c | 7 +++++-- tools/testing/selftests/bpf/progs/dummy_st_ops_success.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index f43fcb13d2c4..dd926c00f414 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -98,7 +98,8 @@ done: static void test_dummy_multiple_args(void) { - __u64 args[5] = {0, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; + struct bpf_dummy_ops_state st = { 7 }; + __u64 args[5] = {(__u64)&st, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), @@ -115,6 +116,7 @@ static void test_dummy_multiple_args(void) fd = bpf_program__fd(skel->progs.test_2); err = bpf_prog_test_run_opts(fd, &attr); ASSERT_OK(err, "test_run"); + args[0] = 7; for (i = 0; i < ARRAY_SIZE(args); i++) { snprintf(name, sizeof(name), "arg %zu", i); ASSERT_EQ(skel->bss->test_2_args[i], args[i], name); @@ -125,7 +127,8 @@ static void test_dummy_multiple_args(void) static void test_dummy_sleepable(void) { - __u64 args[1] = {0}; + struct bpf_dummy_ops_state st; + __u64 args[1] = {(__u64)&st}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c index cc7b69b001aa..ec0c595d47af 100644 --- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c +++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c @@ -34,7 +34,7 @@ SEC("struct_ops/test_2") int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2, char a3, unsigned long a4) { - test_2_args[0] = (unsigned long)state; + test_2_args[0] = state->val; test_2_args[1] = a1; test_2_args[2] = a2; test_2_args[3] = a3; -- cgit v1.2.3 From 6a2d30d3c5bf9f088dcfd5f3746b04d84f2fab83 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:21 -0700 Subject: selftests/bpf: dummy_st_ops should reject 0 for non-nullable params Check if BPF_PROG_TEST_RUN for bpf_dummy_struct_ops programs rejects execution if NULL is passed for non-nullable parameter. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/dummy_st_ops.c | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index dd926c00f414..d3d94596ab79 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -147,6 +147,31 @@ static void test_dummy_sleepable(void) dummy_st_ops_success__destroy(skel); } +/* dummy_st_ops.test_sleepable() parameter is not marked as nullable, + * thus bpf_prog_test_run_opts() below should be rejected as it tries + * to pass NULL for this parameter. + */ +static void test_dummy_sleepable_reject_null(void) +{ + __u64 args[1] = {0}; + LIBBPF_OPTS(bpf_test_run_opts, attr, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + struct dummy_st_ops_success *skel; + int fd, err; + + skel = dummy_st_ops_success__open_and_load(); + if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load")) + return; + + fd = bpf_program__fd(skel->progs.test_sleepable); + err = bpf_prog_test_run_opts(fd, &attr); + ASSERT_EQ(err, -EINVAL, "test_run"); + + dummy_st_ops_success__destroy(skel); +} + void test_dummy_st_ops(void) { if (test__start_subtest("dummy_st_ops_attach")) @@ -159,6 +184,8 @@ void test_dummy_st_ops(void) test_dummy_multiple_args(); if (test__start_subtest("dummy_sleepable")) test_dummy_sleepable(); + if (test__start_subtest("dummy_sleepable_reject_null")) + test_dummy_sleepable_reject_null(); RUN_TESTS(dummy_st_ops_fail); } -- cgit v1.2.3 From 48e2cd3e3dcfe04f212df4fb189fa04c2a87b980 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 26 Apr 2024 00:17:23 +0800 Subject: bpf: add mrtt and srtt as BPF_SOCK_OPS_RTT_CB args Two important arguments in RTT estimation, mrtt and srtt, are passed to tcp_bpf_rtt(), so that bpf programs get more information about RTT computation in BPF_SOCK_OPS_RTT_CB. The difference between bpf_sock_ops->srtt_us and the srtt here is: the former is an old rtt before update, while srtt passed by tcp_bpf_rtt() is that after update. Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240425161724.73707-2-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 4 ++-- include/uapi/linux/bpf.h | 2 ++ net/ipv4/tcp_input.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ae35199d3b3..0f75d03287c2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2706,10 +2706,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e4ae83550fb3..d94a72593ead 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6947,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d874817a78d..d1115d7c3936 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -911,7 +911,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ @@ -921,7 +921,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e4ae83550fb3..d94a72593ead 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6947,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle -- cgit v1.2.3 From 7eb4f66b38069eec9c86c9d115f0bba1cf73ef2c Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 26 Apr 2024 00:17:24 +0800 Subject: selftests/bpf: extend BPF_SOCK_OPS_RTT_CB test for srtt and mrtt_us Because srtt and mrtt_us are added as args in bpf_sock_ops at BPF_SOCK_OPS_RTT_CB, a simple check is added to make sure they are both non-zero. $ ./test_progs -t tcp_rtt #373 tcp_rtt:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Suggested-by: Stanislav Fomichev Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240425161724.73707-3-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 14 ++++++++++++++ tools/testing/selftests/bpf/progs/tcp_rtt.c | 6 ++++++ 2 files changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 8fe84da1b9b4..f2b99d95d916 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; static void send_byte(int fd) @@ -83,6 +86,17 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, err++; } + /* Precise values of mrtt and srtt are unavailable, just make sure they are nonzero */ + if (val.mrtt_us == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[0] (mrtt_us) %u == 0", msg, val.mrtt_us); + err++; + } + + if (val.srtt == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[1] (srtt) %u == 0", msg, val.srtt); + err++; + } + return err; } diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c index 0988d79f1587..42c729f85524 100644 --- a/tools/testing/selftests/bpf/progs/tcp_rtt.c +++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; struct { @@ -55,5 +58,8 @@ int _sockops(struct bpf_sock_ops *ctx) storage->delivered_ce = tcp_sk->delivered_ce; storage->icsk_retransmits = tcp_sk->icsk_retransmits; + storage->mrtt_us = ctx->args[0]; + storage->srtt = ctx->args[1]; + return 1; } -- cgit v1.2.3 From d85465f2773da69e2838505ca3575aa3b22dba69 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Mar 2024 09:36:29 +0000 Subject: KVM: selftests: Remove second semicolon There is a statement with two semicolons. Remove the second one, it is redundant. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240315093629.2431491-1-colin.i.king@gmail.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index d2ea0435f4f7..7d707d8068a4 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -125,7 +125,7 @@ struct compat_vcpu_runstate_info { uint32_t state; uint64_t state_entry_time; uint64_t time[5]; -} __attribute__((__packed__));; +} __attribute__((__packed__)); struct arch_vcpu_info { unsigned long cr2; -- cgit v1.2.3 From 201142d160104c9ce4afc861d4cfbd7a432c5aa8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 6 Feb 2024 16:19:50 +0100 Subject: KVM: selftests: Compare wall time from xen shinfo against KVM_GET_CLOCK xen_shinfo_test is observed to be flaky failing sporadically with "VM time too old". With min_ts/max_ts debug print added: Wall clock (v 3269818) 1704906491.986255664 Time info 1: v 1282712 tsc 33530585736 time 14014430025 mul 3587552223 shift 4294967295 flags 1 Time info 2: v 1282712 tsc 33530585736 time 14014430025 mul 3587552223 shift 4294967295 flags 1 min_ts: 1704906491.986312153 max_ts: 1704906506.001006963 ==== Test Assertion Failure ==== x86_64/xen_shinfo_test.c:1003: cmp_timespec(&min_ts, &vm_ts) <= 0 pid=32724 tid=32724 errno=4 - Interrupted system call 1 0x00000000004030ad: main at xen_shinfo_test.c:1003 2 0x00007fca6b23feaf: ?? ??:0 3 0x00007fca6b23ff5f: ?? ??:0 4 0x0000000000405e04: _start at ??:? VM time too old The test compares wall clock data from shinfo (which is the output of kvm_get_wall_clock_epoch()) against clock_gettime(CLOCK_REALTIME) in the host system before the VM is created. In the example above, it compares shinfo: 1704906491.986255664 vs min_ts: 1704906491.986312153 and fails as the later is greater than the former. While this sounds like a sane test, it doesn't pass reality check: kvm_get_wall_clock_epoch() calculates guest's epoch (realtime when the guest was created) by subtracting kvmclock from the current realtime and the calculation happens when shinfo is setup. The problem is that kvmclock is a raw clock and realtime clock is affected by NTP. This means that if realtime ticks with a slightly reduced frequency, "guest's epoch" calculated by kvm_get_wall_clock_epoch() will actually tick backwards! This is not a big issue from guest's perspective as the guest can't really observe this but this epoch can't be compared with a fixed clock_gettime() on the host. Replace the check with comparing wall clock data from shinfo to KVM_GET_CLOCK. The later gives both realtime and kvmclock so guest's epoch can be calculated by subtraction. Note, CLOCK_REALTIME is susceptible to leap seconds jumps but there's no better alternative in KVM at this moment. Leave a comment and accept 1s delta. Reported-by: Jan Richter Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240206151950.31174-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 50 ++++++++++++---------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 7d707d8068a4..1e44494fa4fb 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -380,20 +380,6 @@ wait_for_timer: GUEST_SYNC(TEST_DONE); } -static int cmp_timespec(struct timespec *a, struct timespec *b) -{ - if (a->tv_sec > b->tv_sec) - return 1; - else if (a->tv_sec < b->tv_sec) - return -1; - else if (a->tv_nsec > b->tv_nsec) - return 1; - else if (a->tv_nsec < b->tv_nsec) - return -1; - else - return 0; -} - static struct shared_info *shinfo; static struct vcpu_info *vinfo; static struct kvm_vcpu *vcpu; @@ -449,7 +435,6 @@ static void *juggle_shinfo_state(void *arg) int main(int argc, char *argv[]) { - struct timespec min_ts, max_ts, vm_ts; struct kvm_xen_hvm_attr evt_reset; struct kvm_vm *vm; pthread_t thread; @@ -468,8 +453,6 @@ int main(int argc, char *argv[]) bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA); - clock_gettime(CLOCK_REALTIME, &min_ts); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); /* Map a region for the shared_info page */ @@ -1010,7 +993,6 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset); alarm(0); - clock_gettime(CLOCK_REALTIME, &max_ts); /* * Just a *really* basic check that things are being put in the @@ -1019,6 +1001,8 @@ int main(int argc, char *argv[]) */ struct pvclock_wall_clock *wc; struct pvclock_vcpu_time_info *ti, *ti2; + struct kvm_clock_data kcdata; + long long delta; wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00); ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); @@ -1034,12 +1018,34 @@ int main(int argc, char *argv[]) ti2->tsc_shift, ti2->flags); } - vm_ts.tv_sec = wc->sec; - vm_ts.tv_nsec = wc->nsec; TEST_ASSERT(wc->version && !(wc->version & 1), "Bad wallclock version %x", wc->version); - TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); - TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); + + vm_ioctl(vm, KVM_GET_CLOCK, &kcdata); + + if (kcdata.flags & KVM_CLOCK_REALTIME) { + if (verbose) { + printf("KVM_GET_CLOCK clock: %lld.%09lld\n", + kcdata.clock / NSEC_PER_SEC, kcdata.clock % NSEC_PER_SEC); + printf("KVM_GET_CLOCK realtime: %lld.%09lld\n", + kcdata.realtime / NSEC_PER_SEC, kcdata.realtime % NSEC_PER_SEC); + } + + delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock); + + /* + * KVM_GET_CLOCK gives CLOCK_REALTIME which jumps on leap seconds updates but + * unfortunately KVM doesn't currently offer a CLOCK_TAI alternative. Accept 1s + * delta as testing clock accuracy is not the goal here. The test just needs to + * check that the value in shinfo is somewhat sane. + */ + TEST_ASSERT(llabs(delta) < NSEC_PER_SEC, + "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld", + wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC, + (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC); + } else { + pr_info("Missing KVM_CLOCK_REALTIME, skipping shinfo epoch sanity check\n"); + } TEST_ASSERT(ti->version && !(ti->version & 1), "Bad time_info version %x", ti->version); -- cgit v1.2.3 From 72cd4de01d8b106182741132d5e0b61c3f55d636 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Thu, 11 Apr 2024 14:02:37 -0700 Subject: KVM: selftests: Make monitor_mwait require MONITOR/MWAIT feature If this feature is not supported or is disabled by IA32_MISC_ENABLE on the host, executing MONITOR or MWAIT instruction from the guest doesn't cause monitor/mwait VM exits, but a #UD. So, we need to skip this test if CPUID.01H:ECX[3] is cleared. Signed-off-by: Zide Chen Link: https://lore.kernel.org/r/20240411210237.34646-1-zide.chen@intel.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 853802641e1e..cdbfcf7cac5c 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -75,6 +75,7 @@ int main(int argc, char *argv[]) struct ucall uc; int testcase; + TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); -- cgit v1.2.3 From 13e860961fd4505d7247ba69e8516f577c2dee5a Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 14 Mar 2024 17:52:50 +0530 Subject: selftests/mm: virtual_address_range: Switch to ksft_exit_fail_msg mmap() must not succeed in validate_lower_address_hint(), for if it does, it is a bug in mmap() itself. Reflect this behaviour with ksft_exit_fail_msg(). While at it, do some formatting changes. Link: https://lkml.kernel.org/r/20240314122250.68534-1-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Cc: Anshuman Khandual Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 7bcf8d48256a..426ddfc345fb 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -85,7 +85,7 @@ static int validate_lower_address_hint(void) char *ptr; ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) return 0; @@ -105,13 +105,11 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_LOW; i++) { ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } + if (validate_lower_address_hint()) + ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); break; } @@ -127,7 +125,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_HIGH; i++) { hint = hind_addr(); hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (hptr[i] == MAP_FAILED) break; -- cgit v1.2.3 From 0104096498858346e3ac0668add1bf682461d4aa Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 21 Mar 2024 16:05:22 +0530 Subject: selftests/mm: confirm VA exhaustion without reliance on correctness of mmap() Currently, VA exhaustion is being checked by passing a hint to mmap() and expecting it to fail. While populating the lower VA space, mmap() fails because we have exhausted the space. Then, in validate_lower_address_hint(), because mmap() fails, we confirm that we have indeed exhausted the space. There is a circular logic involved here. Assume that there is a bug in mmap(), also assume that it exists independent of whether you pass a hint address or not; that for some reason it is not able to find a 1GB chunk. My idea is to assert the exhaustion against some other method. This patch makes a stricter test by successful write() calls from /proc/self/maps to a dump file, confirming that a free chunk is indeed not available. [dev.jain@arm.com: replace SZ_1GB with MAP_CHUNK_SIZE, tidy-up] Link: https://lkml.kernel.org/r/20240325042653.867055-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20240321103522.516097-1-dev.jain@arm.com Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 426ddfc345fb..4e4c1e311247 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -12,6 +12,8 @@ #include #include #include +#include + #include "../kselftest.h" /* @@ -93,6 +95,66 @@ static int validate_lower_address_hint(void) return 1; } +static int validate_complete_va_space(void) +{ + unsigned long start_addr, end_addr, prev_end_addr; + char line[400]; + char prot[6]; + FILE *file; + int fd; + + fd = open("va_dump", O_CREAT | O_WRONLY, 0600); + unlink("va_dump"); + if (fd < 0) { + ksft_test_result_skip("cannot create or open dump file\n"); + ksft_finished(); + } + + file = fopen("/proc/self/maps", "r"); + if (file == NULL) + ksft_exit_fail_msg("cannot open /proc/self/maps\n"); + + prev_end_addr = 0; + while (fgets(line, sizeof(line), file)) { + unsigned long hop; + + if (sscanf(line, "%lx-%lx %s[rwxp-]", + &start_addr, &end_addr, prot) != 3) + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + + /* end of userspace mappings; ignore vsyscall mapping */ + if (start_addr & (1UL << 63)) + return 0; + + /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ + if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) + return 1; + + prev_end_addr = end_addr; + + if (prot[0] != 'r') + continue; + + /* + * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. + * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 + * addresses after that. If the address was not held by this + * process, write would fail with errno set to EFAULT. + * Anyways, if write returns anything apart from 1, exit the + * program since that would mean a bug in /proc/self/maps. + */ + hop = 0; + while (start_addr + hop < end_addr) { + if (write(fd, (void *)(start_addr + hop), 1) != 1) + return 1; + lseek(fd, 0, SEEK_SET); + + hop += MAP_CHUNK_SIZE; + } + } + return 0; +} + int main(int argc, char *argv[]) { char *ptr[NR_CHUNKS_LOW]; @@ -133,6 +195,10 @@ int main(int argc, char *argv[]) validate_addr(hptr[i], 1); } hchunks = i; + if (validate_complete_va_space()) { + ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); + ksft_finished(); + } for (i = 0; i < lchunks; i++) munmap(ptr[i], MAP_CHUNK_SIZE); -- cgit v1.2.3 From 2e47a445d7b3904d0dd6a1401357e3d5ac1a6440 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 21 Mar 2024 17:50:47 -0400 Subject: selftests/mm: run_vmtests.sh: fix hugetlb mem size calculation The script calculates a mininum required size of hugetlb memories, but it'll stop working with <1MB huge page sizes, reporting all zeros even if huge pages are available. In reality, the calculation doesn't really need to be as complicated either. Make it simpler and work for KB-level hugepages too. [peterx@redhat.com: run_vmtests.sh: fix hugetlb mem size calculation] Link: https://lkml.kernel.org/r/20240403200324.1603493-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20240321215047.678172-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Reviewed-by: Muhammad Usama Anjum Tested-by: Ryan Roberts Cc: Nico Pache Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 4bdb3a0c7a60..3157204b9047 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -152,9 +152,13 @@ done < /proc/meminfo # both of these requirements into account and attempt to increase # number of huge pages available. nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) +uffd_min_KB=$((hpgsize_KB * nr_cpus * 2)) +hugetlb_min_KB=$((256 * 1024)) +if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then + needmem_KB=$uffd_min_KB +else + needmem_KB=$hugetlb_min_KB +fi # set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then @@ -294,7 +298,8 @@ CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 # Hugetlb tests require source and destination huge pages. Pass in half -# the size ($half_ufd_size_MB), which is used for *each*. +# the size of the free pages we have, which is used for *each*. +half_ufd_size_MB=$((freepgs / 2)) CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 -- cgit v1.2.3 From 02d7d31ae47030919f421ce43d71abca150365f6 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 22 Mar 2024 17:35:51 +0530 Subject: selftests/mm: parse VMA range in one go Use sscanf() to directly parse the VMA range. No functional change is intended. Link: https://lkml.kernel.org/r/20240322120551.818764-1-dev.jain@arm.com Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mlock2-tests.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 26f744188ad0..7f0d50fa361d 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -20,8 +20,6 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) FILE *file; int ret = 1; char line[1024] = {0}; - char *end_addr; - char *stop; unsigned long start; unsigned long end; @@ -37,21 +35,10 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) memset(area, 0, sizeof(struct vm_boundaries)); while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { + if (sscanf(line, "%lx-%lx", &start, &end) != 2) { ksft_print_msg("cannot parse /proc/self/maps\n"); goto out; } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - ksft_print_msg("cannot parse /proc/self/maps\n"); - goto out; - } - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); if (start <= addr && end > addr) { area->start = start; -- cgit v1.2.3 From c139ca42f5740f0c94a001eee82dafada72c19ee Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 26 Mar 2024 15:32:09 +0100 Subject: selftests/memfd_secret: add vmsplice() test Let's add a simple reproducer for a scenario where GUP-fast could succeed on secretmem folios, making vmsplice() succeed instead of failing. The reproducer is based on a reproducer [1] by Miklos Szeredi. We want to perform two tests: vmsplice() when a fresh page was just faulted in, and vmsplice() on an existing page after munmap() that would drain certain LRU caches/batches in the kernel. In an ideal world, we could use fallocate(FALLOC_FL_PUNCH_HOLE) / MADV_REMOVE to remove any existing page. As that is currently not possible, run the test before any other tests that would allocate memory in the secretmem fd. Perform the ftruncate() only once, and check the return value. [1] https://lkml.kernel.org/r/CAJfpegt3UCsMmxd0taOY11Uaw5U=eS1fE5dn0wZX3HF0oy8-oQ@mail.gmail.com Link: https://lkml.kernel.org/r/20240326143210.291116-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Cc: Lorenzo Stoakes Cc: Miklos Szeredi Cc: xingwei lee Cc: yue sun Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/memfd_secret.c | 51 +++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 9b298f6a04b3..9a0597310a76 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -83,6 +84,45 @@ static void test_mlock_limit(int fd) pass("mlock limit is respected\n"); } +static void test_vmsplice(int fd, const char *desc) +{ + ssize_t transferred; + struct iovec iov; + int pipefd[2]; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + goto close_pipe; + } + + /* + * vmsplice() may use GUP-fast, which must also fail. Prefault the + * page table, so GUP-fast could find it. + */ + memset(mem, PATTERN, page_size); + + iov.iov_base = mem; + iov.iov_len = page_size; + transferred = vmsplice(pipefd[1], &iov, 1, 0); + + if (transferred < 0 && errno == EFAULT) + pass("vmsplice is blocked as expected with %s\n", desc); + else + fail("vmsplice: unexpected memory access with %s\n", desc); + + munmap(mem, page_size); +close_pipe: + close(pipefd[0]); + close(pipefd[1]); +} + static void try_process_vm_read(int fd, int pipefd[2]) { struct iovec liov, riov; @@ -187,7 +227,6 @@ static void test_remote_access(int fd, const char *name, return; } - ftruncate(fd, page_size); memset(mem, PATTERN, page_size); if (write(pipefd[1], &mem, sizeof(mem)) < 0) { @@ -258,7 +297,7 @@ static void prepare(void) strerror(errno)); } -#define NUM_TESTS 4 +#define NUM_TESTS 6 int main(int argc, char *argv[]) { @@ -277,9 +316,17 @@ int main(int argc, char *argv[]) ksft_exit_fail_msg("memfd_secret failed: %s\n", strerror(errno)); } + if (ftruncate(fd, page_size)) + ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno)); test_mlock_limit(fd); test_file_apis(fd); + /* + * We have to run the first vmsplice test before any secretmem page was + * allocated for this fd. + */ + test_vmsplice(fd, "fresh page"); + test_vmsplice(fd, "existing page"); test_process_vm_read(fd); test_ptrace(fd); -- cgit v1.2.3 From dee3d0bef2b00772be430425832ead6aa9d707f9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 17:10:32 +0000 Subject: proc: rewrite stable_page_flags() Reduce the usage of PageFlag tests and reduce the number of compound_head() calls. For multi-page folios, we'll now show all pages as having the flags that apply to them, e.g. if it's dirty, all pages will have the dirty flag set instead of just the head page. The mapped flag is still per page, as is the hwpoison flag. [willy@infradead.org: fix up some bits vs masks] Link: https://lkml.kernel.org/r/20240403173112.1450721-1-willy@infradead.org [willy@infradead.org: fix warnings] Link: https://lkml.kernel.org/r/ZhBPtCYfSuFuUMEz@casper.infradead.org Link: https://lkml.kernel.org/r/20240326171045.410737-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Svetly Todorov Signed-off-by: Andrew Morton --- fs/proc/page.c | 69 ++++++++++++++++++++++-------------------- include/linux/huge_mm.h | 4 +-- include/linux/page-flags.h | 2 +- tools/cgroup/memcg_slabinfo.py | 5 ++- 4 files changed, 42 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/fs/proc/page.c b/fs/proc/page.c index 9223856c934b..05120263af2a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -107,10 +107,13 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) return ((kflags >> kbit) & 1) << ubit; } -u64 stable_page_flags(struct page *page) +u64 stable_page_flags(const struct page *page) { - u64 k; - u64 u; + const struct folio *folio; + unsigned long k; + unsigned long mapping; + bool is_anon; + u64 u = 0; /* * pseudo flag: KPF_NOPAGE @@ -118,49 +121,47 @@ u64 stable_page_flags(struct page *page) */ if (!page) return 1 << KPF_NOPAGE; + folio = page_folio(page); - k = page->flags; - u = 0; + k = folio->flags; + mapping = (unsigned long)folio->mapping; + is_anon = mapping & PAGE_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages */ if (page_mapped(page)) u |= 1 << KPF_MMAP; - if (PageAnon(page)) + if (is_anon) { u |= 1 << KPF_ANON; - if (PageKsm(page)) - u |= 1 << KPF_KSM; + if (mapping & PAGE_MAPPING_KSM) + u |= 1 << KPF_KSM; + } /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; - if (PageTail(page)) + if (page == &folio->page) + u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); + else u |= 1 << KPF_COMPOUND_TAIL; - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; /* - * PageTransCompound can be true for non-huge compound pages (slab - * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * We need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - if (PageLRU(head) || PageAnon(head)) + else if (folio_test_large(folio)) { + if ((k & (1 << PG_lru)) || is_anon) u |= 1 << KPF_THP; - else if (is_huge_zero_page(head)) { + else if (is_huge_zero_page(&folio->page)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; } } else if (is_zero_pfn(page_to_pfn(page))) u |= 1 << KPF_ZERO_PAGE; - /* * Caveats on high order pages: PG_buddy and PG_slab will only be set * on the head page. @@ -174,16 +175,17 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; + if (folio_test_slab(folio)) + u |= 1 << KPF_SLAB; - if (page_is_idle(page)) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) + u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); +#else + if (folio_test_idle(folio)) u |= 1 << KPF_IDLE; +#endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - - u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(page)) - u |= 1 << KPF_SLAB; - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); @@ -194,7 +196,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - if (PageSwapCache(page)) +#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache)) + if ((k & SWAPCACHE) == SWAPCACHE) u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); @@ -202,7 +205,10 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); #ifdef CONFIG_MEMORY_FAILURE - u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + if (u & (1 << KPF_HUGE)) + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); + else + u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED @@ -228,7 +234,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; - struct page *ppage; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; @@ -245,9 +250,9 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ - ppage = pfn_to_online_page(pfn); + struct page *page = pfn_to_online_page(pfn); - if (put_user(stable_page_flags(ppage), out)) { + if (put_user(stable_page_flags(page), out)) { ret = -EFAULT; break; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7576025db55d..1540a1481daf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -351,7 +351,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; -static inline bool is_huge_zero_page(struct page *page) +static inline bool is_huge_zero_page(const struct page *page) { return READ_ONCE(huge_zero_page) == page; } @@ -480,7 +480,7 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } -static inline bool is_huge_zero_page(struct page *page) +static inline bool is_huge_zero_page(const struct page *page) { return false; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index eaecf544039f..888353c209c0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -734,7 +734,7 @@ static __always_inline bool PageKsm(const struct page *page) TESTPAGEFLAG_FALSE(Ksm, ksm) #endif -u64 stable_page_flags(struct page *page); +u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index 1d3a90d93fe2..270c28a0d098 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -146,12 +146,11 @@ def detect_kernel_config(): def for_each_slab(prog): - PGSlab = 1 << prog.constant('PG_slab') - PGHead = 1 << prog.constant('PG_head') + PGSlab = ~prog.constant('PG_slab') for page in for_each_page(prog): try: - if page.flags.value_() & PGSlab: + if page.page_type.value_() == PGSlab: yield cast('struct slab *', page) except FaultError: pass -- cgit v1.2.3 From a9bc15cb1cbd62b498b55958e92a90d0ea52a4b8 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:56 -0700 Subject: selftests/x86: add placement guard gap test for shstk The existing shadow stack test for guard gaps just checks that new mappings are not placed in an existing mapping's guard gap. Add one that checks that new mappings are not placed such that preexisting mappings are in the new mappings guard gap. Link: https://lkml.kernel.org/r/20240326021656.202649-15-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov (AMD) Cc: Christophe Leroy Cc: Dan Williams Cc: Dave Hansen Cc: Deepak Gupta Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mark Brown Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- tools/testing/selftests/x86/test_shadow_stack.c | 67 +++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c index 757e6527f67e..ee909a7927f9 100644 --- a/tools/testing/selftests/x86/test_shadow_stack.c +++ b/tools/testing/selftests/x86/test_shadow_stack.c @@ -556,7 +556,7 @@ struct node { * looked at the shadow stack gaps. * 5. See if it landed in the gap. */ -int test_guard_gap(void) +int test_guard_gap_other_gaps(void) { void *free_area, *shstk, *test_map = (void *)0xFFFFFFFFFFFFFFFF; struct node *head = NULL, *cur; @@ -593,11 +593,64 @@ int test_guard_gap(void) if (shstk - test_map - PAGE_SIZE != PAGE_SIZE) return 1; - printf("[OK]\tGuard gap test\n"); + printf("[OK]\tGuard gap test, other mapping's gaps\n"); return 0; } +/* Tests respecting the guard gap of the mapping getting placed */ +int test_guard_gap_new_mappings_gaps(void) +{ + void *free_area, *shstk_start, *test_map = (void *)0xFFFFFFFFFFFFFFFF; + struct node *head = NULL, *cur; + int ret = 0; + + free_area = mmap(0, PAGE_SIZE * 4, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(free_area, PAGE_SIZE * 4); + + /* Test letting map_shadow_stack find a free space */ + shstk_start = mmap(free_area, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (shstk_start == MAP_FAILED || shstk_start != free_area) + return 1; + + while (test_map > shstk_start) { + test_map = (void *)syscall(__NR_map_shadow_stack, 0, PAGE_SIZE, 0); + if (test_map == MAP_FAILED) { + printf("[INFO]\tmap_shadow_stack MAP_FAILED\n"); + ret = 1; + break; + } + + cur = malloc(sizeof(*cur)); + cur->mapping = test_map; + + cur->next = head; + head = cur; + + if (test_map == free_area + PAGE_SIZE) { + printf("[INFO]\tNew mapping has other mapping in guard gap!\n"); + ret = 1; + break; + } + } + + while (head) { + cur = head; + head = cur->next; + munmap(cur->mapping, PAGE_SIZE); + free(cur); + } + + munmap(shstk_start, PAGE_SIZE); + + if (!ret) + printf("[OK]\tGuard gap test, placement mapping's gaps\n"); + + return ret; +} + /* * Too complicated to pull it out of the 32 bit header, but also get the * 64 bit one needed above. Just define a copy here. @@ -850,9 +903,15 @@ int main(int argc, char *argv[]) goto out; } - if (test_guard_gap()) { + if (test_guard_gap_other_gaps()) { + ret = 1; + printf("[FAIL]\tGuard gap test, other mappings' gaps\n"); + goto out; + } + + if (test_guard_gap_new_mappings_gaps()) { ret = 1; - printf("[FAIL]\tGuard gap test\n"); + printf("[FAIL]\tGuard gap test, placement mapping's gaps\n"); goto out; } -- cgit v1.2.3 From 7abaacb8e5f55c6adaabc1b6bcbc1915e159fd1d Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 28 Mar 2024 19:10:09 +0800 Subject: selftest/mm: ksm_functional_tests: refactor mmap_and_merge_range() In order to extend test_prctl_fork() and test_prctl_fork_exec() to make sure that deduplication really happens, mmap_and_merge_range() needs to be refactored. Firstly, mmap_and_merge_range() will be called with no need to call enable KSM by madvise or prctl. So, switch the 'bool use_prctl' parameter to enum ksm_merge_mode. Secondly, mmap_and_merge_range() will be called in child process in the two testcases, it isn't appropriate to call ksft_test_result_{fail, skip}, because the global variables ksft_{fail, skip} aren't consistent with the parent process. Thus, convert calls of ksft_test_result_{fail, skip} to ksft_print_msg(), return differrent error according to the two cases, and rename mmap_and_merge_range() to __mmap_and_merge_range(). For existing callers, introduce new mmap_and_merge_range() to handle different return values of __mmap_and_merge_range(). Link: https://lkml.kernel.org/r/20240328111010.1502191-3-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Kefeng Wang Cc: Nanyong Sun Cc: Rik van Riel Cc: Stefan Roesch Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 86 ++++++++++++++++------- 1 file changed, 61 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d615767e396b..5462873df84c 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -28,6 +28,15 @@ #define MiB (1024 * KiB) #define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child" +#define MAP_MERGE_FAIL ((void *)-1) +#define MAP_MERGE_SKIP ((void *)-2) + +enum ksm_merge_mode { + KSM_MERGE_PRCTL, + KSM_MERGE_MADVISE, + KSM_MERGE_NONE, /* PRCTL already set */ +}; + static int mem_fd; static int ksm_fd; static int ksm_full_scans_fd; @@ -146,33 +155,34 @@ static int ksm_unmerge(void) return 0; } -static char *mmap_and_merge_range(char val, unsigned long size, int prot, - bool use_prctl) +static char *__mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) { char *map; + char *err_map = MAP_MERGE_FAIL; int ret; /* Stabilize accounting by disabling KSM completely. */ if (ksm_unmerge()) { - ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); - return MAP_FAILED; + ksft_print_msg("Disabling (unmerging) KSM failed\n"); + return err_map; } if (get_my_merging_pages() > 0) { - ksft_test_result_fail("Still pages merged\n"); - return MAP_FAILED; + ksft_print_msg("Still pages merged\n"); + return err_map; } map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; + ksft_print_msg("mmap() failed\n"); + return err_map; } /* Don't use THP. Ignore if THP are not around on a kernel. */ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + ksft_print_msg("MADV_NOHUGEPAGE failed\n"); goto unmap; } @@ -180,27 +190,36 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, memset(map, val, size); if (mprotect(map, size, prot)) { - ksft_test_result_skip("mprotect() failed\n"); + ksft_print_msg("mprotect() failed\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } - if (use_prctl) { + switch (mode) { + case KSM_MERGE_PRCTL: ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { - ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } else if (ret) { - ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n"); goto unmap; } - } else if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; + break; + case KSM_MERGE_MADVISE: + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_print_msg("MADV_MERGEABLE failed\n"); + goto unmap; + } + break; + case KSM_MERGE_NONE: + break; } /* Run KSM to trigger merging and wait. */ if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); + ksft_print_msg("Running KSM failed\n"); goto unmap; } @@ -209,14 +228,31 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, * accounted differently (depending on kernel support). */ if (val && !get_my_merging_pages()) { - ksft_test_result_fail("No pages got merged\n"); + ksft_print_msg("No pages got merged\n"); goto unmap; } return map; unmap: munmap(map, size); - return MAP_FAILED; + return err_map; +} + +static char *mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) +{ + char *map; + char *ret = MAP_FAILED; + + map = __mmap_and_merge_range(val, size, prot, mode); + if (map == MAP_MERGE_FAIL) + ksft_test_result_fail("Merging memory failed"); + else if (map == MAP_MERGE_SKIP) + ksft_test_result_skip("Merging memory skipped"); + else + ret = map; + + return ret; } static void test_unmerge(void) @@ -226,7 +262,7 @@ static void test_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -264,7 +300,7 @@ static void test_unmerge_zero_pages(void) } /* Let KSM deduplicate zero pages. */ - map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -312,7 +348,7 @@ static void test_unmerge_discarded(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -344,7 +380,7 @@ static void test_unmerge_uffd_wp(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -545,7 +581,7 @@ static void test_prctl_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL); if (map == MAP_FAILED) return; @@ -568,7 +604,7 @@ static void test_prot_none(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0x11, size, PROT_NONE, false); + map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) goto unmap; -- cgit v1.2.3 From 6c47de3be3a021d8b28d127802d590a49598a514 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 28 Mar 2024 19:10:10 +0800 Subject: selftest/mm: ksm_functional_tests: extend test case for ksm fork/exec This extends test_prctl_fork() and test_prctl_fork_exec() to make sure that deduplication really happens, instead of only testing the MMF_VM_MERGE_ANY flag is set. [colin.i.king@gmail.com: fix spelling mistake in ksft_test_result_skip message] Link: https://lkml.kernel.org/r/20240402081537.1365939-1-colin.i.king@gmail.com Link: https://lkml.kernel.org/r/20240328111010.1502191-4-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Signed-off-by: Colin Ian King Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Kefeng Wang Cc: Nanyong Sun Cc: Rik van Riel Cc: Stefan Roesch Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 49 ++++++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 5462873df84c..db845dca8d19 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -475,6 +475,36 @@ static void test_prctl(void) ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n"); } +static int test_child_ksm(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + /* Test if KSM is enabled for the process. */ + if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1) + return -1; + + /* Test if merge could really happen. */ + map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE); + if (map == MAP_MERGE_FAIL) + return -2; + else if (map == MAP_MERGE_SKIP) + return -3; + + munmap(map, size); + return 0; +} + +static void test_child_ksm_err(int status) +{ + if (status == -1) + ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + else if (status == -2) + ksft_test_result_fail("Merge in child failed\n"); + else if (status == -3) + ksft_test_result_skip("Merge in child skipped\n"); +} + /* Verify that prctl ksm flag is inherited. */ static void test_prctl_fork(void) { @@ -494,7 +524,7 @@ static void test_prctl_fork(void) child_pid = fork(); if (!child_pid) { - exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0)); + exit(test_child_ksm()); } else if (child_pid < 0) { ksft_test_result_fail("fork() failed\n"); return; @@ -503,8 +533,11 @@ static void test_prctl_fork(void) if (waitpid(child_pid, &status, 0) < 0) { ksft_test_result_fail("waitpid() failed\n"); return; - } else if (WEXITSTATUS(status) != 1) { - ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + } + + status = WEXITSTATUS(status); + if (status) { + test_child_ksm_err(status); return; } @@ -516,12 +549,6 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } -static int ksm_fork_exec_child(void) -{ - /* Test if KSM is enabled for the process. */ - return prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) == 1; -} - static void test_prctl_fork_exec(void) { int ret, status; @@ -554,7 +581,7 @@ static void test_prctl_fork_exec(void) if (WIFEXITED(status)) { status = WEXITSTATUS(status); if (status) { - ksft_test_result_fail("KSM not enabled\n"); + test_child_ksm_err(status); return; } } else { @@ -635,7 +662,7 @@ int main(int argc, char **argv) int err; if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { - exit(ksm_fork_exec_child() == 1 ? 0 : 1); + exit(test_child_ksm()); } #ifdef __NR_userfaultfd -- cgit v1.2.3 From c7876a0cc6a0a74b407b10f9afe5d77920d803ba Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Sat, 30 Mar 2024 23:05:55 +0530 Subject: selftests/mm: mremap_test: optimize using pre-filled random array and memcpy Patch series "selftests/mm: mremap_test: Optimizations and style fixes". The mremap_test, in a worst case controlled by the -t flag, does a for loop iteration in orders of GB. Without compromising on the stdout report, the aim is to reduce this time. A pre-filled random buffer is allocated based on the seed, replacing repetitive rand() calls. The byte pattern in the memory locations is set through memcpy() from the random buffer. Replacing the loop for printing the mismatch index to stdout, employ an efficient algorithm by breaking the comparison into chunks, use the highly optimized memcmp() library function, and when a mismatch does occur, only then do a brute force iteration. Also, use sscanf() to parse /proc/self/maps for consistency across files. Execution time results (x86 system): ./mremap_test Original: 3 seconds After change: 0.8 seconds ./mremap_test -t100 Original: 17 seconds After change: 2 seconds ./mremap_test -t0 (worst case): Original: 9:40 minutes After change: 45 seconds This patch (of 3): Allocate a pre-filled random buffer using the seed. Replace iterative copying of the random sequence to buffers using the highly optimized library function memcpy(). Link: https://lkml.kernel.org/r/20240330173557.2697684-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20240330173557.2697684-2-dev.jain@arm.com Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: John Hubbard Cc: Kalesh Singh Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 78 ++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 2f8b991f78cb..7fed9cc3911e 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -23,6 +23,7 @@ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) @@ -296,7 +297,7 @@ out: * * |DDDDddddSSSSssss| */ -static void mremap_move_within_range(char pattern_seed) +static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) { char *test_name = "mremap mremap move within range"; void *src, *dest; @@ -316,10 +317,7 @@ static void mremap_move_within_range(char pattern_seed) src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1)); /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); dest = src - SIZE_MB(2); @@ -357,7 +355,7 @@ out: /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) + unsigned int pattern_seed, char *rand_addr) { void *addr, *src_addr, *dest_addr, *dest_preamble_addr; int d; @@ -378,9 +376,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) - memset((char *) src_addr + t, (char) rand(), 1); + memcpy(src_addr, rand_addr, threshold); /* Mask to zero out lower bits of address for alignment */ align_mask = ~(c.dest_alignment - 1); @@ -420,9 +416,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for the dest preamble block. */ - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) - memset((char *) dest_preamble_addr + d, (char) rand(), 1); + memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size); } clock_gettime(CLOCK_MONOTONIC, &t_start); @@ -494,7 +488,8 @@ out: * the beginning of the mapping just because the aligned * down address landed on a mapping that maybe does not exist. */ -static void mremap_move_1mb_from_start(char pattern_seed) +static void mremap_move_1mb_from_start(unsigned int pattern_seed, + char *rand_addr) { char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; void *src = NULL, *dest = NULL; @@ -520,10 +515,7 @@ static void mremap_move_1mb_from_start(char pattern_seed) } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); /* * Unmap the beginning of dest so that the aligned address @@ -568,10 +560,10 @@ out: static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, - unsigned int pattern_seed) + unsigned int pattern_seed, char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); + pattern_seed, rand_addr); if (remap_time < 0) { if (test_case.expect_failure) @@ -642,7 +634,15 @@ int main(int argc, char **argv) int failures = 0; int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + + /* hard-coded test configs */ + size_t max_test_variable_region_size = _2GB; + size_t max_test_constant_region_size = _2MB; + size_t dest_preamble_size = 10 * _4MB; + unsigned int pattern_seed; + char *rand_addr; + size_t rand_size; int num_expand_tests = 2; int num_misc_tests = 2; struct test test_cases[MAX_TEST] = {}; @@ -659,6 +659,31 @@ int main(int argc, char **argv) ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", threshold_mb, pattern_seed); + /* + * set preallocated random array according to test configs; see the + * functions for the logic of setting the size + */ + if (!threshold_mb) + rand_size = MAX(max_test_variable_region_size, + max_test_constant_region_size); + else + rand_size = MAX(MIN(threshold_mb * _1MB, + max_test_variable_region_size), + max_test_constant_region_size); + rand_size = MAX(dest_preamble_size, rand_size); + + rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (rand_addr == MAP_FAILED) { + perror("mmap"); + ksft_exit_fail_msg("cannot mmap rand_addr\n"); + } + + /* fill stream of random bytes */ + srand(pattern_seed); + for (unsigned long i = 0; i < rand_size; ++i) + rand_addr[i] = (char) rand(); + page_size = sysconf(_SC_PAGESIZE); /* Expected mremap failures */ @@ -730,13 +755,13 @@ int main(int argc, char **argv) for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); + pattern_seed, rand_addr); maps_fp = fopen("/proc/self/maps", "r"); if (maps_fp == NULL) { - ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); - exit(KSFT_FAIL); + munmap(rand_addr, rand_size); + ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); } mremap_expand_merge(maps_fp, page_size); @@ -744,17 +769,20 @@ int main(int argc, char **argv) fclose(maps_fp); - mremap_move_within_range(pattern_seed); - mremap_move_1mb_from_start(pattern_seed); + mremap_move_within_range(pattern_seed, rand_addr); + mremap_move_1mb_from_start(pattern_seed, rand_addr); if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); + threshold_mb, pattern_seed, + rand_addr); } + munmap(rand_addr, rand_size); + if (failures > 0) ksft_exit_fail(); else -- cgit v1.2.3 From 7033c6cc96203eec738ff192829a18a4e12308dc Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Sat, 30 Mar 2024 23:05:56 +0530 Subject: selftests/mm: mremap_test: optimize execution time from minutes to seconds using chunkwise memcmp Mismatch index is currently being checked by a brute force iteration over the buffer. Instead, break the comparison into O(sqrt(n)) number of chunks, with the chunk size of this order only, where n is the size of the buffer. Do a brute-force iteration to print to stdout only when the highly optimized memcmp() library function returns a mismatch in the chunk. The time complexity of this algorithm is O(sqrt(n)) * t, where t is the time taken by memcmp(); for our test conditions, it is safe to assume t to be small. Link: https://lkml.kernel.org/r/20240330173557.2697684-3-dev.jain@arm.com Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: John Hubbard Cc: Kalesh Singh Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 112 +++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 7fed9cc3911e..678c79d5b8ef 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -70,6 +70,27 @@ enum { .expect_failure = should_fail \ } +/* compute square root using binary search */ +static unsigned long get_sqrt(unsigned long val) +{ + unsigned long low = 1; + + /* assuming rand_size is less than 1TB */ + unsigned long high = (1UL << 20); + + while (low <= high) { + unsigned long mid = low + (high - low) / 2; + unsigned long temp = mid * mid; + + if (temp == val) + return mid; + if (temp < val) + low = mid + 1; + high = mid - 1; + } + return low; +} + /* * Returns false if the requested remap region overlaps with an * existing mapping (e.g text, stack) else returns true. @@ -355,14 +376,14 @@ out: /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, - unsigned int pattern_seed, char *rand_addr) + char *rand_addr) { void *addr, *src_addr, *dest_addr, *dest_preamble_addr; - int d; - unsigned long long t; + unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; + unsigned long num_chunks; if (threshold_mb == VALIDATION_NO_THRESHOLD) threshold = c.region_size; @@ -430,15 +451,42 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto clean_up_dest_preamble; } - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) { - char c = (char) rand(); + /* + * Verify byte pattern after remapping. Employ an algorithm with a + * square root time complexity in threshold: divide the range into + * chunks, if memcmp() returns non-zero, only then perform an + * iteration in that chunk to find the mismatch index. + */ + num_chunks = get_sqrt(threshold); + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = threshold / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) + continue; + + /* brute force iteration only over mismatch segment */ + for (t = shift; t < shift + chunk_size; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { + ksft_print_msg("Data after remap doesn't match at offset %llu\n", + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, + ((char *) dest_addr)[t] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } - if (((char *) dest_addr)[t] != c) { + /* + * if threshold is not divisible by num_chunks, then check the + * last chunk + */ + for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, ((char *) dest_addr)[t] & 0xff); ret = -1; goto clean_up_dest; @@ -446,22 +494,44 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Verify the dest preamble byte pattern after remapping */ - if (c.dest_preamble_size) { - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) { - char c = (char) rand(); - - if (((char *) dest_preamble_addr)[d] != c) { - ksft_print_msg("Preamble data after remap doesn't match at offset %d\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); + if (!c.dest_preamble_size) + goto no_preamble; + + num_chunks = get_sqrt(c.dest_preamble_size); + + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = c.dest_preamble_size / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, + chunk_size)) + continue; + + /* brute force iteration only over mismatched segment */ + for (d = shift; d < shift + chunk_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); ret = -1; goto clean_up_dest; } } } + for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + +no_preamble: start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; @@ -563,7 +633,7 @@ static void run_mremap_test_case(struct test test_case, int *failures, unsigned int pattern_seed, char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed, rand_addr); + rand_addr); if (remap_time < 0) { if (test_case.expect_failure) -- cgit v1.2.3 From e1e13262f0d6e58a6439d437c386bb3392aaca0b Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Sat, 30 Mar 2024 23:05:57 +0530 Subject: selftests/mm: mremap_test: use sscanf to parse /proc/self/maps Enforce consistency across files by avoiding two separate functions to parse /proc/self/maps, replacing them with a simple sscanf(). Link: https://lkml.kernel.org/r/20240330173557.2697684-4-dev.jain@arm.com Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: John Hubbard Cc: Kalesh Singh Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 678c79d5b8ef..1b03bcfaefdf 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -148,19 +148,21 @@ static unsigned long long get_mmap_min_addr(void) * Using /proc/self/maps, assert that the specified address range is contained * within a single mapping. */ -static bool is_range_mapped(FILE *maps_fp, void *start, void *end) +static bool is_range_mapped(FILE *maps_fp, unsigned long start, + unsigned long end) { char *line = NULL; size_t len = 0; bool success = false; + unsigned long first_val, second_val; rewind(maps_fp); while (getline(&line, &len, maps_fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); + if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) { + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + break; + } if (first_val <= start && second_val >= end) { success = true; @@ -255,7 +257,8 @@ static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -294,7 +297,8 @@ static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: -- cgit v1.2.3 From e076eaca59065a48b02384346e7bc110ca37b90b Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 27 Mar 2024 20:34:17 -0700 Subject: selftests: break the dependency upon local header files Patch series "Fix selftests/mm build without requiring "make headers"". As mentioned in each patch, this implements the solution that we discussed in December 2023, in [1]. This turned out to be very clean and easy. It should also be quite easy to maintain. This should also make Peter Zijlstra happy, because it directly addresses the root cause of his "NAK NAK NAK" reply [2]. :) [1] https://lore.kernel.org/all/783a4178-1dec-4e30-989a-5174b8176b09@redhat.com/ [2] https://lore.kernel.org/lkml/20231103121652.GA6217@noisy.programming.kicks-ass.net/ This patch (of 2): Use tools/include/uapi/ files instead. These are obtained by taking a snapshot: run "make headers" at the top level, then copy the desired header file into the appropriate subdir in tools/uapi/. This was discussed and solved in [1]. However, even before copying any additional files there, there are already quite a few in tools/include/uapi already. And these will immediately fix a number of selftests/mm build failures. So this patch: a) Adds TOOLS_INCLUDES to selftests/lib.mk, so that all selftests can immediately and easily include the snapshotted header files. b) Uses $(TOOLS_INCLUDES) in the selftests/mm build. On today's Arch Linux, this already fixes all build errors except for a few userfaultfd.h (those will be addressed in a subsequent patch). [1] https://lore.kernel.org/all/783a4178-1dec-4e30-989a-5174b8176b09@redhat.com/ Link: https://lkml.kernel.org/r/20240328033418.203790-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20240328033418.203790-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Mark Brown Cc: Muhammad Usama Anjum Cc: Suren Baghdasaryan Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/lib.mk | 9 +++++++++ tools/testing/selftests/mm/Makefile | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index da2cade3bab0..1dae4a02957f 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -48,6 +48,15 @@ ifeq ($(KHDR_INCLUDES),) KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif +# In order to use newer items that haven't yet been added to the user's system +# header files, add $(TOOLS_INCLUDES) to the compiler invocation in each +# each selftest. +# You may need to add files to that location, or to refresh an existing file. In +# order to do that, run "make headers" from $(top_srcdir), then copy the +# header file that you want from $(top_srcdir)/usr/include/... , to the matching +# subdir in $(TOOLS_INCLUDE). +TOOLS_INCLUDES := -isystem $(top_srcdir)/tools/include/uapi + # The following are built by lib.mk common compile rules. # TEST_CUSTOM_PROGS should be used by tests that require # custom build rule and prevent common build rule use. diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eb5f39a2668b..7ca9186a0639 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -32,7 +32,7 @@ endif # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) LDLIBS = -lrt -lpthread -lm TEST_GEN_FILES = cow -- cgit v1.2.3 From 580ea358af0ade1176326e76edcf60dd50938808 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 27 Mar 2024 20:34:18 -0700 Subject: selftests/mm: fix additional build errors for selftests These build errors only occur if one fails to first run "make headers". However, that is a non-obvious and instrusive requirement, and so there was a discussion on how to get rid of it [1]. This uses that solution. These two files were created by taking a snapshot of the generated header files that are created via "make headers". These two files were copied from ./usr/include/linux/ to ./tools/include/uapi/linux/ . That fixes the selftests/mm build on today's Arch Linux (which required the userfaultfd.h) and Ubuntu 23.04 (which additionally required memfd.h). [1] https://lore.kernel.org/all/783a4178-1dec-4e30-989a-5174b8176b09@redhat.com/ Link: https://lkml.kernel.org/r/20240328033418.203790-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Mark Brown Cc: Muhammad Usama Anjum Cc: Suren Baghdasaryan Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/include/uapi/linux/memfd.h | 39 ++++ tools/include/uapi/linux/userfaultfd.h | 386 +++++++++++++++++++++++++++++++++ 2 files changed, 425 insertions(+) create mode 100644 tools/include/uapi/linux/memfd.h create mode 100644 tools/include/uapi/linux/userfaultfd.h (limited to 'tools') diff --git a/tools/include/uapi/linux/memfd.h b/tools/include/uapi/linux/memfd.h new file mode 100644 index 000000000000..01c0324e7733 --- /dev/null +++ b/tools/include/uapi/linux/memfd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_MEMFD_H +#define _LINUX_MEMFD_H + +#include + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U +#define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U + +/* + * Huge page size encoding when MFD_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB +#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB +#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + +#endif /* _LINUX_MEMFD_H */ diff --git a/tools/include/uapi/linux/userfaultfd.h b/tools/include/uapi/linux/userfaultfd.h new file mode 100644 index 000000000000..4283de22d5b6 --- /dev/null +++ b/tools/include/uapi/linux/userfaultfd.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * include/linux/userfaultfd.h + * + * Copyright (C) 2007 Davide Libenzi + * Copyright (C) 2015 Red Hat, Inc. + * + */ + +#ifndef _LINUX_USERFAULTFD_H +#define _LINUX_USERFAULTFD_H + +#include + +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ +#define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) + +/* + * Valid ioctl command number range with this API is from 0x00 to + * 0x3F. UFFDIO_API is the fixed number, everything else can be + * changed by implementing a different UFFD_API. If sticking to the + * same UFFD_API more ioctl can be added and userland will be aware of + * which ioctl the running kernel implements through the ioctl command + * bitmask written by the UFFDIO_API. + */ +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) +#define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) +#define _UFFDIO_API (0x3F) + +/* userfaultfd ioctl ids */ +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) + +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + union { + __u32 ptid; + } feat; + } pagefault; + + struct { + __u32 ufd; + } fork; + + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + + struct { + __u64 start; + __u64 end; + } remove; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __attribute__((packed)); + +/* + * Start at 0x12 and not at 0 to be more strict against bugs. + */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ + +struct uffdio_api { + /* userland asks for an API number and the features to enable */ + __u64 api; + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. + */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) + __u64 features; + + __u64 ioctls; +}; + +struct uffdio_range { + __u64 start; + __u64 len; +}; + +struct uffdio_register { + struct uffdio_range range; +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) + __u64 mode; + + /* + * kernel answers which ioctl commands are available for the + * range, keep at the end as the last 8 bytes aren't read. + */ + __u64 ioctls; +}; + +struct uffdio_copy { + __u64 dst; + __u64 src; + __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * "copy" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 copy; +}; + +struct uffdio_zeropage { + struct uffdio_range range; +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "zeropage" is written by the ioctl and must be at the end: + * the copy_from_user will not read the last 8 bytes. + */ + __s64 zeropage; +}; + +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + +#endif /* _LINUX_USERFAULTFD_H */ -- cgit v1.2.3 From 9408a23fac62d31de067bd2b0099eb9151395345 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:33 -0700 Subject: KVM: riscv: selftests: Move sbi definitions to its own header file The SBI definitions will continue to grow. Move the sbi related definitions to its own header file from processor.h Suggested-by: Andrew Jones Reviewed-by: Andrew Jones Signed-off-by: Atish Patra Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20240420151741.962500-18-atishp@rivosinc.com Signed-off-by: Anup Patel --- .../selftests/kvm/include/riscv/processor.h | 39 ----------------- tools/testing/selftests/kvm/include/riscv/sbi.h | 50 ++++++++++++++++++++++ tools/testing/selftests/kvm/include/riscv/ucall.h | 1 + tools/testing/selftests/kvm/steal_time.c | 4 +- 4 files changed, 54 insertions(+), 40 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/riscv/sbi.h (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index ce473fe251dd..3b9cb39327ff 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -154,45 +154,6 @@ void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handle #define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE #define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT -/* SBI return error codes */ -#define SBI_SUCCESS 0 -#define SBI_ERR_FAILURE -1 -#define SBI_ERR_NOT_SUPPORTED -2 -#define SBI_ERR_INVALID_PARAM -3 -#define SBI_ERR_DENIED -4 -#define SBI_ERR_INVALID_ADDRESS -5 -#define SBI_ERR_ALREADY_AVAILABLE -6 -#define SBI_ERR_ALREADY_STARTED -7 -#define SBI_ERR_ALREADY_STOPPED -8 - -#define SBI_EXT_EXPERIMENTAL_START 0x08000000 -#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF - -#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END -#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 -#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 - -enum sbi_ext_id { - SBI_EXT_BASE = 0x10, - SBI_EXT_STA = 0x535441, -}; - -enum sbi_ext_base_fid { - SBI_EXT_BASE_PROBE_EXT = 3, -}; - -struct sbiret { - long error; - long value; -}; - -struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, - unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, - unsigned long arg5); - -bool guest_sbi_probe_extension(int extid, long *out_val); - static inline void local_irq_enable(void) { csr_set(CSR_SSTATUS, SR_SIE); diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h new file mode 100644 index 000000000000..ba04f2dec7b5 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V SBI specific definitions + * + * Copyright (C) 2024 Rivos Inc. + */ + +#ifndef SELFTEST_KVM_SBI_H +#define SELFTEST_KVM_SBI_H + +/* SBI return error codes */ +#define SBI_SUCCESS 0 +#define SBI_ERR_FAILURE -1 +#define SBI_ERR_NOT_SUPPORTED -2 +#define SBI_ERR_INVALID_PARAM -3 +#define SBI_ERR_DENIED -4 +#define SBI_ERR_INVALID_ADDRESS -5 +#define SBI_ERR_ALREADY_AVAILABLE -6 +#define SBI_ERR_ALREADY_STARTED -7 +#define SBI_ERR_ALREADY_STOPPED -8 + +#define SBI_EXT_EXPERIMENTAL_START 0x08000000 +#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF + +#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END +#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 +#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 + +enum sbi_ext_id { + SBI_EXT_BASE = 0x10, + SBI_EXT_STA = 0x535441, +}; + +enum sbi_ext_base_fid { + SBI_EXT_BASE_PROBE_EXT = 3, +}; + +struct sbiret { + long error; + long value; +}; + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5); + +bool guest_sbi_probe_extension(int extid, long *out_val); + +#endif /* SELFTEST_KVM_SBI_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/ucall.h b/tools/testing/selftests/kvm/include/riscv/ucall.h index be46eb32ec27..a695ae36f3e0 100644 --- a/tools/testing/selftests/kvm/include/riscv/ucall.h +++ b/tools/testing/selftests/kvm/include/riscv/ucall.h @@ -3,6 +3,7 @@ #define SELFTEST_KVM_UCALL_H #include "processor.h" +#include "sbi.h" #define UCALL_EXIT_REASON KVM_EXIT_RISCV_SBI diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index bae0c5026f82..2ff82c7fd926 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -11,7 +11,9 @@ #include #include #include -#ifndef __riscv +#ifdef __riscv +#include "sbi.h" +#else #include #endif -- cgit v1.2.3 From 97be675bfdb5086a6ad1aeeaa43df78b1afd5a0d Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:34 -0700 Subject: KVM: riscv: selftests: Add helper functions for extension checks __vcpu_has_ext can check both SBI and ISA extensions when the first argument is properly converted to SBI/ISA extension IDs. Introduce two helper functions to make life easier for developers so they don't have to worry about the conversions. Replace the current usages as well with new helpers. Reviewed-by: Andrew Jones Signed-off-by: Atish Patra Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20240420151741.962500-19-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/include/riscv/processor.h | 10 ++++++++++ tools/testing/selftests/kvm/riscv/arch_timer.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index 3b9cb39327ff..5f389166338c 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -50,6 +50,16 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype, bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext); +static inline bool __vcpu_has_isa_ext(struct kvm_vcpu *vcpu, uint64_t isa_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_ISA_EXT_REG(isa_ext)); +} + +static inline bool __vcpu_has_sbi_ext(struct kvm_vcpu *vcpu, uint64_t sbi_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_SBI_EXT_REG(sbi_ext)); +} + struct ex_regs { unsigned long ra; unsigned long sp; diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index 0f9cabd99fd4..735b78569021 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -85,7 +85,7 @@ struct kvm_vm *test_vm_create(void) int nr_vcpus = test_args.nr_vcpus; vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); - __TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)), + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpus[0], KVM_RISCV_ISA_EXT_SSTC), "SSTC not available, skipping test\n"); vm_init_vector_tables(vm); -- cgit v1.2.3 From 3a21b37c47f8ea7e7efcb7173d82e0ab147bbcb1 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:35 -0700 Subject: KVM: riscv: selftests: Add Sscofpmf to get-reg-list test The KVM RISC-V allows Sscofpmf extension for Guest/VM so let us add this extension to get-reg-list test. Reviewed-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20240420151741.962500-20-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/get-reg-list.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index b882b7b9b785..222198dd6d04 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -43,6 +43,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT: @@ -408,6 +409,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(V), KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVNAPOT), @@ -931,6 +933,7 @@ KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F); KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D); KVM_ISA_EXT_SIMPLE_CONFIG(h, H); KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN); +KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF); KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC); KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL); KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT); @@ -986,6 +989,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_fp_d, &config_h, &config_smstateen, + &config_sscofpmf, &config_sstc, &config_svinval, &config_svnapot, -- cgit v1.2.3 From 3203b9474356503ac5a28030b5d46d85a6491c5f Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:36 -0700 Subject: KVM: riscv: selftests: Add SBI PMU extension definitions The SBI PMU extension definition is required for upcoming SBI PMU selftests. Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20240420151741.962500-21-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/include/riscv/sbi.h | 66 +++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h index ba04f2dec7b5..6675ca673c77 100644 --- a/tools/testing/selftests/kvm/include/riscv/sbi.h +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -29,17 +29,83 @@ enum sbi_ext_id { SBI_EXT_BASE = 0x10, SBI_EXT_STA = 0x535441, + SBI_EXT_PMU = 0x504D55, }; enum sbi_ext_base_fid { SBI_EXT_BASE_PROBE_EXT = 3, }; +enum sbi_ext_pmu_fid { + SBI_EXT_PMU_NUM_COUNTERS = 0, + SBI_EXT_PMU_COUNTER_GET_INFO, + SBI_EXT_PMU_COUNTER_CFG_MATCH, + SBI_EXT_PMU_COUNTER_START, + SBI_EXT_PMU_COUNTER_STOP, + SBI_EXT_PMU_COUNTER_FW_READ, + SBI_EXT_PMU_COUNTER_FW_READ_HI, + SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, +}; + +union sbi_pmu_ctr_info { + unsigned long value; + struct { + unsigned long csr:12; + unsigned long width:6; +#if __riscv_xlen == 32 + unsigned long reserved:13; +#else + unsigned long reserved:45; +#endif + unsigned long type:1; + }; +}; struct sbiret { long error; long value; }; +/** General pmu event codes specified in SBI PMU extension */ +enum sbi_pmu_hw_generic_events_t { + SBI_PMU_HW_NO_EVENT = 0, + SBI_PMU_HW_CPU_CYCLES = 1, + SBI_PMU_HW_INSTRUCTIONS = 2, + SBI_PMU_HW_CACHE_REFERENCES = 3, + SBI_PMU_HW_CACHE_MISSES = 4, + SBI_PMU_HW_BRANCH_INSTRUCTIONS = 5, + SBI_PMU_HW_BRANCH_MISSES = 6, + SBI_PMU_HW_BUS_CYCLES = 7, + SBI_PMU_HW_STALLED_CYCLES_FRONTEND = 8, + SBI_PMU_HW_STALLED_CYCLES_BACKEND = 9, + SBI_PMU_HW_REF_CPU_CYCLES = 10, + + SBI_PMU_HW_GENERAL_MAX, +}; + +/* SBI PMU counter types */ +enum sbi_pmu_ctr_type { + SBI_PMU_CTR_TYPE_HW = 0x0, + SBI_PMU_CTR_TYPE_FW, +}; + +/* Flags defined for config matching function */ +#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0) +#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1) +#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2) +#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3) +#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4) +#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5) +#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6) +#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7) + +/* Flags defined for counter start function */ +#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0) +#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1) + +/* Flags defined for counter stop function */ +#define SBI_PMU_STOP_FLAG_RESET BIT(0) +#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1) + struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, -- cgit v1.2.3 From 158cb9e61cb7f9ed07384584fe34fb9c39590293 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:37 -0700 Subject: KVM: riscv: selftests: Add SBI PMU selftest This test implements basic sanity test and cycle/instret event counting tests. Reviewed-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20240420151741.962500-22-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 369 +++++++++++++++++++++++ 2 files changed, 370 insertions(+) create mode 100644 tools/testing/selftests/kvm/riscv/sbi_pmu_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 7f4430242c9e..6a356c06f2fc 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -189,6 +189,7 @@ TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer TEST_GEN_PROGS_riscv += demand_paging_test diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c new file mode 100644 index 000000000000..7c81691e39c5 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sbi_pmu_test.c - Tests the riscv64 SBI PMU functionality. + * + * Copyright (c) 2024, Rivos Inc. + */ + +#include +#include +#include +#include +#include +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" +#include "sbi.h" + +/* Maximum counters(firmware + hardware) */ +#define RISCV_MAX_PMU_COUNTERS 64 +union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; + +/* Cache the available counters in a bitmask */ +static unsigned long counter_mask_available; + +static bool illegal_handler_invoked; + +unsigned long pmu_csr_read_num(int csr_num) +{ +#define switchcase_csr_read(__csr_num, __val) {\ + case __csr_num: \ + __val = csr_read(__csr_num); \ + break; } +#define switchcase_csr_read_2(__csr_num, __val) {\ + switchcase_csr_read(__csr_num + 0, __val) \ + switchcase_csr_read(__csr_num + 1, __val)} +#define switchcase_csr_read_4(__csr_num, __val) {\ + switchcase_csr_read_2(__csr_num + 0, __val) \ + switchcase_csr_read_2(__csr_num + 2, __val)} +#define switchcase_csr_read_8(__csr_num, __val) {\ + switchcase_csr_read_4(__csr_num + 0, __val) \ + switchcase_csr_read_4(__csr_num + 4, __val)} +#define switchcase_csr_read_16(__csr_num, __val) {\ + switchcase_csr_read_8(__csr_num + 0, __val) \ + switchcase_csr_read_8(__csr_num + 8, __val)} +#define switchcase_csr_read_32(__csr_num, __val) {\ + switchcase_csr_read_16(__csr_num + 0, __val) \ + switchcase_csr_read_16(__csr_num + 16, __val)} + + unsigned long ret = 0; + + switch (csr_num) { + switchcase_csr_read_32(CSR_CYCLE, ret) + switchcase_csr_read_32(CSR_CYCLEH, ret) + default : + break; + } + + return ret; +#undef switchcase_csr_read_32 +#undef switchcase_csr_read_16 +#undef switchcase_csr_read_8 +#undef switchcase_csr_read_4 +#undef switchcase_csr_read_2 +#undef switchcase_csr_read +} + +static inline void dummy_func_loop(uint64_t iter) +{ + int i = 0; + + while (i < iter) { + asm volatile("nop"); + i++; + } +} + +static void start_counter(unsigned long counter, unsigned long start_flags, + unsigned long ival) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, counter, 1, start_flags, + ival, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to start counter %ld\n", counter); +} + +/* This should be invoked only for reset counter use case */ +static void stop_reset_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, + stop_flags | SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); + __GUEST_ASSERT(ret.error == SBI_ERR_ALREADY_STOPPED, + "Unable to stop counter %ld\n", counter); +} + +static void stop_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags, + 0, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n", + counter, ret.error); +} + +static void guest_illegal_exception_handler(struct ex_regs *regs) +{ + __GUEST_ASSERT(regs->cause == EXC_INST_ILLEGAL, + "Unexpected exception handler %lx\n", regs->cause); + + illegal_handler_invoked = true; + /* skip the trapping instruction */ + regs->epc += 4; +} + +static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, + unsigned long cflags, + unsigned long event) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask, + cflags, event, 0, 0); + __GUEST_ASSERT(ret.error == 0, "config matching failed %ld\n", ret.error); + GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS); + GUEST_ASSERT(BIT(ret.value) & counter_mask_available); + + return ret.value; +} + +static unsigned long get_num_counters(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_NUM_COUNTERS, 0, 0, 0, 0, 0, 0); + + __GUEST_ASSERT(ret.error == 0, "Unable to retrieve number of counters from SBI PMU"); + __GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS, + "Invalid number of counters %ld\n", ret.value); + + return ret.value; +} + +static void update_counter_info(int num_counters) +{ + int i = 0; + struct sbiret ret; + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo_arr[i].value = ret.value; + counter_mask_available |= BIT(i); + } + + GUEST_ASSERT(counter_mask_available > 0); +} + +static unsigned long read_fw_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, idx, 0, 0, 0, 0, 0); + GUEST_ASSERT(ret.error == 0); + return ret.value; +} + +static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + unsigned long counter_val = 0; + + __GUEST_ASSERT(ctrinfo.type < 2, "Invalid counter type %d", ctrinfo.type); + + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) + counter_val = pmu_csr_read_num(ctrinfo.csr); + else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) + counter_val = read_fw_counter(idx, ctrinfo); + + return counter_val; +} + +static void test_pmu_event(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, ULONG_MAX/2); + stop_counter(counter, 0); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + stop_counter(counter, 0); + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_invalid_event(void) +{ + struct sbiret ret; + unsigned long event = 0x1234; /* A random event */ + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, 0, + counter_mask_available, 0, event, 0, 0); + GUEST_ASSERT_EQ(ret.error, SBI_ERR_NOT_SUPPORTED); +} + +static void test_pmu_events(void) +{ + int num_counters = 0; + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Sanity testing for any random invalid event */ + test_invalid_event(); + + /* Only these two events are guaranteed to be present */ + test_pmu_event(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_basic_sanity(void) +{ + long out_val = 0; + bool probe; + struct sbiret ret; + int num_counters = 0, i; + union sbi_pmu_ctr_info ctrinfo; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + num_counters = get_num_counters(); + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, + 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo.value = ret.value; + + /** + * Accessibility check of hardware and read capability of firmware counters. + * The spec doesn't mandate any initial value. No need to check any value. + */ + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) { + pmu_csr_read_num(ctrinfo.csr); + GUEST_ASSERT(illegal_handler_invoked); + } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) { + read_fw_counter(i, ctrinfo); + } + } + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + case UCALL_SYNC: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + break; + } +} + +void test_vm_destroy(struct kvm_vm *vm) +{ + memset(ctrinfo_arr, 0, sizeof(union sbi_pmu_ctr_info) * RISCV_MAX_PMU_COUNTERS); + counter_mask_available = 0; + kvm_vm_free(vm); +} + +static void test_vm_basic_test(void *guest_code) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + vm_init_vector_tables(vm); + /* Illegal instruction handler is required to verify read access without configuration */ + vm_install_exception_handler(vm, EXC_INST_ILLEGAL, guest_illegal_exception_handler); + + vcpu_init_vector_tables(vcpu); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu = NULL; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +int main(void) +{ + test_vm_basic_test(test_pmu_basic_sanity); + pr_info("SBI PMU basic test : PASS\n"); + + test_vm_events_test(test_pmu_events); + pr_info("SBI PMU event verification test : PASS\n"); + + return 0; +} -- cgit v1.2.3 From 13cb706e28d9d4d3259954eb08c57b990e4429ea Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:38 -0700 Subject: KVM: riscv: selftests: Add a test for PMU snapshot functionality Verify PMU snapshot functionality by setting up the shared memory correctly and reading the counter values from the shared memory instead of the CSR. Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20240420151741.962500-23-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/include/riscv/sbi.h | 25 ++++ tools/testing/selftests/kvm/lib/riscv/processor.c | 12 ++ tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 144 ++++++++++++++++++++++ 3 files changed, 181 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h index 6675ca673c77..046b432ae896 100644 --- a/tools/testing/selftests/kvm/include/riscv/sbi.h +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -8,6 +8,12 @@ #ifndef SELFTEST_KVM_SBI_H #define SELFTEST_KVM_SBI_H +/* SBI spec version fields */ +#define SBI_SPEC_VERSION_DEFAULT 0x1 +#define SBI_SPEC_VERSION_MAJOR_SHIFT 24 +#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f +#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff + /* SBI return error codes */ #define SBI_SUCCESS 0 #define SBI_ERR_FAILURE -1 @@ -33,6 +39,9 @@ enum sbi_ext_id { }; enum sbi_ext_base_fid { + SBI_EXT_BASE_GET_SPEC_VERSION = 0, + SBI_EXT_BASE_GET_IMP_ID, + SBI_EXT_BASE_GET_IMP_VERSION, SBI_EXT_BASE_PROBE_EXT = 3, }; enum sbi_ext_pmu_fid { @@ -60,6 +69,12 @@ union sbi_pmu_ctr_info { }; }; +struct riscv_pmu_snapshot_data { + u64 ctr_overflow_mask; + u64 ctr_values[64]; + u64 reserved[447]; +}; + struct sbiret { long error; long value; @@ -113,4 +128,14 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, bool guest_sbi_probe_extension(int extid, long *out_val); +/* Make SBI version */ +static inline unsigned long sbi_mk_version(unsigned long major, + unsigned long minor) +{ + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT) + | (minor & SBI_SPEC_VERSION_MINOR_MASK); +} + +unsigned long get_host_sbi_spec_version(void); + #endif /* SELFTEST_KVM_SBI_H */ diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index e8211f5d6863..ccb35573749c 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -502,3 +502,15 @@ bool guest_sbi_probe_extension(int extid, long *out_val) return true; } + +unsigned long get_host_sbi_spec_version(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_GET_SPEC_VERSION, 0, + 0, 0, 0, 0, 0); + + GUEST_ASSERT(!ret.error); + + return ret.value; +} diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index 7c81691e39c5..9002ff451abf 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -19,6 +19,11 @@ #define RISCV_MAX_PMU_COUNTERS 64 union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; +/* Snapshot shared memory data */ +#define PMU_SNAPSHOT_GPA_BASE BIT(30) +static void *snapshot_gva; +static vm_paddr_t snapshot_gpa; + /* Cache the available counters in a bitmask */ static unsigned long counter_mask_available; @@ -186,6 +191,32 @@ static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo) return counter_val; } +static inline void verify_sbi_requirement_assert(void) +{ + long out_val = 0; + bool probe; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + if (get_host_sbi_spec_version() < sbi_mk_version(2, 0)) + __GUEST_ASSERT(0, "SBI implementation version doesn't support PMU Snapshot"); +} + +static void snapshot_set_shmem(vm_paddr_t gpa, unsigned long flags) +{ + unsigned long lo = (unsigned long)gpa; +#if __riscv_xlen == 32 + unsigned long hi = (unsigned long)(gpa >> 32); +#else + unsigned long hi = gpa == -1 ? -1 : 0; +#endif + struct sbiret ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + + GUEST_ASSERT(ret.value == 0 && ret.error == 0); +} + static void test_pmu_event(unsigned long event) { unsigned long counter; @@ -234,6 +265,59 @@ static void test_pmu_event(unsigned long event) stop_reset_counter(counter, 0); } +static void test_pmu_event_snapshot(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + /* The counter value is updated w.r.t relative index of cbase */ + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + WRITE_ONCE(snapshot_data->ctr_values[0], ULONG_MAX/2); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_pre = READ_ONCE(snapshot_data->ctr_values[0]); + + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + static void test_invalid_event(void) { struct sbiret ret; @@ -301,6 +385,34 @@ static void test_pmu_basic_sanity(void) GUEST_DONE(); } +static void test_pmu_events_snaphost(void) +{ + int num_counters = 0; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + int i; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Validate shared memory access */ + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_overflow_mask), 0); + for (i = 0; i < num_counters; i++) { + if (counter_mask_available & (BIT(i))) + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_values[i]), 0); + } + /* Only these two events are guranteed to be present */ + test_pmu_event_snapshot(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event_snapshot(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + static void run_vcpu(struct kvm_vcpu *vcpu) { struct ucall uc; @@ -357,6 +469,35 @@ static void test_vm_events_test(void *guest_code) test_vm_destroy(vm); } +static void test_vm_setup_snapshot_mem(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + /* PMU Snapshot requires single page only */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, PMU_SNAPSHOT_GPA_BASE, 1, 1, 0); + /* PMU_SNAPSHOT_GPA_BASE is identity mapped */ + virt_map(vm, PMU_SNAPSHOT_GPA_BASE, PMU_SNAPSHOT_GPA_BASE, 1); + + snapshot_gva = (void *)(PMU_SNAPSHOT_GPA_BASE); + snapshot_gpa = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gpa); +} + +static void test_vm_events_snapshot_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + int main(void) { test_vm_basic_test(test_pmu_basic_sanity); @@ -365,5 +506,8 @@ int main(void) test_vm_events_test(test_pmu_events); pr_info("SBI PMU event verification test : PASS\n"); + test_vm_events_snapshot_test(test_pmu_events_snaphost); + pr_info("SBI PMU event verification with snapshot test : PASS\n"); + return 0; } -- cgit v1.2.3 From 4ace2573d13ee22ebc5bb90efca6c2c9b27b2ef8 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:39 -0700 Subject: KVM: riscv: selftests: Add a test for counter overflow Add a test for verifying overflow interrupt. Currently, it relies on overflow support on cycle/instret events. This test works for cycle/ instret events which support sampling via hpmcounters on the platform. There are no ISA extensions to detect if a platform supports that. Thus, this test will fail on platform with virtualization but doesn't support overflow on these two events. Reviewed-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Atish Patra Link: https://lore.kernel.org/r/20240420151741.962500-24-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 113 +++++++++++++++++++++++ 1 file changed, 113 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index 9002ff451abf..0fd9b76ae838 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -14,6 +14,7 @@ #include "test_util.h" #include "processor.h" #include "sbi.h" +#include "arch_timer.h" /* Maximum counters(firmware + hardware) */ #define RISCV_MAX_PMU_COUNTERS 64 @@ -24,6 +25,9 @@ union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; static void *snapshot_gva; static vm_paddr_t snapshot_gpa; +static int vcpu_shared_irq_count; +static int counter_in_use; + /* Cache the available counters in a bitmask */ static unsigned long counter_mask_available; @@ -120,6 +124,31 @@ static void guest_illegal_exception_handler(struct ex_regs *regs) regs->epc += 4; } +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + unsigned long overflown_mask; + unsigned long counter_val = 0; + + /* Validate that we are in the correct irq handler */ + GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF); + + /* Stop all counters first to avoid further interrupts */ + stop_counter(counter_in_use, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + csr_clear(CSR_SIP, BIT(IRQ_PMU_OVF)); + + overflown_mask = READ_ONCE(snapshot_data->ctr_overflow_mask); + GUEST_ASSERT(overflown_mask & 0x01); + + WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1); + + counter_val = READ_ONCE(snapshot_data->ctr_values[0]); + /* Now start the counter to mimick the real driver behavior */ + start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val); +} + static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, unsigned long cflags, unsigned long event) @@ -318,6 +347,33 @@ static void test_pmu_event_snapshot(unsigned long event) stop_reset_counter(counter, 0); } +static void test_pmu_event_overflow(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_post; + unsigned long counter_init_value = ULONG_MAX - 10000; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_in_use = counter; + + /* The counter value is updated w.r.t relative index of cbase passed to start/stop */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + udelay(msecs_to_usecs(2000)); + /* irq handler should have stopped the counter */ + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + /* The counter value after stopping should be less the init value due to overflow */ + __GUEST_ASSERT(counter_value_post < counter_init_value, + "counter_value_post %lx counter_init_value %lx for counter\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + static void test_invalid_event(void) { struct sbiret ret; @@ -413,6 +469,34 @@ static void test_pmu_events_snaphost(void) GUEST_DONE(); } +static void test_pmu_events_overflow(void) +{ + int num_counters = 0; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + csr_set(CSR_IE, BIT(IRQ_PMU_OVF)); + local_irq_enable(); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* + * Qemu supports overflow for cycle/instruction. + * This test may fail on any platform that do not support overflow for these two events. + */ + test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1); + + test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2); + + GUEST_DONE(); +} + static void run_vcpu(struct kvm_vcpu *vcpu) { struct ucall uc; @@ -498,6 +582,32 @@ static void test_vm_events_snapshot_test(void *guest_code) test_vm_destroy(vm); } +static void test_vm_events_overflow(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpu, KVM_RISCV_ISA_EXT_SSCOFPMF), + "Sscofpmf is not available, skipping overflow test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + vm_init_vector_tables(vm); + vm_install_interrupt_handler(vm, guest_irq_handler); + + vcpu_init_vector_tables(vcpu); + /* Initialize guest timer frequency. */ + vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq); + sync_global_to_guest(vm, timer_freq); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + int main(void) { test_vm_basic_test(test_pmu_basic_sanity); @@ -509,5 +619,8 @@ int main(void) test_vm_events_snapshot_test(test_pmu_events_snaphost); pr_info("SBI PMU event verification with snapshot test : PASS\n"); + test_vm_events_overflow(test_pmu_events_overflow); + pr_info("SBI PMU event verification with overflow test : PASS\n"); + return 0; } -- cgit v1.2.3 From 5ef2f3d4e747c7851678ad2b70e37be886a8c9eb Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sat, 20 Apr 2024 08:17:40 -0700 Subject: KVM: riscv: selftests: Add commandline option for SBI PMU test SBI PMU test comprises of multiple tests and user may want to run only a subset depending on the platform. The most common case would be to run all to validate all the tests. However, some platform may not support all events or all ISA extensions. The commandline option allows user to disable any set of tests if they want to. Suggested-by: Andrew Jones Signed-off-by: Atish Patra Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20240420151741.962500-25-atishp@rivosinc.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/sbi_pmu_test.c | 73 +++++++++++++++++++++--- 1 file changed, 64 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index 0fd9b76ae838..69bb94e6b227 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -33,6 +33,13 @@ static unsigned long counter_mask_available; static bool illegal_handler_invoked; +#define SBI_PMU_TEST_BASIC BIT(0) +#define SBI_PMU_TEST_EVENTS BIT(1) +#define SBI_PMU_TEST_SNAPSHOT BIT(2) +#define SBI_PMU_TEST_OVERFLOW BIT(3) + +static int disabled_tests; + unsigned long pmu_csr_read_num(int csr_num) { #define switchcase_csr_read(__csr_num, __val) {\ @@ -608,19 +615,67 @@ static void test_vm_events_overflow(void *guest_code) test_vm_destroy(vm); } -int main(void) +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-d ]\n", name); + pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("\t-h: print this help screen\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "hd:")) != -1) { + switch (opt) { + case 'd': + if (!strncmp("basic", optarg, 5)) + disabled_tests |= SBI_PMU_TEST_BASIC; + else if (!strncmp("events", optarg, 6)) + disabled_tests |= SBI_PMU_TEST_EVENTS; + else if (!strncmp("snapshot", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_SNAPSHOT; + else if (!strncmp("overflow", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_OVERFLOW; + else + goto done; + break; + case 'h': + default: + goto done; + } + } + + return true; +done: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) { - test_vm_basic_test(test_pmu_basic_sanity); - pr_info("SBI PMU basic test : PASS\n"); + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (!(disabled_tests & SBI_PMU_TEST_BASIC)) { + test_vm_basic_test(test_pmu_basic_sanity); + pr_info("SBI PMU basic test : PASS\n"); + } - test_vm_events_test(test_pmu_events); - pr_info("SBI PMU event verification test : PASS\n"); + if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) { + test_vm_events_test(test_pmu_events); + pr_info("SBI PMU event verification test : PASS\n"); + } - test_vm_events_snapshot_test(test_pmu_events_snaphost); - pr_info("SBI PMU event verification with snapshot test : PASS\n"); + if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { + test_vm_events_snapshot_test(test_pmu_events_snaphost); + pr_info("SBI PMU event verification with snapshot test : PASS\n"); + } - test_vm_events_overflow(test_pmu_events_overflow); - pr_info("SBI PMU event verification with overflow test : PASS\n"); + if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) { + test_vm_events_overflow(test_pmu_events_overflow); + pr_info("SBI PMU event verification with overflow test : PASS\n"); + } return 0; } -- cgit v1.2.3 From 680fda4f671452d99401f294f20b60b3fdf24191 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 23 Apr 2024 14:49:05 +0200 Subject: test: hsr: Remove script code already implemented in lib.sh Some parts (like netns creation and cleanup) of hsr_ping.sh script are already implemented in ../lib.sh common script, so can be replaced by it. Signed-off-by: Lukasz Majewski Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/hsr/hsr_ping.sh | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index 1c6457e54625..bd7c4b8f07b8 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -1,6 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +source ../lib.sh ret=0 ksft_skip=4 ipv6=true @@ -27,19 +28,7 @@ while getopts "$optstring" option;do esac done -sec=$(date +%s) -rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) -ns1="ns1-$rndh" -ns2="ns2-$rndh" -ns3="ns3-$rndh" - -cleanup() -{ - local netns - for netns in "$ns1" "$ns2" "$ns3" ;do - ip netns del $netns - done -} +setup_ns ns1 ns2 ns3 # $1: IP address is_v6() @@ -254,21 +243,10 @@ if [ $? -ne 0 ];then exit $ksft_skip fi -trap cleanup EXIT - -for i in "$ns1" "$ns2" "$ns3" ;do - ip netns add $i || exit $ksft_skip - ip -net $i link set lo up -done +trap cleanup_all_ns EXIT setup_hsr_interfaces 0 do_complete_ping_test -cleanup - -for i in "$ns1" "$ns2" "$ns3" ;do - ip netns add $i || exit $ksft_skip - ip -net $i link set lo up -done setup_hsr_interfaces 1 do_complete_ping_test -- cgit v1.2.3 From 154a82cb64be2f6f88f8eabd0d0353ea66adae43 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 23 Apr 2024 14:49:06 +0200 Subject: test: hsr: Move common code to hsr_common.sh file Some of the code already present in the hsr_ping.sh test program can be moved to a separate script file, so it can be reused by other HSR functionality (like HSR-SAN) tests. Signed-off-by: Lukasz Majewski Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/hsr/Makefile | 1 + tools/testing/selftests/net/hsr/hsr_common.sh | 83 +++++++++++++++++++++++++++ tools/testing/selftests/net/hsr/hsr_ping.sh | 82 ++------------------------ 3 files changed, 88 insertions(+), 78 deletions(-) create mode 100644 tools/testing/selftests/net/hsr/hsr_common.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile index 92c1d9d080cd..c782ad19e011 100644 --- a/tools/testing/selftests/net/hsr/Makefile +++ b/tools/testing/selftests/net/hsr/Makefile @@ -3,5 +3,6 @@ top_srcdir = ../../../../.. TEST_PROGS := hsr_ping.sh +TEST_FILES += hsr_common.sh include ../../lib.mk diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh new file mode 100644 index 000000000000..be4ad07ff355 --- /dev/null +++ b/tools/testing/selftests/net/hsr/hsr_common.sh @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: GPL-2.0 +# Common code for HSR testing scripts + +source ../lib.sh +ret=0 +ksft_skip=4 + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_ping() +{ + local netns="$1" + local connect_addr="$2" + local ping_args="-q -c 2" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + return 1 + fi + + return 0 +} + +do_ping_long() +{ + local netns="$1" + local connect_addr="$2" + local ping_args="-q -c 10" + + if is_v6 "${connect_addr}"; then + $ipv6 || return 0 + ping_args="${ping_args} -6" + fi + + OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)" + if [ $? -ne 0 ] ; then + echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2 + ret=1 + return 1 + fi + + VAL="$(echo $OUT | cut -d' ' -f1-8)" + if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ] + then + echo "$netns -> $connect_addr ping TEST [ FAIL ]" + echo "Expect to send and receive 10 packets and no duplicates." + echo "Full message: ${OUT}." + ret=1 + return 1 + fi + + return 0 +} + +stop_if_error() +{ + local msg="$1" + + if [ ${ret} -ne 0 ]; then + echo "FAIL: ${msg}" 1>&2 + exit ${ret} + fi +} + +check_prerequisites() +{ + ip -Version > /dev/null 2>&1 + if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip + fi +} diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index bd7c4b8f07b8..790294c8af83 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -1,11 +1,10 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -source ../lib.sh -ret=0 -ksft_skip=4 ipv6=true +source ./hsr_common.sh + optstring="h4" usage() { echo "Usage: $0 [OPTION]" @@ -28,76 +27,6 @@ while getopts "$optstring" option;do esac done -setup_ns ns1 ns2 ns3 - -# $1: IP address -is_v6() -{ - [ -z "${1##*:*}" ] -} - -do_ping() -{ - local netns="$1" - local connect_addr="$2" - local ping_args="-q -c 2" - - if is_v6 "${connect_addr}"; then - $ipv6 || return 0 - ping_args="${ping_args} -6" - fi - - ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null - if [ $? -ne 0 ] ; then - echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2 - ret=1 - return 1 - fi - - return 0 -} - -do_ping_long() -{ - local netns="$1" - local connect_addr="$2" - local ping_args="-q -c 10" - - if is_v6 "${connect_addr}"; then - $ipv6 || return 0 - ping_args="${ping_args} -6" - fi - - OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)" - if [ $? -ne 0 ] ; then - echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2 - ret=1 - return 1 - fi - - VAL="$(echo $OUT | cut -d' ' -f1-8)" - if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ] - then - echo "$netns -> $connect_addr ping TEST [ FAIL ]" - echo "Expect to send and receive 10 packets and no duplicates." - echo "Full message: ${OUT}." - ret=1 - return 1 - fi - - return 0 -} - -stop_if_error() -{ - local msg="$1" - - if [ ${ret} -ne 0 ]; then - echo "FAIL: ${msg}" 1>&2 - exit ${ret} - fi -} - do_complete_ping_test() { echo "INFO: Initial validation ping." @@ -237,11 +166,8 @@ setup_hsr_interfaces() ip -net "$ns3" link set hsr3 up } -ip -Version > /dev/null 2>&1 -if [ $? -ne 0 ];then - echo "SKIP: Could not run test without ip tool" - exit $ksft_skip -fi +check_prerequisites +setup_ns ns1 ns2 ns3 trap cleanup_all_ns EXIT -- cgit v1.2.3 From 40b90bf60ce10310baa7c4c2537cb3ee6ab622b9 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 23 Apr 2024 14:49:07 +0200 Subject: test: hsr: Extract version agnostic information from ping command output Current code checks if ping command output match hardcoded pattern: "10 packets transmitted, 10 received, 0% packet loss,". Such approach will work only from one ping program version (for which this test has been originally written). This patch address problem when ping with different summary output like "10 packets transmitted, 10 packets received, 0% packet" is used to run this test - for example one from busybox (as the test system runs in QEMU with rootfs created with buildroot). The fix is to modify output of ping command to be agnostic to ping version used on the platform. Signed-off-by: Lukasz Majewski Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/hsr/hsr_common.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh index be4ad07ff355..8e97b1f2e7e5 100644 --- a/tools/testing/selftests/net/hsr/hsr_common.sh +++ b/tools/testing/selftests/net/hsr/hsr_common.sh @@ -51,7 +51,8 @@ do_ping_long() fi VAL="$(echo $OUT | cut -d' ' -f1-8)" - if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ] + SED_VAL="$(echo ${VAL} | sed -r -e 's/([0-9]{2}).*([0-9]{2}).*[[:space:]]([0-9]+%).*/\1 transmitted \2 received \3 loss/')" + if [ "${SED_VAL}" != "10 transmitted 10 received 0% loss" ] then echo "$netns -> $connect_addr ping TEST [ FAIL ]" echo "Expect to send and receive 10 packets and no duplicates." -- cgit v1.2.3 From 542e645c4a4d7dc9a6ef6bbf0e4249fc113728d1 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 23 Apr 2024 14:49:08 +0200 Subject: test: hsr: Add test for HSR RedBOX (HSR-SAN) mode of operation This patch adds hsr_redbox.sh script to test if HSR-SAN mode of operation works correctly. Signed-off-by: Lukasz Majewski Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/hsr/Makefile | 2 +- tools/testing/selftests/net/hsr/hsr_redbox.sh | 92 +++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/hsr/hsr_redbox.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile index c782ad19e011..884cd2cc0681 100644 --- a/tools/testing/selftests/net/hsr/Makefile +++ b/tools/testing/selftests/net/hsr/Makefile @@ -2,7 +2,7 @@ top_srcdir = ../../../../.. -TEST_PROGS := hsr_ping.sh +TEST_PROGS := hsr_ping.sh hsr_redbox.sh TEST_FILES += hsr_common.sh include ../../lib.mk diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh new file mode 100755 index 000000000000..52e0412c32e6 --- /dev/null +++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ipv6=false + +source ./hsr_common.sh + +do_complete_ping_test() +{ + echo "INFO: Initial validation ping (HSR-SAN/RedBox)." + # Each node has to be able each one. + do_ping "${ns1}" 100.64.0.2 + do_ping "${ns2}" 100.64.0.1 + # Ping from SAN to hsr1 (via hsr2) + do_ping "${ns3}" 100.64.0.1 + do_ping "${ns1}" 100.64.0.3 + stop_if_error "Initial validation failed." + + # Wait for MGNT HSR frames being received and nodes being + # merged. + sleep 5 + + echo "INFO: Longer ping test (HSR-SAN/RedBox)." + # Ping from SAN to hsr1 (via hsr2) + do_ping_long "${ns3}" 100.64.0.1 + # Ping from hsr1 (via hsr2) to SAN + do_ping_long "${ns1}" 100.64.0.3 + stop_if_error "Longer ping test failed." + + echo "INFO: All good." +} + +setup_hsr_interfaces() +{ + local HSRv="$1" + + echo "INFO: preparing interfaces for HSRv${HSRv} (HSR-SAN/RedBox)." + +# |NS1 | +# | | +# | /-- hsr1 --\ | +# | ns1eth1 ns1eth2 | +# |------------------------| +# | | +# | | +# | | +# |------------------------| |-----------| +# | ns2eth1 ns2eth2 | | | +# | \-- hsr2 --/ | | | +# | \ | | | +# | ns2eth3 |--------| ns3eth1 | +# | (interlink)| | | +# |NS2 (RedBOX) | |NS3 (SAN) | +# + # Check if iproute2 supports adding interlink port to hsrX device + ip link help hsr | grep -q INTERLINK + [ $? -ne 0 ] && { echo "iproute2: HSR interlink interface not supported!"; exit 0; } + + # Create interfaces for name spaces + ip link add ns1eth1 netns "${ns1}" type veth peer name ns2eth1 netns "${ns2}" + ip link add ns1eth2 netns "${ns1}" type veth peer name ns2eth2 netns "${ns2}" + ip link add ns3eth1 netns "${ns3}" type veth peer name ns2eth3 netns "${ns2}" + + sleep 1 + + ip -n "${ns1}" link set ns1eth1 up + ip -n "${ns1}" link set ns1eth2 up + + ip -n "${ns2}" link set ns2eth1 up + ip -n "${ns2}" link set ns2eth2 up + ip -n "${ns2}" link set ns2eth3 up + + ip -n "${ns3}" link set ns3eth1 up + + ip -net "${ns1}" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version ${HSRv} proto 0 + ip -net "${ns2}" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 interlink ns2eth3 supervision 45 version ${HSRv} proto 0 + + ip -n "${ns1}" addr add 100.64.0.1/24 dev hsr1 + ip -n "${ns2}" addr add 100.64.0.2/24 dev hsr2 + ip -n "${ns3}" addr add 100.64.0.3/24 dev ns3eth1 + + ip -n "${ns1}" link set hsr1 up + ip -n "${ns2}" link set hsr2 up +} + +check_prerequisites +setup_ns ns1 ns2 ns3 + +setup_hsr_interfaces 1 +do_complete_ping_test + +exit $ret -- cgit v1.2.3 From 41ad836e393aa834039de48c84305dfe7d6aceef Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Apr 2024 12:40:46 +0200 Subject: selftests: forwarding: add ability to assemble NETIFS array by driver name Allow driver tests to work without specifying the netdevice names. Introduce a possibility to search for available netdevices according to set driver name. Allow test to specify the name by setting NETIF_FIND_DRIVER variable. Note that user overrides this either by passing netdevice names on the command line or by declaring NETIFS array in custom forwarding.config configuration file. Signed-off-by: Jiri Pirko Reviewed-by: Petr Machata Reviewed-by: Benjamin Poirier Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 37 +++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 7913c6ee418d..9d6802c6c023 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -84,6 +84,43 @@ declare -A NETIFS=( # e.g. a low-power board. : "${KSFT_MACHINE_SLOW:=no}" +############################################################################## +# Find netifs by test-specified driver name + +driver_name_get() +{ + local dev=$1; shift + local driver_path="/sys/class/net/$dev/device/driver" + + if [[ -L $driver_path ]]; then + basename `realpath $driver_path` + fi +} + +netif_find_driver() +{ + local ifnames=`ip -j link show | jq -r ".[].ifname"` + local count=0 + + for ifname in $ifnames + do + local driver_name=`driver_name_get $ifname` + if [[ ! -z $driver_name && $driver_name == $NETIF_FIND_DRIVER ]]; then + count=$((count + 1)) + NETIFS[p$count]="$ifname" + fi + done +} + +# Whether to find netdevice according to the driver speficied by the importer +: "${NETIF_FIND_DRIVER:=}" + +if [[ $NETIF_FIND_DRIVER ]]; then + unset NETIFS + declare -A NETIFS + netif_find_driver +fi + net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") if [[ -f $net_forwarding_dir/forwarding.config ]]; then -- cgit v1.2.3 From 617198cbc69d94c6b5130a97e51598428398a7d0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Apr 2024 12:40:47 +0200 Subject: selftests: forwarding: add check_driver() helper Add a helper to be used to check if the netdevice is backed by specified driver. Signed-off-by: Jiri Pirko Reviewed-by: Benjamin Poirier Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 9d6802c6c023..2d57912d3973 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -283,6 +283,18 @@ if [[ "$(id -u)" -ne 0 ]]; then exit $ksft_skip fi +check_driver() +{ + local dev=$1; shift + local expected=$1; shift + local driver_name=`driver_name_get $dev` + + if [[ $driver_name != $expected ]]; then + echo "SKIP: expected driver $expected for $dev, got $driver_name instead" + exit $ksft_skip + fi +} + if [[ "$CHECK_TC" = "yes" ]]; then check_tc_version fi -- cgit v1.2.3 From dae9dd5fd9f35f0e57599148d3655e9d473c8e24 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Apr 2024 12:40:48 +0200 Subject: selftests: forwarding: add wait_for_dev() helper The existing setup_wait*() helper family check the status of the interface to be up. Introduce wait_for_dev() to wait for the netdevice to appear, for example after test script does manual device bind. Signed-off-by: Jiri Pirko Reviewed-by: Petr Machata Reviewed-by: Benjamin Poirier Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 2d57912d3973..3353a1745946 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -738,6 +738,19 @@ setup_wait() sleep $WAIT_TIME } +wait_for_dev() +{ + local dev=$1; shift + local timeout=${1:-$WAIT_TIMEOUT}; shift + + slowwait $timeout ip link show dev $dev &> /dev/null + if (( $? )); then + check_err 1 + log_test wait_for_dev "Interface $dev did not appear." + exit $EXIT_STATUS + fi +} + cmd_jq() { local cmd=$1 -- cgit v1.2.3 From ccfaed04db5e0f372986baac051b20fbd9e69096 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Apr 2024 12:40:49 +0200 Subject: selftests: virtio_net: add initial tests Introduce initial tests for virtio_net driver. Focus on feature testing leveraging previously introduced debugfs feature filtering infrastructure. Add very basic ping and F_MAC feature tests. To run this, do: $ make -C tools/testing/selftests/ TARGETS=drivers/net/virtio_net/ run_tests Run it on a system with 2 virtio_net devices connected back-to-back on the hypervisor. Signed-off-by: Jiri Pirko Reviewed-by: Petr Machata Tested-by: Benjamin Poirier Signed-off-by: Paolo Abeni --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + .../selftests/drivers/net/virtio_net/Makefile | 15 +++ .../drivers/net/virtio_net/basic_features.sh | 131 +++++++++++++++++++++ .../selftests/drivers/net/virtio_net/config | 2 + .../drivers/net/virtio_net/virtio_net_common.sh | 99 ++++++++++++++++ 6 files changed, 249 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/virtio_net/Makefile create mode 100755 tools/testing/selftests/drivers/net/virtio_net/basic_features.sh create mode 100644 tools/testing/selftests/drivers/net/virtio_net/config create mode 100644 tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 209143bac37a..ab89edc6974d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23466,6 +23466,7 @@ F: include/linux/virtio*.h F: include/linux/vringh.h F: include/uapi/linux/virtio_*.h F: tools/virtio/ +F: tools/testing/selftests/drivers/net/virtio_net/ VIRTIO CRYPTO DRIVER M: Gonglei diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index c785b6256a45..2c940e9c4ced 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -20,6 +20,7 @@ TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net TARGETS += drivers/net/bonding TARGETS += drivers/net/team +TARGETS += drivers/net/virtio_net TARGETS += dt TARGETS += efivarfs TARGETS += exec diff --git a/tools/testing/selftests/drivers/net/virtio_net/Makefile b/tools/testing/selftests/drivers/net/virtio_net/Makefile new file mode 100644 index 000000000000..7ec7cd3ab2cc --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0+ OR MIT + +TEST_PROGS = basic_features.sh \ + # + +TEST_FILES = \ + virtio_net_common.sh \ + # + +TEST_INCLUDES = \ + ../../../net/forwarding/lib.sh \ + ../../../net/lib.sh \ + # + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh new file mode 100755 index 000000000000..cf8cf816ed48 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# See virtio_net_common.sh comments for more details about assumed setup + +ALL_TESTS=" + initial_ping_test + f_mac_test +" + +source virtio_net_common.sh + +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +h1=${NETIFS[p1]} +h2=${NETIFS[p2]} + +h1_create() +{ + simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_destroy() +{ + simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h2_create() +{ + simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_destroy() +{ + simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +initial_ping_test() +{ + setup_cleanup + setup_prepare + ping_test $h1 $H2_IPV4 " simple" +} + +f_mac_test() +{ + RET=0 + local test_name="mac feature filtered" + + virtio_feature_present $h1 $VIRTIO_NET_F_MAC + if [ $? -ne 0 ]; then + log_test_skip "$test_name" "Device $h1 is missing feature $VIRTIO_NET_F_MAC." + return 0 + fi + virtio_feature_present $h1 $VIRTIO_NET_F_MAC + if [ $? -ne 0 ]; then + log_test_skip "$test_name" "Device $h2 is missing feature $VIRTIO_NET_F_MAC." + return 0 + fi + + setup_cleanup + setup_prepare + + grep -q 0 /sys/class/net/$h1/addr_assign_type + check_err $? "Permanent address assign type for $h1 is not set" + grep -q 0 /sys/class/net/$h2/addr_assign_type + check_err $? "Permanent address assign type for $h2 is not set" + + setup_cleanup + virtio_filter_feature_add $h1 $VIRTIO_NET_F_MAC + virtio_filter_feature_add $h2 $VIRTIO_NET_F_MAC + setup_prepare + + grep -q 0 /sys/class/net/$h1/addr_assign_type + check_fail $? "Permanent address assign type for $h1 is set when F_MAC feature is filtered" + grep -q 0 /sys/class/net/$h2/addr_assign_type + check_fail $? "Permanent address assign type for $h2 is set when F_MAC feature is filtered" + + ping_do $h1 $H2_IPV4 + check_err $? "Ping failed" + + log_test "$test_name" +} + +setup_prepare() +{ + virtio_device_rebind $h1 + virtio_device_rebind $h2 + wait_for_dev $h1 + wait_for_dev $h2 + + vrf_prepare + + h1_create + h2_create +} + +setup_cleanup() +{ + h2_destroy + h1_destroy + + vrf_cleanup + + virtio_filter_features_clear $h1 + virtio_filter_features_clear $h2 + virtio_device_rebind $h1 + virtio_device_rebind $h2 + wait_for_dev $h1 + wait_for_dev $h2 +} + +cleanup() +{ + pre_cleanup + setup_cleanup +} + +check_driver $h1 "virtio_net" +check_driver $h2 "virtio_net" +check_virtio_debugfs $h1 +check_virtio_debugfs $h2 + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/virtio_net/config b/tools/testing/selftests/drivers/net/virtio_net/config new file mode 100644 index 000000000000..f35de0542b60 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/config @@ -0,0 +1,2 @@ +CONFIG_VIRTIO_NET=y +CONFIG_VIRTIO_DEBUG=y diff --git a/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh new file mode 100644 index 000000000000..57bd8055e2e5 --- /dev/null +++ b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This assumes running on a host with two virtio interfaces connected +# back to back. Example script to do such wire-up of tap devices would +# look like this: +# +# ======================================================================================================= +# #!/bin/bash +# +# DEV1="$1" +# DEV2="$2" +# +# sudo tc qdisc add dev $DEV1 clsact +# sudo tc qdisc add dev $DEV2 clsact +# sudo tc filter add dev $DEV1 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV2 +# sudo tc filter add dev $DEV2 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV1 +# sudo ip link set $DEV1 up +# sudo ip link set $DEV2 up +# ======================================================================================================= + +REQUIRE_MZ="no" +NETIF_CREATE="no" +NETIF_FIND_DRIVER="virtio_net" +NUM_NETIFS=2 + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +VIRTIO_NET_F_MAC=5 + +virtio_device_get() +{ + local dev=$1; shift + local device_path="/sys/class/net/$dev/device/" + + basename `realpath $device_path` +} + +virtio_device_rebind() +{ + local dev=$1; shift + local device=`virtio_device_get $dev` + + echo "$device" > /sys/bus/virtio/drivers/virtio_net/unbind + echo "$device" > /sys/bus/virtio/drivers/virtio_net/bind +} + +virtio_debugfs_get() +{ + local dev=$1; shift + local device=`virtio_device_get $dev` + + echo /sys/kernel/debug/virtio/$device/ +} + +check_virtio_debugfs() +{ + local dev=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + if [ ! -f "$debugfs/device_features" ] || + [ ! -f "$debugfs/filter_feature_add" ] || + [ ! -f "$debugfs/filter_feature_del" ] || + [ ! -f "$debugfs/filter_features" ] || + [ ! -f "$debugfs/filter_features_clear" ]; then + echo "SKIP: not possible to access debugfs for $dev" + exit $ksft_skip + fi +} + +virtio_feature_present() +{ + local dev=$1; shift + local feature=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + cat $debugfs/device_features |grep "^$feature$" &> /dev/null + return $? +} + +virtio_filter_features_clear() +{ + local dev=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + echo "1" > $debugfs/filter_features_clear +} + +virtio_filter_feature_add() +{ + local dev=$1; shift + local feature=$1; shift + local debugfs=`virtio_debugfs_get $dev` + + echo "$feature" > $debugfs/filter_feature_add +} -- cgit v1.2.3 From 6e25bcf06af0341691f7058e17e04800f6a19e26 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Fri, 26 Apr 2024 16:51:58 +0200 Subject: bpf_helpers.h: Define bpf_tail_call_static when building with GCC The definition of bpf_tail_call_static in tools/lib/bpf/bpf_helpers.h is guarded by a preprocessor check to assure that clang is recent enough to support it. This patch updates the guard so the function is compiled when using GCC 13 or later as well. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240426145158.14409-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_helpers.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cd17f6d0791f..62e1c0cc4a59 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -137,7 +137,8 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12) +#if defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -165,6 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif +#endif enum libbpf_pin_type { LIBBPF_PIN_NONE, -- cgit v1.2.3 From 7cd6750d9a560fa69bb640a7280479d6a67999ad Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:10 +0000 Subject: selftests/bpf: Test PROBE_MEM of VSYSCALL_ADDR on x86-64 The vsyscall is a legacy API for fast execution of system calls. It maps a page at address VSYSCALL_ADDR into the userspace program. This address is in the top 10MB of the address space: ffffffffff600000 - ffffffffff600fff | 4 kB | legacy vsyscall ABI The last commit fixes the x86-64 BPF JIT to skip accessing addresses in this memory region. Add this address to bpf_testmod_return_ptr() so we can make sure that it is fixed. After this change and without the previous commit, subprogs_extable selftest will crash the kernel. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 39ad96a18123..edcd26106557 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -205,6 +205,9 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) case 5: return (void *)~(1ull << 30); /* trigger extable */ case 6: return &f; /* valid addr */ case 7: return (void *)((long)&f | 1); /* kernel tricks */ +#ifdef CONFIG_X86_64 + case 8: return (void *)VSYSCALL_ADDR; /* vsyscall page address */ +#endif default: return NULL; } } -- cgit v1.2.3 From d7bd0aeb5ab6609b71ca87db4ec6d8bba24b2209 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 26 Apr 2024 17:48:50 +0800 Subject: ACPI: tools: pfrut: Print the update_cap field during capability query There is request from the end user to print this field to better query what type of update capability is supported on this platform. Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- tools/power/acpi/tools/pfrut/pfrut.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c index 388c9e3ad040..44a9ecbd91e8 100644 --- a/tools/power/acpi/tools/pfrut/pfrut.c +++ b/tools/power/acpi/tools/pfrut/pfrut.c @@ -174,6 +174,8 @@ void print_cap(struct pfru_update_cap_info *cap) exit(1); } + printf("update capability:%d\n", cap->update_cap); + uuid_unparse(cap->code_type, uuid); printf("code injection image type:%s\n", uuid); printf("fw_version:%d\n", cap->fw_version); -- cgit v1.2.3 From f8ac9b0fab33ed138da80b95b94d8be16b07c6fd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2024 15:23:39 -0700 Subject: selftests: drv-net: extend the README with more info and example Add more info to the README. It's also now copied to GitHub for increased visibility: https://github.com/linux-netdev/nipa/wiki/Running-driver-tests Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240425222341.309778-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/README.rst | 97 ++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index 0cbab33dad1f..3b6a29e6564b 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -1,18 +1,42 @@ -Running tests -============= +.. SPDX-License-Identifier: GPL-2.0 -Tests are executed within kselftest framework like any other tests. -By default tests execute against software drivers such as netdevsim. -All tests must support running against a real device (SW-only tests -should instead be placed in net/ or drivers/net/netdevsim, HW-only -tests in drivers/net/hw). +Running driver tests +==================== -Set appropriate variables to point the tests at a real device. +Networking driver tests are executed within kselftest framework like any +other tests. They support testing both real device drivers and emulated / +software drivers (latter mostly to test the core parts of the stack). + +SW mode +~~~~~~~ + +By default, when no extra parameters are set or exported, tests execute +against software drivers such as netdevsim. No extra preparation is required +the software devices are created and destroyed as part of the test. +In this mode the tests are indistinguishable from other selftests and +(for example) can be run under ``virtme-ng`` like the core networking selftests. + +HW mode +~~~~~~~ + +Executing tests against a real device requires external preparation. +The netdevice against which tests will be run must exist, be running +(in UP state) and be configured with an IP address. + +Refer to list of :ref:`Variables` later in this file to set up running +the tests against a real device. + +Both modes required +~~~~~~~~~~~~~~~~~~~ + +All tests in drivers/net must support running both against a software device +and a real device. SW-only tests should instead be placed in net/ or +drivers/net/netdevsim, HW-only tests in drivers/net/hw. Variables ========= -Variables can be set in the environment or by creating a net.config +The variables can be set in the environment or by creating a net.config file in the same directory as this README file. Example:: $ NETIF=eth0 ./some_test.sh @@ -23,9 +47,9 @@ or:: # Variable set in a file NETIF=eth0 -Please note that the config parser is very simple, if there are -any non-alphanumeric characters in the value it needs to be in -double quotes. +Local test (which don't require endpoint for sending / receiving traffic) +need only the ``NETIF`` variable. Remaining variables define the endpoint +and communication method. NETIF ~~~~~ @@ -61,3 +85,52 @@ Communication channel dependent:: for netns - name of the "remote" namespace for ssh - name/address of the remote host + +Example +======= + +Build the selftests:: + + # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" + +"Install" the tests and copy them over to the target machine:: + + # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" \ + install INSTALL_PATH=/tmp/ksft-net-drv + + # rsync -ra --delete /tmp/ksft-net-drv root@192.168.1.1:/root/ + +On the target machine, running the tests will use netdevsim by default:: + + [/root] # ./ksft-net-drv/run_kselftest.sh -t drivers/net:ping.py + TAP version 13 + 1..1 + # timeout set to 45 + # selftests: drivers/net: ping.py + # KTAP version 1 + # 1..3 + # ok 1 ping.test_v4 + # ok 2 ping.test_v6 + # ok 3 ping.test_tcp + # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 + ok 1 selftests: drivers/net: ping.py + +Create a config with remote info:: + + [/root] # cat > ./ksft-net-drv/drivers/net/net.config < Date: Thu, 25 Apr 2024 15:23:40 -0700 Subject: selftests: drv-net: reimplement the config parser The shell lexer is not helping much, do very basic parsing manually. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240425222341.309778-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index a3db1bb1afeb..6f57bd5c0ed7 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 import os -import shlex from pathlib import Path from lib.py import KsftSkipEx from lib.py import cmd, ip @@ -16,17 +15,20 @@ def _load_env_file(src_path): if not (src_dir / "net.config").exists(): return env - lexer = shlex.shlex(open((src_dir / "net.config").as_posix(), 'r').read()) - k = None - for token in lexer: - if k is None: - k = token - env[k] = "" - elif token == "=": - pass - else: - env[k] = token - k = None + with open((src_dir / "net.config").as_posix(), 'r') as fp: + for line in fp.readlines(): + full_file = line + # Strip comments + pos = line.find("#") + if pos >= 0: + line = line[:pos] + line = line.strip() + if not line: + continue + pair = line.split('=', maxsplit=1) + if len(pair) != 2: + raise Exception("Can't parse configuration line:", full_file) + env[pair[0]] = pair[1] return env -- cgit v1.2.3 From 340ab206ce5c673aab23d0197d3a0e2bccb86d74 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2024 15:23:41 -0700 Subject: selftests: drv-net: validate the environment Throw a slightly more helpful exception when env variables are partially populated. Prior to this change we'd get a dictionary key exception somewhere later on. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240425222341.309778-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 25 +++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 6f57bd5c0ed7..e2ab637e56dc 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -88,6 +88,7 @@ class NetDrvEpEnv: self._ns_peer = None if "NETIF" in self.env: + self._check_env() self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] self.v4 = self.env.get("LOCAL_V4") @@ -143,6 +144,30 @@ class NetDrvEpEnv: ip(f"-6 addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v6_pfx}2/64 nodad", ns=self._netns) ip(f" link set dev {self._ns_peer.nsims[0].ifname} up", ns=self._netns) + def _check_env(self): + vars_needed = [ + ["LOCAL_V4", "LOCAL_V6"], + ["REMOTE_V4", "REMOTE_V6"], + ["REMOTE_TYPE"], + ["REMOTE_ARGS"] + ] + missing = [] + + for choice in vars_needed: + for entry in choice: + if entry in self.env: + break + else: + missing.append(choice) + # Make sure v4 / v6 configs are symmetric + if ("LOCAL_V6" in self.env) != ("REMOTE_V6" in self.env): + missing.append(["LOCAL_V6", "REMOTE_V6"]) + if ("LOCAL_V4" in self.env) != ("REMOTE_V4" in self.env): + missing.append(["LOCAL_V4", "REMOTE_V4"]) + if missing: + raise Exception("Invalid environment, missing configuration:", missing, + "Please see tools/testing/selftests/drivers/net/README.rst") + def __enter__(self): return self -- cgit v1.2.3 From 5c4c0edca68a5841a8d53ccd49596fe199c8334c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2024 17:31:11 -0700 Subject: tools: ynl: don't append doc of missing type directly to the type When using YNL in tests appending the doc string to the type name makes it harder to check that we got the correct error. Put the doc under a separate key. Reviewed-by: Donald Hunter Link: https://lore.kernel.org/r/20240426003111.359285-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 35f82a2c2247..35e666928119 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -233,10 +233,9 @@ class NlMsg: miss_type = self.extack['miss-type'] if miss_type in attr_space.attrs_by_val: spec = attr_space.attrs_by_val[miss_type] - desc = spec['name'] + self.extack['miss-type'] = spec['name'] if 'doc' in spec: - desc += f" ({spec['doc']})" - self.extack['miss-type'] = desc + self.extack['miss-type-doc'] = spec['doc'] def _decode_policy(self, raw): policy = {} -- cgit v1.2.3 From 7255fcc80d4b525cc10cfaaf7f485830d4ed2000 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Apr 2024 16:45:37 -0300 Subject: perf tests shell kprobes: Add missing description as used by 'perf test' output Before: root@x1:~# perf test 76 76: SPDX-License-Identifier: GPL-2.0 : Ok root@x1:~# After: root@x1:~# perf test 76 76: Add 'perf probe's, list and remove them. : Ok root@x1:~# Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Athira Rajeev Cc: Jiri Olsa Cc: Kajol Jain Cc: Michael Petlan Cc: Namhyung Kim Cc: Veronika Molnarova Link: https://lore.kernel.org/lkml/ZigRDKUGkcDqD-yW@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/base_probe/test_adding_kernel.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh index a5d707efad85..63bb8974b38e 100755 --- a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh +++ b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Add 'perf probe's, list and remove them # SPDX-License-Identifier: GPL-2.0 # -- cgit v1.2.3 From cd88c11c6d89bcd1851736d853eca57bbde1b042 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Apr 2024 17:23:27 -0300 Subject: tools lib rbtree: Pick some improvements from the kernel rbtree code The tools/lib/rbtree.c code came from the kernel, removing the EXPORT_SYMBOL() that make sense only there, unfortunately it is not being checked with tools/perf/check_headers.sh, will try to remedy this, till then pick the improvements from: b0687c1119b4e8c8 ("lib/rbtree: use '+' instead of '|' for setting color.") That I noticed by doing: diff -u tools/lib/rbtree.c lib/rbtree.c diff -u tools/include/linux/rbtree_augmented.h include/linux/rbtree_augmented.h There is one other cases, but lets pick it in separate patches. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Andrew Morton Cc: Noah Goldstein Link: https://lore.kernel.org/lkml/ZigZzeFoukzRKG1Q@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/rbtree_augmented.h | 4 ++-- tools/lib/rbtree.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 570bb9794421..95483c7d81df 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; + rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { - rb->__rb_parent_color = (unsigned long)p | color; + rb->__rb_parent_color = (unsigned long)p + color; } static inline void diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 727396de6be5..9e7307186b7f 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) -- cgit v1.2.3 From e0c48bf9e80ceefb83d74999adeeddbdd95f4c1d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 23 Apr 2024 16:32:48 +0300 Subject: perf scripts python: Add a script to run instances of 'perf script' in parallel Add a Python script to run a perf script command multiple times in parallel, using perf script options --cpu and --time so that each job processes a different chunk of the data. Extend perf script tests to test also the new script. The script supports the use of normal 'perf script' options like --dlfilter and --script, so that the benefit of running parallel jobs naturally extends to them also. In addition, a command can be provided (refer --pipe-to option) to pipe standard output to a custom command. Refer to the script's own help text at the end of the patch for more details. The script is useful for Intel PT traces, that can be efficiently decoded by 'perf script' when split by CPU and/or time ranges. Running jobs in parallel can decrease the overall decoding time. Committer testing: Ian reported that shellcheck found some issues, I installed it as there are no warnings about it not being available, but when available it fails the build with: TEST /tmp/build/perf-tools-next/tests/shell/script.sh.shellcheck_log CC /tmp/build/perf-tools-next/util/header.o In tests/shell/script.sh line 20: rm -rf "${temp_dir}/"* ^-------------^ SC2115 (warning): Use "${var:?}" to ensure this never expands to /* . In tests/shell/script.sh line 83: output1_dir="${temp_dir}/output1" ^---------^ SC2034 (warning): output1_dir appears unused. Verify use (or export if used externally). In tests/shell/script.sh line 84: output2_dir="${temp_dir}/output2" ^---------^ SC2034 (warning): output2_dir appears unused. Verify use (or export if used externally). In tests/shell/script.sh line 86: python3 "${pp}" -o "${output_dir}" --jobs 4 --verbose -- perf script -i "${perf_data}" ^-----------^ SC2154 (warning): output_dir is referenced but not assigned (did you mean 'output1_dir'?). For more information: https://www.shellcheck.net/wiki/SC2034 -- output1_dir appears unused. Verif... https://www.shellcheck.net/wiki/SC2115 -- Use "${var:?}" to ensure this nev... https://www.shellcheck.net/wiki/SC2154 -- output_dir is referenced but not ... Did these fixes: - rm -rf "${temp_dir}/"* + rm -rf "${temp_dir:?}/"* And: @@ -83,8 +83,8 @@ test_parallel_perf() output1_dir="${temp_dir}/output1" output2_dir="${temp_dir}/output2" perf record -o "${perf_data}" --sample-cpu uname - python3 "${pp}" -o "${output_dir}" --jobs 4 --verbose -- perf script -i "${perf_data}" - python3 "${pp}" -o "${output_dir}" --jobs 4 --verbose --per-cpu -- perf script -i "${perf_data}" + python3 "${pp}" -o "${output1_dir}" --jobs 4 --verbose -- perf script -i "${perf_data}" + python3 "${pp}" -o "${output2_dir}" --jobs 4 --verbose --per-cpu -- perf script -i "${perf_data}" After that: root@number:~# perf test -vv "perf script tests" 97: perf script tests: --- start --- test child forked, pid 4084139 DB test [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.032 MB /tmp/perf-test-script.T4MJDr0L6J/perf.data (7 samples) ] DB test [Success] parallel-perf test Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.034 MB /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data (7 samples) ] Starting: perf script --time=,91898.301878499 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --time=91898.301878500,91898.301905999 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --time=91898.301906000,91898.301933499 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --time=91898.301933500, -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --time=91898.301878500,91898.301905999 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --time=91898.301906000,91898.301933499 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 4 jobs: 2 completed, 2 running Finished: perf script --time=,91898.301878499 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --time=91898.301933500, -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 4 jobs: 4 completed, 0 running All jobs finished successfully parallel-perf.py done Starting: perf script --cpu=0 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=1 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=2 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=3 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=0 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=1 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=2 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=3 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 4 completed, 0 running Starting: perf script --cpu=4 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=5 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=6 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=7 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=4 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=5 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=6 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=7 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 8 completed, 0 running Starting: perf script --cpu=8 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=9 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=10 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=11 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=8 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=9 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=10 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=11 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 12 completed, 0 running Starting: perf script --cpu=12 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=13 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=14 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=15 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=12 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=13 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=14 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=15 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 16 completed, 0 running Starting: perf script --cpu=16 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=17 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=18 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=19 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=16 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=17 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=18 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=19 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 20 completed, 0 running Starting: perf script --cpu=20 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=21 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=22 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=23 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=20 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=21 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=22 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=23 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 24 completed, 0 running Starting: perf script --cpu=24 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=25 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=26 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Starting: perf script --cpu=27 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=25 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=26 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data Finished: perf script --cpu=27 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 27 completed, 1 running Finished: perf script --cpu=24 -i /tmp/perf-test-script.T4MJDr0L6J/pp-perf.data There are 28 jobs: 28 completed, 0 running All jobs finished successfully parallel-perf.py done parallel-perf test [Success] --- Cleaning up --- ---- end(0) ---- 97: perf script tests : Ok root@number:~# Reviewed-by: Andi Kleen Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240423133248.10206-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/parallel-perf.py | 988 +++++++++++++++++++++++++++++ tools/perf/tests/shell/script.sh | 26 +- 2 files changed, 1013 insertions(+), 1 deletion(-) create mode 100755 tools/perf/scripts/python/parallel-perf.py (limited to 'tools') diff --git a/tools/perf/scripts/python/parallel-perf.py b/tools/perf/scripts/python/parallel-perf.py new file mode 100755 index 000000000000..21f32ec5ed46 --- /dev/null +++ b/tools/perf/scripts/python/parallel-perf.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Run a perf script command multiple times in parallel, using perf script +# options --cpu and --time so that each job processes a different chunk +# of the data. +# +# Copyright (c) 2024, Intel Corporation. + +import subprocess +import argparse +import pathlib +import shlex +import time +import copy +import sys +import os +import re + +glb_prog_name = "parallel-perf.py" +glb_min_interval = 10.0 +glb_min_samples = 64 + +class Verbosity(): + + def __init__(self, quiet=False, verbose=False, debug=False): + self.normal = True + self.verbose = verbose + self.debug = debug + self.self_test = True + if self.debug: + self.verbose = True + if self.verbose: + quiet = False + if quiet: + self.normal = False + +# Manage work (Start/Wait/Kill), as represented by a subprocess.Popen command +class Work(): + + def __init__(self, cmd, pipe_to, output_dir="."): + self.popen = None + self.consumer = None + self.cmd = cmd + self.pipe_to = pipe_to + self.output_dir = output_dir + self.cmdout_name = f"{output_dir}/cmd.txt" + self.stdout_name = f"{output_dir}/out.txt" + self.stderr_name = f"{output_dir}/err.txt" + + def Command(self): + sh_cmd = [ shlex.quote(x) for x in self.cmd ] + return " ".join(self.cmd) + + def Stdout(self): + return open(self.stdout_name, "w") + + def Stderr(self): + return open(self.stderr_name, "w") + + def CreateOutputDir(self): + pathlib.Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + def Start(self): + if self.popen: + return + self.CreateOutputDir() + with open(self.cmdout_name, "w") as f: + f.write(self.Command()) + f.write("\n") + stdout = self.Stdout() + stderr = self.Stderr() + if self.pipe_to: + self.popen = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=stderr) + args = shlex.split(self.pipe_to) + self.consumer = subprocess.Popen(args, stdin=self.popen.stdout, stdout=stdout, stderr=stderr) + else: + self.popen = subprocess.Popen(self.cmd, stdout=stdout, stderr=stderr) + + def RemoveEmptyErrFile(self): + if os.path.exists(self.stderr_name): + if os.path.getsize(self.stderr_name) == 0: + os.unlink(self.stderr_name) + + def Errors(self): + if os.path.exists(self.stderr_name): + if os.path.getsize(self.stderr_name) != 0: + return [ f"Non-empty error file {self.stderr_name}" ] + return [] + + def TidyUp(self): + self.RemoveEmptyErrFile() + + def RawPollWait(self, p, wait): + if wait: + return p.wait() + return p.poll() + + def Poll(self, wait=False): + if not self.popen: + return None + result = self.RawPollWait(self.popen, wait) + if self.consumer: + res = result + result = self.RawPollWait(self.consumer, wait) + if result != None and res == None: + self.popen.kill() + result = None + elif result == 0 and res != None and res != 0: + result = res + if result != None: + self.TidyUp() + return result + + def Wait(self): + return self.Poll(wait=True) + + def Kill(self): + if not self.popen: + return + self.popen.kill() + if self.consumer: + self.consumer.kill() + +def KillWork(worklist, verbosity): + for w in worklist: + w.Kill() + for w in worklist: + w.Wait() + +def NumberOfCPUs(): + return os.sysconf("SC_NPROCESSORS_ONLN") + +def NanoSecsToSecsStr(x): + if x == None: + return "" + x = str(x) + if len(x) < 10: + x = "0" * (10 - len(x)) + x + return x[:len(x) - 9] + "." + x[-9:] + +def InsertOptionAfter(cmd, option, after): + try: + pos = cmd.index(after) + cmd.insert(pos + 1, option) + except: + cmd.append(option) + +def CreateWorkList(cmd, pipe_to, output_dir, cpus, time_ranges_by_cpu): + max_len = len(str(cpus[-1])) + cpu_dir_fmt = f"cpu-%.{max_len}u" + worklist = [] + pos = 0 + for cpu in cpus: + if cpu >= 0: + cpu_dir = os.path.join(output_dir, cpu_dir_fmt % cpu) + cpu_option = f"--cpu={cpu}" + else: + cpu_dir = output_dir + cpu_option = None + + tr_dir_fmt = "time-range" + + if len(time_ranges_by_cpu) > 1: + time_ranges = time_ranges_by_cpu[pos] + tr_dir_fmt += f"-{pos}" + pos += 1 + else: + time_ranges = time_ranges_by_cpu[0] + + max_len = len(str(len(time_ranges))) + tr_dir_fmt += f"-%.{max_len}u" + + i = 0 + for r in time_ranges: + if r == [None, None]: + time_option = None + work_output_dir = cpu_dir + else: + time_option = "--time=" + NanoSecsToSecsStr(r[0]) + "," + NanoSecsToSecsStr(r[1]) + work_output_dir = os.path.join(cpu_dir, tr_dir_fmt % i) + i += 1 + work_cmd = list(cmd) + if time_option != None: + InsertOptionAfter(work_cmd, time_option, "script") + if cpu_option != None: + InsertOptionAfter(work_cmd, cpu_option, "script") + w = Work(work_cmd, pipe_to, work_output_dir) + worklist.append(w) + return worklist + +def DoRunWork(worklist, nr_jobs, verbosity): + nr_to_do = len(worklist) + not_started = list(worklist) + running = [] + done = [] + chg = False + while True: + nr_done = len(done) + if chg and verbosity.normal: + nr_run = len(running) + print(f"\rThere are {nr_to_do} jobs: {nr_done} completed, {nr_run} running", flush=True, end=" ") + if verbosity.verbose: + print() + chg = False + if nr_done == nr_to_do: + break + while len(running) < nr_jobs and len(not_started): + w = not_started.pop(0) + running.append(w) + if verbosity.verbose: + print("Starting:", w.Command()) + w.Start() + chg = True + if len(running): + time.sleep(0.1) + finished = [] + not_finished = [] + while len(running): + w = running.pop(0) + r = w.Poll() + if r == None: + not_finished.append(w) + continue + if r == 0: + if verbosity.verbose: + print("Finished:", w.Command()) + finished.append(w) + chg = True + continue + if verbosity.normal and not verbosity.verbose: + print() + print("Job failed!\n return code:", r, "\n command: ", w.Command()) + if w.pipe_to: + print(" piped to: ", w.pipe_to) + print("Killing outstanding jobs") + KillWork(not_finished, verbosity) + KillWork(running, verbosity) + return False + running = not_finished + done += finished + errorlist = [] + for w in worklist: + errorlist += w.Errors() + if len(errorlist): + print("Errors:") + for e in errorlist: + print(e) + elif verbosity.normal: + print("\r"," "*50, "\rAll jobs finished successfully", flush=True) + return True + +def RunWork(worklist, nr_jobs=NumberOfCPUs(), verbosity=Verbosity()): + try: + return DoRunWork(worklist, nr_jobs, verbosity) + except: + for w in worklist: + w.Kill() + raise + return True + +def ReadHeader(perf, file_name): + return subprocess.Popen([perf, "script", "--header-only", "--input", file_name], stdout=subprocess.PIPE).stdout.read().decode("utf-8") + +def ParseHeader(hdr): + result = {} + lines = hdr.split("\n") + for line in lines: + if ":" in line and line[0] == "#": + pos = line.index(":") + name = line[1:pos-1].strip() + value = line[pos+1:].strip() + if name in result: + orig_name = name + nr = 2 + while True: + name = f"{orig_name} {nr}" + if name not in result: + break + nr += 1 + result[name] = value + return result + +def HeaderField(hdr_dict, hdr_fld): + if hdr_fld not in hdr_dict: + raise Exception(f"'{hdr_fld}' missing from header information") + return hdr_dict[hdr_fld] + +# Represent the position of an option within a command string +# and provide the option value and/or remove the option +class OptPos(): + + def Init(self, opt_element=-1, value_element=-1, opt_pos=-1, value_pos=-1, error=None): + self.opt_element = opt_element # list element that contains option + self.value_element = value_element # list element that contains option value + self.opt_pos = opt_pos # string position of option + self.value_pos = value_pos # string position of value + self.error = error # error message string + + def __init__(self, args, short_name, long_name, default=None): + self.args = list(args) + self.default = default + n = 2 + len(long_name) + m = len(short_name) + pos = -1 + for opt in args: + pos += 1 + if m and opt[:2] == f"-{short_name}": + if len(opt) == 2: + if pos + 1 < len(args): + self.Init(pos, pos + 1, 0, 0) + else: + self.Init(error = f"-{short_name} option missing value") + else: + self.Init(pos, pos, 0, 2) + return + if opt[:n] == f"--{long_name}": + if len(opt) == n: + if pos + 1 < len(args): + self.Init(pos, pos + 1, 0, 0) + else: + self.Init(error = f"--{long_name} option missing value") + elif opt[n] == "=": + self.Init(pos, pos, 0, n + 1) + else: + self.Init(error = f"--{long_name} option expected '='") + return + if m and opt[:1] == "-" and opt[:2] != "--" and short_name in opt: + ipos = opt.index(short_name) + if "-" in opt[1:]: + hpos = opt[1:].index("-") + if hpos < ipos: + continue + if ipos + 1 == len(opt): + if pos + 1 < len(args): + self.Init(pos, pos + 1, ipos, 0) + else: + self.Init(error = f"-{short_name} option missing value") + else: + self.Init(pos, pos, ipos, ipos + 1) + return + self.Init() + + def Value(self): + if self.opt_element >= 0: + if self.opt_element != self.value_element: + return self.args[self.value_element] + else: + return self.args[self.value_element][self.value_pos:] + return self.default + + def Remove(self, args): + if self.opt_element == -1: + return + if self.opt_element != self.value_element: + del args[self.value_element] + if self.opt_pos: + args[self.opt_element] = args[self.opt_element][:self.opt_pos] + else: + del args[self.opt_element] + +def DetermineInputFileName(cmd): + p = OptPos(cmd, "i", "input", "perf.data") + if p.error: + raise Exception(f"perf command {p.error}") + file_name = p.Value() + if not os.path.exists(file_name): + raise Exception(f"perf command input file '{file_name}' not found") + return file_name + +def ReadOption(args, short_name, long_name, err_prefix, remove=False): + p = OptPos(args, short_name, long_name) + if p.error: + raise Exception(f"{err_prefix}{p.error}") + value = p.Value() + if remove: + p.Remove(args) + return value + +def ExtractOption(args, short_name, long_name, err_prefix): + return ReadOption(args, short_name, long_name, err_prefix, True) + +def ReadPerfOption(args, short_name, long_name): + return ReadOption(args, short_name, long_name, "perf command ") + +def ExtractPerfOption(args, short_name, long_name): + return ExtractOption(args, short_name, long_name, "perf command ") + +def PerfDoubleQuickCommands(cmd, file_name): + cpu_str = ReadPerfOption(cmd, "C", "cpu") + time_str = ReadPerfOption(cmd, "", "time") + # Use double-quick sampling to determine trace data density + times_cmd = ["perf", "script", "--ns", "--input", file_name, "--itrace=qqi"] + if cpu_str != None and cpu_str != "": + times_cmd.append(f"--cpu={cpu_str}") + if time_str != None and time_str != "": + times_cmd.append(f"--time={time_str}") + cnts_cmd = list(times_cmd) + cnts_cmd.append("-Fcpu") + times_cmd.append("-Fcpu,time") + return cnts_cmd, times_cmd + +class CPUTimeRange(): + def __init__(self, cpu): + self.cpu = cpu + self.sample_cnt = 0 + self.time_ranges = None + self.interval = 0 + self.interval_remaining = 0 + self.remaining = 0 + self.tr_pos = 0 + +def CalcTimeRangesByCPU(line, cpu, cpu_time_ranges, max_time): + cpu_time_range = cpu_time_ranges[cpu] + cpu_time_range.remaining -= 1 + cpu_time_range.interval_remaining -= 1 + if cpu_time_range.remaining == 0: + cpu_time_range.time_ranges[cpu_time_range.tr_pos][1] = max_time + return + if cpu_time_range.interval_remaining == 0: + time = TimeVal(line[1][:-1], 0) + time_ranges = cpu_time_range.time_ranges + time_ranges[cpu_time_range.tr_pos][1] = time - 1 + time_ranges.append([time, max_time]) + cpu_time_range.tr_pos += 1 + cpu_time_range.interval_remaining = cpu_time_range.interval + +def CountSamplesByCPU(line, cpu, cpu_time_ranges): + try: + cpu_time_ranges[cpu].sample_cnt += 1 + except: + print("exception") + print("cpu", cpu) + print("len(cpu_time_ranges)", len(cpu_time_ranges)) + raise + +def ProcessCommandOutputLines(cmd, per_cpu, fn, *x): + # Assume CPU number is at beginning of line and enclosed by [] + pat = re.compile(r"\s*\[[0-9]+\]") + p = subprocess.Popen(cmd, stdout=subprocess.PIPE) + while True: + if line := p.stdout.readline(): + line = line.decode("utf-8") + if pat.match(line): + line = line.split() + if per_cpu: + # Assumes CPU number is enclosed by [] + cpu = int(line[0][1:-1]) + else: + cpu = 0 + fn(line, cpu, *x) + else: + break + p.wait() + +def IntersectTimeRanges(new_time_ranges, time_ranges): + pos = 0 + new_pos = 0 + # Can assume len(time_ranges) != 0 and len(new_time_ranges) != 0 + # Note also, there *must* be at least one intersection. + while pos < len(time_ranges) and new_pos < len(new_time_ranges): + # new end < old start => no intersection, remove new + if new_time_ranges[new_pos][1] < time_ranges[pos][0]: + del new_time_ranges[new_pos] + continue + # new start > old end => no intersection, check next + if new_time_ranges[new_pos][0] > time_ranges[pos][1]: + pos += 1 + if pos < len(time_ranges): + continue + # no next, so remove remaining + while new_pos < len(new_time_ranges): + del new_time_ranges[new_pos] + return + # Found an intersection + # new start < old start => adjust new start = old start + if new_time_ranges[new_pos][0] < time_ranges[pos][0]: + new_time_ranges[new_pos][0] = time_ranges[pos][0] + # new end > old end => keep the overlap, insert the remainder + if new_time_ranges[new_pos][1] > time_ranges[pos][1]: + r = [ time_ranges[pos][1] + 1, new_time_ranges[new_pos][1] ] + new_time_ranges[new_pos][1] = time_ranges[pos][1] + new_pos += 1 + new_time_ranges.insert(new_pos, r) + continue + # new [start, end] is within old [start, end] + new_pos += 1 + +def SplitTimeRangesByTraceDataDensity(time_ranges, cpus, nr, cmd, file_name, per_cpu, min_size, min_interval, verbosity): + if verbosity.normal: + print("\rAnalyzing...", flush=True, end=" ") + if verbosity.verbose: + print() + cnts_cmd, times_cmd = PerfDoubleQuickCommands(cmd, file_name) + + nr_cpus = cpus[-1] + 1 if per_cpu else 1 + if per_cpu: + nr_cpus = cpus[-1] + 1 + cpu_time_ranges = [ CPUTimeRange(cpu) for cpu in range(nr_cpus) ] + else: + nr_cpus = 1 + cpu_time_ranges = [ CPUTimeRange(-1) ] + + if verbosity.debug: + print("nr_cpus", nr_cpus) + print("cnts_cmd", cnts_cmd) + print("times_cmd", times_cmd) + + # Count the number of "double quick" samples per CPU + ProcessCommandOutputLines(cnts_cmd, per_cpu, CountSamplesByCPU, cpu_time_ranges) + + tot = 0 + mx = 0 + for cpu_time_range in cpu_time_ranges: + cnt = cpu_time_range.sample_cnt + tot += cnt + if cnt > mx: + mx = cnt + if verbosity.debug: + print("cpu:", cpu_time_range.cpu, "sample_cnt", cnt) + + if min_size < 1: + min_size = 1 + + if mx < min_size: + # Too little data to be worth splitting + if verbosity.debug: + print("Too little data to split by time") + if nr == 0: + nr = 1 + return [ SplitTimeRangesIntoN(time_ranges, nr, min_interval) ] + + if nr: + divisor = nr + min_size = 1 + else: + divisor = NumberOfCPUs() + + interval = int(round(tot / divisor, 0)) + if interval < min_size: + interval = min_size + + if verbosity.debug: + print("divisor", divisor) + print("min_size", min_size) + print("interval", interval) + + min_time = time_ranges[0][0] + max_time = time_ranges[-1][1] + + for cpu_time_range in cpu_time_ranges: + cnt = cpu_time_range.sample_cnt + if cnt == 0: + cpu_time_range.time_ranges = copy.deepcopy(time_ranges) + continue + # Adjust target interval for CPU to give approximately equal interval sizes + # Determine number of intervals, rounding to nearest integer + n = int(round(cnt / interval, 0)) + if n < 1: + n = 1 + # Determine interval size, rounding up + d, m = divmod(cnt, n) + if m: + d += 1 + cpu_time_range.interval = d + cpu_time_range.interval_remaining = d + cpu_time_range.remaining = cnt + # Init. time ranges for each CPU with the start time + cpu_time_range.time_ranges = [ [min_time, max_time] ] + + # Set time ranges so that the same number of "double quick" samples + # will fall into each time range. + ProcessCommandOutputLines(times_cmd, per_cpu, CalcTimeRangesByCPU, cpu_time_ranges, max_time) + + for cpu_time_range in cpu_time_ranges: + if cpu_time_range.sample_cnt: + IntersectTimeRanges(cpu_time_range.time_ranges, time_ranges) + + return [cpu_time_ranges[cpu].time_ranges for cpu in cpus] + +def SplitSingleTimeRangeIntoN(time_range, n): + if n <= 1: + return [time_range] + start = time_range[0] + end = time_range[1] + duration = int((end - start + 1) / n) + if duration < 1: + return [time_range] + time_ranges = [] + for i in range(n): + time_ranges.append([start, start + duration - 1]) + start += duration + time_ranges[-1][1] = end + return time_ranges + +def TimeRangeDuration(r): + return r[1] - r[0] + 1 + +def TotalDuration(time_ranges): + duration = 0 + for r in time_ranges: + duration += TimeRangeDuration(r) + return duration + +def SplitTimeRangesByInterval(time_ranges, interval): + new_ranges = [] + for r in time_ranges: + duration = TimeRangeDuration(r) + n = duration / interval + n = int(round(n, 0)) + new_ranges += SplitSingleTimeRangeIntoN(r, n) + return new_ranges + +def SplitTimeRangesIntoN(time_ranges, n, min_interval): + if n <= len(time_ranges): + return time_ranges + duration = TotalDuration(time_ranges) + interval = duration / n + if interval < min_interval: + interval = min_interval + return SplitTimeRangesByInterval(time_ranges, interval) + +def RecombineTimeRanges(tr): + new_tr = copy.deepcopy(tr) + n = len(new_tr) + i = 1 + while i < len(new_tr): + # if prev end + 1 == cur start, combine them + if new_tr[i - 1][1] + 1 == new_tr[i][0]: + new_tr[i][0] = new_tr[i - 1][0] + del new_tr[i - 1] + else: + i += 1 + return new_tr + +def OpenTimeRangeEnds(time_ranges, min_time, max_time): + if time_ranges[0][0] <= min_time: + time_ranges[0][0] = None + if time_ranges[-1][1] >= max_time: + time_ranges[-1][1] = None + +def BadTimeStr(time_str): + raise Exception(f"perf command bad time option: '{time_str}'\nCheck also 'time of first sample' and 'time of last sample' in perf script --header-only") + +def ValidateTimeRanges(time_ranges, time_str): + n = len(time_ranges) + for i in range(n): + start = time_ranges[i][0] + end = time_ranges[i][1] + if i != 0 and start <= time_ranges[i - 1][1]: + BadTimeStr(time_str) + if start > end: + BadTimeStr(time_str) + +def TimeVal(s, dflt): + s = s.strip() + if s == "": + return dflt + a = s.split(".") + if len(a) > 2: + raise Exception(f"Bad time value'{s}'") + x = int(a[0]) + if x < 0: + raise Exception("Negative time not allowed") + x *= 1000000000 + if len(a) > 1: + x += int((a[1] + "000000000")[:9]) + return x + +def BadCPUStr(cpu_str): + raise Exception(f"perf command bad cpu option: '{cpu_str}'\nCheck also 'nrcpus avail' in perf script --header-only") + +def ParseTimeStr(time_str, min_time, max_time): + if time_str == None or time_str == "": + return [[min_time, max_time]] + time_ranges = [] + for r in time_str.split(): + a = r.split(",") + if len(a) != 2: + BadTimeStr(time_str) + try: + start = TimeVal(a[0], min_time) + end = TimeVal(a[1], max_time) + except: + BadTimeStr(time_str) + time_ranges.append([start, end]) + ValidateTimeRanges(time_ranges, time_str) + return time_ranges + +def ParseCPUStr(cpu_str, nr_cpus): + if cpu_str == None or cpu_str == "": + return [-1] + cpus = [] + for r in cpu_str.split(","): + a = r.split("-") + if len(a) < 1 or len(a) > 2: + BadCPUStr(cpu_str) + try: + start = int(a[0].strip()) + if len(a) > 1: + end = int(a[1].strip()) + else: + end = start + except: + BadCPUStr(cpu_str) + if start < 0 or end < 0 or end < start or end >= nr_cpus: + BadCPUStr(cpu_str) + cpus.extend(range(start, end + 1)) + cpus = list(set(cpus)) # Remove duplicates + cpus.sort() + return cpus + +class ParallelPerf(): + + def __init__(self, a): + for arg_name in vars(a): + setattr(self, arg_name, getattr(a, arg_name)) + self.orig_nr = self.nr + self.orig_cmd = list(self.cmd) + self.perf = self.cmd[0] + if os.path.exists(self.output_dir): + raise Exception(f"Output '{self.output_dir}' already exists") + if self.jobs < 0 or self.nr < 0 or self.interval < 0: + raise Exception("Bad options (negative values): try -h option for help") + if self.nr != 0 and self.interval != 0: + raise Exception("Cannot specify number of time subdivisions and time interval") + if self.jobs == 0: + self.jobs = NumberOfCPUs() + if self.nr == 0 and self.interval == 0: + if self.per_cpu: + self.nr = 1 + else: + self.nr = self.jobs + + def Init(self): + if self.verbosity.debug: + print("cmd", self.cmd) + self.file_name = DetermineInputFileName(self.cmd) + self.hdr = ReadHeader(self.perf, self.file_name) + self.hdr_dict = ParseHeader(self.hdr) + self.cmd_line = HeaderField(self.hdr_dict, "cmdline") + + def ExtractTimeInfo(self): + self.min_time = TimeVal(HeaderField(self.hdr_dict, "time of first sample"), 0) + self.max_time = TimeVal(HeaderField(self.hdr_dict, "time of last sample"), 0) + self.time_str = ExtractPerfOption(self.cmd, "", "time") + self.time_ranges = ParseTimeStr(self.time_str, self.min_time, self.max_time) + if self.verbosity.debug: + print("time_ranges", self.time_ranges) + + def ExtractCPUInfo(self): + if self.per_cpu: + nr_cpus = int(HeaderField(self.hdr_dict, "nrcpus avail")) + self.cpu_str = ExtractPerfOption(self.cmd, "C", "cpu") + if self.cpu_str == None or self.cpu_str == "": + self.cpus = [ x for x in range(nr_cpus) ] + else: + self.cpus = ParseCPUStr(self.cpu_str, nr_cpus) + else: + self.cpu_str = None + self.cpus = [-1] + if self.verbosity.debug: + print("cpus", self.cpus) + + def IsIntelPT(self): + return self.cmd_line.find("intel_pt") >= 0 + + def SplitTimeRanges(self): + if self.IsIntelPT() and self.interval == 0: + self.split_time_ranges_for_each_cpu = \ + SplitTimeRangesByTraceDataDensity(self.time_ranges, self.cpus, self.orig_nr, + self.orig_cmd, self.file_name, self.per_cpu, + self.min_size, self.min_interval, self.verbosity) + elif self.nr: + self.split_time_ranges_for_each_cpu = [ SplitTimeRangesIntoN(self.time_ranges, self.nr, self.min_interval) ] + else: + self.split_time_ranges_for_each_cpu = [ SplitTimeRangesByInterval(self.time_ranges, self.interval) ] + + def CheckTimeRanges(self): + for tr in self.split_time_ranges_for_each_cpu: + # Re-combined time ranges should be the same + new_tr = RecombineTimeRanges(tr) + if new_tr != self.time_ranges: + if self.verbosity.debug: + print("tr", tr) + print("new_tr", new_tr) + raise Exception("Self test failed!") + + def OpenTimeRangeEnds(self): + for time_ranges in self.split_time_ranges_for_each_cpu: + OpenTimeRangeEnds(time_ranges, self.min_time, self.max_time) + + def CreateWorkList(self): + self.worklist = CreateWorkList(self.cmd, self.pipe_to, self.output_dir, self.cpus, self.split_time_ranges_for_each_cpu) + + def PerfDataRecordedPerCPU(self): + if "--per-thread" in self.cmd_line.split(): + return False + return True + + def DefaultToPerCPU(self): + # --no-per-cpu option takes precedence + if self.no_per_cpu: + return False + if not self.PerfDataRecordedPerCPU(): + return False + # Default to per-cpu for Intel PT data that was recorded per-cpu, + # because decoding can be done for each CPU separately. + if self.IsIntelPT(): + return True + return False + + def Config(self): + self.Init() + self.ExtractTimeInfo() + if not self.per_cpu: + self.per_cpu = self.DefaultToPerCPU() + if self.verbosity.debug: + print("per_cpu", self.per_cpu) + self.ExtractCPUInfo() + self.SplitTimeRanges() + if self.verbosity.self_test: + self.CheckTimeRanges() + # Prefer open-ended time range to starting / ending with min_time / max_time resp. + self.OpenTimeRangeEnds() + self.CreateWorkList() + + def Run(self): + if self.dry_run: + print(len(self.worklist),"jobs:") + for w in self.worklist: + print(w.Command()) + return True + result = RunWork(self.worklist, self.jobs, verbosity=self.verbosity) + if self.verbosity.verbose: + print(glb_prog_name, "done") + return result + +def RunParallelPerf(a): + pp = ParallelPerf(a) + pp.Config() + return pp.Run() + +def Main(args): + ap = argparse.ArgumentParser( + prog=glb_prog_name, formatter_class = argparse.RawDescriptionHelpFormatter, + description = +""" +Run a perf script command multiple times in parallel, using perf script options +--cpu and --time so that each job processes a different chunk of the data. +""", + epilog = +""" +Follow the options by '--' and then the perf script command e.g. + + $ perf record -a -- sleep 10 + $ parallel-perf.py --nr=4 -- perf script --ns + All jobs finished successfully + $ tree parallel-perf-output/ + parallel-perf-output/ + ├── time-range-0 + │   ├── cmd.txt + │   └── out.txt + ├── time-range-1 + │   ├── cmd.txt + │   └── out.txt + ├── time-range-2 + │   ├── cmd.txt + │   └── out.txt + └── time-range-3 + ├── cmd.txt + └── out.txt + $ find parallel-perf-output -name cmd.txt | sort | xargs grep -H . + parallel-perf-output/time-range-0/cmd.txt:perf script --time=,9466.504461499 --ns + parallel-perf-output/time-range-1/cmd.txt:perf script --time=9466.504461500,9469.005396999 --ns + parallel-perf-output/time-range-2/cmd.txt:perf script --time=9469.005397000,9471.506332499 --ns + parallel-perf-output/time-range-3/cmd.txt:perf script --time=9471.506332500, --ns + +Any perf script command can be used, including the use of perf script options +--dlfilter and --script, so that the benefit of running parallel jobs +naturally extends to them also. + +If option --pipe-to is used, standard output is first piped through that +command. Beware, if the command fails (e.g. grep with no matches), it will be +considered a fatal error. + +Final standard output is redirected to files named out.txt in separate +subdirectories under the output directory. Similarly, standard error is +written to files named err.txt. In addition, files named cmd.txt contain the +corresponding perf script command. After processing, err.txt files are removed +if they are empty. + +If any job exits with a non-zero exit code, then all jobs are killed and no +more are started. A message is printed if any job results in a non-empty +err.txt file. + +There is a separate output subdirectory for each time range. If the --per-cpu +option is used, these are further grouped under cpu-n subdirectories, e.g. + + $ parallel-perf.py --per-cpu --nr=2 -- perf script --ns --cpu=0,1 + All jobs finished successfully + $ tree parallel-perf-output + parallel-perf-output/ + ├── cpu-0 + │   ├── time-range-0 + │   │   ├── cmd.txt + │   │   └── out.txt + │   └── time-range-1 + │   ├── cmd.txt + │   └── out.txt + └── cpu-1 + ├── time-range-0 + │   ├── cmd.txt + │   └── out.txt + └── time-range-1 + ├── cmd.txt + └── out.txt + $ find parallel-perf-output -name cmd.txt | sort | xargs grep -H . + parallel-perf-output/cpu-0/time-range-0/cmd.txt:perf script --cpu=0 --time=,9469.005396999 --ns + parallel-perf-output/cpu-0/time-range-1/cmd.txt:perf script --cpu=0 --time=9469.005397000, --ns + parallel-perf-output/cpu-1/time-range-0/cmd.txt:perf script --cpu=1 --time=,9469.005396999 --ns + parallel-perf-output/cpu-1/time-range-1/cmd.txt:perf script --cpu=1 --time=9469.005397000, --ns + +Subdivisions of time range, and cpus if the --per-cpu option is used, are +expressed by the --time and --cpu perf script options respectively. If the +supplied perf script command has a --time option, then that time range is +subdivided, otherwise the time range given by 'time of first sample' to +'time of last sample' is used (refer perf script --header-only). Similarly, the +supplied perf script command may provide a --cpu option, and only those CPUs +will be processed. + +To prevent time intervals becoming too small, the --min-interval option can +be used. + +Note there is special handling for processing Intel PT traces. If an interval is +not specified and the perf record command contained the intel_pt event, then the +time range will be subdivided in order to produce subdivisions that contain +approximately the same amount of trace data. That is accomplished by counting +double-quick (--itrace=qqi) samples, and choosing time ranges that encompass +approximately the same number of samples. In that case, time ranges may not be +the same for each CPU processed. For Intel PT, --per-cpu is the default, but +that can be overridden by --no-per-cpu. Note, for Intel PT, double-quick +decoding produces 1 sample for each PSB synchronization packet, which in turn +come after a certain number of bytes output, determined by psb_period (refer +perf Intel PT documentation). The minimum number of double-quick samples that +will define a time range can be set by the --min_size option, which defaults to +64. +""") + ap.add_argument("-o", "--output-dir", default="parallel-perf-output", help="output directory (default 'parallel-perf-output')") + ap.add_argument("-j", "--jobs", type=int, default=0, help="maximum number of jobs to run in parallel at one time (default is the number of CPUs)") + ap.add_argument("-n", "--nr", type=int, default=0, help="number of time subdivisions (default is the number of jobs)") + ap.add_argument("-i", "--interval", type=float, default=0, help="subdivide the time range using this time interval (in seconds e.g. 0.1 for a tenth of a second)") + ap.add_argument("-c", "--per-cpu", action="store_true", help="process data for each CPU in parallel") + ap.add_argument("-m", "--min-interval", type=float, default=glb_min_interval, help=f"minimum interval (default {glb_min_interval} seconds)") + ap.add_argument("-p", "--pipe-to", help="command to pipe output to (optional)") + ap.add_argument("-N", "--no-per-cpu", action="store_true", help="do not process data for each CPU in parallel") + ap.add_argument("-b", "--min_size", type=int, default=glb_min_samples, help="minimum data size (for Intel PT in PSBs)") + ap.add_argument("-D", "--dry-run", action="store_true", help="do not run any jobs, just show the perf script commands") + ap.add_argument("-q", "--quiet", action="store_true", help="do not print any messages except errors") + ap.add_argument("-v", "--verbose", action="store_true", help="print more messages") + ap.add_argument("-d", "--debug", action="store_true", help="print debugging messages") + cmd_line = list(args) + try: + split_pos = cmd_line.index("--") + cmd = cmd_line[split_pos + 1:] + args = cmd_line[:split_pos] + except: + cmd = None + args = cmd_line + a = ap.parse_args(args=args[1:]) + a.cmd = cmd + a.verbosity = Verbosity(a.quiet, a.verbose, a.debug) + try: + if a.cmd == None: + if len(args) <= 1: + ap.print_help() + return True + raise Exception("Command line must contain '--' before perf command") + return RunParallelPerf(a) + except Exception as e: + print("Fatal error: ", str(e)) + if a.debug: + raise + return False + +if __name__ == "__main__": + if not Main(sys.argv): + sys.exit(1) diff --git a/tools/perf/tests/shell/script.sh b/tools/perf/tests/shell/script.sh index fa4d71e2e72a..c1a603653662 100755 --- a/tools/perf/tests/shell/script.sh +++ b/tools/perf/tests/shell/script.sh @@ -17,7 +17,7 @@ cleanup() sane=$(echo "${temp_dir}" | cut -b 1-21) if [ "${sane}" = "/tmp/perf-test-script" ] ; then echo "--- Cleaning up ---" - rm -f "${temp_dir}/"* + rm -rf "${temp_dir:?}/"* rmdir "${temp_dir}" fi } @@ -65,7 +65,31 @@ _end_of_file_ echo "DB test [Success]" } +test_parallel_perf() +{ + echo "parallel-perf test" + if ! python3 --version >/dev/null 2>&1 ; then + echo "SKIP: no python3" + err=2 + return + fi + pp=$(dirname "$0")/../../scripts/python/parallel-perf.py + if [ ! -f "${pp}" ] ; then + echo "SKIP: parallel-perf.py script not found " + err=2 + return + fi + perf_data="${temp_dir}/pp-perf.data" + output1_dir="${temp_dir}/output1" + output2_dir="${temp_dir}/output2" + perf record -o "${perf_data}" --sample-cpu uname + python3 "${pp}" -o "${output1_dir}" --jobs 4 --verbose -- perf script -i "${perf_data}" + python3 "${pp}" -o "${output2_dir}" --jobs 4 --verbose --per-cpu -- perf script -i "${perf_data}" + echo "parallel-perf test [Success]" +} + test_db +test_parallel_perf cleanup -- cgit v1.2.3 From 8b734eaa98ac3f22bbaa253273fd16f391b3d5f8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:17 -0700 Subject: perf parse-events: Factor out '/.../' parsing Factor out the case of an event or PMU name followed by a slash based term list. This is with a view to sharing the code with new legacy hardware parsing. Use early return to reduce indentation in the code. Make parse_events_add_pmu static now it doesn't need sharing with parse-events.y. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 70 +++++++++++++++++++++++++++++++++++++++- tools/perf/util/parse-events.h | 10 +++--- tools/perf/util/parse-events.y | 73 +++--------------------------------------- 3 files changed, 80 insertions(+), 73 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 6f8b0fa17689..a6f71165ee1a 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1385,7 +1385,7 @@ static bool config_term_percore(struct list_head *config_terms) return false; } -int parse_events_add_pmu(struct parse_events_state *parse_state, +static int parse_events_add_pmu(struct parse_events_state *parse_state, struct list_head *list, const char *name, const struct parse_events_terms *const_parsed_terms, bool auto_merge_stats, void *loc_) @@ -1618,6 +1618,74 @@ out_err: return ok ? 0 : -1; } +int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state, + const char *event_or_pmu, + const struct parse_events_terms *const_parsed_terms, + struct list_head **listp, + void *loc_) +{ + char *pattern = NULL; + YYLTYPE *loc = loc_; + struct perf_pmu *pmu = NULL; + int ok = 0; + char *help; + + *listp = malloc(sizeof(**listp)); + if (!*listp) + return -ENOMEM; + + INIT_LIST_HEAD(*listp); + + /* Attempt to add to list assuming event_or_pmu is a PMU name. */ + if (!parse_events_add_pmu(parse_state, *listp, event_or_pmu, const_parsed_terms, + /*auto_merge_stats=*/false, loc)) + return 0; + + /* Failed to add, try wildcard expansion of event_or_pmu as a PMU name. */ + if (asprintf(&pattern, "%s*", event_or_pmu) < 0) { + zfree(listp); + return -ENOMEM; + } + + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + const char *name = pmu->name; + + if (parse_events__filter_pmu(parse_state, pmu)) + continue; + + if (!strncmp(name, "uncore_", 7) && + strncmp(event_or_pmu, "uncore_", 7)) + name += 7; + if (!perf_pmu__match(pattern, name, event_or_pmu) || + !perf_pmu__match(pattern, pmu->alias_name, event_or_pmu)) { + bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); + + if (!parse_events_add_pmu(parse_state, *listp, pmu->name, + const_parsed_terms, + auto_merge_stats, loc)) { + ok++; + parse_state->wild_card_pmus = true; + } + } + } + zfree(&pattern); + if (ok) + return 0; + + /* Failure to add, assume event_or_pmu is an event name. */ + zfree(listp); + if (!parse_events_multi_pmu_add(parse_state, event_or_pmu, const_parsed_terms, listp, loc)) + return 0; + + if (asprintf(&help, "Unable to find PMU or event on a PMU of '%s'", event_or_pmu) < 0) + help = NULL; + parse_events_error__handle(parse_state->error, loc->first_column, + strdup("Bad event or PMU"), + help); + zfree(listp); + return -EINVAL; +} + int parse_events__modifier_group(struct list_head *list, char *event_mod) { diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 809359e8544e..a331b9f0da2b 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -209,10 +209,6 @@ int parse_events_add_breakpoint(struct parse_events_state *parse_state, struct list_head *list, u64 addr, char *type, u64 len, struct parse_events_terms *head_config); -int parse_events_add_pmu(struct parse_events_state *parse_state, - struct list_head *list, const char *name, - const struct parse_events_terms *const_parsed_terms, - bool auto_merge_stats, void *loc); struct evsel *parse_events__add_event(int idx, struct perf_event_attr *attr, const char *name, const char *metric_id, @@ -223,6 +219,12 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, const struct parse_events_terms *const_parsed_terms, struct list_head **listp, void *loc); +int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state, + const char *event_or_pmu, + const struct parse_events_terms *const_parsed_terms, + struct list_head **listp, + void *loc_); + void parse_events__set_leader(char *name, struct list_head *list); void parse_events_update_lists(struct list_head *list_event, struct list_head *list_all); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index d70f5d84af92..7764e5895210 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -273,78 +273,15 @@ event_def: event_pmu | event_pmu: PE_NAME opt_pmu_config { - struct parse_events_state *parse_state = _parse_state; /* List of created evsels. */ struct list_head *list = NULL; - char *pattern = NULL; - -#define CLEANUP \ - do { \ - parse_events_terms__delete($2); \ - free(list); \ - free($1); \ - free(pattern); \ - } while(0) + int err = parse_events_multi_pmu_add_or_add_pmu(_parse_state, $1, $2, &list, &@1); - list = alloc_list(); - if (!list) { - CLEANUP; - YYNOMEM; - } - /* Attempt to add to list assuming $1 is a PMU name. */ - if (parse_events_add_pmu(parse_state, list, $1, $2, /*auto_merge_stats=*/false, &@1)) { - struct perf_pmu *pmu = NULL; - int ok = 0; - - /* Failure to add, try wildcard expansion of $1 as a PMU name. */ - if (asprintf(&pattern, "%s*", $1) < 0) { - CLEANUP; - YYNOMEM; - } - - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - const char *name = pmu->name; - - if (parse_events__filter_pmu(parse_state, pmu)) - continue; - - if (!strncmp(name, "uncore_", 7) && - strncmp($1, "uncore_", 7)) - name += 7; - if (!perf_pmu__match(pattern, name, $1) || - !perf_pmu__match(pattern, pmu->alias_name, $1)) { - bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); - - if (!parse_events_add_pmu(parse_state, list, pmu->name, $2, - auto_merge_stats, &@1)) { - ok++; - parse_state->wild_card_pmus = true; - } - } - } - - if (!ok) { - /* Failure to add, assume $1 is an event name. */ - zfree(&list); - ok = !parse_events_multi_pmu_add(parse_state, $1, $2, &list, &@1); - } - if (!ok) { - struct parse_events_error *error = parse_state->error; - char *help; - - if (asprintf(&help, "Unable to find PMU or event on a PMU of '%s'", $1) < 0) - help = NULL; - parse_events_error__handle(error, @1.first_column, - strdup("Bad event or PMU"), - help); - CLEANUP; - YYABORT; - } - } + parse_events_terms__delete($2); + free($1); + if (err) + PE_ABORT(err); $$ = list; - list = NULL; - CLEANUP; -#undef CLEANUP } | PE_NAME sep_dc -- cgit v1.2.3 From 63dfcde9779bcab5cff909819e8c82d472ea117a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:18 -0700 Subject: perf parse-events: Directly pass PMU to parse_events_add_pmu() Avoid passing the name of a PMU then finding it again, just directly pass the PMU. parse_events_multi_pmu_add_or_add_pmu() is the only version that needs to find a PMU, so move the find there. Remove the error message as parse_events_multi_pmu_add_or_add_pmu will given an error at the end when a name isn't either a PMU name or event name. Without the error message being created the location in the input parameter (loc) can be removed. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 46 ++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index a6f71165ee1a..2d5a275dd257 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1386,32 +1386,18 @@ static bool config_term_percore(struct list_head *config_terms) } static int parse_events_add_pmu(struct parse_events_state *parse_state, - struct list_head *list, const char *name, - const struct parse_events_terms *const_parsed_terms, - bool auto_merge_stats, void *loc_) + struct list_head *list, struct perf_pmu *pmu, + const struct parse_events_terms *const_parsed_terms, + bool auto_merge_stats) { struct perf_event_attr attr; struct perf_pmu_info info; - struct perf_pmu *pmu; struct evsel *evsel; struct parse_events_error *err = parse_state->error; - YYLTYPE *loc = loc_; LIST_HEAD(config_terms); struct parse_events_terms parsed_terms; bool alias_rewrote_terms = false; - pmu = parse_state->fake_pmu ?: perf_pmus__find(name); - - if (!pmu) { - char *err_str; - - if (asprintf(&err_str, - "Cannot find PMU `%s'. Missing kernel support?", - name) >= 0) - parse_events_error__handle(err, loc->first_column, err_str, NULL); - return -EINVAL; - } - parse_events_terms__init(&parsed_terms); if (const_parsed_terms) { int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms); @@ -1425,9 +1411,9 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, strbuf_init(&sb, /*hint=*/ 0); if (pmu->selectable && list_empty(&parsed_terms.terms)) { - strbuf_addf(&sb, "%s//", name); + strbuf_addf(&sb, "%s//", pmu->name); } else { - strbuf_addf(&sb, "%s/", name); + strbuf_addf(&sb, "%s/", pmu->name); parse_events_terms__to_strbuf(&parsed_terms, &sb); strbuf_addch(&sb, '/'); } @@ -1469,7 +1455,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, strbuf_init(&sb, /*hint=*/ 0); parse_events_terms__to_strbuf(&parsed_terms, &sb); - fprintf(stderr, "..after resolving event: %s/%s/\n", name, sb.buf); + fprintf(stderr, "..after resolving event: %s/%s/\n", pmu->name, sb.buf); strbuf_release(&sb); } @@ -1583,8 +1569,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, continue; auto_merge_stats = perf_pmu__auto_merge_stats(pmu); - if (!parse_events_add_pmu(parse_state, list, pmu->name, - &parsed_terms, auto_merge_stats, loc)) { + if (!parse_events_add_pmu(parse_state, list, pmu, + &parsed_terms, auto_merge_stats)) { struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); @@ -1596,8 +1582,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } if (parse_state->fake_pmu) { - if (!parse_events_add_pmu(parse_state, list, event_name, &parsed_terms, - /*auto_merge_stats=*/true, loc)) { + if (!parse_events_add_pmu(parse_state, list, parse_state->fake_pmu, &parsed_terms, + /*auto_merge_stats=*/true)) { struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); @@ -1626,7 +1612,7 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state { char *pattern = NULL; YYLTYPE *loc = loc_; - struct perf_pmu *pmu = NULL; + struct perf_pmu *pmu; int ok = 0; char *help; @@ -1637,10 +1623,12 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state INIT_LIST_HEAD(*listp); /* Attempt to add to list assuming event_or_pmu is a PMU name. */ - if (!parse_events_add_pmu(parse_state, *listp, event_or_pmu, const_parsed_terms, - /*auto_merge_stats=*/false, loc)) + pmu = parse_state->fake_pmu ?: perf_pmus__find(event_or_pmu); + if (pmu && !parse_events_add_pmu(parse_state, *listp, pmu, const_parsed_terms, + /*auto_merge_stats=*/false)) return 0; + pmu = NULL; /* Failed to add, try wildcard expansion of event_or_pmu as a PMU name. */ if (asprintf(&pattern, "%s*", event_or_pmu) < 0) { zfree(listp); @@ -1660,9 +1648,9 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state !perf_pmu__match(pattern, pmu->alias_name, event_or_pmu)) { bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); - if (!parse_events_add_pmu(parse_state, *listp, pmu->name, + if (!parse_events_add_pmu(parse_state, *listp, pmu, const_parsed_terms, - auto_merge_stats, loc)) { + auto_merge_stats)) { ok++; parse_state->wild_card_pmus = true; } -- cgit v1.2.3 From 90b2c210a54e657ea8fcba3d724dba180c716edc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:19 -0700 Subject: perf parse-events: Avoid copying an empty list In parse_events_add_pmu, delay copying the list of terms until it is known the list contains terms. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2d5a275dd257..3b1f767039fa 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1398,29 +1398,21 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, struct parse_events_terms parsed_terms; bool alias_rewrote_terms = false; - parse_events_terms__init(&parsed_terms); - if (const_parsed_terms) { - int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms); - - if (ret) - return ret; - } - if (verbose > 1) { struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); - if (pmu->selectable && list_empty(&parsed_terms.terms)) { + if (pmu->selectable && const_parsed_terms && + list_empty(&const_parsed_terms->terms)) { strbuf_addf(&sb, "%s//", pmu->name); } else { strbuf_addf(&sb, "%s/", pmu->name); - parse_events_terms__to_strbuf(&parsed_terms, &sb); + parse_events_terms__to_strbuf(const_parsed_terms, &sb); strbuf_addch(&sb, '/'); } fprintf(stderr, "Attempt to add: %s\n", sb.buf); strbuf_release(&sb); } - fix_raw(&parsed_terms, pmu); memset(&attr, 0, sizeof(attr)); if (pmu->perf_event_attr_init_default) @@ -1428,7 +1420,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, attr.type = pmu->type; - if (list_empty(&parsed_terms.terms)) { + if (!const_parsed_terms || list_empty(&const_parsed_terms->terms)) { evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true, /*name=*/NULL, /*metric_id=*/NULL, pmu, @@ -1437,6 +1429,15 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, return evsel ? 0 : -ENOMEM; } + parse_events_terms__init(&parsed_terms); + if (const_parsed_terms) { + int ret = parse_events_terms__copy(const_parsed_terms, &parsed_terms); + + if (ret) + return ret; + } + fix_raw(&parsed_terms, pmu); + /* Configure attr/terms with a known PMU, this will set hardcoded terms. */ if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { parse_events_terms__exit(&parsed_terms); -- cgit v1.2.3 From f91fa2ae63604e6fe9eed9ac05eb9ed3b23701cc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:20 -0700 Subject: perf pmu: Refactor perf_pmu__match() Move all implementation to pmu code. Don't allocate a fnmatch wildcard pattern, matching ignoring the suffix already handles this, and only use fnmatch if the given PMU name has a '*' in it. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 19 ++----------------- tools/perf/util/pmu.c | 27 +++++++++++++++++++-------- tools/perf/util/pmu.h | 2 +- 3 files changed, 22 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 3b1f767039fa..39548ec645ec 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1611,7 +1611,6 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state struct list_head **listp, void *loc_) { - char *pattern = NULL; YYLTYPE *loc = loc_; struct perf_pmu *pmu; int ok = 0; @@ -1631,22 +1630,9 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state pmu = NULL; /* Failed to add, try wildcard expansion of event_or_pmu as a PMU name. */ - if (asprintf(&pattern, "%s*", event_or_pmu) < 0) { - zfree(listp); - return -ENOMEM; - } - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - const char *name = pmu->name; - - if (parse_events__filter_pmu(parse_state, pmu)) - continue; - - if (!strncmp(name, "uncore_", 7) && - strncmp(event_or_pmu, "uncore_", 7)) - name += 7; - if (!perf_pmu__match(pattern, name, event_or_pmu) || - !perf_pmu__match(pattern, pmu->alias_name, event_or_pmu)) { + if (!parse_events__filter_pmu(parse_state, pmu) && + perf_pmu__match(pmu, event_or_pmu)) { bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); if (!parse_events_add_pmu(parse_state, *listp, pmu, @@ -1657,7 +1643,6 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state } } } - zfree(&pattern); if (ok) return 0; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ab30f22eaf10..74dd5bd49d9a 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -2064,18 +2064,29 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, name ?: "N/A", buf, config_name, config); } -int perf_pmu__match(const char *pattern, const char *name, const char *tok) +bool perf_pmu__match(const struct perf_pmu *pmu, const char *tok) { - if (!name) - return -1; + const char *name = pmu->name; + bool need_fnmatch = strchr(tok, '*') != NULL; - if (fnmatch(pattern, name, 0)) - return -1; + if (!strncmp(tok, "uncore_", 7)) + tok += 7; + if (!strncmp(name, "uncore_", 7)) + name += 7; - if (tok && !perf_pmu__match_ignoring_suffix(name, tok)) - return -1; + if (perf_pmu__match_ignoring_suffix(name, tok) || + (need_fnmatch && !fnmatch(tok, name, 0))) + return true; - return 0; + name = pmu->alias_name; + if (!name) + return false; + + if (!strncmp(name, "uncore_", 7)) + name += 7; + + return perf_pmu__match_ignoring_suffix(name, tok) || + (need_fnmatch && !fnmatch(tok, name, 0)); } double __weak perf_pmu__cpu_slots_per_cycle(void) diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 152700f78455..93d03bd3ecbe 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -263,7 +263,7 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, const char *config_name); void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu); -int perf_pmu__match(const char *pattern, const char *name, const char *tok); +bool perf_pmu__match(const struct perf_pmu *pmu, const char *tok); double perf_pmu__cpu_slots_per_cycle(void); int perf_pmu__event_source_devices_scnprintf(char *pathname, size_t size); -- cgit v1.2.3 From 78fae2071ff790bcc701dcc03a4bc7ad36375856 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:21 -0700 Subject: perf tests parse-events: Use "branches" rather than "cache-references" Switch from "cache-references" to "branches" in test as Intel has a sysfs event for "cache-references" and changing the priority for sysfs over legacy causes the test to fail. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 0b70451451b3..993e482f094c 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -942,8 +942,8 @@ static int test__group2(struct evlist *evlist) continue; } if (evsel->core.attr.type == PERF_TYPE_HARDWARE && - test_config(evsel, PERF_COUNT_HW_CACHE_REFERENCES)) { - /* cache-references + :u modifier */ + test_config(evsel, PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) { + /* branches + :u modifier */ TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -2032,7 +2032,7 @@ static const struct evlist_test test__events[] = { /* 8 */ }, { - .name = "{faults:k,cache-references}:u,cycles:k", + .name = "{faults:k,branches}:u,cycles:k", .check = test__group2, /* 9 */ }, -- cgit v1.2.3 From 62593394f66aaebc8bfc0058bb029cae84bd8748 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:22 -0700 Subject: perf parse-events: Legacy cache names on all PMUs and lower priority Prior behavior is to not look for legacy cache names in sysfs/JSON and to create events on all core PMUs. New behavior is to look for sysfs/JSON events first on all PMUs, for core PMUs add a legacy event if the sysfs/JSON event isn't present. This is done so that there is consistency with how event names in terms are handled and their prioritization of sysfs/JSON over legacy. It may make sense to use a legacy cache event name as an event name on a non-core PMU so we should allow it. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 39 +++++++++++++++++++++++++++++++-------- tools/perf/util/parse-events.h | 2 +- 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 39548ec645ec..1440a3b4b674 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -442,17 +442,21 @@ bool parse_events__filter_pmu(const struct parse_events_state *parse_state, return strcmp(parse_state->pmu_filter, pmu->name) != 0; } +static int parse_events_add_pmu(struct parse_events_state *parse_state, + struct list_head *list, struct perf_pmu *pmu, + const struct parse_events_terms *const_parsed_terms, + bool auto_merge_stats); + int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_state *parse_state, - struct parse_events_terms *head_config) + struct parse_events_terms *parsed_terms) { struct perf_pmu *pmu = NULL; bool found_supported = false; - const char *config_name = get_config_name(head_config); - const char *metric_id = get_config_metric_id(head_config); + const char *config_name = get_config_name(parsed_terms); + const char *metric_id = get_config_metric_id(parsed_terms); - /* Legacy cache events are only supported by core PMUs. */ - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { LIST_HEAD(config_terms); struct perf_event_attr attr; int ret; @@ -460,6 +464,24 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, if (parse_events__filter_pmu(parse_state, pmu)) continue; + if (perf_pmu__have_event(pmu, name)) { + /* + * The PMU has the event so add as not a legacy cache + * event. + */ + ret = parse_events_add_pmu(parse_state, list, pmu, + parsed_terms, + perf_pmu__auto_merge_stats(pmu)); + if (ret) + return ret; + continue; + } + + if (!pmu->is_core) { + /* Legacy cache events are only supported by core PMUs. */ + continue; + } + memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_HW_CACHE; @@ -469,11 +491,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, found_supported = true; - if (head_config) { - if (config_attr(&attr, head_config, parse_state->error, config_term_common)) + if (parsed_terms) { + if (config_attr(&attr, parsed_terms, parse_state->error, + config_term_common)) return -EINVAL; - if (get_config_terms(head_config, &config_terms)) + if (get_config_terms(parsed_terms, &config_terms)) return -ENOMEM; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index a331b9f0da2b..db47913e54bc 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -203,7 +203,7 @@ int parse_events_add_tool(struct parse_events_state *parse_state, int tool_event); int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_state *parse_state, - struct parse_events_terms *head_config); + struct parse_events_terms *parsed_terms); int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config); int parse_events_add_breakpoint(struct parse_events_state *parse_state, struct list_head *list, -- cgit v1.2.3 From 9d0dba2398ff4bdb2eced7eaa7abd927aadf442b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:23 -0700 Subject: perf parse-events: Handle PE_TERM_HW in name_or_raw Avoid duplicate logic for name_or_raw and PE_TERM_HW by having a rule to turn PE_TERM_HW into a name_or_raw. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.y | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 7764e5895210..254f8aeca461 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -603,6 +603,11 @@ event_term } name_or_raw: PE_RAW | PE_NAME | PE_LEGACY_CACHE +| +PE_TERM_HW +{ + $$ = $1.str; +} event_term: PE_RAW @@ -644,20 +649,6 @@ name_or_raw '=' PE_VALUE $$ = term; } | -name_or_raw '=' PE_TERM_HW -{ - struct parse_events_term *term; - int err = parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER, - $1, $3.str, &@1, &@3); - - if (err) { - free($1); - free($3.str); - PE_ABORT(err); - } - $$ = term; -} -| PE_LEGACY_CACHE { struct parse_events_term *term; @@ -710,18 +701,6 @@ PE_TERM '=' name_or_raw $$ = term; } | -PE_TERM '=' PE_TERM_HW -{ - struct parse_events_term *term; - int err = parse_events_term__str(&term, $1, /*config=*/NULL, $3.str, &@1, &@3); - - if (err) { - free($3.str); - PE_ABORT(err); - } - $$ = term; -} -| PE_TERM '=' PE_TERM { struct parse_events_term *term; -- cgit v1.2.3 From 5ccc4edfc2a98840d8fb1669963df42d2d433181 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:24 -0700 Subject: perf parse-events: Constify parse_events_add_numeric Allow the term list to be const so that other functions can pass const term lists. Add const as necessary to called functions. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 20 +++++++++++--------- tools/perf/util/parse-events.h | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1440a3b4b674..1b408e3dccc7 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -34,7 +34,8 @@ #ifdef PARSER_DEBUG extern int parse_events_debug; #endif -static int get_config_terms(struct parse_events_terms *head_config, struct list_head *head_terms); +static int get_config_terms(const struct parse_events_terms *head_config, + struct list_head *head_terms); static int parse_events_terms__copy(const struct parse_events_terms *src, struct parse_events_terms *dest); @@ -154,7 +155,7 @@ const char *event_type(int type) return "unknown"; } -static char *get_config_str(struct parse_events_terms *head_terms, +static char *get_config_str(const struct parse_events_terms *head_terms, enum parse_events__term_type type_term) { struct parse_events_term *term; @@ -169,12 +170,12 @@ static char *get_config_str(struct parse_events_terms *head_terms, return NULL; } -static char *get_config_metric_id(struct parse_events_terms *head_terms) +static char *get_config_metric_id(const struct parse_events_terms *head_terms) { return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } -static char *get_config_name(struct parse_events_terms *head_terms) +static char *get_config_name(const struct parse_events_terms *head_terms) { return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME); } @@ -358,7 +359,7 @@ static int config_term_common(struct perf_event_attr *attr, struct parse_events_term *term, struct parse_events_error *err); static int config_attr(struct perf_event_attr *attr, - struct parse_events_terms *head, + const struct parse_events_terms *head, struct parse_events_error *err, config_term_func_t config_term); @@ -1108,7 +1109,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr, #endif static int config_attr(struct perf_event_attr *attr, - struct parse_events_terms *head, + const struct parse_events_terms *head, struct parse_events_error *err, config_term_func_t config_term) { @@ -1121,7 +1122,8 @@ static int config_attr(struct perf_event_attr *attr, return 0; } -static int get_config_terms(struct parse_events_terms *head_config, struct list_head *head_terms) +static int get_config_terms(const struct parse_events_terms *head_config, + struct list_head *head_terms) { #define ADD_CONFIG_TERM(__type, __weak) \ struct evsel_config_term *__t; \ @@ -1325,7 +1327,7 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, static int __parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, struct perf_pmu *pmu, u32 type, u32 extended_type, - u64 config, struct parse_events_terms *head_config) + u64 config, const struct parse_events_terms *head_config) { struct perf_event_attr attr; LIST_HEAD(config_terms); @@ -1361,7 +1363,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state, int parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, u32 type, u64 config, - struct parse_events_terms *head_config, + const struct parse_events_terms *head_config, bool wildcard) { struct perf_pmu *pmu = NULL; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index db47913e54bc..5005782766e9 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -196,7 +196,7 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, int parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, u32 type, u64 config, - struct parse_events_terms *head_config, + const struct parse_events_terms *head_config, bool wildcard); int parse_events_add_tool(struct parse_events_state *parse_state, struct list_head *list, -- cgit v1.2.3 From 617824a7f0f73e4de325cf8add58e55b28c12493 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:25 -0700 Subject: perf parse-events: Prefer sysfs/JSON hardware events over legacy It was requested that RISC-V be able to add events to the perf tool so the PMU driver didn't need to map legacy events to config encodings: https://lore.kernel.org/lkml/20240217005738.3744121-1-atishp@rivosinc.com/ This change makes the priority of events specified without a PMU the same as those specified with a PMU, namely sysfs and JSON events are checked first before using the legacy encoding. The hw_term is made more generic as a hardware_event that encodes a pair of string and int value, allowing parse_events_multi_pmu_add to fall back on a known encoding when the sysfs/JSON adding fails for core events. As this covers PE_VALUE_SYM_HW, that token is removed and related code simplified. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 31 ++++++++++++----- tools/perf/util/parse-events.h | 2 +- tools/perf/util/parse-events.l | 76 +++++++++++++++++++++--------------------- tools/perf/util/parse-events.y | 62 ++++++++++++++++++++++------------ 4 files changed, 103 insertions(+), 68 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1b408e3dccc7..805872c90a3e 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1543,7 +1543,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, } int parse_events_multi_pmu_add(struct parse_events_state *parse_state, - const char *event_name, + const char *event_name, u64 hw_config, const struct parse_events_terms *const_parsed_terms, struct list_head **listp, void *loc_) { @@ -1551,8 +1551,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, struct list_head *list = NULL; struct perf_pmu *pmu = NULL; YYLTYPE *loc = loc_; - int ok = 0; - const char *config; + int ok = 0, core_ok = 0; + const char *tmp; struct parse_events_terms parsed_terms; *listp = NULL; @@ -1565,15 +1565,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, return ret; } - config = strdup(event_name); - if (!config) + tmp = strdup(event_name); + if (!tmp) goto out_err; if (parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - config, /*num=*/1, /*novalue=*/true, + tmp, /*num=*/1, /*novalue=*/true, loc, /*loc_val=*/NULL) < 0) { - zfree(&config); + zfree(&tmp); goto out_err; } list_add_tail(&term->list, &parsed_terms.terms); @@ -1604,6 +1604,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, pr_debug("%s -> %s/%s/\n", event_name, pmu->name, sb.buf); strbuf_release(&sb); ok++; + if (pmu->is_core) + core_ok++; } } @@ -1620,6 +1622,18 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } } + if (hw_config != PERF_COUNT_HW_MAX && !core_ok) { + /* + * The event wasn't found on core PMUs but it has a hardware + * config version to try. + */ + if (!parse_events_add_numeric(parse_state, list, + PERF_TYPE_HARDWARE, hw_config, + const_parsed_terms, + /*wildcard=*/true)) + ok++; + } + out_err: parse_events_terms__exit(&parsed_terms); if (ok) @@ -1673,7 +1687,8 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state /* Failure to add, assume event_or_pmu is an event name. */ zfree(listp); - if (!parse_events_multi_pmu_add(parse_state, event_or_pmu, const_parsed_terms, listp, loc)) + if (!parse_events_multi_pmu_add(parse_state, event_or_pmu, PERF_COUNT_HW_MAX, + const_parsed_terms, listp, loc)) return 0; if (asprintf(&help, "Unable to find PMU or event on a PMU of '%s'", event_or_pmu) < 0) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 5005782766e9..7e5afad3feb8 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -215,7 +215,7 @@ struct evsel *parse_events__add_event(int idx, struct perf_event_attr *attr, struct perf_pmu *pmu); int parse_events_multi_pmu_add(struct parse_events_state *parse_state, - const char *event_name, + const char *event_name, u64 hw_config, const struct parse_events_terms *const_parsed_terms, struct list_head **listp, void *loc); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index e86c45675e1d..6fe37003ab7b 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -100,12 +100,12 @@ do { \ yyless(0); \ } while (0) -static int sym(yyscan_t scanner, int type, int config) +static int sym(yyscan_t scanner, int config) { YYSTYPE *yylval = parse_events_get_lval(scanner); - yylval->num = (type << 16) + config; - return type == PERF_TYPE_HARDWARE ? PE_VALUE_SYM_HW : PE_VALUE_SYM_SW; + yylval->num = config; + return PE_VALUE_SYM_SW; } static int tool(yyscan_t scanner, enum perf_tool_event event) @@ -124,13 +124,13 @@ static int term(yyscan_t scanner, enum parse_events__term_type type) return PE_TERM; } -static int hw_term(yyscan_t scanner, int config) +static int hw(yyscan_t scanner, int config) { YYSTYPE *yylval = parse_events_get_lval(scanner); char *text = parse_events_get_text(scanner); - yylval->hardware_term.str = strdup(text); - yylval->hardware_term.num = PERF_TYPE_HARDWARE + config; + yylval->hardware_event.str = strdup(text); + yylval->hardware_event.num = config; return PE_TERM_HW; } @@ -246,16 +246,16 @@ percore { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); } aux-output { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } metric-id { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } -cpu-cycles|cycles { return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } -stalled-cycles-frontend|idle-cycles-frontend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } -stalled-cycles-backend|idle-cycles-backend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } -instructions { return hw_term(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } -cache-references { return hw_term(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } -cache-misses { return hw_term(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } -branch-instructions|branches { return hw_term(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } -branch-misses { return hw_term(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } -bus-cycles { return hw_term(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } -ref-cycles { return hw_term(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } +cpu-cycles|cycles { return hw(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } +stalled-cycles-frontend|idle-cycles-frontend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } +stalled-cycles-backend|idle-cycles-backend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } +instructions { return hw(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } +cache-references { return hw(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } +cache-misses { return hw(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } +branch-instructions|branches { return hw(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +branch-misses { return hw(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } +bus-cycles { return hw(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } +ref-cycles { return hw(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } r{num_raw_hex} { return str(yyscanner, PE_RAW); } r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } , { return ','; } @@ -299,31 +299,31 @@ r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } <> { BEGIN(INITIAL); } } -cpu-cycles|cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); } -stalled-cycles-frontend|idle-cycles-frontend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } -stalled-cycles-backend|idle-cycles-backend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } -instructions { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); } -cache-references { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES); } -cache-misses { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); } -branch-instructions|branches { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } -branch-misses { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); } -bus-cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES); } -ref-cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES); } -cpu-clock { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK); } -task-clock { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK); } -page-faults|faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS); } -minor-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN); } -major-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ); } -context-switches|cs { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES); } -cpu-migrations|migrations { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS); } -alignment-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); } -emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); } -dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } +cpu-cycles|cycles { return hw(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } +stalled-cycles-frontend|idle-cycles-frontend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } +stalled-cycles-backend|idle-cycles-backend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } +instructions { return hw(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } +cache-references { return hw(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } +cache-misses { return hw(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } +branch-instructions|branches { return hw(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +branch-misses { return hw(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } +bus-cycles { return hw(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } +ref-cycles { return hw(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } +cpu-clock { return sym(yyscanner, PERF_COUNT_SW_CPU_CLOCK); } +task-clock { return sym(yyscanner, PERF_COUNT_SW_TASK_CLOCK); } +page-faults|faults { return sym(yyscanner, PERF_COUNT_SW_PAGE_FAULTS); } +minor-faults { return sym(yyscanner, PERF_COUNT_SW_PAGE_FAULTS_MIN); } +major-faults { return sym(yyscanner, PERF_COUNT_SW_PAGE_FAULTS_MAJ); } +context-switches|cs { return sym(yyscanner, PERF_COUNT_SW_CONTEXT_SWITCHES); } +cpu-migrations|migrations { return sym(yyscanner, PERF_COUNT_SW_CPU_MIGRATIONS); } +alignment-faults { return sym(yyscanner, PERF_COUNT_SW_ALIGNMENT_FAULTS); } +emulation-faults { return sym(yyscanner, PERF_COUNT_SW_EMULATION_FAULTS); } +dummy { return sym(yyscanner, PERF_COUNT_SW_DUMMY); } duration_time { return tool(yyscanner, PERF_TOOL_DURATION_TIME); } user_time { return tool(yyscanner, PERF_TOOL_USER_TIME); } system_time { return tool(yyscanner, PERF_TOOL_SYSTEM_TIME); } -bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } -cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CGROUP_SWITCHES); } +bpf-output { return sym(yyscanner, PERF_COUNT_SW_BPF_OUTPUT); } +cgroup-switches { return sym(yyscanner, PERF_COUNT_SW_CGROUP_SWITCHES); } {lc_type} { return str(yyscanner, PE_LEGACY_CACHE); } {lc_type}-{lc_op_result} { return str(yyscanner, PE_LEGACY_CACHE); } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 254f8aeca461..31fe8cf428ff 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -55,7 +55,7 @@ static void free_list_evsel(struct list_head* list_evsel) %} %token PE_START_EVENTS PE_START_TERMS -%token PE_VALUE PE_VALUE_SYM_HW PE_VALUE_SYM_SW PE_TERM +%token PE_VALUE PE_VALUE_SYM_SW PE_TERM %token PE_VALUE_SYM_TOOL %token PE_EVENT_NAME %token PE_RAW PE_NAME @@ -66,11 +66,9 @@ static void free_list_evsel(struct list_head* list_evsel) %token PE_DRV_CFG_TERM %token PE_TERM_HW %type PE_VALUE -%type PE_VALUE_SYM_HW %type PE_VALUE_SYM_SW %type PE_VALUE_SYM_TOOL %type PE_TERM -%type value_sym %type PE_RAW %type PE_NAME %type PE_LEGACY_CACHE @@ -87,6 +85,7 @@ static void free_list_evsel(struct list_head* list_evsel) %type opt_pmu_config %destructor { parse_events_terms__delete ($$); } %type event_pmu +%type event_legacy_hardware %type event_legacy_symbol %type event_legacy_cache %type event_legacy_mem @@ -104,8 +103,8 @@ static void free_list_evsel(struct list_head* list_evsel) %destructor { free_list_evsel ($$); } %type tracepoint_name %destructor { free ($$.sys); free ($$.event); } -%type PE_TERM_HW -%destructor { free ($$.str); } +%type PE_TERM_HW +%destructor { free ($$.str); } %union { @@ -119,10 +118,10 @@ static void free_list_evsel(struct list_head* list_evsel) char *sys; char *event; } tracepoint_name; - struct hardware_term { + struct hardware_event { char *str; u64 num; - } hardware_term; + } hardware_event; } %% @@ -263,6 +262,7 @@ PE_EVENT_NAME event_def event_def event_def: event_pmu | + event_legacy_hardware | event_legacy_symbol | event_legacy_cache sep_dc | event_legacy_mem sep_dc | @@ -289,7 +289,7 @@ PE_NAME sep_dc struct list_head *list; int err; - err = parse_events_multi_pmu_add(_parse_state, $1, NULL, &list, &@1); + err = parse_events_multi_pmu_add(_parse_state, $1, PERF_COUNT_HW_MAX, NULL, &list, &@1); if (err < 0) { struct parse_events_state *parse_state = _parse_state; struct parse_events_error *error = parse_state->error; @@ -305,24 +305,45 @@ PE_NAME sep_dc $$ = list; } -value_sym: -PE_VALUE_SYM_HW +event_legacy_hardware: +PE_TERM_HW opt_pmu_config +{ + /* List of created evsels. */ + struct list_head *list = NULL; + int err = parse_events_multi_pmu_add(_parse_state, $1.str, $1.num, $2, &list, &@1); + + free($1.str); + parse_events_terms__delete($2); + if (err) + PE_ABORT(err); + + $$ = list; +} | -PE_VALUE_SYM_SW +PE_TERM_HW sep_dc +{ + struct list_head *list; + int err; + + err = parse_events_multi_pmu_add(_parse_state, $1.str, $1.num, NULL, &list, &@1); + free($1.str); + if (err) + PE_ABORT(err); + $$ = list; +} event_legacy_symbol: -value_sym '/' event_config '/' +PE_VALUE_SYM_SW '/' event_config '/' { struct list_head *list; - int type = $1 >> 16; - int config = $1 & 255; int err; - bool wildcard = (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE); list = alloc_list(); if (!list) YYNOMEM; - err = parse_events_add_numeric(_parse_state, list, type, config, $3, wildcard); + err = parse_events_add_numeric(_parse_state, list, + /*type=*/PERF_TYPE_SOFTWARE, /*config=*/$1, + $3, /*wildcard=*/false); parse_events_terms__delete($3); if (err) { free_list_evsel(list); @@ -331,18 +352,17 @@ value_sym '/' event_config '/' $$ = list; } | -value_sym sep_slash_slash_dc +PE_VALUE_SYM_SW sep_slash_slash_dc { struct list_head *list; - int type = $1 >> 16; - int config = $1 & 255; - bool wildcard = (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE); int err; list = alloc_list(); if (!list) YYNOMEM; - err = parse_events_add_numeric(_parse_state, list, type, config, /*head_config=*/NULL, wildcard); + err = parse_events_add_numeric(_parse_state, list, + /*type=*/PERF_TYPE_SOFTWARE, /*config=*/$1, + /*head_config=*/NULL, /*wildcard=*/false); if (err) PE_ABORT(err); $$ = list; -- cgit v1.2.3 From 4e5484b4bfd53d4d12933ab9b19da66909f1598e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:26 -0700 Subject: perf parse-events: Inline parse_events_update_lists The helper function just wraps a splice and free. Making the free inline removes a comment, so then it just wraps a splice which we can make inline too. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 13 ------------- tools/perf/util/parse-events.h | 2 -- tools/perf/util/parse-events.y | 41 +++++++++++++++++++++++++---------------- 3 files changed, 25 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 805872c90a3e..7eba714f0d73 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1720,19 +1720,6 @@ void parse_events__set_leader(char *name, struct list_head *list) leader->group_name = name; } -/* list_event is assumed to point to malloc'ed memory */ -void parse_events_update_lists(struct list_head *list_event, - struct list_head *list_all) -{ - /* - * Called for single event definition. Update the - * 'all event' list, and reinit the 'single event' - * list, for next event definition. - */ - list_splice_tail(list_event, list_all); - free(list_event); -} - struct event_modifier { int eu; int ek; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 7e5afad3feb8..e8f2aebea10f 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -226,8 +226,6 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state void *loc_); void parse_events__set_leader(char *name, struct list_head *list); -void parse_events_update_lists(struct list_head *list_event, - struct list_head *list_all); void parse_events_evlist_error(struct parse_events_state *parse_state, int idx, const char *str); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 31fe8cf428ff..51490f0f8c50 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -125,6 +125,10 @@ static void free_list_evsel(struct list_head* list_evsel) } %% + /* + * Entry points. We are either parsing events or terminals. Just terminal + * parsing is used for parsing events in sysfs. + */ start: PE_START_EVENTS start_events | @@ -132,31 +136,36 @@ PE_START_TERMS start_terms start_events: groups { + /* Take the parsed events, groups.. and place into parse_state. */ + struct list_head *groups = $1; struct parse_events_state *parse_state = _parse_state; - /* frees $1 */ - parse_events_update_lists($1, &parse_state->list); + list_splice_tail(groups, &parse_state->list); + free(groups); } -groups: +groups: /* A list of groups or events. */ groups ',' group { - struct list_head *list = $1; - struct list_head *group = $3; + /* Merge group into the list of events/groups. */ + struct list_head *groups = $1; + struct list_head *group = $3; - /* frees $3 */ - parse_events_update_lists(group, list); - $$ = list; + list_splice_tail(group, groups); + free(group); + $$ = groups; } | groups ',' event { - struct list_head *list = $1; + /* Merge event into the list of events/groups. */ + struct list_head *groups = $1; struct list_head *event = $3; - /* frees $3 */ - parse_events_update_lists(event, list); - $$ = list; + + list_splice_tail(event, groups); + free(event); + $$ = groups; } | group @@ -206,12 +215,12 @@ PE_NAME '{' events '}' events: events ',' event { + struct list_head *events = $1; struct list_head *event = $3; - struct list_head *list = $1; - /* frees $3 */ - parse_events_update_lists(event, list); - $$ = list; + list_splice_tail(event, events); + free(event); + $$ = events; } | event -- cgit v1.2.3 From ba5c371edfd05411522c1fefb00e92b54c08c9db Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:27 -0700 Subject: perf parse-events: Improve error message for bad numbers Use the error handler from the parse_state to give a more informative error message. Before: $ perf stat -e 'cycles/period=99999999999999999999/' true event syntax error: 'cycles/period=99999999999999999999/' \___ parser error Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events After: $ perf stat -e 'cycles/period=99999999999999999999/' true event syntax error: 'cycles/period=99999999999999999999/' \___ parser error event syntax error: '..les/period=99999999999999999999/' \___ Bad base 10 number "99999999999999999999" Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.l | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 6fe37003ab7b..0cd68c9f0d4f 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -18,26 +18,34 @@ char *parse_events_get_text(yyscan_t yyscanner); YYSTYPE *parse_events_get_lval(yyscan_t yyscanner); +int parse_events_get_column(yyscan_t yyscanner); +int parse_events_get_leng(yyscan_t yyscanner); -static int __value(YYSTYPE *yylval, char *str, int base, int token) +static int get_column(yyscan_t scanner) { - u64 num; - - errno = 0; - num = strtoull(str, NULL, base); - if (errno) - return PE_ERROR; - - yylval->num = num; - return token; + return parse_events_get_column(scanner) - parse_events_get_leng(scanner); } -static int value(yyscan_t scanner, int base) +static int value(struct parse_events_state *parse_state, yyscan_t scanner, int base) { YYSTYPE *yylval = parse_events_get_lval(scanner); char *text = parse_events_get_text(scanner); + u64 num; - return __value(yylval, text, base, PE_VALUE); + errno = 0; + num = strtoull(text, NULL, base); + if (errno) { + struct parse_events_error *error = parse_state->error; + char *help = NULL; + + if (asprintf(&help, "Bad base %d number \"%s\"", base, text) > 0) + parse_events_error__handle(error, get_column(scanner), help , NULL); + + return PE_ERROR; + } + + yylval->num = num; + return PE_VALUE; } static int str(yyscan_t scanner, int token) @@ -283,8 +291,8 @@ r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } */ "/"/{digit} { return PE_BP_SLASH; } "/"/{non_digit} { BEGIN(config); return '/'; } -{num_dec} { return value(yyscanner, 10); } -{num_hex} { return value(yyscanner, 16); } +{num_dec} { return value(_parse_state, yyscanner, 10); } +{num_hex} { return value(_parse_state, yyscanner, 16); } /* * We need to separate 'mem:' scanner part, in order to get specific * modifier bits parsed out. Otherwise we would need to handle PE_NAME @@ -330,8 +338,8 @@ cgroup-switches { return sym(yyscanner, PERF_COUNT_SW_CGROUP_SWITCHES); } {lc_type}-{lc_op_result}-{lc_op_result} { return str(yyscanner, PE_LEGACY_CACHE); } mem: { BEGIN(mem); return PE_PREFIX_MEM; } r{num_raw_hex} { return str(yyscanner, PE_RAW); } -{num_dec} { return value(yyscanner, 10); } -{num_hex} { return value(yyscanner, 16); } +{num_dec} { return value(_parse_state, yyscanner, 10); } +{num_hex} { return value(_parse_state, yyscanner, 16); } {modifier_event} { return str(yyscanner, PE_MODIFIER_EVENT); } {name} { return str(yyscanner, PE_NAME); } -- cgit v1.2.3 From e18601d80ce1917f02396b8b0d85b7587a0d3af5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:28 -0700 Subject: perf parse-events: Inline parse_events_evlist_error Inline parse_events_evlist_error that is only used in parse_events_error. Modify parse_events_error to not report a parser error unless errors haven't already been reported. Make it clearer that the latter case only happens for unrecognized input. Before: $ perf stat -e 'cycles/period=99999999999999999999/' true event syntax error: 'cycles/period=99999999999999999999/' \___ parser error event syntax error: '..les/period=99999999999999999999/' \___ Bad base 10 number "99999999999999999999" Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events $ perf stat -e 'cycles:xyz' true event syntax error: 'cycles:xyz' \___ parser error Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events After: $ perf stat -e 'cycles/period=99999999999999999999/xyz' true event syntax error: '..les/period=99999999999999999999/xyz' \___ Bad base 10 number "99999999999999999999" Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events $ perf stat -e 'cycles:xyz' true event syntax error: 'cycles:xyz' \___ Unrecognized input Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 9 --------- tools/perf/util/parse-events.h | 2 -- tools/perf/util/parse-events.y | 10 ++++++++-- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 7eba714f0d73..ebada37ef98a 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2760,15 +2760,6 @@ int parse_events_terms__to_strbuf(const struct parse_events_terms *terms, struct return 0; } -void parse_events_evlist_error(struct parse_events_state *parse_state, - int idx, const char *str) -{ - if (!parse_state->error) - return; - - parse_events_error__handle(parse_state->error, idx, strdup(str), NULL); -} - static void config_terms_list(char *buf, size_t buf_sz) { int i; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index e8f2aebea10f..290ae6c72ec5 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -226,8 +226,6 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state void *loc_); void parse_events__set_leader(char *name, struct list_head *list); -void parse_events_evlist_error(struct parse_events_state *parse_state, - int idx, const char *str); struct event_symbol { const char *symbol; diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 51490f0f8c50..2c4817e126c1 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -790,9 +790,15 @@ sep_slash_slash_dc: '/' '/' | ':' | %% -void parse_events_error(YYLTYPE *loc, void *parse_state, +void parse_events_error(YYLTYPE *loc, void *_parse_state, void *scanner __maybe_unused, char const *msg __maybe_unused) { - parse_events_evlist_error(parse_state, loc->last_column, "parser error"); + struct parse_events_state *parse_state = _parse_state; + + if (!parse_state->error || !list_empty(&parse_state->error->list)) + return; + + parse_events_error__handle(parse_state->error, loc->last_column, + strdup("Unrecognized input"), NULL); } -- cgit v1.2.3 From e30a7912f498c58062d0b75e59c181dd48f1cc56 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:29 -0700 Subject: perf parse-events: Improvements to modifier parsing Use a struct/bitmap rather than a copied string from lexer. In lexer give improved error message when too many precise flags are given or repeated modifiers. Before: $ perf stat -e 'cycles:kuk' true event syntax error: 'cycles:kuk' \___ Bad modifier ... $ perf stat -e 'cycles:pppp' true event syntax error: 'cycles:pppp' \___ Bad modifier ... $ perf stat -e '{instructions:p,cycles:pp}:pp' -a true event syntax error: '..cycles:pp}:pp' \___ Bad modifier ... After: $ perf stat -e 'cycles:kuk' true event syntax error: 'cycles:kuk' \___ Duplicate modifier 'k' (kernel) ... $ perf stat -e 'cycles:pppp' true event syntax error: 'cycles:pppp' \___ Maximum precise value is 3 ... $ perf stat -e '{instructions:p,cycles:pp}:pp' true event syntax error: '..cycles:pp}:pp' \___ Maximum combined precise value is 3, adding precision to "cycles:pp" ... Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 250 +++++++++++++++-------------------------- tools/perf/util/parse-events.h | 23 +++- tools/perf/util/parse-events.l | 75 ++++++++++++- tools/perf/util/parse-events.y | 28 ++--- 4 files changed, 194 insertions(+), 182 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index ebada37ef98a..3ab533d0e653 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1700,12 +1700,6 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state return -EINVAL; } -int parse_events__modifier_group(struct list_head *list, - char *event_mod) -{ - return parse_events__modifier_event(list, event_mod, true); -} - void parse_events__set_leader(char *name, struct list_head *list) { struct evsel *leader; @@ -1720,183 +1714,125 @@ void parse_events__set_leader(char *name, struct list_head *list) leader->group_name = name; } -struct event_modifier { - int eu; - int ek; - int eh; - int eH; - int eG; - int eI; - int precise; - int precise_max; - int exclude_GH; - int sample_read; - int pinned; - int weak; - int exclusive; - int bpf_counter; -}; +static int parse_events__modifier_list(struct parse_events_state *parse_state, + YYLTYPE *loc, + struct list_head *list, + struct parse_events_modifier mod, + bool group) +{ + struct evsel *evsel; -static int get_event_modifier(struct event_modifier *mod, char *str, - struct evsel *evsel) -{ - int eu = evsel ? evsel->core.attr.exclude_user : 0; - int ek = evsel ? evsel->core.attr.exclude_kernel : 0; - int eh = evsel ? evsel->core.attr.exclude_hv : 0; - int eH = evsel ? evsel->core.attr.exclude_host : 0; - int eG = evsel ? evsel->core.attr.exclude_guest : 0; - int eI = evsel ? evsel->core.attr.exclude_idle : 0; - int precise = evsel ? evsel->core.attr.precise_ip : 0; - int precise_max = 0; - int sample_read = 0; - int pinned = evsel ? evsel->core.attr.pinned : 0; - int exclusive = evsel ? evsel->core.attr.exclusive : 0; - - int exclude = eu | ek | eh; - int exclude_GH = evsel ? evsel->exclude_GH : 0; - int weak = 0; - int bpf_counter = 0; - - memset(mod, 0, sizeof(*mod)); - - while (*str) { - if (*str == 'u') { + if (!group && mod.weak) { + parse_events_error__handle(parse_state->error, loc->first_column, + strdup("Weak modifier is for use with groups"), NULL); + return -EINVAL; + } + + __evlist__for_each_entry(list, evsel) { + /* Translate modifiers into the equivalent evsel excludes. */ + int eu = group ? evsel->core.attr.exclude_user : 0; + int ek = group ? evsel->core.attr.exclude_kernel : 0; + int eh = group ? evsel->core.attr.exclude_hv : 0; + int eH = group ? evsel->core.attr.exclude_host : 0; + int eG = group ? evsel->core.attr.exclude_guest : 0; + int exclude = eu | ek | eh; + int exclude_GH = group ? evsel->exclude_GH : 0; + + if (mod.precise) { + /* use of precise requires exclude_guest */ + eG = 1; + } + if (mod.user) { if (!exclude) exclude = eu = ek = eh = 1; if (!exclude_GH && !perf_guest) eG = 1; eu = 0; - } else if (*str == 'k') { + } + if (mod.kernel) { if (!exclude) exclude = eu = ek = eh = 1; ek = 0; - } else if (*str == 'h') { + } + if (mod.hypervisor) { if (!exclude) exclude = eu = ek = eh = 1; eh = 0; - } else if (*str == 'G') { + } + if (mod.guest) { if (!exclude_GH) exclude_GH = eG = eH = 1; eG = 0; - } else if (*str == 'H') { + } + if (mod.host) { if (!exclude_GH) exclude_GH = eG = eH = 1; eH = 0; - } else if (*str == 'I') { - eI = 1; - } else if (*str == 'p') { - precise++; - /* use of precise requires exclude_guest */ - if (!exclude_GH) - eG = 1; - } else if (*str == 'P') { - precise_max = 1; - } else if (*str == 'S') { - sample_read = 1; - } else if (*str == 'D') { - pinned = 1; - } else if (*str == 'e') { - exclusive = 1; - } else if (*str == 'W') { - weak = 1; - } else if (*str == 'b') { - bpf_counter = 1; - } else - break; - - ++str; + } + evsel->core.attr.exclude_user = eu; + evsel->core.attr.exclude_kernel = ek; + evsel->core.attr.exclude_hv = eh; + evsel->core.attr.exclude_host = eH; + evsel->core.attr.exclude_guest = eG; + evsel->exclude_GH = exclude_GH; + + /* Simple modifiers copied to the evsel. */ + if (mod.precise) { + u8 precise = evsel->core.attr.precise_ip + mod.precise; + /* + * precise ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + if (precise > 3) { + char *help; + + if (asprintf(&help, + "Maximum combined precise value is 3, adding precision to \"%s\"", + evsel__name(evsel)) > 0) { + parse_events_error__handle(parse_state->error, + loc->first_column, + help, NULL); + } + return -EINVAL; + } + evsel->core.attr.precise_ip = precise; + } + if (mod.precise_max) + evsel->precise_max = 1; + if (mod.non_idle) + evsel->core.attr.exclude_idle = 1; + if (mod.sample_read) + evsel->sample_read = 1; + if (mod.pinned && evsel__is_group_leader(evsel)) + evsel->core.attr.pinned = 1; + if (mod.exclusive && evsel__is_group_leader(evsel)) + evsel->core.attr.exclusive = 1; + if (mod.weak) + evsel->weak_group = true; + if (mod.bpf) + evsel->bpf_counter = true; } - - /* - * precise ip: - * - * 0 - SAMPLE_IP can have arbitrary skid - * 1 - SAMPLE_IP must have constant skid - * 2 - SAMPLE_IP requested to have 0 skid - * 3 - SAMPLE_IP must have 0 skid - * - * See also PERF_RECORD_MISC_EXACT_IP - */ - if (precise > 3) - return -EINVAL; - - mod->eu = eu; - mod->ek = ek; - mod->eh = eh; - mod->eH = eH; - mod->eG = eG; - mod->eI = eI; - mod->precise = precise; - mod->precise_max = precise_max; - mod->exclude_GH = exclude_GH; - mod->sample_read = sample_read; - mod->pinned = pinned; - mod->weak = weak; - mod->bpf_counter = bpf_counter; - mod->exclusive = exclusive; - return 0; } -/* - * Basic modifier sanity check to validate it contains only one - * instance of any modifier (apart from 'p') present. - */ -static int check_modifier(char *str) +int parse_events__modifier_group(struct parse_events_state *parse_state, void *loc, + struct list_head *list, + struct parse_events_modifier mod) { - char *p = str; - - /* The sizeof includes 0 byte as well. */ - if (strlen(str) > (sizeof("ukhGHpppPSDIWeb") - 1)) - return -1; - - while (*p) { - if (*p != 'p' && strchr(p + 1, *p)) - return -1; - p++; - } - - return 0; + return parse_events__modifier_list(parse_state, loc, list, mod, /*group=*/true); } -int parse_events__modifier_event(struct list_head *list, char *str, bool add) +int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc, + struct list_head *list, + struct parse_events_modifier mod) { - struct evsel *evsel; - struct event_modifier mod; - - if (str == NULL) - return 0; - - if (check_modifier(str)) - return -EINVAL; - - if (!add && get_event_modifier(&mod, str, NULL)) - return -EINVAL; - - __evlist__for_each_entry(list, evsel) { - if (add && get_event_modifier(&mod, str, evsel)) - return -EINVAL; - - evsel->core.attr.exclude_user = mod.eu; - evsel->core.attr.exclude_kernel = mod.ek; - evsel->core.attr.exclude_hv = mod.eh; - evsel->core.attr.precise_ip = mod.precise; - evsel->core.attr.exclude_host = mod.eH; - evsel->core.attr.exclude_guest = mod.eG; - evsel->core.attr.exclude_idle = mod.eI; - evsel->exclude_GH = mod.exclude_GH; - evsel->sample_read = mod.sample_read; - evsel->precise_max = mod.precise_max; - evsel->weak_group = mod.weak; - evsel->bpf_counter = mod.bpf_counter; - - if (evsel__is_group_leader(evsel)) { - evsel->core.attr.pinned = mod.pinned; - evsel->core.attr.exclusive = mod.exclusive; - } - } - - return 0; + return parse_events__modifier_list(parse_state, loc, list, mod, /*group=*/false); } int parse_events_name(struct list_head *list, const char *name) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 290ae6c72ec5..f104faef1a78 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -186,8 +186,27 @@ void parse_events_terms__init(struct parse_events_terms *terms); void parse_events_terms__exit(struct parse_events_terms *terms); int parse_events_terms(struct parse_events_terms *terms, const char *str, FILE *input); int parse_events_terms__to_strbuf(const struct parse_events_terms *terms, struct strbuf *sb); -int parse_events__modifier_event(struct list_head *list, char *str, bool add); -int parse_events__modifier_group(struct list_head *list, char *event_mod); + +struct parse_events_modifier { + u8 precise; /* Number of repeated 'p' for precision. */ + bool precise_max : 1; /* 'P' */ + bool non_idle : 1; /* 'I' */ + bool sample_read : 1; /* 'S' */ + bool pinned : 1; /* 'D' */ + bool exclusive : 1; /* 'e' */ + bool weak : 1; /* 'W' */ + bool bpf : 1; /* 'b' */ + bool user : 1; /* 'u' */ + bool kernel : 1; /* 'k' */ + bool hypervisor : 1; /* 'h' */ + bool guest : 1; /* 'G' */ + bool host : 1; /* 'H' */ +}; + +int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc, + struct list_head *list, struct parse_events_modifier mod); +int parse_events__modifier_group(struct parse_events_state *parse_state, void *loc, + struct list_head *list, struct parse_events_modifier mod); int parse_events_name(struct list_head *list, const char *name); int parse_events_add_tracepoint(struct list_head *list, int *idx, const char *sys, const char *event, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 0cd68c9f0d4f..4aaf0c53d9b6 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -142,6 +142,77 @@ static int hw(yyscan_t scanner, int config) return PE_TERM_HW; } +static void modifiers_error(struct parse_events_state *parse_state, yyscan_t scanner, + int pos, char mod_char, const char *mod_name) +{ + struct parse_events_error *error = parse_state->error; + char *help = NULL; + + if (asprintf(&help, "Duplicate modifier '%c' (%s)", mod_char, mod_name) > 0) + parse_events_error__handle(error, get_column(scanner) + pos, help , NULL); +} + +static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner) +{ + YYSTYPE *yylval = parse_events_get_lval(scanner); + char *text = parse_events_get_text(scanner); + struct parse_events_modifier mod = { .precise = 0, }; + + for (size_t i = 0, n = strlen(text); i < n; i++) { +#define CASE(c, field) \ + case c: \ + if (mod.field) { \ + modifiers_error(parse_state, scanner, i, c, #field); \ + return PE_ERROR; \ + } \ + mod.field = true; \ + break + + switch (text[i]) { + CASE('u', user); + CASE('k', kernel); + CASE('h', hypervisor); + CASE('I', non_idle); + CASE('G', guest); + CASE('H', host); + case 'p': + mod.precise++; + /* + * precise ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + if (mod.precise > 3) { + struct parse_events_error *error = parse_state->error; + char *help = strdup("Maximum precise value is 3"); + + if (help) { + parse_events_error__handle(error, get_column(scanner) + i, + help , NULL); + } + return PE_ERROR; + } + break; + CASE('P', precise_max); + CASE('S', sample_read); + CASE('D', pinned); + CASE('W', weak); + CASE('e', exclusive); + CASE('b', bpf); + default: + return PE_ERROR; + } +#undef CASE + } + yylval->mod = mod; + return PE_MODIFIER_EVENT; +} + #define YY_USER_ACTION \ do { \ yylloc->last_column = yylloc->first_column; \ @@ -174,7 +245,7 @@ drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? * If you add a modifier you need to update check_modifier(). * Also, the letters in modifier_event must not be in modifier_bp. */ -modifier_event [ukhpPGHSDIWeb]+ +modifier_event [ukhpPGHSDIWeb]{1,15} modifier_bp [rwx]{1,3} lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node) lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss) @@ -341,7 +412,7 @@ r{num_raw_hex} { return str(yyscanner, PE_RAW); } {num_dec} { return value(_parse_state, yyscanner, 10); } {num_hex} { return value(_parse_state, yyscanner, 16); } -{modifier_event} { return str(yyscanner, PE_MODIFIER_EVENT); } +{modifier_event} { return modifiers(_parse_state, yyscanner); } {name} { return str(yyscanner, PE_NAME); } {name_tag} { return str(yyscanner, PE_NAME); } "/" { BEGIN(config); return '/'; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 2c4817e126c1..79f254189be6 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -68,11 +68,11 @@ static void free_list_evsel(struct list_head* list_evsel) %type PE_VALUE %type PE_VALUE_SYM_SW %type PE_VALUE_SYM_TOOL +%type PE_MODIFIER_EVENT %type PE_TERM %type PE_RAW %type PE_NAME %type PE_LEGACY_CACHE -%type PE_MODIFIER_EVENT %type PE_MODIFIER_BP %type PE_EVENT_NAME %type PE_DRV_CFG_TERM @@ -110,6 +110,7 @@ static void free_list_evsel(struct list_head* list_evsel) { char *str; u64 num; + struct parse_events_modifier mod; enum parse_events__term_type term_type; struct list_head *list_evsel; struct parse_events_terms *list_terms; @@ -175,20 +176,13 @@ event group: group_def ':' PE_MODIFIER_EVENT { + /* Apply the modifier to the events in the group_def. */ struct list_head *list = $1; int err; - err = parse_events__modifier_group(list, $3); - free($3); - if (err) { - struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; - - parse_events_error__handle(error, @3.first_column, - strdup("Bad modifier"), NULL); - free_list_evsel(list); + err = parse_events__modifier_group(_parse_state, &@3, list, $3); + if (err) YYABORT; - } $$ = list; } | @@ -238,17 +232,9 @@ event_name PE_MODIFIER_EVENT * (there could be more events added for multiple tracepoint * definitions via '*?'. */ - err = parse_events__modifier_event(list, $2, false); - free($2); - if (err) { - struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; - - parse_events_error__handle(error, @2.first_column, - strdup("Bad modifier"), NULL); - free_list_evsel(list); + err = parse_events__modifier_event(_parse_state, &@2, list, $2); + if (err) YYABORT; - } $$ = list; } | -- cgit v1.2.3 From 4a20e793652e7742d8c0608a32e6f8adcde9e272 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:30 -0700 Subject: perf parse-event: Constify event_symbol arrays Moves 352 bytes from .data to .data.rel.ro. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 4 ++-- tools/perf/util/parse-events.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 3ab533d0e653..8d3d692d219d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -39,7 +39,7 @@ static int get_config_terms(const struct parse_events_terms *head_config, static int parse_events_terms__copy(const struct parse_events_terms *src, struct parse_events_terms *dest); -struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { +const struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = { .symbol = "cpu-cycles", .alias = "cycles", @@ -82,7 +82,7 @@ struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { }, }; -struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { +const struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { [PERF_COUNT_SW_CPU_CLOCK] = { .symbol = "cpu-clock", .alias = "", diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index f104faef1a78..0bb5f0c80a5e 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -250,8 +250,8 @@ struct event_symbol { const char *symbol; const char *alias; }; -extern struct event_symbol event_symbols_hw[]; -extern struct event_symbol event_symbols_sw[]; +extern const struct event_symbol event_symbols_hw[]; +extern const struct event_symbol event_symbols_sw[]; char *parse_events_formats_error_string(char *additional_terms); -- cgit v1.2.3 From afd876bbdc97664257523bb5f2dcedbb5cef40d0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:31 -0700 Subject: perf parse-events: Minor grouping tidy up Add comments. Ensure leader->group_name is freed before overwriting it. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 1 + tools/perf/util/parse-events.y | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8d3d692d219d..1c1b1bcb78e8 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1711,6 +1711,7 @@ void parse_events__set_leader(char *name, struct list_head *list) leader = list_first_entry(list, struct evsel, core.node); __perf_evlist__set_leader(list, &leader->core); + zfree(&leader->group_name); leader->group_name = name; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 79f254189be6..6f1042272dda 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -193,7 +193,10 @@ PE_NAME '{' events '}' { struct list_head *list = $3; - /* Takes ownership of $1. */ + /* + * Set the first entry of list to be the leader. Set the group name on + * the leader to $1 taking ownership. + */ parse_events__set_leader($1, list); $$ = list; } @@ -202,6 +205,7 @@ PE_NAME '{' events '}' { struct list_head *list = $2; + /* Set the first entry of list to be the leader clearing the group name. */ parse_events__set_leader(NULL, list); $$ = list; } -- cgit v1.2.3 From bb65ff7810b654beb498e7afb4a4688e6ddf0340 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 Apr 2024 23:15:32 -0700 Subject: perf parse-events: Tidy the setting of the default event name Add comments. Pass ownership of the event name to save on a strdup. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Tested-by: Atish Patra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Beeman Strong Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240416061533.921723-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 9 ++++++--- tools/perf/util/parse-events.h | 2 +- tools/perf/util/parse-events.l | 5 +++++ tools/perf/util/parse-events.y | 10 +++++++--- 4 files changed, 19 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1c1b1bcb78e8..0f308b4db2b9 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1836,18 +1836,21 @@ int parse_events__modifier_event(struct parse_events_state *parse_state, void *l return parse_events__modifier_list(parse_state, loc, list, mod, /*group=*/false); } -int parse_events_name(struct list_head *list, const char *name) +int parse_events__set_default_name(struct list_head *list, char *name) { struct evsel *evsel; + bool used_name = false; __evlist__for_each_entry(list, evsel) { if (!evsel->name) { - evsel->name = strdup(name); + evsel->name = used_name ? strdup(name) : name; + used_name = true; if (!evsel->name) return -ENOMEM; } } - + if (!used_name) + free(name); return 0; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 0bb5f0c80a5e..5695308efab9 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -207,7 +207,7 @@ int parse_events__modifier_event(struct parse_events_state *parse_state, void *l struct list_head *list, struct parse_events_modifier mod); int parse_events__modifier_group(struct parse_events_state *parse_state, void *loc, struct list_head *list, struct parse_events_modifier mod); -int parse_events_name(struct list_head *list, const char *name); +int parse_events__set_default_name(struct list_head *list, char *name); int parse_events_add_tracepoint(struct list_head *list, int *idx, const char *sys, const char *event, struct parse_events_error *error, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 4aaf0c53d9b6..08ea2d845dc3 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -96,6 +96,11 @@ static int drv_str(yyscan_t scanner, int token) return token; } +/* + * Use yyless to return all the characaters to the input. Update the column for + * location debugging. If __alloc is non-zero set yylval to the text for the + * returned token's value. + */ #define REWIND(__alloc) \ do { \ YYSTYPE *__yylval = parse_events_get_lval(yyscanner); \ diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 6f1042272dda..68b3b06c7ff0 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -247,10 +247,14 @@ event_name event_name: PE_EVENT_NAME event_def { - int err; + /* + * When an event is parsed the text is rewound and the entire text of + * the event is set to the str of PE_EVENT_NAME token matched here. If + * no name was on an event via a term, set the name to the entire text + * taking ownership of the allocation. + */ + int err = parse_events__set_default_name($2, $1); - err = parse_events_name($2, $1); - free($1); if (err) { free_list_evsel($2); YYNOMEM; -- cgit v1.2.3 From 281bf8f63f20b73aafae16e0f1d02b141701b0ed Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 23 Apr 2024 17:12:31 -0700 Subject: perf test: Add a new test for 'perf annotate' Add a basic 'perf annotate' test: $ ./perf test annotate -vv 76: perf annotate basic tests: --- start --- test child forked, pid 846989 fbcd0-fbd55 l noploop perf does have symbol 'noploop' Basic perf annotate test : 0 0xfbcd0 : 0.00 : fbcd0: pushq %rbp 0.00 : fbcd1: movq %rsp, %rbp 0.00 : fbcd4: pushq %r12 0.00 : fbcd6: pushq %rbx 0.00 : fbcd7: movl $1, %ebx 0.00 : fbcdc: subq $0x10, %rsp 0.00 : fbce0: movq %fs:0x28, %rax 0.00 : fbce9: movq %rax, -0x18(%rbp) 0.00 : fbced: xorl %eax, %eax 0.00 : fbcef: testl %edi, %edi 0.00 : fbcf1: jle 0xfbd04 0.00 : fbcf3: movq (%rsi), %rdi 0.00 : fbcf6: movl $0xa, %edx 0.00 : fbcfb: xorl %esi, %esi 0.00 : fbcfd: callq 0x41920 0.00 : fbd02: movl %eax, %ebx 0.00 : fbd04: leaq -0x7b(%rip), %r12 # fbc90 0.00 : fbd0b: movl $2, %edi 0.00 : fbd10: movq %r12, %rsi 0.00 : fbd13: callq 0x40a00 0.00 : fbd18: movl $0xe, %edi 0.00 : fbd1d: movq %r12, %rsi 0.00 : fbd20: callq 0x40a00 0.00 : fbd25: movl %ebx, %edi 0.00 : fbd27: callq 0x407c0 0.10 : fbd2c: movl 0x89785e(%rip), %eax # 993590 0.00 : fbd32: testl %eax, %eax 99.90 : fbd34: je 0xfbd2c 0.00 : fbd36: movq -0x18(%rbp), %rax 0.00 : fbd3a: subq %fs:0x28, %rax 0.00 : fbd43: jne 0xfbd50 0.00 : fbd45: addq $0x10, %rsp 0.00 : fbd49: xorl %eax, %eax 0.00 : fbd4b: popq %rbx 0.00 : fbd4c: popq %r12 0.00 : fbd4e: popq %rbp 0.00 : fbd4f: retq 0.00 : fbd50: callq 0x407e0 0.00 : fbcd0: pushq %rbp 0.00 : fbcd1: movq %rsp, %rbp 0.00 : fbcd4: pushq %r12 0.00 : fbcd0: push %rbp 0.00 : fbcd1: mov %rsp,%rbp 0.00 : fbcd4: push %r12 Basic annotate test [Success] ---- end(0) ---- 76: perf annotate basic tests : Ok Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240424001231.849972-1-namhyung@kernel.org [ Improved a bit the error messages ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/annotate.sh | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 tools/perf/tests/shell/annotate.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/annotate.sh b/tools/perf/tests/shell/annotate.sh new file mode 100755 index 000000000000..1db1e8113d99 --- /dev/null +++ b/tools/perf/tests/shell/annotate.sh @@ -0,0 +1,83 @@ +#!/bin/sh +# perf annotate basic tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +shelldir=$(dirname "$0") + +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + +testsym="noploop" + +skip_test_missing_symbol ${testsym} + +err=0 +perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +testprog="perf test -w noploop" +# disassembly format: "percent : offset: instruction (operands ...)" +disasm_regex="[0-9]*\.[0-9]* *: *\w*: *\w*" + +cleanup() { + rm -rf "${perfdata}" + rm -rf "${perfdata}".old + + trap - EXIT TERM INT +} + +trap_cleanup() { + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + +test_basic() { + echo "Basic perf annotate test" + if ! perf record -o "${perfdata}" ${testprog} 2> /dev/null + then + echo "Basic annotate [Failed: perf record]" + err=1 + return + fi + + # check if it has the target symbol + if ! perf annotate -i "${perfdata}" 2> /dev/null | grep "${testsym}" + then + echo "Basic annotate [Failed: missing target symbol]" + err=1 + return + fi + + # check if it has the disassembly lines + if ! perf annotate -i "${perfdata}" 2> /dev/null | grep "${disasm_regex}" + then + echo "Basic annotate [Failed: missing disasm output from default disassembler]" + err=1 + return + fi + + # check again with a target symbol name + if ! perf annotate -i "${perfdata}" "${testsym}" 2> /dev/null | \ + grep -m 3 "${disasm_regex}" + then + echo "Basic annotate [Failed: missing disasm output when specifying the target symbol]" + err=1 + return + fi + + # check one more with external objdump tool (forced by --objdump option) + if ! perf annotate -i "${perfdata}" --objdump=objdump 2> /dev/null | \ + grep -m 3 "${disasm_regex}" + then + echo "Basic annotate [Failed: missing disasm output from non default disassembler (using --objdump)]" + err=1 + return + fi + echo "Basic annotate test [Success]" +} + +test_basic + +cleanup +exit $err -- cgit v1.2.3 From 47557db99a5decba2192303ed34ec1add5177b51 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 Apr 2024 16:00:15 -0700 Subject: perf annotate-data: Check if 'struct annotation_source' was allocated on 'perf report' TUI As it removed the sample accounting for code when no symbol sort key is given for 'perf report' TUI, it might not have allocated the 'struct annotated_source' yet. Let's check if it's NULL first. Fixes: 6cdd977ec24e1538 ("perf report: Do not collect sample histogram unnecessarily") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240424230015.1054013-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index dca2c08ab8c5..f5b6b5e5e757 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -906,7 +906,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (parch) *parch = arch; - if (!list_empty(¬es->src->source)) + if (notes->src && !list_empty(¬es->src->source)) return 0; args.arch = arch; -- cgit v1.2.3 From f35847de2a65137e011e559f38a3de5902a5463f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 Apr 2024 17:51:56 -0700 Subject: perf annotate: Fallback disassemble to objdump when capstone fails I found some cases that capstone failed to disassemble. Probably my capstone is an old version but anyway there's a chance it can fail. And then it silently stopped in the middle. In my case, it didn't understand "RDPKRU" instruction. Let's check if the capstone disassemble reached the end of the function and fallback to objdump if not. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240425005157.1104789-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 92937809be85..412101f2cf2a 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1542,6 +1542,20 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym, offset += insn[i].size; } + /* It failed in the middle: probably due to unknown instructions */ + if (offset != len) { + struct list_head *list = ¬es->src->source; + + /* Discard all lines and fallback to objdump */ + while (!list_empty(list)) { + dl = list_first_entry(list, struct disasm_line, al.node); + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } + count = -1; + } + out: if (needs_cs_close) cs_close(&handle); -- cgit v1.2.3 From 8f3ec810bb668d421f9d260e3de3bccca954b56f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 Apr 2024 17:51:57 -0700 Subject: perf annotate: Update DSO binary type when trying build-id dso__disassemble_filename() tries to get the filename for objdump (or capstone) using build-id. But I found sometimes it didn't disassemble some functions. It turned out that those functions belong to a DSO which has no binary type set. It seems it sets the binary type for some special files only - like kernel (kallsyms or kcore) or BPF images. And there's a logic to skip dso with DSO_BINARY_TYPE__NOT_FOUND. As it's checked the build-id cache link, it should set the binary type as DSO_BINARY_TYPE__BUILD_ID_CACHE. Fixes: 873a83731f1cc85c ("perf annotate: Skip DSOs not found") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240425005157.1104789-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 412101f2cf2a..6d1125e687b7 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1156,6 +1156,8 @@ fallback: } } mutex_unlock(&dso->lock); + } else if (dso->binary_type == DSO_BINARY_TYPE__NOT_FOUND) { + dso->binary_type = DSO_BINARY_TYPE__BUILD_ID_CACHE; } free(build_id_path); -- cgit v1.2.3 From 7cc72090fbbf87bdd075eeece7f72453cbc1103a Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 25 Apr 2024 14:04:27 +0800 Subject: perf record: Fix comment misspellings Fix comment misspellings Signed-off-by: Howard Chu Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240425060427.1800663-1-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/mmap.c | 2 +- tools/perf/builtin-record.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index 0c903c2372c9..c1a51d925e0e 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -279,7 +279,7 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map) if (!refcount_read(&map->refcnt)) return NULL; - /* non-overwirte doesn't pause the ringbuffer */ + /* non-overwrite doesn't pause the ringbuffer */ if (!map->overwrite) map->end = perf_mmap__read_head(map); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2ff718d3e202..66a3de8ac661 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -332,7 +332,7 @@ static int record__aio_complete(struct mmap *md, struct aiocb *cblock) } else { /* * aio write request may require restart with the - * reminder if the kernel didn't write whole + * remainder if the kernel didn't write whole * chunk at once. */ rem_off = cblock->aio_offset + written; @@ -400,7 +400,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size * * Coping can be done in two steps in case the chunk of profiling data * crosses the upper bound of the kernel buffer. In this case we first move - * part of data from map->start till the upper bound and then the reminder + * part of data from map->start till the upper bound and then the remainder * from the beginning of the kernel buffer till the end of the data chunk. */ -- cgit v1.2.3 From e101a05f79fd4ee3e89d2f3fb716493c33a33708 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 26 Mar 2024 10:32:23 +0200 Subject: perf intel-pt: Fix unassigned instruction op (discovered by MemorySanitizer) MemorySanitizer discovered instances where the instruction op value was not assigned.: WARNING: MemorySanitizer: use-of-uninitialized-value #0 0x5581c00a76b3 in intel_pt_sample_flags tools/perf/util/intel-pt.c:1527:17 Uninitialized value was stored to memory at #0 0x5581c005ddf8 in intel_pt_walk_insn tools/perf/util/intel-pt-decoder/intel-pt-decoder.c:1256:25 The op value is used to set branch flags for branch instructions encountered when walking the code, so fix by setting op to INTEL_PT_OP_OTHER in other cases. Fixes: 4c761d805bb2d2ea ("perf intel-pt: Fix intel_pt_fup_event() assumptions about setting state type") Reported-by: Ian Rogers Signed-off-by: Adrian Hunter Tested-by: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Closes: https://lore.kernel.org/linux-perf-users/20240320162619.1272015-1-irogers@google.com/ Link: https://lore.kernel.org/r/20240326083223.10883-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 2 ++ tools/perf/util/intel-pt.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index b450178e3420..e733f6b1f7ac 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -1319,6 +1319,8 @@ static bool intel_pt_fup_event(struct intel_pt_decoder *decoder, bool no_tip) bool ret = false; decoder->state.type &= ~INTEL_PT_BRANCH; + decoder->state.insn_op = INTEL_PT_OP_OTHER; + decoder->state.insn_len = 0; if (decoder->set_fup_cfe_ip || decoder->set_fup_cfe) { bool ip = decoder->set_fup_cfe_ip; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index f38893e0b036..4db9a098f592 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -764,6 +764,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, addr_location__init(&al); intel_pt_insn->length = 0; + intel_pt_insn->op = INTEL_PT_OP_OTHER; if (to_ip && *ip == to_ip) goto out_no_cache; @@ -898,6 +899,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, if (to_ip && *ip == to_ip) { intel_pt_insn->length = 0; + intel_pt_insn->op = INTEL_PT_OP_OTHER; goto out_no_cache; } -- cgit v1.2.3 From 8524d71cebfa6ddcfbb89f0fe0e174c8d0477c6d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 20 Mar 2024 09:32:44 -0700 Subject: perf build: Pretend scandirat is missing with msan Memory sanitizer lacks an interceptor for scandirat, reporting all memory it allocates as uninitialized. Memory sanitizer has a scandir interceptor so use the fallback function in this case. This allows 'perf test' to run under memory sanitizer. Additional notes from Ian on running in this mode: Note, as msan needs to instrument memory allocations libraries need to be compiled with it. I lacked the msan built libraries and so built with: ``` $ make -C tools/perf O=/tmp/perf DEBUG=1 EXTRA_CFLAGS="-O0 -g -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" CC=clang CXX=clang++ HOSTCC=clang NO_LIBTRACEEVENT=1 NO_LIBELF=1 BUILD_BPF_SKEL=0 NO_LIBPFM=1 ``` oh, I disabled libbpf here as the bpf system call also lacks msan interceptors. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240320163244.1287780-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 7783479de691..7f1e016a9253 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -496,7 +496,10 @@ ifdef NO_DWARF endif ifeq ($(feature-scandirat), 1) - CFLAGS += -DHAVE_SCANDIRAT_SUPPORT + # Ignore having scandirat with memory sanitizer that lacks an interceptor. + ifeq ($(filter s% -fsanitize=memory%,$(EXTRA_CFLAGS),),) + CFLAGS += -DHAVE_SCANDIRAT_SUPPORT + endif endif ifeq ($(feature-sched_getcpu), 1) -- cgit v1.2.3 From 2b87383c885c52f051a90574fb857ca3b76b3f5c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 22 Apr 2024 19:06:43 -0700 Subject: perf annotate: Fix data type profiling on stdio The loop in hists__find_annotations() never set the 'nd' pointer to NULL and it makes stdio output repeating the last element forever. I think it doesn't set to NULL for TUI to prevent it from exiting unexpectedly. But it should just set on stdio mode. Fixes: d001c7a7f4736743 ("perf annotate-data: Add hist_entry__annotate_data_tui()") Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240423020643.740029-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 6f7104f06c42..83812b9d5363 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -491,7 +491,7 @@ find_next: return; } - if (next != NULL) + if (use_browser == 0 || next != NULL) nd = next; continue; -- cgit v1.2.3 From 8f21164321eea844ed2ed28d9db9f90f43d2bf7c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 25 Apr 2024 18:08:42 -0300 Subject: tools headers x86 cpufeatures: Sync with the kernel sources to pick BHI mitigation changes To pick the changes from: 95a6ccbdc7199a14 ("x86/bhi: Mitigate KVM by default") ec9404e40e8f3642 ("x86/bhi: Add BHI mitigation knob") be482ff9500999f5 ("x86/bhi: Enumerate Branch History Injection (BHI) bug") 0f4a837615ff925b ("x86/bhi: Define SPEC_CTRL_BHI_DIS_S") 7390db8aea0d64e9 ("x86/bhi: Add support for clearing branch history at syscall entry") This causes these perf files to be rebuilt and brings some X86_FEATURE that will be used when updating the copies of tools/arch/x86/lib/mem{cpy,set}_64.S with the kernel sources: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Adrian Hunter Cc: Daniel Sneddon Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Pawan Gupta Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/ZirIx4kPtJwGFZS0@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index a38f8f9ba657..3c7434329661 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -461,11 +461,15 @@ /* * Extended auxiliary flags: Linux defined - for features scattered in various - * CPUID levels like 0x80000022, etc. + * CPUID levels like 0x80000022, etc and Linux defined features. * * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ /* * BUG word(s) @@ -515,4 +519,5 @@ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ -- cgit v1.2.3 From 450f941ea9dce7256a485baf36f2b8d85a64e1c0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 26 Apr 2024 15:52:09 -0300 Subject: tools headers: Synchronize linux/bits.h with the kernel sources To pick up the changes in this cset: 3c7a8e190bc58081 ("uapi: introduce uapi-friendly macros for GENMASK") That just causes perf to rebuild. Its just some macros going to an uapi header that we now have to grab a copy into tools/ as well. This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/linux/bits.h include/linux/bits.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Paolo Bonzini Link: https://lore.kernel.org/lkml/ZiwJsFOBez0MS4r9@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/bits.h | 8 +------- tools/include/uapi/asm-generic/bitsperlong.h | 4 ++++ tools/include/uapi/linux/bits.h | 15 +++++++++++++++ tools/perf/check-headers.sh | 1 + 4 files changed, 21 insertions(+), 7 deletions(-) create mode 100644 tools/include/uapi/linux/bits.h (limited to 'tools') diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 7c0cf5031abe..0eb24d21aac2 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -4,6 +4,7 @@ #include #include +#include #include #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) @@ -30,15 +31,8 @@ #define GENMASK_INPUT_CHECK(h, l) 0 #endif -#define __GENMASK(h, l) \ - (((~UL(0)) - (UL(1) << (l)) + 1) & \ - (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) - -#define __GENMASK_ULL(h, l) \ - (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ - (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define GENMASK_ULL(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) diff --git a/tools/include/uapi/asm-generic/bitsperlong.h b/tools/include/uapi/asm-generic/bitsperlong.h index 352cb81947b8..fadb3f857f28 100644 --- a/tools/include/uapi/asm-generic/bitsperlong.h +++ b/tools/include/uapi/asm-generic/bitsperlong.h @@ -24,4 +24,8 @@ #endif #endif +#ifndef __BITS_PER_LONG_LONG +#define __BITS_PER_LONG_LONG 64 +#endif + #endif /* _UAPI__ASM_GENERIC_BITS_PER_LONG */ diff --git a/tools/include/uapi/linux/bits.h b/tools/include/uapi/linux/bits.h new file mode 100644 index 000000000000..3c2a101986a3 --- /dev/null +++ b/tools/include/uapi/linux/bits.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* bits.h: Macros for dealing with bitmasks. */ + +#ifndef _UAPI_LINUX_BITS_H +#define _UAPI_LINUX_BITS_H + +#define __GENMASK(h, l) \ + (((~_UL(0)) - (_UL(1) << (l)) + 1) & \ + (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) + +#define __GENMASK_ULL(h, l) \ + (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ + (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) + +#endif /* _UAPI_LINUX_BITS_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 9c84fd049070..672421b858ac 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -9,6 +9,7 @@ FILES=( "include/uapi/linux/const.h" "include/uapi/drm/drm.h" "include/uapi/drm/i915_drm.h" + "include/uapi/linux/bits.h" "include/uapi/linux/fadvise.h" "include/uapi/linux/fscrypt.h" "include/uapi/linux/kcmp.h" -- cgit v1.2.3 From 8c618b58c89ce4c24c0030bd42340938bdf8e29c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 26 Apr 2024 17:26:26 -0300 Subject: perf test: Reintroduce -p/--parallel and make -S/--sequential the default We can't default to doing parallel tests as there are tests that compete for the same resources and thus clash, for instance tests that put in place 'perf probe' probes, that clean the probes without regard to other tests needs, ARM64 coresight tests, Intel PT ones, etc. So reintroduce --p/--parallel and make -S/--sequential the default. We need to come up with infrastructure that state which tests can't run in parallel because they need exclusive access to some resource, something as simple as "probes" that would then avoid 'perf probe' tests from running while other such test is running, or make the tests more resilient, till then we can't use parallel mode as default. While at it, document all these options in the 'perf test' man page. Reported-by: Adrian Hunter Reported-by: Arnaldo Carvalho de Melo Reported-by: James Clark Reviewed-by: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Ziwm18BqIn_vc1vn@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-test.txt | 13 ++++++++++++- tools/perf/tests/builtin-test.c | 8 +++++++- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt index 951a2f262872..9acb8d1f6588 100644 --- a/tools/perf/Documentation/perf-test.txt +++ b/tools/perf/Documentation/perf-test.txt @@ -31,9 +31,20 @@ OPTIONS --verbose:: Be more verbose. +-S:: +--sequential:: + Run tests one after the other, this is the default mode. + +-p:: +--parallel:: + Run tests in parallel, speeds up the whole process but is not safe with + the current infrastructure, where some tests that compete for some resources, + for instance, 'perf probe' tests that add/remove probes or clean all probes, etc. + -F:: --dont-fork:: - Do not fork child for each test, run all tests within single process. + Do not fork child for each test, run all tests within single process, this + sets sequential mode. --dso:: Specify a DSO for the "Symbols" test. diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 73f53b02f733..c3d84b67ca8e 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -40,7 +40,10 @@ */ static bool dont_fork; /* Don't fork the tests in parallel and wait for their completion. */ -static bool sequential; +static bool sequential = true; +/* Do it in parallel, lacks infrastructure to avoid running tests that clash for resources, + * So leave it as the developers choice to enable while working on the needed infra */ +static bool parallel; const char *dso_to_test; const char *test_objdump_path = "objdump"; @@ -536,6 +539,7 @@ int cmd_test(int argc, const char **argv) "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('F', "dont-fork", &dont_fork, "Do not fork for testcase"), + OPT_BOOLEAN('p', "parallel", ¶llel, "Run the tests in parallel"), OPT_BOOLEAN('S', "sequential", &sequential, "Run the tests one after another rather than in parallel"), OPT_STRING('w', "workload", &workload, "work", "workload to run for testing"), @@ -566,6 +570,8 @@ int cmd_test(int argc, const char **argv) if (dont_fork) sequential = true; + else if (parallel) + sequential = false; symbol_conf.priv_size = sizeof(int); symbol_conf.try_vmlinux_path = true; -- cgit v1.2.3 From b7fab1b69b9c4c152c185af8fb375810e70cb606 Mon Sep 17 00:00:00 2001 From: xieming Date: Mon, 22 Apr 2024 09:57:30 +0800 Subject: kselftest/arm64: Remove unused parameters in abi test Remove unused parameter i in tpidr2.c main function. Signed-off-by: xieming Link: https://lore.kernel.org/r/20240422015730.89805-1-xieming@kylinos.cn Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 02ee3a91b780..285c47dd42f6 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -262,7 +262,7 @@ static int write_clone_read(void) int main(int argc, char **argv) { - int ret, i; + int ret; putstr("TAP version 13\n"); putstr("1.."); -- cgit v1.2.3 From 12d712dc8e4f1a30b18f8c3789adfbc07f5eb050 Mon Sep 17 00:00:00 2001 From: Shiqi Liu Date: Sun, 21 Apr 2024 14:33:28 +0800 Subject: arm64/sysreg: Update PIE permission encodings Fix left shift overflow issue when the parameter idx is greater than or equal to 8 in the calculation of perm in PIRx_ELx_PERM macro. Fix this by modifying the encoding to use a long integer type. Signed-off-by: Shiqi Liu Acked-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240421063328.29710-1-shiqiliu@hust.edu.cn Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 24 ++++++++++++------------ tools/arch/arm64/include/asm/sysreg.h | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9e8999592f3a..af3b206fa423 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1036,18 +1036,18 @@ * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). */ -#define PIE_NONE_O 0x0 -#define PIE_R_O 0x1 -#define PIE_X_O 0x2 -#define PIE_RX_O 0x3 -#define PIE_RW_O 0x5 -#define PIE_RWnX_O 0x6 -#define PIE_RWX_O 0x7 -#define PIE_R 0x8 -#define PIE_GCS 0x9 -#define PIE_RX 0xa -#define PIE_RW 0xc -#define PIE_RWX 0xe +#define PIE_NONE_O UL(0x0) +#define PIE_R_O UL(0x1) +#define PIE_X_O UL(0x2) +#define PIE_RX_O UL(0x3) +#define PIE_RW_O UL(0x5) +#define PIE_RWnX_O UL(0x6) +#define PIE_RWX_O UL(0x7) +#define PIE_R UL(0x8) +#define PIE_GCS UL(0x9) +#define PIE_RX UL(0xa) +#define PIE_RW UL(0xc) +#define PIE_RWX UL(0xe) #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index ccc13e991376..cd8420e8c3ad 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -701,18 +701,18 @@ * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). */ -#define PIE_NONE_O 0x0 -#define PIE_R_O 0x1 -#define PIE_X_O 0x2 -#define PIE_RX_O 0x3 -#define PIE_RW_O 0x5 -#define PIE_RWnX_O 0x6 -#define PIE_RWX_O 0x7 -#define PIE_R 0x8 -#define PIE_GCS 0x9 -#define PIE_RX 0xa -#define PIE_RW 0xc -#define PIE_RWX 0xe +#define PIE_NONE_O UL(0x0) +#define PIE_R_O UL(0x1) +#define PIE_X_O UL(0x2) +#define PIE_RX_O UL(0x3) +#define PIE_RW_O UL(0x5) +#define PIE_RWnX_O UL(0x6) +#define PIE_RWX_O UL(0x7) +#define PIE_R UL(0x8) +#define PIE_GCS UL(0x9) +#define PIE_RX UL(0xa) +#define PIE_RW UL(0xc) +#define PIE_RWX UL(0xe) #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) -- cgit v1.2.3 From 80164282b3620a3cb73de6ffda5592743e448d0e Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Tue, 23 Apr 2024 16:21:02 +0800 Subject: kselftest: arm64: Add a null pointer check There is a 'malloc' call, which can be unsuccessful. This patch will add the malloc failure checking to avoid possible null dereference and give more information about test fail reasons. Signed-off-by: Kunwu Chan Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240423082102.2018886-1-chentao@kylinos.cn Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/tags/tags_test.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/tags/tags_test.c b/tools/testing/selftests/arm64/tags/tags_test.c index 5701163460ef..955f87c1170d 100644 --- a/tools/testing/selftests/arm64/tags/tags_test.c +++ b/tools/testing/selftests/arm64/tags/tags_test.c @@ -6,6 +6,7 @@ #include #include #include +#include "../../kselftest.h" #define SHIFT_TAG(tag) ((uint64_t)(tag) << 56) #define SET_TAG(ptr, tag) (((uint64_t)(ptr) & ~SHIFT_TAG(0xff)) | \ @@ -21,6 +22,9 @@ int main(void) if (prctl(PR_SET_TAGGED_ADDR_CTRL, PR_TAGGED_ADDR_ENABLE, 0, 0, 0) == 0) tbi_enabled = 1; ptr = (struct utsname *)malloc(sizeof(*ptr)); + if (!ptr) + ksft_exit_fail_msg("Failed to allocate utsname buffer\n"); + if (tbi_enabled) tag = 0x42; ptr = (struct utsname *)SET_TAG(ptr, tag); -- cgit v1.2.3 From a66f962f67ebbbdf7c82c6652180930c0169cf13 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 10 Apr 2024 19:58:52 -0700 Subject: tools/arch/x86/intel_sdsi: Fix maximum meter bundle length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The maximum number of bundles in the meter certificate was set to 8 which is much less than the maximum. Instead, since the bundles appear at the end of the file, set it based on the remaining file size from the bundle start position. Fixes: 7fdc03a7370f ("tools/arch/x86: intel_sdsi: Add support for reading meter certificates") Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240411025856.2782476-6-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- tools/arch/x86/intel_sdsi/intel_sdsi.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index 2cd92761f171..7eaffcbff788 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -43,7 +43,6 @@ #define METER_CERT_MAX_SIZE 4096 #define STATE_MAX_NUM_LICENSES 16 #define STATE_MAX_NUM_IN_BUNDLE (uint32_t)8 -#define METER_MAX_NUM_BUNDLES 8 #define __round_mask(x, y) ((__typeof__(x))((y) - 1)) #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) @@ -167,6 +166,11 @@ struct bundle_encoding_counter { uint32_t encoding; uint32_t counter; }; +#define METER_BUNDLE_SIZE sizeof(struct bundle_encoding_counter) +#define BUNDLE_COUNT(length) ((length) / METER_BUNDLE_SIZE) +#define METER_MAX_NUM_BUNDLES \ + ((METER_CERT_MAX_SIZE - sizeof(struct meter_certificate)) / \ + sizeof(struct bundle_encoding_counter)) struct sdsi_dev { struct sdsi_regs regs; @@ -386,9 +390,9 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) return -1; } - if (mc->bundle_length > METER_MAX_NUM_BUNDLES * 8) { - fprintf(stderr, "More than %d bundles: %d\n", - METER_MAX_NUM_BUNDLES, mc->bundle_length / 8); + if (mc->bundle_length > METER_MAX_NUM_BUNDLES * METER_BUNDLE_SIZE) { + fprintf(stderr, "More than %ld bundles: actual %ld\n", + METER_MAX_NUM_BUNDLES, BUNDLE_COUNT(mc->bundle_length)); return -1; } -- cgit v1.2.3 From 76f2bc17428c890754d11aa6aea14b332ba130c5 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 10 Apr 2024 19:58:53 -0700 Subject: tools/arch/x86/intel_sdsi: Fix meter_show display MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes sdsi_meter_cert_show() to correctly decode and display the meter certificate output. Adds and displays a missing version field, displays the ASCII name of the signature, and fixes the print alignment. Fixes: 7fdc03a7370f ("tools/arch/x86: intel_sdsi: Add support for reading meter certificates") Signed-off-by: David E. Box Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240411025856.2782476-7-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- tools/arch/x86/intel_sdsi/intel_sdsi.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index 7eaffcbff788..ae29214da102 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -153,11 +153,12 @@ struct bundle_encoding { }; struct meter_certificate { - uint32_t block_signature; - uint32_t counter_unit; + uint32_t signature; + uint32_t version; uint64_t ppin; + uint32_t counter_unit; uint32_t bundle_length; - uint32_t reserved; + uint64_t reserved; uint32_t mmrc_encoding; uint32_t mmrc_counter; }; @@ -338,6 +339,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) uint32_t count = 0; FILE *cert_ptr; int ret, size; + char name[4]; ret = sdsi_update_registers(s); if (ret) @@ -379,12 +381,19 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("\n"); printf("Meter certificate for device %s\n", s->dev_name); printf("\n"); - printf("Block Signature: 0x%x\n", mc->block_signature); - printf("Count Unit: %dms\n", mc->counter_unit); - printf("PPIN: 0x%lx\n", mc->ppin); - printf("Feature Bundle Length: %d\n", mc->bundle_length); - printf("MMRC encoding: %d\n", mc->mmrc_encoding); - printf("MMRC counter: %d\n", mc->mmrc_counter); + + get_feature(mc->signature, name); + printf("Signature: %.4s\n", name); + + printf("Version: %d\n", mc->version); + printf("Count Unit: %dms\n", mc->counter_unit); + printf("PPIN: 0x%lx\n", mc->ppin); + printf("Feature Bundle Length: %d\n", mc->bundle_length); + + get_feature(mc->mmrc_encoding, name); + printf("MMRC encoding: %.4s\n", name); + + printf("MMRC counter: %d\n", mc->mmrc_counter); if (mc->bundle_length % 8) { fprintf(stderr, "Invalid bundle length\n"); return -1; @@ -398,7 +407,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) bec = (void *)(mc) + sizeof(mc); - printf("Number of Feature Counters: %d\n", mc->bundle_length / 8); + printf("Number of Feature Counters: %ld\n", BUNDLE_COUNT(mc->bundle_length)); while (count++ < mc->bundle_length / 8) { char feature[5]; -- cgit v1.2.3 From 09d70ded6c566fd00886be32c26d0b2004ef239c Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 10 Apr 2024 19:58:54 -0700 Subject: tools/arch/x86/intel_sdsi: Fix meter_certificate decoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix errors in the calculation of the start position of the counters and in the display loop. While here, use a #define for the bundle count and size. Fixes: 7fdc03a7370f ("tools/arch/x86: intel_sdsi: Add support for reading meter certificates") Signed-off-by: David E. Box Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240411025856.2782476-8-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- tools/arch/x86/intel_sdsi/intel_sdsi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index ae29214da102..ba2a6b6645ae 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -394,7 +394,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("MMRC encoding: %.4s\n", name); printf("MMRC counter: %d\n", mc->mmrc_counter); - if (mc->bundle_length % 8) { + if (mc->bundle_length % METER_BUNDLE_SIZE) { fprintf(stderr, "Invalid bundle length\n"); return -1; } @@ -405,15 +405,16 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) return -1; } - bec = (void *)(mc) + sizeof(mc); + bec = (struct bundle_encoding_counter *)(mc + 1); printf("Number of Feature Counters: %ld\n", BUNDLE_COUNT(mc->bundle_length)); - while (count++ < mc->bundle_length / 8) { + while (count < BUNDLE_COUNT(mc->bundle_length)) { char feature[5]; feature[4] = '\0'; get_feature(bec[count].encoding, feature); printf(" %s: %d\n", feature, bec[count].counter); + ++count; } return 0; -- cgit v1.2.3 From 53310fe98c7b7a48b31e8e6e556a25a1238a7691 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 10 Apr 2024 19:58:55 -0700 Subject: tools/arch/x86/intel_sdsi: Simplify ascii printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add #define for feature length and move NUL assignment from callers to get_feature(). Signed-off-by: David E. Box Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240411025856.2782476-9-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- tools/arch/x86/intel_sdsi/intel_sdsi.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index ba2a6b6645ae..301213370740 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -43,6 +43,7 @@ #define METER_CERT_MAX_SIZE 4096 #define STATE_MAX_NUM_LICENSES 16 #define STATE_MAX_NUM_IN_BUNDLE (uint32_t)8 +#define FEAT_LEN 5 /* 4 plus NUL terminator */ #define __round_mask(x, y) ((__typeof__(x))((y) - 1)) #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) @@ -321,10 +322,11 @@ static char *content_type(uint32_t type) } } -static void get_feature(uint32_t encoding, char *feature) +static void get_feature(uint32_t encoding, char feature[5]) { char *name = (char *)&encoding; + feature[4] = '\0'; feature[3] = name[0]; feature[2] = name[1]; feature[1] = name[2]; @@ -339,7 +341,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) uint32_t count = 0; FILE *cert_ptr; int ret, size; - char name[4]; + char name[FEAT_LEN]; ret = sdsi_update_registers(s); if (ret) @@ -383,7 +385,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("\n"); get_feature(mc->signature, name); - printf("Signature: %.4s\n", name); + printf("Signature: %s\n", name); printf("Version: %d\n", mc->version); printf("Count Unit: %dms\n", mc->counter_unit); @@ -391,7 +393,7 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("Feature Bundle Length: %d\n", mc->bundle_length); get_feature(mc->mmrc_encoding, name); - printf("MMRC encoding: %.4s\n", name); + printf("MMRC encoding: %s\n", name); printf("MMRC counter: %d\n", mc->mmrc_counter); if (mc->bundle_length % METER_BUNDLE_SIZE) { @@ -409,9 +411,8 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) printf("Number of Feature Counters: %ld\n", BUNDLE_COUNT(mc->bundle_length)); while (count < BUNDLE_COUNT(mc->bundle_length)) { - char feature[5]; + char feature[FEAT_LEN]; - feature[4] = '\0'; get_feature(bec[count].encoding, feature); printf(" %s: %d\n", feature, bec[count].counter); ++count; @@ -494,7 +495,7 @@ static int sdsi_state_cert_show(struct sdsi_dev *s) sizeof(*lki) + // size of the license key info offset; // offset to this blob content struct bundle_encoding *bundle = (void *)(lbc) + sizeof(*lbc); - char feature[5]; + char feature[FEAT_LEN]; uint32_t i; printf(" Blob %d:\n", count - 1); @@ -507,8 +508,6 @@ static int sdsi_state_cert_show(struct sdsi_dev *s) printf(" Blob revision ID: %u\n", lbc->rev_id); printf(" Number of Features: %u\n", lbc->num_bundles); - feature[4] = '\0'; - for (i = 0; i < min(lbc->num_bundles, STATE_MAX_NUM_IN_BUNDLE); i++) { get_feature(bundle[i].encoding, feature); printf(" Feature %d: %s\n", i, feature); -- cgit v1.2.3 From f24644581ba92c901bbc2b69a1a6f6d178ee59d4 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 10 Apr 2024 19:58:56 -0700 Subject: tools/arch/x86/intel_sdsi: Add current meter support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to read the 'meter_current' file. The display is the same as the 'meter_certificate', but will show the current snapshot of the counters. Signed-off-by: David E. Box Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240411025856.2782476-10-david.e.box@linux.intel.com Signed-off-by: Hans de Goede --- tools/arch/x86/intel_sdsi/intel_sdsi.c | 49 +++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/intel_sdsi/intel_sdsi.c b/tools/arch/x86/intel_sdsi/intel_sdsi.c index 301213370740..766a5d26f534 100644 --- a/tools/arch/x86/intel_sdsi/intel_sdsi.c +++ b/tools/arch/x86/intel_sdsi/intel_sdsi.c @@ -185,6 +185,7 @@ struct sdsi_dev { enum command { CMD_SOCKET_INFO, CMD_METER_CERT, + CMD_METER_CURRENT_CERT, CMD_STATE_CERT, CMD_PROV_AKC, CMD_PROV_CAP, @@ -333,13 +334,14 @@ static void get_feature(uint32_t encoding, char feature[5]) feature[0] = name[3]; } -static int sdsi_meter_cert_show(struct sdsi_dev *s) +static int sdsi_meter_cert_show(struct sdsi_dev *s, bool show_current) { char buf[METER_CERT_MAX_SIZE] = {0}; struct bundle_encoding_counter *bec; struct meter_certificate *mc; uint32_t count = 0; FILE *cert_ptr; + char *cert_fname; int ret, size; char name[FEAT_LEN]; @@ -349,7 +351,6 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) if (!s->regs.en_features.sdsi) { fprintf(stderr, "SDSi feature is present but not enabled.\n"); - fprintf(stderr, " Unable to read meter certificate\n"); return -1; } @@ -364,15 +365,17 @@ static int sdsi_meter_cert_show(struct sdsi_dev *s) return ret; } - cert_ptr = fopen("meter_certificate", "r"); + cert_fname = show_current ? "meter_current" : "meter_certificate"; + cert_ptr = fopen(cert_fname, "r"); + if (!cert_ptr) { - perror("Could not open 'meter_certificate' file"); + fprintf(stderr, "Could not open '%s' file: %s", cert_fname, strerror(errno)); return -1; } size = fread(buf, 1, sizeof(buf), cert_ptr); if (!size) { - fprintf(stderr, "Could not read 'meter_certificate' file\n"); + fprintf(stderr, "Could not read '%s' file\n", cert_fname); fclose(cert_ptr); return -1; } @@ -738,7 +741,7 @@ static void sdsi_free_dev(struct sdsi_dev *s) static void usage(char *prog) { - printf("Usage: %s [-l] [-d DEVNO [-i] [-s] [-m] [-a FILE] [-c FILE]]\n", prog); + printf("Usage: %s [-l] [-d DEVNO [-i] [-s] [-m | -C] [-a FILE] [-c FILE]\n", prog); } static void show_help(void) @@ -747,8 +750,9 @@ static void show_help(void) printf(" %-18s\t%s\n", "-l, --list", "list available On Demand devices"); printf(" %-18s\t%s\n", "-d, --devno DEVNO", "On Demand device number"); printf(" %-18s\t%s\n", "-i, --info", "show socket information"); - printf(" %-18s\t%s\n", "-s, --state", "show state certificate"); - printf(" %-18s\t%s\n", "-m, --meter", "show meter certificate"); + printf(" %-18s\t%s\n", "-s, --state", "show state certificate data"); + printf(" %-18s\t%s\n", "-m, --meter", "show meter certificate data"); + printf(" %-18s\t%s\n", "-C, --meter_current", "show live unattested meter data"); printf(" %-18s\t%s\n", "-a, --akc FILE", "provision socket with AKC FILE"); printf(" %-18s\t%s\n", "-c, --cap FILE>", "provision socket with CAP FILE"); } @@ -764,21 +768,22 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { - {"akc", required_argument, 0, 'a'}, - {"cap", required_argument, 0, 'c'}, - {"devno", required_argument, 0, 'd'}, - {"help", no_argument, 0, 'h'}, - {"info", no_argument, 0, 'i'}, - {"list", no_argument, 0, 'l'}, - {"meter", no_argument, 0, 'm'}, - {"state", no_argument, 0, 's'}, - {0, 0, 0, 0 } + {"akc", required_argument, 0, 'a'}, + {"cap", required_argument, 0, 'c'}, + {"devno", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"info", no_argument, 0, 'i'}, + {"list", no_argument, 0, 'l'}, + {"meter", no_argument, 0, 'm'}, + {"meter_current", no_argument, 0, 'C'}, + {"state", no_argument, 0, 's'}, + {0, 0, 0, 0 } }; progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+a:c:d:hilms", long_options, + while ((opt = getopt_long_only(argc, argv, "+a:c:d:hilmCs", long_options, &option_index)) != -1) { switch (opt) { case 'd': @@ -794,6 +799,9 @@ int main(int argc, char *argv[]) case 'm': command = CMD_METER_CERT; break; + case 'C': + command = CMD_METER_CURRENT_CERT; + break; case 's': command = CMD_STATE_CERT; break; @@ -832,7 +840,10 @@ int main(int argc, char *argv[]) ret = sdsi_read_reg(s); break; case CMD_METER_CERT: - ret = sdsi_meter_cert_show(s); + ret = sdsi_meter_cert_show(s, false); + break; + case CMD_METER_CURRENT_CERT: + ret = sdsi_meter_cert_show(s, true); break; case CMD_STATE_CERT: ret = sdsi_state_cert_show(s); -- cgit v1.2.3 From 9ea48bdfd5b19a81edfd9dcc12b8af6bb319c6d8 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 7 Mar 2024 15:45:54 -0800 Subject: tools/power/x86/intel-speed-select: Increase die count TPMI platform information supports up to 16 compute dies. So increase the range. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 4bddd3c66bf7..39ee75677c2c 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -80,7 +80,7 @@ #define DISP_FREQ_MULTIPLIER 100 #define MAX_PACKAGE_COUNT 32 -#define MAX_DIE_PER_PACKAGE 2 +#define MAX_DIE_PER_PACKAGE 16 #define MAX_PUNIT_PER_DIE 8 /* Unified structure to specific a CPU or a Power Domain */ -- cgit v1.2.3 From f9264471337e1db69ffc97525bff605b8c7c442f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 7 Mar 2024 15:47:11 -0800 Subject: tools/power/x86/intel-speed-select: Support multiple dies When the die id is same as punit compute die ID, treat them same. In this case, when for_each_online_power_domain_in_set() is called, then don't loop for each punit in a die. Just loop for all punits in a package. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index d865dc1f89ee..54c50124303e 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -46,6 +46,8 @@ static int force_online_offline; static int auto_mode; static int fact_enable_fail; static int cgroupv2; +static int max_die_id; +static int max_punit_id; /* clos related */ static int current_clos = -1; @@ -562,6 +564,18 @@ void for_each_online_power_domain_in_set(void (*callback)(struct isst_id *, void } for (i = 0; i < MAX_PACKAGE_COUNT; i++) { + if (max_die_id == max_punit_id) { + for (k = 0; k < MAX_PUNIT_PER_DIE && k < MAX_DIE_PER_PACKAGE; k++) { + id.cpu = cpus[i][k][k]; + id.pkg = i; + id.die = k; + id.punit = k; + if (isst_is_punit_valid(&id)) + callback(&id, arg1, arg2, arg3, arg4); + } + continue; + } + for (j = 0; j < MAX_DIE_PER_PACKAGE; j++) { /* * Fix me: @@ -795,6 +809,12 @@ static void create_cpu_map(void) cpu_cnt[pkg_id][die_id][punit_id]++; + if (max_die_id < die_id) + max_die_id = die_id; + + if (max_punit_id < cpu_map[i].punit_id) + max_punit_id = cpu_map[i].punit_id; + debug_printf( "map logical_cpu:%d core: %d die:%d pkg:%d punit:%d punit_cpu:%d punit_core:%d\n", i, cpu_map[i].core_id, cpu_map[i].die_id, -- cgit v1.2.3 From 55d5639bda6563fdfbdf0606c3c23cf5fceacd61 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 8 Mar 2024 16:12:33 -0800 Subject: tools/power/x86/intel-speed-select: Fix display for unsupported levels During call to "intel-speed-select turbo-freq info" some junk values are reported for unsupported levels. Initialize the structure fact_info with 0s, so that isst_fact_display_information() will skip "0" values in the frequency. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 54c50124303e..d7809b7c12d9 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -2074,6 +2074,7 @@ static void dump_fact_config_for_cpu(struct isst_id *id, void *arg1, void *arg2, struct isst_fact_info fact_info; int ret; + memset(&fact_info, 0, sizeof(fact_info)); ret = isst_get_fact_info(id, tdp_level, fact_bucket, &fact_info); if (ret) { isst_display_error_info_message(1, "Failed to get turbo-freq info at this level", 1, tdp_level); -- cgit v1.2.3 From 38fa152b3de09c01e3488debb903adb0a28580b9 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 8 Mar 2024 18:11:03 -0800 Subject: tools/power/x86/intel-speed-select: Present all TRL levels for turbo-freq For turbo-freq feature, only 3 levels of frequencies are displayed even if platform support more. Present all levels based on the CPU model. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-core-mbox.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-core-mbox.c b/tools/power/x86/intel-speed-select/isst-core-mbox.c index 24bea57f4ff5..c81ecd602bcf 100644 --- a/tools/power/x86/intel-speed-select/isst-core-mbox.c +++ b/tools/power/x86/intel-speed-select/isst-core-mbox.c @@ -746,6 +746,7 @@ static int mbox_set_pbf_fact_status(struct isst_id *id, int pbf, int enable) static int _get_fact_bucket_info(struct isst_id *id, int level, struct isst_fact_bucket_info *bucket_info) { + int trl_max_levels = isst_get_trl_max_levels(); unsigned int resp; int i, k, ret; @@ -769,7 +770,7 @@ static int _get_fact_bucket_info(struct isst_id *id, int level, } } - for (k = 0; k < 3; ++k) { + for (k = 0; k < trl_max_levels; ++k) { for (i = 0; i < 2; ++i) { int j; -- cgit v1.2.3 From 80a513e3f7fca0493e2760b2bb0526faa33a12cb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 25 Mar 2024 13:39:36 -0700 Subject: tools/power/x86/intel-speed-select: Increase number of CPUs displayed Currently max 128 CPUs can be displayed in the enable CPU list. Double the range. Since the size is big for stack allocation, change to static. Here changing to static is fine as these functions are called in serial. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-display.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 14c9b037859a..e30f6355e417 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -199,8 +199,8 @@ static void _isst_pbf_display_information(struct isst_id *id, FILE *outf, int le struct isst_pbf_info *pbf_info, int disp_level) { - char header[256]; - char value[512]; + static char header[256]; + static char value[1024]; snprintf(header, sizeof(header), "speed-select-base-freq-properties"); format_and_print(outf, disp_level, header, NULL); @@ -338,8 +338,8 @@ void isst_ctdp_display_core_info(struct isst_id *id, FILE *outf, char *prefix, void isst_ctdp_display_information(struct isst_id *id, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { - char header[256]; - char value[512]; + static char header[256]; + static char value[1024]; static int level; int trl_max_levels = isst_get_trl_max_levels(); int i; -- cgit v1.2.3 From 1fcf670e50645f23f026d1dca82e59200115f925 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 26 Mar 2024 18:55:53 -0400 Subject: tools/power/x86/intel-speed-select: SST BF/TF support per level SST BF and TF can be enabled/disabled per level. So check the current level support from the mask of supported levels. This change from a single level to mask for info.sst_tf_support and info.sst_tf_support is indicated by API version change. Use as mask for API version above 2. In this way there is no change in behavior when running on older kernel with API version 2. Since the tool can support now API version 3, update the supported API version. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 2 +- tools/power/x86/intel-speed-select/isst-core-tpmi.c | 10 ++++++++-- tools/power/x86/intel-speed-select/isst-core.c | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index d7809b7c12d9..424e708b7b20 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -18,7 +18,7 @@ struct process_cmd_struct { static const char *version_str = "v1.18"; -static const int supported_api_ver = 2; +static const int supported_api_ver = 3; static struct isst_if_platform_info isst_platform_info; static char *progname; static int debug_flag; diff --git a/tools/power/x86/intel-speed-select/isst-core-tpmi.c b/tools/power/x86/intel-speed-select/isst-core-tpmi.c index 3458768562e5..32ea70c7dbd8 100644 --- a/tools/power/x86/intel-speed-select/isst-core-tpmi.c +++ b/tools/power/x86/intel-speed-select/isst-core-tpmi.c @@ -194,8 +194,14 @@ static int tpmi_get_ctdp_control(struct isst_id *id, int config_index, if (!(info.level_mask & level_mask)) return -1; - ctdp_level->fact_support = info.sst_tf_support; - ctdp_level->pbf_support = info.sst_bf_support; + if (api_version() > 2) { + ctdp_level->fact_support = info.sst_tf_support & BIT(config_index); + ctdp_level->pbf_support = info.sst_bf_support & BIT(config_index); + } else { + ctdp_level->fact_support = info.sst_tf_support; + ctdp_level->pbf_support = info.sst_bf_support; + } + ctdp_level->fact_enabled = !!(info.feature_state & BIT(1)); ctdp_level->pbf_enabled = !!(info.feature_state & BIT(0)); diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index f55fef4c13a7..05efffbca3b7 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -23,6 +23,7 @@ int isst_set_platform_ops(int api_version) isst_ops = mbox_get_platform_ops(); break; case 2: + case 3: isst_ops = tpmi_get_platform_ops(); break; default: -- cgit v1.2.3 From 8ebc39ace34e9b9944e29110598f8343f2d8c159 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 26 Apr 2024 14:39:44 -0700 Subject: tools/power/x86/intel-speed-select: Display CPU as None for -1 When there is no CPU in a power domain, display "None" instead of -1. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-display.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index e30f6355e417..07ebd08f3202 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -172,12 +172,19 @@ static int print_package_info(struct isst_id *id, FILE *outf) int level = 1; if (out_format_is_json()) { - if (api_version() > 1) - snprintf(header, sizeof(header), "package-%d:die-%d:powerdomain-%d:cpu-%d", - id->pkg, id->die, id->punit, id->cpu); - else + if (api_version() > 1) { + if (id->cpu < 0) + snprintf(header, sizeof(header), + "package-%d:die-%d:powerdomain-%d:cpu-None", + id->pkg, id->die, id->punit); + else + snprintf(header, sizeof(header), + "package-%d:die-%d:powerdomain-%d:cpu-%d", + id->pkg, id->die, id->punit, id->cpu); + } else { snprintf(header, sizeof(header), "package-%d:die-%d:cpu-%d", id->pkg, id->die, id->cpu); + } format_and_print(outf, level, header, NULL); return 1; } @@ -189,7 +196,12 @@ static int print_package_info(struct isst_id *id, FILE *outf) snprintf(header, sizeof(header), "powerdomain-%d", id->punit); format_and_print(outf, level++, header, NULL); } - snprintf(header, sizeof(header), "cpu-%d", id->cpu); + + if (id->cpu < 0) + snprintf(header, sizeof(header), "cpu-None"); + else + snprintf(header, sizeof(header), "cpu-%d", id->cpu); + format_and_print(outf, level, header, NULL); return level; -- cgit v1.2.3 From 5cfac5abb6e23d15669d0621d991d141f4f9b7d4 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 25 Apr 2024 17:36:08 -0700 Subject: tools/power/x86/intel-speed-select: v1.19 release This version addresses issues with: - Support of SST BF/TF support per level - Increase number of CPUs displayed - Present all TRL levels for turbo-freq - Fix display for unsupported levels - Support multiple dies - Increase die count - Change CPU display for non compute domain Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 424e708b7b20..5899c27c2e2e 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -16,7 +16,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.18"; +static const char *version_str = "v1.19"; static const int supported_api_ver = 3; static struct isst_if_platform_info isst_platform_info; -- cgit v1.2.3 From 37496845c812db2a470d51088a59ee38156e8058 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 29 Feb 2024 15:07:09 +0530 Subject: selftests/powerpc: Re-order *FLAGS to follow lib.mk In some powerpc/ sub-folder Makefiles, CFLAGS are defined before lib.mk include. Clean it up by re-ordering the flags to follow after the mk include. This is needed to support sub-folders in powerpc/ buildable on its own. Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240229093711.581230-1-maddy@linux.ibm.com --- tools/testing/selftests/powerpc/benchmarks/Makefile | 4 ++-- tools/testing/selftests/powerpc/copyloops/Makefile | 20 ++++++++++---------- tools/testing/selftests/powerpc/nx-gzip/Makefile | 4 ++-- tools/testing/selftests/powerpc/pmu/ebb/Makefile | 20 ++++++++++---------- .../selftests/powerpc/pmu/event_code_tests/Makefile | 4 ++-- .../selftests/powerpc/pmu/sampling_tests/Makefile | 4 ++-- tools/testing/selftests/powerpc/primitives/Makefile | 4 ++-- tools/testing/selftests/powerpc/security/Makefile | 4 ++-- tools/testing/selftests/powerpc/signal/Makefile | 3 ++- tools/testing/selftests/powerpc/stringloops/Makefile | 10 +++++----- .../testing/selftests/powerpc/switch_endian/Makefile | 4 ++-- tools/testing/selftests/powerpc/syscalls/Makefile | 4 ++-- tools/testing/selftests/powerpc/vphn/Makefile | 4 ++-- 13 files changed, 45 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index a32a6ab89914..75f5232c3aec 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,11 +4,11 @@ TEST_GEN_FILES := exec_target TEST_FILES := settings -CFLAGS += -O2 - top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -O2 + $(TEST_GEN_PROGS): ../harness.c $(OUTPUT)/context_switch: ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 77594e697f2f..72684ed589c0 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -1,14 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -m64 -CFLAGS += -I$(CURDIR) -CFLAGS += -D SELFTEST -CFLAGS += -maltivec -CFLAGS += -mcpu=power4 - -# Use our CFLAGS for the implicit .S rule & set the asm machine type -ASFLAGS = $(CFLAGS) -Wa,-mpower4 - TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ @@ -21,6 +11,16 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +# The loops are all 64-bit code +CFLAGS += -m64 +CFLAGS += -I$(CURDIR) +CFLAGS += -D SELFTEST +CFLAGS += -maltivec +CFLAGS += -mcpu=power4 + +# Use our CFLAGS for the implicit .S rule & set the asm machine type +ASFLAGS = $(CFLAGS) -Wa,-mpower4 + $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) $(CC) $(CPPFLAGS) $(CFLAGS) \ -D COPY_LOOP=test___copy_tofrom_user_base \ diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index 0785c2e99d40..b40991f902b2 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -1,8 +1,8 @@ -CFLAGS = -O3 -m64 -I./include -I../include - TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +CFLAGS = -O3 -m64 -I./include -I../include + $(TEST_GEN_FILES): gzip_vas.c ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index 010160690227..b3946ce17e0c 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -4,16 +4,6 @@ include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ -# The EBB handler is 64-bit code and everything links against it -CFLAGS += -m64 - -TMPOUT = $(OUTPUT)/TMPDIR/ -# Toolchains may build PIE by default which breaks the assembly -no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) - -LDFLAGS += $(no-pie-option) - TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ cycles_with_freeze_test pmc56_overflow_test \ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \ @@ -29,6 +19,16 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +# The EBB handler is 64-bit code and everything links against it +CFLAGS += -m64 + +TMPOUT = $(OUTPUT)/TMPDIR/ +# Toolchains may build PIE by default which breaks the assembly +no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) + +LDFLAGS += $(no-pie-option) + $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \ ebb.c ebb_handler.S trace.c busy_loop.S diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 4e07d7046457..509d4b235b9e 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ @@ -12,4 +10,6 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +CFLAGS += -m64 + $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ../sampling_tests/misc.h ../sampling_tests/misc.c diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 9e67351fb252..d45892151e05 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ @@ -12,4 +10,6 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +CFLAGS += -m64 + $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S ../branch_loops.S diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 9b9491a63213..6dc5c5a42ca9 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -I$(CURDIR) - TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -I$(CURDIR) + $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index e0d979ab0204..0a08386be969 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -5,10 +5,10 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. -CFLAGS += $(KHDR_INCLUDES) - include ../../lib.mk +CFLAGS += $(KHDR_INCLUDES) + $(TEST_GEN_PROGS): ../harness.c ../utils.c $(OUTPUT)/spectre_v2: CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index f679d260afc8..b15d5dbccc24 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -3,7 +3,6 @@ TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart TEST_GEN_PROGS += sigreturn_kernel TEST_GEN_PROGS += sigreturn_unaligned -CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm $(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 @@ -12,4 +11,6 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -maltivec + $(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 9c39f55a58ff..87c8c8f238da 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -I$(CURDIR) - EXTRA_SOURCES := ../harness.c build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi) @@ -27,9 +24,12 @@ $(OUTPUT)/strlen_32: CFLAGS += -m32 TEST_GEN_PROGS += strlen_32 endif -ASFLAGS = $(CFLAGS) - top_srcdir = ../../../../.. include ../../lib.mk +# The loops are all 64-bit code +CFLAGS += -I$(CURDIR) + +ASFLAGS = $(CFLAGS) + $(TEST_GEN_PROGS): $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index bdc081afedb0..8f0c2a1d3333 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -1,13 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := switch_endian_test -ASFLAGS += -O2 -Wall -g -nostdlib -m64 - EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +ASFLAGS += -O2 -Wall -g -nostdlib -m64 + $(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index ee1740ddfb0c..83dc33500773 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := ipc_unmuxed rtas_filter -CFLAGS += $(KHDR_INCLUDES) - top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += $(KHDR_INCLUDES) + $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index cf65cbf33085..ddc09a20b80f 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 -I$(CURDIR) - top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -m64 -I$(CURDIR) + $(TEST_GEN_PROGS): ../harness.c -- cgit v1.2.3 From 5553a79387e92ffd812a49fdcf679f392281f6a9 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 29 Feb 2024 15:07:10 +0530 Subject: selftests/powerpc: Add flags.mk to support pmu buildable When running `make -C powerpc/pmu run_tests` from top level selftests directory, currently this error is being reported: make: Entering directory '/home/maddy/linux/tools/testing/selftests/powerpc/pmu' Makefile:40: warning: overriding recipe for target 'emit_tests' ../../lib.mk:111: warning: ignoring old recipe for target 'emit_tests' gcc -m64 count_instructions.c ../harness.c event.c lib.c ../utils.c loop.S -o /home/maddy/selftest_output//count_instructions In file included from count_instructions.c:13: event.h:12:10: fatal error: utils.h: No such file or directory 12 | #include "utils.h" | ^~~~~~~~~ compilation terminated. This is due to missing of include path in CFLAGS. That is, CFLAGS and GIT_VERSION macros are defined in the powerpc/ folder Makefile which in this case is not involved. To address the failure in case of executing specific sub-folder test directly, a new rule file has been addded by the patch called "flags.mk" under selftest/powerpc/ folder and is linked to all the Makefile of powerpc/pmu sub-folders. Reported-by: Sachin Sant Signed-off-by: Madhavan Srinivasan Tested-by: Sachin Sant [mpe: Fixup ifeq, make GIT_VERSION simply expanded to avoid re-executing git describe] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240229093711.581230-2-maddy@linux.ibm.com --- tools/testing/selftests/powerpc/flags.mk | 12 ++++++++++++ tools/testing/selftests/powerpc/pmu/Makefile | 1 + tools/testing/selftests/powerpc/pmu/ebb/Makefile | 1 + .../testing/selftests/powerpc/pmu/event_code_tests/Makefile | 1 + tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile | 1 + 5 files changed, 16 insertions(+) create mode 100644 tools/testing/selftests/powerpc/flags.mk (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/flags.mk b/tools/testing/selftests/powerpc/flags.mk new file mode 100644 index 000000000000..b909bee3cb2a --- /dev/null +++ b/tools/testing/selftests/powerpc/flags.mk @@ -0,0 +1,12 @@ +#This checks for any ENV variables and add those. + +ifeq ($(GIT_VERSION),) +GIT_VERSION := $(shell git describe --always --long --dirty || echo "unknown") +export GIT_VERSION +endif + +ifeq ($(CFLAGS),) +CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(CFLAGS) +export CFLAGS +endif + diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index a284fa874a9f..1fcacae1b188 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -7,6 +7,7 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index b3946ce17e0c..1b39af7c10db 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -18,6 +18,7 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk # The EBB handler is 64-bit code and everything links against it CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 509d4b235b9e..fdb080b3fa65 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -9,6 +9,7 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index d45892151e05..9f79bec5fce7 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -9,6 +9,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk CFLAGS += -m64 -- cgit v1.2.3 From 108e5e683333615023265a9a73a29d4c2fa16c70 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 29 Feb 2024 15:07:11 +0530 Subject: selftests/powerpc: make sub-folders buildable on their own Build breaks when executing make with run_tests for sub-folders under powerpc. This is because, CFLAGS and GIT_VERSION macros are defined in Makefile of toplevel powerpc folder. make: Entering directory '/home/maddy/linux/tools/testing/selftests/powerpc/mm' gcc hugetlb_vs_thp_test.c ../harness.c ../utils.c -o /home/maddy/selftest_output//hugetlb_vs_thp_test hugetlb_vs_thp_test.c:6:10: fatal error: utils.h: No such file or directory 6 | #include "utils.h" | ^~~~~~~~~ compilation terminated. Fix this by adding the flags.mk in each sub-folder Makefile. Also remove the CFLAGS and GIT_VERSION macros from powerpc/ folder Makefile since the same is definied in flags.mk Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240229093711.581230-3-maddy@linux.ibm.com --- tools/testing/selftests/powerpc/Makefile | 7 +------ tools/testing/selftests/powerpc/alignment/Makefile | 1 + tools/testing/selftests/powerpc/benchmarks/Makefile | 1 + tools/testing/selftests/powerpc/cache_shape/Makefile | 1 + tools/testing/selftests/powerpc/copyloops/Makefile | 1 + tools/testing/selftests/powerpc/dexcr/Makefile | 1 + tools/testing/selftests/powerpc/dscr/Makefile | 1 + tools/testing/selftests/powerpc/eeh/Makefile | 1 + tools/testing/selftests/powerpc/math/Makefile | 1 + tools/testing/selftests/powerpc/mce/Makefile | 1 + tools/testing/selftests/powerpc/mm/Makefile | 1 + tools/testing/selftests/powerpc/nx-gzip/Makefile | 1 + tools/testing/selftests/powerpc/papr_attributes/Makefile | 3 ++- tools/testing/selftests/powerpc/papr_sysparm/Makefile | 1 + tools/testing/selftests/powerpc/papr_vpd/Makefile | 1 + tools/testing/selftests/powerpc/primitives/Makefile | 1 + tools/testing/selftests/powerpc/ptrace/Makefile | 1 + tools/testing/selftests/powerpc/security/Makefile | 1 + tools/testing/selftests/powerpc/signal/Makefile | 1 + tools/testing/selftests/powerpc/stringloops/Makefile | 1 + tools/testing/selftests/powerpc/switch_endian/Makefile | 1 + tools/testing/selftests/powerpc/syscalls/Makefile | 1 + tools/testing/selftests/powerpc/tm/Makefile | 1 + tools/testing/selftests/powerpc/vphn/Makefile | 1 + 24 files changed, 25 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index c376151982c4..2f299fd04d2d 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -7,12 +7,6 @@ ARCH := $(shell echo $(ARCH) | sed -e s/ppc.*/powerpc/) ifeq ($(ARCH),powerpc) -GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown") - -CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)/include $(CFLAGS) - -export CFLAGS - SUB_DIRS = alignment \ benchmarks \ cache_shape \ @@ -46,6 +40,7 @@ $(SUB_DIRS): BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all include ../lib.mk +include ./flags.mk override define RUN_TESTS +@for TARGET in $(SUB_DIRS); do \ diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93e9af37449d..66d5d7aaeb20 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index 75f5232c3aec..1321922038d0 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -6,6 +6,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -O2 diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 689f6c8ebcd8..3a3ca956ac66 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := cache_shape top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 72684ed589c0..42940f92d832 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -10,6 +10,7 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk # The loops are all 64-bit code CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 76210f2bcec3..523947a38d17 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -2,6 +2,7 @@ TEST_GEN_PROGS := hashchk_test TEST_GEN_FILES := lsdexcr include ../../lib.mk +include ../flags.mk $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 9289d5febe1e..9fa9cb5bd989 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -5,6 +5,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread $(OUTPUT)/dscr_explicit_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile index ae963eb2dc5b..70797716f2b5 100644 --- a/tools/testing/selftests/powerpc/eeh/Makefile +++ b/tools/testing/selftests/powerpc/eeh/Makefile @@ -7,3 +7,4 @@ TEST_FILES := eeh-functions.sh settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 3948f7c510aa..b14fd2e0c6a8 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal fpu_denormal vmx_syscall vm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c $(TEST_GEN_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile index 2424513982d9..ce4ed679aaaf 100644 --- a/tools/testing/selftests/powerpc/mce/Makefile +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := inject-ra-err include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 4a6608beef0e..aab058ecb352 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -13,6 +13,7 @@ TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index b40991f902b2..480d8ba94cf7 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -2,6 +2,7 @@ TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +include ../flags.mk CFLAGS = -O3 -m64 -I./include -I../include diff --git a/tools/testing/selftests/powerpc/papr_attributes/Makefile b/tools/testing/selftests/powerpc/papr_attributes/Makefile index e899712d49db..406429499022 100644 --- a/tools/testing/selftests/powerpc/papr_attributes/Makefile +++ b/tools/testing/selftests/powerpc/papr_attributes/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := attr_test top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -$(TEST_GEN_PROGS): ../harness.c ../utils.c \ No newline at end of file +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile b/tools/testing/selftests/powerpc/papr_sysparm/Makefile index 7f79e437634a..fed4f2414dbf 100644 --- a/tools/testing/selftests/powerpc/papr_sysparm/Makefile +++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_sysparm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile b/tools/testing/selftests/powerpc/papr_vpd/Makefile index 06b719703bfd..b09852e40882 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/Makefile +++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_vpd top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 6dc5c5a42ca9..23bd9a7590dd 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -I$(CURDIR) diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 1b39b86849da..59ca01d8567e 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -26,6 +26,7 @@ LOCAL_HDRS += $(patsubst %,$(selfdir)/powerpc/ptrace/%,$(wildcard *.h)) top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk TM_TESTS := $(patsubst %,$(OUTPUT)/%,$(TM_TESTS)) TESTS_64 := $(patsubst %,$(OUTPUT)/%,$(TESTS_64)) diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index 0a08386be969..33286039724a 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -6,6 +6,7 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index b15d5dbccc24..ece95bd52be9 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -10,6 +10,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -maltivec diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 87c8c8f238da..4c9d9a58c9d1 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -26,6 +26,7 @@ endif top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk # The loops are all 64-bit code CFLAGS += -I$(CURDIR) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index 8f0c2a1d3333..0da2e0a74264 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -5,6 +5,7 @@ EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk ASFLAGS += -O2 -Wall -g -nostdlib -m64 diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index 83dc33500773..3bc07af88f0e 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := ipc_unmuxed rtas_filter top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 3876805c2f31..f13f0ab36007 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -11,6 +11,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index ddc09a20b80f..61d519a076c6 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := test-vphn top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -m64 -I$(CURDIR) -- cgit v1.2.3 From 822a04957cc5e675570645f506270797a1cf2865 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 22 Apr 2024 23:34:52 +1000 Subject: selftests/powerpc: Convert pmu Makefile to for loop style The pmu Makefile has grown more sub directories over the years. Rather than open coding the rules for each subdir, use for loops. Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422133453.1793988-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/pmu/Makefile | 43 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 1fcacae1b188..773933e5180e 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -9,7 +9,9 @@ top_srcdir = ../../../../.. include ../../lib.mk include ../flags.mk -all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests +SUB_DIRS := ebb sampling_tests event_code_tests + +all: $(TEST_GEN_PROGS) $(SUB_DIRS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -23,12 +25,16 @@ $(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) $(OUTPUT)/per_event_excludes: ../utils.c +$(SUB_DIRS): + BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all + DEFAULT_RUN_TESTS := $(RUN_TESTS) override define RUN_TESTS $(DEFAULT_RUN_TESTS) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests; \ + done; endef emit_tests: @@ -36,34 +42,29 @@ emit_tests: BASENAME_TEST=`basename $$TEST`; \ echo "$(COLLECTION):$$BASENAME_TEST"; \ done - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests; \ + done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install; \ + done; endef DEFAULT_CLEAN := $(CLEAN) override define CLEAN $(DEFAULT_CLEAN) $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \ + done; endef -ebb: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -sampling_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -event_code_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all .PHONY: all run_tests ebb sampling_tests event_code_tests emit_tests -- cgit v1.2.3 From dda32e37d397f5937cc24a6e98b71d3645f51afa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 22 Apr 2024 23:34:53 +1000 Subject: selftests/powerpc: Install tests in sub-directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sources for the powerpc selftests are arranged into sub-directories. However when the tests are built and installed, the sub-directories are squashed, losing the structure. For example, with the current code the result of installing the selftests is: $ tree tools/testing/selftests/kselftest_install tools/testing/selftests/kselftest_install ├── kselftest │   ├── ktap_helpers.sh │   ├── module.sh │   ├── prefix.pl │   └── runner.sh ├── kselftest-list.txt ├── powerpc │   ├── alignment_handler │   ├── attr_test │   ├── back_to_back_ebbs_test │   ├── bad_accesses │   ├── bhrb_filter_map_test │   ├── bhrb_no_crash_wo_pmu_test │   ├── blacklisted_events_test │   ├── cache_shape │   ├── close_clears_pmcc_test │   ├── context_switch │   ├── copy_first_unaligned ... │   ├── settings ... │   └── wild_bctr └── run_kselftest.sh All the powerpc tests are squashed into the single powerpc directory. In particular, note that there is a single `settings` file, even though there are multiple settings files in the powerpc selftest sources. One of the settings files ends up installed, depending on install order, even if they have different contents. Similarly if there were two tests with the same name in different sub-directories they would clobber each other. Fix it by replicating the directory structure of the source tree into the install directory. The result being for example: $ tree tools/testing/selftests/kselftest_install tools/testing/selftests/kselftest_install ├── kselftest │   ├── ktap_helpers.sh │   ├── module.sh │   ├── prefix.pl │   └── runner.sh ├── kselftest-list.txt ├── powerpc │   ├── alignment │   │   ├── alignment_handler │   │   └── copy_first_unaligned │   ├── benchmarks │   │   ├── context_switch │   │   ├── exec_target │   │   ├── fork │   │   ├── futex_bench │   │   ├── gettimeofday │   │   ├── mmap_bench │   │   ├── null_syscall │   │   └── settings ... │   ├── eeh │   │   ├── eeh-basic.sh │   │   ├── eeh-functions.sh │   │   └── settings ... │   └── vphn │   └── test-vphn └── run_kselftest.sh Note multiple settings files in different sub-directories. This change also has the effect of changing the names of the tests from the point of view of the kselftest runner. Before the tests are named eg: powerpc:copy_first_unaligned powerpc:cache_shape powerpc:reg_access_test After, the test collection names include the sub-directory: powerpc/alignment:copy_first_unaligned powerpc/cache_shape:cache_shape powerpc/pmu/ebb:reg_access_test That means whereas previously all powerpc tests could be run with: $ ./run_kselftest.sh -c powerpc After the change it's necessary to pass a regex that matches all powerpc entries, eg: $ ./run_kselftest.sh -c "powerpc.*" The latter form also works before and after the change. Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422133453.1793988-2-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/Makefile | 4 ++-- tools/testing/selftests/powerpc/pmu/Makefile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 2f299fd04d2d..b175e94e1901 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -52,14 +52,14 @@ endef override define INSTALL_RULE +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install;\ done; endef emit_tests: +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET $@;\ done; override define CLEAN diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 773933e5180e..7e9dbf3d0d09 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -44,7 +44,7 @@ emit_tests: done +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests; \ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET emit_tests; \ done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) @@ -52,7 +52,7 @@ override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install; \ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install; \ done; endef -- cgit v1.2.3 From 5d211c7090590033581175d6405ae40917ca3a06 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 26 Apr 2024 15:47:56 -0700 Subject: cxl: Fix cxl_endpoint_get_perf_coordinate() support for RCH Robert reported the following when booting a CXL host with Restricted CXL Host (RCH) topology: [ 39.815379] cxl_acpi ACPI0017:00: not a cxl_port device [ 39.827123] WARNING: CPU: 46 PID: 1754 at drivers/cxl/core/port.c:592 to_cxl_port+0x56/0x70 [cxl_core] ... plus some related subsequent NULL pointer dereference: [ 40.718708] BUG: kernel NULL pointer dereference, address: 00000000000002d8 The iterator to walk the PCIe path did not account for RCH topology. However RCH does not support hotplug and the memory exported by the Restricted CXL Device (RCD) should be covered by HMAT and therefore no access_coordinate is needed. Add check to see if the endpoint device is RCD and skip calculation. Also add a call to cxl_endpoint_get_perf_coordinates() in cxl_test in order to exercise the topology iterator. The dev_is_pci() check added is to help with this test and should be harmless for normal operation. Reported-by: Robert Richter Closes: https://lore.kernel.org/all/Ziv8GfSMSbvlBB0h@rric.localdomain/ Fixes: 592780b8391f ("cxl: Fix retrieving of access_coordinates in PCIe path") Reviewed-by: Dan Williams Tested-by: Robert Richter Reviewed-by: Robert Richter Link: https://lore.kernel.org/r/20240426224913.1027420-1-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 15 ++++++++++++++- tools/testing/cxl/test/cxl.c | 7 +++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 762783bb091a..887ed6e358fb 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2184,6 +2184,7 @@ static bool parent_port_is_cxl_root(struct cxl_port *port) int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord) { + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct access_coordinate c[] = { { .read_bandwidth = UINT_MAX, @@ -2197,12 +2198,20 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct cxl_port *iter = port; struct cxl_dport *dport; struct pci_dev *pdev; + struct device *dev; unsigned int bw; bool is_cxl_root; if (!is_cxl_endpoint(port)) return -EINVAL; + /* + * Skip calculation for RCD. Expectation is HMAT already covers RCD case + * since RCH does not support hotplug. + */ + if (cxlmd->cxlds->rcd) + return 0; + /* * Exit the loop when the parent port of the current iter port is cxl * root. The iterative loop starts at the endpoint and gathers the @@ -2232,8 +2241,12 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); + dev = port->uport_dev->parent; + if (!dev_is_pci(dev)) + return -ENODEV; + /* Get the calculated PCI paths bandwidth */ - pdev = to_pci_dev(port->uport_dev->parent); + pdev = to_pci_dev(dev); bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL); if (bw == 0) return -ENXIO; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 61c69297e797..3482248aa344 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1001,6 +1001,7 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; struct range pmem_range = { .start = cxlds->pmem_res.start, .end = cxlds->pmem_res.end, @@ -1020,6 +1021,12 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) dpa_perf_setup(port, &pmem_range, &mds->pmem_perf); cxl_memdev_update_perf(cxlmd); + + /* + * This function is here to only test the topology iterator. It serves + * no other purpose. + */ + cxl_endpoint_get_perf_coordinates(port, ep_c); } static struct cxl_mock_ops cxl_mock_ops = { -- cgit v1.2.3 From 0540193614eb6fadb140d87b076ee7094615f76d Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 15 Apr 2024 14:43:54 +0000 Subject: KVM: selftests: Avoid assuming "sudo" exists in NX hugepage test Writing various root-only files, omit "sudo" when already running as root to allow running the NX hugepage test on systems with a minimal rootfs, i.e. without sudo. Signed-off-by: Brendan Jackman Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240415-kvm-selftests-no-sudo-v1-1-95153ad5f470@google.com [sean: name the helper do_sudo() instead of maybe_sudo(), massage changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh index 7cbb409801ee..caad084b8bfd 100755 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -13,10 +13,21 @@ NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_reco NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms) HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) +# If we're already root, the host might not have sudo. +if [ $(whoami) == "root" ]; then + function do_sudo () { + "$@" + } +else + function do_sudo () { + sudo "$@" + } +fi + set +e function sudo_echo () { - echo "$1" | sudo tee -a "$2" > /dev/null + echo "$1" | do_sudo tee -a "$2" > /dev/null } NXECUTABLE="$(dirname $0)/nx_huge_pages_test" -- cgit v1.2.3 From 730cfa45b5f4f170095707b526dc7af99c9f0959 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 23 Apr 2024 12:03:08 -0700 Subject: KVM: selftests: Define _GNU_SOURCE for all selftests code Define _GNU_SOURCE is the base CFLAGS instead of relying on selftests to manually #define _GNU_SOURCE, which is repetitive and error prone. E.g. kselftest_harness.h requires _GNU_SOURCE for asprintf(), but if a selftest includes kvm_test_harness.h after stdio.h, the include guards result in the effective version of stdio.h consumed by kvm_test_harness.h not defining asprintf(): In file included from x86_64/fix_hypercall_test.c:12: In file included from include/kvm_test_harness.h:11: ../kselftest_harness.h:1169:2: error: call to undeclared function 'asprintf'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 1169 | asprintf(&test_name, "%s%s%s.%s", f->name, | ^ When including the rseq selftest's "library" code, #undef _GNU_SOURCE so that rseq.c controls whether or not it wants to build with _GNU_SOURCE. Reported-by: Muhammad Usama Anjum Acked-by: Claudio Imbrenda Acked-by: Oliver Upton Acked-by: Anup Patel Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240423190308.2883084-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 4 ++-- tools/testing/selftests/kvm/aarch64/arch_timer.c | 2 -- tools/testing/selftests/kvm/aarch64/page_fault_test.c | 1 - tools/testing/selftests/kvm/aarch64/psci_test.c | 3 --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 1 - tools/testing/selftests/kvm/arch_timer.c | 3 --- tools/testing/selftests/kvm/demand_paging_test.c | 3 --- tools/testing/selftests/kvm/dirty_log_test.c | 3 --- tools/testing/selftests/kvm/guest_memfd_test.c | 2 -- tools/testing/selftests/kvm/hardware_disable_test.c | 3 --- tools/testing/selftests/kvm/include/kvm_util_base.h | 12 ++++++------ tools/testing/selftests/kvm/include/userfaultfd_util.h | 3 --- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 2 -- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 -- tools/testing/selftests/kvm/kvm_page_table_test.c | 3 --- tools/testing/selftests/kvm/lib/assert.c | 3 --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 -- tools/testing/selftests/kvm/lib/memstress.c | 2 -- tools/testing/selftests/kvm/lib/test_util.c | 2 -- tools/testing/selftests/kvm/lib/userfaultfd_util.c | 3 --- tools/testing/selftests/kvm/lib/x86_64/sev.c | 1 - tools/testing/selftests/kvm/max_guest_memory_test.c | 2 -- .../testing/selftests/kvm/memslot_modification_stress_test.c | 3 --- tools/testing/selftests/kvm/riscv/arch_timer.c | 3 --- tools/testing/selftests/kvm/rseq_test.c | 12 +++++++++--- tools/testing/selftests/kvm/s390x/cmma_test.c | 2 -- tools/testing/selftests/kvm/s390x/sync_regs_test.c | 2 -- tools/testing/selftests/kvm/set_memory_region_test.c | 1 - tools/testing/selftests/kvm/steal_time.c | 1 - tools/testing/selftests/kvm/x86_64/amx_test.c | 2 -- .../selftests/kvm/x86_64/exit_on_emulation_failure_test.c | 3 --- tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c | 2 -- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 2 -- tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c | 1 - tools/testing/selftests/kvm/x86_64/hyperv_ipi.c | 2 -- tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c | 1 - tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c | 2 -- tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c | 2 -- tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c | 3 --- tools/testing/selftests/kvm/x86_64/platform_info_test.c | 2 -- tools/testing/selftests/kvm/x86_64/pmu_counters_test.c | 2 -- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 3 --- .../selftests/kvm/x86_64/private_mem_conversions_test.c | 1 - tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 1 - tools/testing/selftests/kvm/x86_64/set_sregs_test.c | 1 - .../selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c | 3 --- tools/testing/selftests/kvm/x86_64/smm_test.c | 1 - tools/testing/selftests/kvm/x86_64/state_test.c | 1 - tools/testing/selftests/kvm/x86_64/sync_regs_test.c | 2 -- tools/testing/selftests/kvm/x86_64/ucna_injection_test.c | 2 -- tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c | 2 -- tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c | 3 --- tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c | 1 - .../testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c | 1 - tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c | 2 -- tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 1 - tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 2 -- 57 files changed, 17 insertions(+), 120 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 871e2de3eb05..6de9994971c9 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -226,8 +226,8 @@ LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ - -fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \ - -fno-builtin-strnlen \ + -D_GNU_SOURCE -fno-builtin-memcmp -fno-builtin-memcpy \ + -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ -I$( #include #include diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 9b004905d1d3..1c8c6f0c1ca3 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -10,9 +10,6 @@ * - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated * KVM_SYSTEM_EVENT_SUSPEND UAPI. */ - -#define _GNU_SOURCE - #include #include "kvm_util.h" diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index eef816b80993..e93022870cac 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c index ae1f1a6d8312..fcebd8d81ce4 100644 --- a/tools/testing/selftests/kvm/arch_timer.c +++ b/tools/testing/selftests/kvm/arch_timer.c @@ -19,9 +19,6 @@ * * Copyright (c) 2021, Google LLC. */ - -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index bf3609f71854..0c4d3b6afbf8 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019, Google, Inc. */ - -#define _GNU_SOURCE /* for pipe2 */ - #include #include #include diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index eaad5b20854c..bf1ebc29f22a 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 92eae206baa6..309fe84b84ad 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -4,8 +4,6 @@ * * Author: Chao Peng */ - -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index decc521fc760..bce73bcb973c 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -4,9 +4,6 @@ * kvm_arch_hardware_disable is called and it attempts to unregister the user * return notifiers. */ - -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 8acca8237687..af02308e264e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -27,12 +27,12 @@ /* * Provide a version of static_assert() that is guaranteed to have an optional - * message param. If _ISOC11_SOURCE is defined, glibc (/usr/include/assert.h) - * #undefs and #defines static_assert() as a direct alias to _Static_assert(), - * i.e. effectively makes the message mandatory. Many KVM selftests #define - * _GNU_SOURCE for various reasons, and _GNU_SOURCE implies _ISOC11_SOURCE. As - * a result, static_assert() behavior is non-deterministic and may or may not - * require a message depending on #include order. + * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE + * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and + * #defines static_assert() as a direct alias to _Static_assert() (see + * usr/include/assert.h). Define a custom macro instead of redefining + * static_assert() to avoid creating non-deterministic behavior that is + * dependent on include order. */ #define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) #define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h index 877449c34592..a9d97c213584 100644 --- a/tools/testing/selftests/kvm/include/userfaultfd_util.h +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -5,9 +5,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019-2022 Google LLC */ - -#define _GNU_SOURCE /* for pipe2 */ - #include #include #include diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 698c1cfa3111..f02355c3c4c2 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -6,8 +6,6 @@ * * Test the fd-based interface for KVM statistics. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index b9e23265e4b3..c78f34699f73 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -6,8 +6,6 @@ * * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index e0ba97ac1c56..7759c685086b 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -8,9 +8,6 @@ * page size have been pre-allocated on your system, if you are planning to * use hugepages to back the guest memory for testing. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index 2bd25b191d15..b49690658c60 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/ - #include "test_util.h" #include diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9da388100f3a..c4f12e272b38 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -4,8 +4,6 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for program_invocation_name */ #include "test_util.h" #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index cf2c73971308..555e3932e529 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2020, Google LLC. */ -#define _GNU_SOURCE - #include #include diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 5a8f8becb129..8ed0b74ae837 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -4,8 +4,6 @@ * * Copyright (C) 2020, Google LLC. */ - -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index f4eef6eb2dc2..bd07568a5240 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019-2022 Google LLC */ - -#define _GNU_SOURCE /* for pipe2 */ - #include #include #include diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c index d482029b6004..e9535ee20b7f 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/sev.c +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 6628dc4dda89..9b7fc3908be6 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 156361966612..05fcf902e067 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -6,9 +6,6 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2020, Google, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index 0f9cabd99fd4..4b5004ef9c6b 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -7,9 +7,6 @@ * * Copyright (c) 2024, Intel Corporation. */ - -#define _GNU_SOURCE - #include "arch_timer.h" #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 28f97fb52044..0728b15b5d3a 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -1,5 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ + +/* + * Include rseq.c without _GNU_SOURCE defined, before including any headers, so + * that rseq.c is compiled with its configuration, not KVM selftests' config. + */ +#undef _GNU_SOURCE +#include "../rseq/rseq.c" +#define _GNU_SOURCE + #include #include #include @@ -20,8 +28,6 @@ #include "processor.h" #include "test_util.h" -#include "../rseq/rseq.c" - /* * Any bug related to task migration is likely to be timing-dependent; perform * a large number of migrations to reduce the odds of a false negative. diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c index 626a2b8a2037..84ba79c42ab1 100644 --- a/tools/testing/selftests/kvm/s390x/cmma_test.c +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -7,8 +7,6 @@ * Authors: * Nico Boehr */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 43fb25ddc3ec..53def355ccba 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -10,8 +10,6 @@ * * Test expected behavior of the KVM_CAP_SYNC_REGS functionality. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 904d58793fc6..c6e438c9d851 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index bae0c5026f82..e9231387c589 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index eae521f050e0..8e5713e36d4b 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -6,8 +6,6 @@ * * Tests for amx #NM exception and save/restore. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c index 6c2e5e0ceb1f..9c21b6bccc38 100644 --- a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c +++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c @@ -4,9 +4,6 @@ * * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "flds_emulation.h" #include "test_util.h" diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c index df351ae17029..10b1b0ba374e 100644 --- a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2023, Google LLC. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "test_util.h" diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 5c27efbf405e..4f5881d4ef66 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -7,8 +7,6 @@ * This work is licensed under the terms of the GNU GPL, version 2. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index 4c7257ecd2a6..4f3f3a9b038b 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -4,7 +4,6 @@ * * Tests for Enlightened VMCS, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c index f1617762c22f..8206f5ef42dd 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -5,8 +5,6 @@ * Copyright (C) 2022, Red Hat, Inc. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index c9b18707edc0..b987a3d79715 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -4,7 +4,6 @@ * * Tests for Hyper-V extensions to SVM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c index 05b56095cf76..077cd0ec3040 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c @@ -5,8 +5,6 @@ * Copyright (C) 2022, Red Hat, Inc. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c index 3670331adf21..3eb0313ffa39 100644 --- a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c +++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "test_util.h" #include "kvm_util.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 17bbb96fc4df..e7efb2b35f8b 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -5,9 +5,6 @@ * * Copyright (C) 2022, Google LLC. */ - -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 87011965dc41..2165b1ad8b38 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -9,8 +9,6 @@ * Verifies expected behavior of controlling guest access to * MSR_PLATFORM_INFO. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 29609b52f8fa..842d87c8d6b6 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -2,8 +2,6 @@ /* * Copyright (C) 2023, Tencent, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "pmu.h" diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 3c85d1ae9893..5ce53b8c46e0 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -9,9 +9,6 @@ * Verifies the expected behavior of allow lists and deny lists for * virtual PMU events. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "kvm_util.h" #include "pmu.h" #include "processor.h" diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c index e0f642d2a3c4..82a8d88b5338 100644 --- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2022, Google LLC. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 366cf18600bc..d691d86e5bc3 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -4,7 +4,6 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 3610981d9162..c021c0795a96 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -10,7 +10,6 @@ * That bug allowed a user-mode program that called the KVM_SET_SREGS * ioctl to put a VCPU's local APIC into an invalid state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 416207c38a17..362be40fc00d 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -5,9 +5,6 @@ * Test that KVM emulates instructions in response to EPT violations when * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ - #include "flds_emulation.h" #include "test_util.h" diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index e18b86666e1f..55c88d664a94 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -4,7 +4,6 @@ * * Tests for SMM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 88b58aab7207..1c756db329e5 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -6,7 +6,6 @@ * * Tests for vCPU state save/restore, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index adb5593daf48..8fa3948b0170 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -8,8 +8,6 @@ * including requesting an invalid register set, updates to/from values * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index dcbb3c29fb8e..abe71946941f 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -17,8 +17,6 @@ * delivered into the guest or not. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index f4f61a2d2464..53afbea4df88 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -4,8 +4,6 @@ * * Tests for exiting into userspace on registered MSRs */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "kvm_test_harness.h" diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 7f6f5f23fb9b..a39cba19c058 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index ea0cb3cae0f7..3b93f262b797 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -10,7 +10,6 @@ * and check it can be retrieved with KVM_GET_MSR, also test * the invalid LBR formats are rejected. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index affc32800158..00dd2ac07a61 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -9,7 +9,6 @@ * value instead of partially decayed timer value * */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 725c206ba0b9..c78e5f755116 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -19,8 +19,6 @@ * Migration is a command line option. When used on non-numa machines will * exit with error. Test is still usefull on non-numa for testing IPIs. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index ab75b873a4ad..69849acd95b0 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include #include diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 167c97abff1b..f331a4e9bae3 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -4,8 +4,6 @@ * * Tests for the IA32_XSS MSR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include #include "test_util.h" -- cgit v1.2.3 From cb6c6914788f65efc8efa72b8a582e2aa2ccc386 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 11:54:54 -0700 Subject: KVM: selftests: Provide a global pseudo-RNG instance for all tests Add a global guest_random_state instance, i.e. a pseudo-RNG, so that an RNG is available for *all* tests. This will allow randomizing behavior in core library code, e.g. x86 will utilize the pRNG to conditionally force emulation of writes from within common guest code. To allow for deterministic runs, and to be compatible with existing tests, allow tests to override the seed used to initialize the pRNG. Note, the seed *must* be overwritten before a VM is created in order for the seed to take effect, though it's perfectly fine for a test to initialize multiple VMs with different seeds. And as evidenced by memstress_guest_code(), it's also a-ok to instantiate more RNGs using the global seed (or a modified version of it). The goal of the global RNG is purely to ensure that _a_ source of random numbers is available, it doesn't have to be the _only_ RNG. Link: https://lore.kernel.org/r/20240314185459.2439072-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 9 ++++----- tools/testing/selftests/kvm/dirty_log_test.c | 17 +---------------- tools/testing/selftests/kvm/include/memstress.h | 1 - tools/testing/selftests/kvm/include/test_util.h | 8 ++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 9 +++++++++ tools/testing/selftests/kvm/lib/memstress.c | 8 +------- 6 files changed, 23 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 504f6fe980e8..5cda9780c378 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -132,7 +132,6 @@ struct test_params { enum vm_mem_backing_src_type backing_src; int slots; uint32_t write_percent; - uint32_t random_seed; bool random_access; }; @@ -156,8 +155,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) p->slots, p->backing_src, p->partition_vcpu_memory_access); - pr_info("Random seed: %u\n", p->random_seed); - memstress_set_random_seed(vm, p->random_seed); memstress_set_write_percent(vm, p->write_percent); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; @@ -346,11 +343,13 @@ int main(int argc, char *argv[]) .partition_vcpu_memory_access = true, .backing_src = DEFAULT_VM_MEM_SRC, .slots = 1, - .random_seed = 1, .write_percent = 100, }; int opt; + /* Override the seed to be deterministic by default. */ + guest_random_seed = 1; + dirty_log_manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | @@ -395,7 +394,7 @@ int main(int argc, char *argv[]) p.phys_offset = strtoull(optarg, NULL, 0); break; case 'r': - p.random_seed = atoi_positive("Random seed", optarg); + guest_random_seed = atoi_positive("Random seed", optarg); break; case 's': p.backing_src = parse_backing_src_type(optarg); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index bf1ebc29f22a..aaf6d61c08db 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -73,7 +73,6 @@ static uint64_t host_page_size; static uint64_t guest_page_size; static uint64_t guest_num_pages; -static uint64_t random_array[TEST_PAGES_PER_LOOP]; static uint64_t iteration; /* @@ -112,13 +111,12 @@ static void guest_code(void) while (true) { for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { addr = guest_test_virt_mem; - addr += (READ_ONCE(random_array[i]) % guest_num_pages) + addr += (guest_random_u64(&guest_rng) % guest_num_pages) * guest_page_size; addr = align_down(addr, host_page_size); *(uint64_t *)addr = READ_ONCE(iteration); } - /* Tell the host that we need more random numbers */ GUEST_SYNC(1); } } @@ -505,20 +503,10 @@ static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) mode->after_vcpu_run(vcpu, ret, err); } -static void generate_random_array(uint64_t *guest_array, uint64_t size) -{ - uint64_t i; - - for (i = 0; i < size; i++) - guest_array[i] = random(); -} - static void *vcpu_worker(void *data) { int ret; struct kvm_vcpu *vcpu = data; - struct kvm_vm *vm = vcpu->vm; - uint64_t *guest_array; uint64_t pages_count = 0; struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) + sizeof(sigset_t)); @@ -537,11 +525,8 @@ static void *vcpu_worker(void *data) sigemptyset(sigset); sigaddset(sigset, SIG_IPI); - guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); - while (!READ_ONCE(host_quit)) { /* Clear any existing kick signals */ - generate_random_array(guest_array, TEST_PAGES_PER_LOOP); pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ ret = __vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h index ce4e603050ea..9071eb6dea60 100644 --- a/tools/testing/selftests/kvm/include/memstress.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -62,7 +62,6 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, void memstress_destroy_vm(struct kvm_vm *vm); void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); -void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); void memstress_set_random_access(struct kvm_vm *vm, bool random_access); void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *)); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 8a6e30612c86..4b78fb7e539e 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -91,9 +91,17 @@ struct guest_random_state { uint32_t seed; }; +extern uint32_t guest_random_seed; +extern struct guest_random_state guest_rng; + struct guest_random_state new_guest_random_state(uint32_t seed); uint32_t guest_random_u32(struct guest_random_state *state); +static inline uint64_t guest_random_u64(struct guest_random_state *state) +{ + return ((uint64_t)guest_random_u32(state) << 32) | guest_random_u32(state); +} + enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c4f12e272b38..d64276739513 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -18,6 +18,9 @@ #define KVM_UTIL_MIN_PFN 2 +uint32_t guest_random_seed; +struct guest_random_state guest_rng; + static int vcpu_mmap_sz(void); int open_path_or_exit(const char *path, int flags) @@ -430,6 +433,10 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, slot0 = memslot2region(vm, 0); ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + pr_info("Random seed: 0x%x\n", guest_random_seed); + guest_rng = new_guest_random_state(guest_random_seed); + sync_global_to_guest(vm, guest_rng); + kvm_arch_vm_post_create(vm); return vm; @@ -2303,6 +2310,8 @@ void __attribute((constructor)) kvm_selftest_init(void) /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + guest_random_seed = random(); + kvm_selftest_arch_init(); } diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 555e3932e529..3b7fe1778d7b 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -54,7 +54,7 @@ void memstress_guest_code(uint32_t vcpu_idx) uint64_t page; int i; - rand_state = new_guest_random_state(args->random_seed + vcpu_idx); + rand_state = new_guest_random_state(guest_random_seed + vcpu_idx); gva = vcpu_args->gva; pages = vcpu_args->pages; @@ -241,12 +241,6 @@ void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) sync_global_to_guest(vm, memstress_args.write_percent); } -void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) -{ - memstress_args.random_seed = random_seed; - sync_global_to_guest(vm, memstress_args.random_seed); -} - void memstress_set_random_access(struct kvm_vm *vm, bool random_access) { memstress_args.random_access = random_access; -- cgit v1.2.3 From 73369acd9fbdf6cbf3029cace886abcc626f46ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 11:54:55 -0700 Subject: KVM: selftests: Provide an API for getting a random bool from an RNG Move memstress' random bool logic into common code to avoid reinventing the wheel for basic yes/no decisions. Provide an outer wrapper to handle the basic/common case of just wanting a 50/50 chance of something happening. Link: https://lore.kernel.org/r/20240314185459.2439072-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/test_util.h | 11 +++++++++++ tools/testing/selftests/kvm/lib/memstress.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 4b78fb7e539e..3e473058849f 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -97,6 +97,17 @@ extern struct guest_random_state guest_rng; struct guest_random_state new_guest_random_state(uint32_t seed); uint32_t guest_random_u32(struct guest_random_state *state); +static inline bool __guest_random_bool(struct guest_random_state *state, + uint8_t percent) +{ + return (guest_random_u32(state) % 100) < percent; +} + +static inline bool guest_random_bool(struct guest_random_state *state) +{ + return __guest_random_bool(state, 50); +} + static inline uint64_t guest_random_u64(struct guest_random_state *state) { return ((uint64_t)guest_random_u32(state) << 32) | guest_random_u32(state); diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 3b7fe1778d7b..aa0a01988150 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -74,7 +74,7 @@ void memstress_guest_code(uint32_t vcpu_idx) addr = gva + (page * args->guest_page_size); - if (guest_random_u32(&rand_state) % 100 < args->write_percent) + if (__guest_random_bool(&rand_state, args->write_percent)) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); -- cgit v1.2.3 From e1ff11525d3c52159a8f262c209e5b9a9ef84918 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 11:54:56 -0700 Subject: KVM: selftests: Add global snapshot of kvm_is_forced_emulation_enabled() Add a global snapshot of kvm_is_forced_emulation_enabled() and sync it to all VMs by default so that core library code can force emulation, e.g. to allow for easier testing of the intersections between emulation and other features in KVM. Link: https://lore.kernel.org/r/20240314185459.2439072-4-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 3 +++ tools/testing/selftests/kvm/x86_64/pmu_counters_test.c | 3 --- tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c | 10 ++-------- 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h index 9f1725192aa2..41aba476640a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h @@ -5,6 +5,8 @@ #include #include +extern bool is_forced_emulation_enabled; + struct kvm_vm_arch { uint64_t c_bit; uint64_t s_bit; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 9f87ca8b7ab6..90aeacf114bf 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -23,6 +23,7 @@ vm_vaddr_t exception_handlers; bool host_cpu_is_amd; bool host_cpu_is_intel; +bool is_forced_emulation_enabled; static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { @@ -577,6 +578,7 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm) vm_create_irqchip(vm); sync_global_to_guest(vm, host_cpu_is_intel); sync_global_to_guest(vm, host_cpu_is_amd); + sync_global_to_guest(vm, is_forced_emulation_enabled); if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { struct kvm_sev_init init = { 0 }; @@ -1348,6 +1350,7 @@ void kvm_selftest_arch_init(void) { host_cpu_is_intel = this_cpu_is_intel(); host_cpu_is_amd = this_cpu_is_amd(); + is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); } bool sys_clocksource_is_based_on_tsc(void) diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 842d87c8d6b6..bad2ab1b9810 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -19,7 +19,6 @@ static uint8_t kvm_pmu_version; static bool kvm_has_perf_caps; -static bool is_forced_emulation_enabled; static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code, @@ -33,7 +32,6 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, vcpu_init_descriptor_tables(*vcpu); sync_global_to_guest(vm, kvm_pmu_version); - sync_global_to_guest(vm, is_forced_emulation_enabled); /* * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling @@ -610,7 +608,6 @@ int main(int argc, char *argv[]) kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM); - is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); test_intel_counters(); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index 53afbea4df88..af9981a6642f 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -11,8 +11,6 @@ #include "kvm_util.h" #include "vmx.h" -static bool fep_available; - #define MSR_NON_EXISTENT 0x474f4f00 static u64 deny_bits = 0; @@ -256,7 +254,7 @@ static void guest_code_filter_allow(void) GUEST_ASSERT(data == 2); GUEST_ASSERT(guest_exception_count == 0); - if (fep_available) { + if (is_forced_emulation_enabled) { /* Let userspace know we aren't done. */ GUEST_SYNC(0); @@ -518,8 +516,6 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) uint64_t cmd; int rc; - sync_global_to_guest(vm, fep_available); - rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); @@ -549,7 +545,7 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) vcpu_run(vcpu); cmd = process_ucall(vcpu); - if (fep_available) { + if (is_forced_emulation_enabled) { TEST_ASSERT_EQ(cmd, UCALL_SYNC); vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); @@ -772,7 +768,5 @@ KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL) int main(int argc, char *argv[]) { - fep_available = kvm_is_forced_emulation_enabled(); - return test_harness_run(argc, argv); } -- cgit v1.2.3 From 2f2bc6af6aa8cc07f84291d625f7113fd13d68e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 11:54:57 -0700 Subject: KVM: selftests: Add vcpu_arch_put_guest() to do writes from guest code Introduce a macro, vcpu_arch_put_guest(), for "putting" values to memory from guest code in "interesting" situations, e.g. when writing memory that is being dirty logged. Structure the macro so that arch code can provide a custom implementation, e.g. x86 will use the macro to force emulation of the access. Use the helper in dirty_log_test, which is of particular interest (see above), and in xen_shinfo_test, which isn't all that interesting, but provides a second usage of the macro with a different size operand (uint8_t versus uint64_t), i.e. to help verify that the macro works for more than just 64-bit values. Use "put" as the verb to align with the kernel's {get,put}_user() terminology. Link: https://lore.kernel.org/r/20240314185459.2439072-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_test.c | 5 +++-- tools/testing/selftests/kvm/include/kvm_util_base.h | 3 +++ tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index aaf6d61c08db..a83cbc7b7f0d 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -105,7 +105,7 @@ static void guest_code(void) */ for (i = 0; i < guest_num_pages; i++) { addr = guest_test_virt_mem + i * guest_page_size; - *(uint64_t *)addr = READ_ONCE(iteration); + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } while (true) { @@ -114,7 +114,8 @@ static void guest_code(void) addr += (guest_random_u64(&guest_rng) % guest_num_pages) * guest_page_size; addr = align_down(addr, host_page_size); - *(uint64_t *)addr = READ_ONCE(iteration); + + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } GUEST_SYNC(1); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index af02308e264e..e850269a3219 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -609,6 +609,9 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); +#ifndef vcpu_arch_put_guest +#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0) +#endif static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) { diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index d2ea0435f4f7..1ba06551526b 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -171,8 +171,9 @@ static volatile bool guest_saw_irq; static void evtchn_handler(struct ex_regs *regs) { struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; - vi->evtchn_upcall_pending = 0; - vi->evtchn_pending_sel = 0; + + vcpu_arch_put_guest(vi->evtchn_upcall_pending, 0); + vcpu_arch_put_guest(vi->evtchn_pending_sel, 0); guest_saw_irq = true; GUEST_SYNC(TEST_GUEST_SAW_IRQ); -- cgit v1.2.3 From 87aa264cd89d068f2455fc6e240d4015f6234204 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 11:54:58 -0700 Subject: KVM: selftests: Randomly force emulation on x86 writes from guest code Override vcpu_arch_put_guest() to randomly force emulation on supported accesses. Force emulation of LOCK CMPXCHG as well as a regular MOV to stress KVM's emulation of atomic accesses, which has a unique path in KVM's emulator. Arbitrarily give all the decisions 50/50 odds; absent much, much more sophisticated infrastructure for generating random numbers, it's highly unlikely that doing more than a coin flip with affect selftests' ability to find KVM bugs. This is effectively a regression test for commit 910c57dfa4d1 ("KVM: x86: Mark target gfn of emulated atomic instruction as dirty"). Link: https://lore.kernel.org/r/20240314185459.2439072-6-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/kvm_util_arch.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h index 41aba476640a..d0b587c38e07 100644 --- a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h @@ -5,6 +5,8 @@ #include #include +#include "test_util.h" + extern bool is_forced_emulation_enabled; struct kvm_vm_arch { @@ -22,4 +24,23 @@ static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) #define vm_arch_has_protected_memory(vm) \ __vm_arch_has_protected_memory(&(vm)->arch) +#define vcpu_arch_put_guest(mem, __val) \ +do { \ + const typeof(mem) val = (__val); \ + \ + if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) { \ + (mem) = val; \ + } else if (guest_random_bool(&guest_rng)) { \ + __asm__ __volatile__(KVM_FEP "mov %1, %0" \ + : "+m" (mem) \ + : "r" (val) : "memory"); \ + } else { \ + uint64_t __old = READ_ONCE(mem); \ + \ + __asm__ __volatile__(KVM_FEP LOCK_PREFIX "cmpxchg %[new], %[ptr]" \ + : [ptr] "+m" (mem), [old] "+a" (__old) \ + : [new]"r" (val) : "memory", "cc"); \ + } \ +} while (0) + #endif // SELFTEST_KVM_UTIL_ARCH_H -- cgit v1.2.3 From 2b7deea3ec7c81a92d4c17751d3bcd780d065ae4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:20 -0700 Subject: Revert "kvm: selftests: move base kvm_util.h declarations to kvm_util_base.h" Effectively revert the movement of code from kvm_util.h => kvm_util_base.h, as the TL;DR of the justification for the move was to avoid #idefs and/or circular dependencies between what ended up being ucall_common.h and what was (and now again, is), kvm_util.h. But avoiding #ifdef and circular includes is trivial: don't do that. The cost of removing kvm_util_base.h is a few extra includes of ucall_common.h, but that cost is practically nothing. On the other hand, having a "base" version of a header that is really just the header itself is confusing, and makes it weird/hard to choose names for headers that actually are "base" headers, e.g. to hold core KVM selftests typedefs. For all intents and purposes, this reverts commit 7d9a662ed9f0403e7b94940dceb81552b8edb931. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 1 + tools/testing/selftests/kvm/arch_timer.c | 1 + tools/testing/selftests/kvm/demand_paging_test.c | 1 + tools/testing/selftests/kvm/dirty_log_perf_test.c | 1 + tools/testing/selftests/kvm/dirty_log_test.c | 1 + tools/testing/selftests/kvm/guest_memfd_test.c | 2 +- tools/testing/selftests/kvm/guest_print_test.c | 1 + .../selftests/kvm/include/aarch64/processor.h | 2 + .../testing/selftests/kvm/include/aarch64/ucall.h | 2 +- tools/testing/selftests/kvm/include/kvm_util.h | 1128 +++++++++++++++++++- tools/testing/selftests/kvm/include/s390x/ucall.h | 2 +- .../selftests/kvm/include/x86_64/processor.h | 3 +- tools/testing/selftests/kvm/include/x86_64/ucall.h | 2 +- tools/testing/selftests/kvm/kvm_page_table_test.c | 1 + .../testing/selftests/kvm/lib/aarch64/processor.c | 2 + tools/testing/selftests/kvm/lib/kvm_util.c | 1 + tools/testing/selftests/kvm/lib/memstress.c | 1 + tools/testing/selftests/kvm/lib/riscv/processor.c | 1 + tools/testing/selftests/kvm/lib/ucall_common.c | 5 +- tools/testing/selftests/kvm/riscv/arch_timer.c | 1 + tools/testing/selftests/kvm/rseq_test.c | 1 + tools/testing/selftests/kvm/s390x/cmma_test.c | 1 + tools/testing/selftests/kvm/s390x/memop.c | 1 + tools/testing/selftests/kvm/s390x/tprot.c | 1 + tools/testing/selftests/kvm/steal_time.c | 1 + .../kvm/x86_64/dirty_log_page_splitting_test.c | 1 + .../kvm/x86_64/exit_on_emulation_failure_test.c | 2 +- .../selftests/kvm/x86_64/ucna_injection_test.c | 1 - 28 files changed, 1156 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 5369959e9fc2..02967318caf4 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -10,6 +10,7 @@ #include "gic.h" #include "processor.h" #include "timer_test.h" +#include "ucall_common.h" #include "vgic.h" #define GICD_BASE_GPA 0x8000000ULL diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c index fcebd8d81ce4..acb2cb596332 100644 --- a/tools/testing/selftests/kvm/arch_timer.c +++ b/tools/testing/selftests/kvm/arch_timer.c @@ -26,6 +26,7 @@ #include #include "timer_test.h" +#include "ucall_common.h" struct test_args test_args = { .nr_vcpus = NR_VCPUS_DEF, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 0c4d3b6afbf8..e3e56f107f0b 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -19,6 +19,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #include "userfaultfd_util.h" #ifdef __NR_userfaultfd diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 5cda9780c378..89fdb461aff8 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -18,6 +18,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #ifdef __aarch64__ #include "aarch64/vgic.h" diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index a83cbc7b7f0d..aacf80f57439 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -20,6 +20,7 @@ #include "test_util.h" #include "guest_modes.h" #include "processor.h" +#include "ucall_common.h" #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 309fe84b84ad..ba0c8e996035 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -17,8 +17,8 @@ #include #include +#include "kvm_util.h" #include "test_util.h" -#include "kvm_util_base.h" static void test_file_read_write(int fd) { diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c index 3502caa3590c..8092c2d0f5d6 100644 --- a/tools/testing/selftests/kvm/guest_print_test.c +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -13,6 +13,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" struct guest_vals { uint64_t a; diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9e518b562827..1814af7d8567 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -8,6 +8,8 @@ #define SELFTEST_KVM_PROCESSOR_H #include "kvm_util.h" +#include "ucall_common.h" + #include #include #include diff --git a/tools/testing/selftests/kvm/include/aarch64/ucall.h b/tools/testing/selftests/kvm/include/aarch64/ucall.h index 4b68f37efd36..4ec801f37f00 100644 --- a/tools/testing/selftests/kvm/include/aarch64/ucall.h +++ b/tools/testing/selftests/kvm/include/aarch64/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_MMIO diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c9286811a4cb..c51babca552a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1,13 +1,1133 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * tools/testing/selftests/kvm/include/kvm_util.h - * * Copyright (C) 2018, Google LLC. */ #ifndef SELFTEST_KVM_UTIL_H #define SELFTEST_KVM_UTIL_H -#include "kvm_util_base.h" -#include "ucall_common.h" +#include "test_util.h" + +#include +#include "linux/hashtable.h" +#include "linux/list.h" +#include +#include +#include "linux/rbtree.h" +#include + +#include +#include + +#include + +#include "kvm_util_arch.h" +#include "sparsebit.h" + +/* + * Provide a version of static_assert() that is guaranteed to have an optional + * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE + * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and + * #defines static_assert() as a direct alias to _Static_assert() (see + * usr/include/assert.h). Define a custom macro instead of redefining + * static_assert() to avoid creating non-deterministic behavior that is + * dependent on include order. + */ +#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) +#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) + +#define KVM_DEV_PATH "/dev/kvm" +#define KVM_MAX_VCPUS 512 + +#define NSEC_PER_SEC 1000000000L + +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +struct userspace_mem_region { + struct kvm_userspace_memory_region2 region; + struct sparsebit *unused_phy_pages; + struct sparsebit *protected_phy_pages; + int fd; + off_t offset; + enum vm_mem_backing_src_type backing_src_type; + void *host_mem; + void *host_alias; + void *mmap_start; + void *mmap_alias; + size_t mmap_size; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; +}; + +struct kvm_vcpu { + struct list_head list; + uint32_t id; + int fd; + struct kvm_vm *vm; + struct kvm_run *run; +#ifdef __x86_64__ + struct kvm_cpuid2 *cpuid; +#endif + struct kvm_dirty_gfn *dirty_gfns; + uint32_t fetch_index; + uint32_t dirty_gfns_count; +}; + +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + +enum kvm_mem_region_type { + MEM_REGION_CODE, + MEM_REGION_DATA, + MEM_REGION_PT, + MEM_REGION_TEST_DATA, + NR_MEM_REGIONS, +}; + +struct kvm_vm { + int mode; + unsigned long type; + int kvm_fd; + int fd; + unsigned int pgtable_levels; + unsigned int page_size; + unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; + uint64_t max_gfn; + struct list_head vcpus; + struct userspace_mem_regions regions; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + bool has_irqchip; + bool pgd_created; + vm_paddr_t ucall_mmio_addr; + vm_paddr_t pgd; + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + vm_vaddr_t handlers; + uint32_t dirty_ring_size; + uint64_t gpa_tag_mask; + + struct kvm_vm_arch arch; + + /* Cache of information for binary stats interface */ + int stats_fd; + struct kvm_stats_header stats_header; + struct kvm_stats_desc *stats_desc; + + /* + * KVM region slots. These are the default memslots used by page + * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] + * memslot. + */ + uint32_t memslots[NR_MEM_REGIONS]; +}; + +struct vcpu_reg_sublist { + const char *name; + long capability; + int feature; + int feature_type; + bool finalize; + __u64 *regs; + __u64 regs_n; + __u64 *rejects_set; + __u64 rejects_set_n; + __u64 *skips_set; + __u64 skips_set_n; +}; + +struct vcpu_reg_list { + char *name; + struct vcpu_reg_sublist sublists[]; +}; + +#define for_each_sublist(c, s) \ + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) + +#define kvm_for_each_vcpu(vm, i, vcpu) \ + for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ + if (!((vcpu) = vm->vcpus[i])) \ + continue; \ + else + +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + +static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, + enum kvm_mem_region_type type) +{ + assert(type < NR_MEM_REGIONS); + return memslot2region(vm, vm->memslots[type]); +} + +/* Minimum allocated guest virtual and physical addresses */ +#define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 +#define DEFAULT_STACK_PGS 5 + +enum vm_guest_mode { + VM_MODE_P52V48_4K, + VM_MODE_P52V48_16K, + VM_MODE_P52V48_64K, + VM_MODE_P48V48_4K, + VM_MODE_P48V48_16K, + VM_MODE_P48V48_64K, + VM_MODE_P40V48_4K, + VM_MODE_P40V48_16K, + VM_MODE_P40V48_64K, + VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, + VM_MODE_P44V64_4K, + VM_MODE_P36V48_4K, + VM_MODE_P36V48_16K, + VM_MODE_P36V48_64K, + VM_MODE_P36V47_16K, + NUM_VM_MODES, +}; + +struct vm_shape { + uint32_t type; + uint8_t mode; + uint8_t pad0; + uint16_t pad1; +}; + +kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); + +#define VM_TYPE_DEFAULT 0 + +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT \ + }; \ + \ + shape; \ +}) + +#if defined(__aarch64__) + +extern enum vm_guest_mode vm_mode_default; + +#define VM_MODE_DEFAULT vm_mode_default +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__x86_64__) + +#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__s390x__) + +#define VM_MODE_DEFAULT VM_MODE_P44V64_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 16) + +#elif defined(__riscv) + +#if __riscv_xlen == 32 +#error "RISC-V 32-bit kvm selftests not supported" +#endif + +#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#endif + +#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) + +#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) +#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) + +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; +}; +extern const struct vm_guest_mode_params vm_guest_mode_params[]; + +int open_path_or_exit(const char *path, int flags); +int open_kvm_dev_path_or_exit(void); + +bool get_kvm_param_bool(const char *param); +bool get_kvm_intel_param_bool(const char *param); +bool get_kvm_amd_param_bool(const char *param); + +int get_kvm_param_integer(const char *param); +int get_kvm_intel_param_integer(const char *param); +int get_kvm_amd_param_integer(const char *param); + +unsigned int kvm_check_cap(long cap); + +static inline bool kvm_has_cap(long cap) +{ + return kvm_check_cap(cap); +} + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* + * Use the "inner", double-underscore macro when reporting errors from within + * other macros so that the name of ioctl() and not its literal numeric value + * is printed on error. The "outer" macro is strongly preferred when reporting + * errors "directly", i.e. without an additional layer of macros, as it reduces + * the probability of passing in the wrong string. + */ +#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) +#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) + +#define kvm_do_ioctl(fd, cmd, arg) \ +({ \ + kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ + ioctl(fd, cmd, arg); \ +}) + +#define __kvm_ioctl(kvm_fd, cmd, arg) \ + kvm_do_ioctl(kvm_fd, cmd, arg) + +#define kvm_ioctl(kvm_fd, cmd, arg) \ +({ \ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ +}) + +static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } + +#define __vm_ioctl(vm, cmd, arg) \ +({ \ + static_assert_is_vm(vm); \ + kvm_do_ioctl((vm)->fd, cmd, arg); \ +}) + +/* + * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if + * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, + * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before + * selftests existed and (b) should never outright fail, i.e. is supposed to + * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the + * VM and its vCPUs, including KVM_CHECK_EXTENSION. + */ +#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ +do { \ + int __errno = errno; \ + \ + static_assert_is_vm(vm); \ + \ + if (cond) \ + break; \ + \ + if (errno == EIO && \ + __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ + TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ + TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ + } \ + errno = __errno; \ + TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ +} while (0) + +#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ + __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) + +#define vm_ioctl(vm, cmd, arg) \ +({ \ + int ret = __vm_ioctl(vm, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ +}) + +static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } + +#define __vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + static_assert_is_vcpu(vcpu); \ + kvm_do_ioctl((vcpu)->fd, cmd, arg); \ +}) + +#define vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + int ret = __vcpu_ioctl(vcpu, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ +}) + +/* + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +static inline int vm_check_cap(struct kvm_vm *vm, long cap) +{ + int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); + + TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); + return ret; +} + +static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} +static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, + uint64_t size, uint64_t attributes) +{ + struct kvm_memory_attributes attr = { + .attributes = attributes, + .address = gpa, + .size = size, + .flags = 0, + }; + + /* + * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows + * need significant enhancements to support multiple attributes. + */ + TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, + "Update me to support multiple attributes!"); + + vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); +} + + +static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, 0); +} + +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, + bool punch_hole); + +static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, true); +} + +static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, false); +} + +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); +const char *vm_guest_mode_string(uint32_t i); + +void kvm_vm_free(struct kvm_vm *vmp); +void kvm_vm_restart(struct kvm_vm *vmp); +void kvm_vm_release(struct kvm_vm *vmp); +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, + size_t len); +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); +int kvm_memfd_alloc(size_t size, bool hugepages); + +void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + + vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); +} + +static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, + .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; + + vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); +} + +static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) +{ + return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); +} + +static inline int vm_get_stats_fd(struct kvm_vm *vm) +{ + int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); + return fd; +} + +static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) +{ + ssize_t ret; + + ret = pread(stats_fd, header, sizeof(*header), 0); + TEST_ASSERT(ret == sizeof(*header), + "Failed to read '%lu' header bytes, ret = '%ld'", + sizeof(*header), ret); +} + +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header); + +static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) +{ + /* + * The base size of the descriptor is defined by KVM's ABI, but the + * size of the name field is variable, as far as KVM's ABI is + * concerned. For a given instance of KVM, the name field is the same + * size for all stats and is provided in the overall stats header. + */ + return sizeof(struct kvm_stats_desc) + header->name_size; +} + +static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, + int index, + struct kvm_stats_header *header) +{ + /* + * Note, size_desc includes the size of the name field, which is + * variable. i.e. this is NOT equivalent to &stats_desc[i]. + */ + return (void *)stats + index * get_stats_descriptor_size(header); +} + +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements); + +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements); + +static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) +{ + uint64_t data; + + __vm_get_stat(vm, stat_name, &data, 1); + return data; +} + +void vm_create_irqchip(struct kvm_vm *vm); + +static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); +} + +static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + int fd = __vm_create_guest_memfd(vm, size, flags); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); + return fd; +} + +void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); + +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags); +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + +#ifndef vm_arch_has_protected_memory +static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) +{ + return false; +} +#endif + +void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vm_populate_vaddr_bitmap(struct kvm_vm *vm); +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); + +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + unsigned int npages); +void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); +void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); +vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); + +#ifndef vcpu_arch_put_guest +#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0) +#endif + +static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) +{ + return gpa & ~vm->gpa_tag_mask; +} + +void vcpu_run(struct kvm_vcpu *vcpu); +int _vcpu_run(struct kvm_vcpu *vcpu); + +static inline int __vcpu_run(struct kvm_vcpu *vcpu) +{ + return __vcpu_ioctl(vcpu, KVM_RUN, NULL); +} + +void vcpu_run_complete_io(struct kvm_vcpu *vcpu); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); + +static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, + uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *debug) +{ + vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); +} + +static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); +} +static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); +} + +static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_GET_REGS, regs); +} + +static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_SET_REGS, regs); +} +static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); + +} +static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); +} +static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); +} + +static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} +static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} + +#ifdef __KVM_HAVE_VCPU_EVENTS +static inline void vcpu_events_get(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); +} +static inline void vcpu_events_set(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); +} +#endif +#ifdef __x86_64__ +static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); +} +static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} + +static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} +#endif +static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) +{ + int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); + return fd; +} + +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); + +static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = __kvm_has_device_attr(dev_fd, group, attr); + + TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); +} + +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_get(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_get(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); +} + +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_set(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_set(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); +} + +static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + return __kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type); + +static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) +{ + int fd = __kvm_create_device(vm, type); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); + return fd; +} + +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); + +/* + * VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first @num input parameters for the function at @vcpu's entry point, + * per the C calling convention of the architecture, to the values given as + * variable args. Each of the variable args is expected to be of type uint64_t. + * The maximum @num can be is specific to the architecture. + */ +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); + +void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); +int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); + +#define KVM_MAX_IRQ_ROUTES 4096 + +struct kvm_irq_routing *kvm_gsi_routing_create(void); +void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, + uint32_t gsi, uint32_t pin); +int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); +void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); + +const char *exit_reason_str(unsigned int exit_reason); + +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot); +vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot, + bool protected); +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); + +static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot) +{ + /* + * By default, allocate memory as protected for VMs that support + * protected memory, as the majority of memory for such VMs is + * protected, i.e. using shared memory is effectively opt-in. + */ + return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, + vm_arch_has_protected_memory(vm)); +} + +/* + * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also + * loads the test binary into guest memory and creates an IRQ chip (x86 only). + * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to + * calculate the amount of memory needed for per-vCPU data, e.g. stacks. + */ +struct kvm_vm *____vm_create(struct vm_shape shape); +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages); + +static inline struct kvm_vm *vm_create_barebones(void) +{ + return ____vm_create(VM_SHAPE_DEFAULT); +} + +static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = type, + }; + + return ____vm_create(shape); +} + +static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) +{ + return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); +} + +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, + uint64_t extra_mem_pages, + void *guest_code, struct kvm_vcpu *vcpus[]); + +static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, + void *guest_code, + struct kvm_vcpu *vcpus[]) +{ + return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, + guest_code, vcpus); +} + + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); + +/* + * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages + * additional pages of guest memory. Returns the VM and vCPU (via out param). + */ +static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, + extra_mem_pages, guest_code); +} + +static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_with_one_vcpu(vcpu, 0, guest_code); +} + +static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); +} + +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); + +void kvm_pin_this_task_to_pcpu(uint32_t pcpu); +void kvm_print_vcpu_pinning_help(void); +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus); + +unsigned long vm_compute_max_gfn(struct kvm_vm *vm); +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); +unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); +unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); +static inline unsigned int +vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + unsigned int n; + n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); +#ifdef __s390x__ + /* s390 requires 1M aligned guest sizes */ + n = (n + 255) & ~255; +#endif + return n; +} + +#define sync_global_to_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(_p, &(g), sizeof(g)); \ +}) + +#define sync_global_from_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(&(g), _p, sizeof(g)); \ +}) + +/* + * Write a global value, but only in the VM's (guest's) domain. Primarily used + * for "globals" that hold per-VM values (VMs always duplicate code and global + * data into their own region of physical memory), but can be used anytime it's + * undesirable to change the host's copy of the global. + */ +#define write_guest_global(vm, g, val) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + typeof(g) _val = val; \ + \ + memcpy(_p, &(_val), sizeof(g)); \ +}) + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); + +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent); + +static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent) +{ + vcpu_arch_dump(stream, vcpu, indent); +} + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - The id of the VCPU to add to the VM. + */ +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); + +static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) +{ + struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); + + vcpu_arch_set_entry_point(vcpu, guest_code); + + return vcpu; +} + +/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); + +static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return vm_arch_vcpu_recreate(vm, vcpu_id); +} + +void vcpu_arch_free(struct kvm_vcpu *vcpu); + +void virt_arch_pgd_alloc(struct kvm_vm *vm); + +static inline void virt_pgd_alloc(struct kvm_vm *vm) +{ + virt_arch_pgd_alloc(vm); +} + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); + +static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + virt_arch_pg_map(vm, vaddr, paddr); +} + + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); + +static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_arch_gva2gpa(vm, gva); +} + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + virt_arch_dump(stream, vm, indent); +} + + +static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) +{ + return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); +} + +/* + * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * to allow for arch-specific setup that is common to all tests, e.g. computing + * the default guest "mode". + */ +void kvm_selftest_arch_init(void); + +void kvm_arch_vm_post_create(struct kvm_vm *vm); + +bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); + +uint32_t guest_get_vcpuid(void); #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/s390x/ucall.h b/tools/testing/selftests/kvm/include/s390x/ucall.h index b231bf2e49d6..8035a872a351 100644 --- a/tools/testing/selftests/kvm/include/s390x/ucall.h +++ b/tools/testing/selftests/kvm/include/s390x/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_S390_SIEIC diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 74a59c7ce7ed..2b654b65fe47 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -18,7 +18,8 @@ #include #include -#include "../kvm_util.h" +#include "kvm_util.h" +#include "ucall_common.h" extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; diff --git a/tools/testing/selftests/kvm/include/x86_64/ucall.h b/tools/testing/selftests/kvm/include/x86_64/ucall.h index 06b244bd06ee..d3825dcc3cd9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/ucall.h +++ b/tools/testing/selftests/kvm/include/x86_64/ucall.h @@ -2,7 +2,7 @@ #ifndef SELFTEST_KVM_UCALL_H #define SELFTEST_KVM_UCALL_H -#include "kvm_util_base.h" +#include "kvm_util.h" #define UCALL_EXIT_REASON KVM_EXIT_IO diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 7759c685086b..dd8b12f626d3 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -18,6 +18,7 @@ #include "kvm_util.h" #include "processor.h" #include "guest_modes.h" +#include "ucall_common.h" #define TEST_MEM_SLOT_INDEX 1 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index a9eb17295be4..0ac7cc89f38c 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -11,6 +11,8 @@ #include "guest_modes.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" + #include #include diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index d64276739513..6b2158655baa 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -7,6 +7,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #include #include diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index aa0a01988150..313277486a1d 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -8,6 +8,7 @@ #include "kvm_util.h" #include "memstress.h" #include "processor.h" +#include "ucall_common.h" struct memstress_args memstress_args; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index e8211f5d6863..79b67e2627cb 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -10,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c index f5af65a41c29..42151e571953 100644 --- a/tools/testing/selftests/kvm/lib/ucall_common.c +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -1,9 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only -#include "kvm_util.h" #include "linux/types.h" #include "linux/bitmap.h" #include "linux/atomic.h" +#include "kvm_util.h" +#include "ucall_common.h" + + #define GUEST_UCALL_FAILED -1 struct ucall_header { diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index 4b5004ef9c6b..82999e0f0221 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -11,6 +11,7 @@ #include "kvm_util.h" #include "processor.h" #include "timer_test.h" +#include "ucall_common.h" static int timer_irq = IRQ_S_TIMER; diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 0728b15b5d3a..a44df60cf189 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -27,6 +27,7 @@ #include "kvm_util.h" #include "processor.h" #include "test_util.h" +#include "ucall_common.h" /* * Any bug related to task migration is likely to be timing-dependent; perform diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c index 84ba79c42ab1..b39033844756 100644 --- a/tools/testing/selftests/kvm/s390x/cmma_test.c +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -16,6 +16,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" #define MAIN_PAGE_COUNT 512 diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 48cb910e660d..f2df7416be84 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -15,6 +15,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" enum mop_target { LOGICAL, diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index c73f948c9b63..7a742a673b7c 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -8,6 +8,7 @@ #include "test_util.h" #include "kvm_util.h" #include "kselftest.h" +#include "ucall_common.h" #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index e9231387c589..d7a70fc191ef 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -17,6 +17,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define NR_VCPUS 4 #define ST_GPA_BASE (1 << 30) diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c index ee3b384b991c..2929c067c207 100644 --- a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c +++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c @@ -17,6 +17,7 @@ #include "test_util.h" #include "memstress.h" #include "guest_modes.h" +#include "ucall_common.h" #define VCPUS 2 #define SLOTS 2 diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c index 9c21b6bccc38..81055476d394 100644 --- a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c +++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c @@ -5,8 +5,8 @@ * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE. */ #include "flds_emulation.h" - #include "test_util.h" +#include "ucall_common.h" #define MMIO_GPA 0x700000000 #define MMIO_GVA MMIO_GPA diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index abe71946941f..fb976d6a1969 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -22,7 +22,6 @@ #include #include -#include "kvm_util_base.h" #include "kvm_util.h" #include "mce.h" #include "processor.h" -- cgit v1.2.3 From f54884f93898e2e8b62784153a4c12d12a081f96 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:21 -0700 Subject: KVM: sefltests: Add kvm_util_types.h to hold common types, e.g. vm_vaddr_t Move the base types unique to KVM selftests out of kvm_util.h and into a new header, kvm_util_types.h. This will allow kvm_util_arch.h, i.e. core arch headers, to reference common types, e.g. vm_vaddr_t and vm_paddr_t. No functional change intended. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 16 +--------------- tools/testing/selftests/kvm/include/kvm_util_types.h | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/kvm_util_types.h (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c51babca552a..f5a1eaeaf0c8 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -21,28 +21,14 @@ #include #include "kvm_util_arch.h" +#include "kvm_util_types.h" #include "sparsebit.h" -/* - * Provide a version of static_assert() that is guaranteed to have an optional - * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE - * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and - * #defines static_assert() as a direct alias to _Static_assert() (see - * usr/include/assert.h). Define a custom macro instead of redefining - * static_assert() to avoid creating non-deterministic behavior that is - * dependent on include order. - */ -#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) -#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) - #define KVM_DEV_PATH "/dev/kvm" #define KVM_MAX_VCPUS 512 #define NSEC_PER_SEC 1000000000L -typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ -typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ - struct userspace_mem_region { struct kvm_userspace_memory_region2 region; struct sparsebit *unused_phy_pages; diff --git a/tools/testing/selftests/kvm/include/kvm_util_types.h b/tools/testing/selftests/kvm/include/kvm_util_types.h new file mode 100644 index 000000000000..ec787b97cf18 --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_util_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_TYPES_H +#define SELFTEST_KVM_UTIL_TYPES_H + +/* + * Provide a version of static_assert() that is guaranteed to have an optional + * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE + * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and + * #defines static_assert() as a direct alias to _Static_assert() (see + * usr/include/assert.h). Define a custom macro instead of redefining + * static_assert() to avoid creating non-deterministic behavior that is + * dependent on include order. + */ +#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) +#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) + +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +#endif /* SELFTEST_KVM_UTIL_TYPES_H */ -- cgit v1.2.3 From 3a085fbf8228cfcdbf48ded8915618e10226f2e3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:22 -0700 Subject: KVM: selftests: Move GDT, IDT, and TSS fields to x86's kvm_vm_arch Now that kvm_vm_arch exists, move the GDT, IDT, and TSS fields to x86's implementation, as the structures are firmly x86-only. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-4-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 3 --- .../selftests/kvm/include/x86_64/kvm_util_arch.h | 5 +++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 22 +++++++++++----------- .../kvm/x86_64/svm_nested_shutdown_test.c | 2 +- .../kvm/x86_64/svm_nested_soft_inject_test.c | 2 +- 5 files changed, 18 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index f5a1eaeaf0c8..63c2aaae51f3 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -93,9 +93,6 @@ struct kvm_vm { bool pgd_created; vm_paddr_t ucall_mmio_addr; vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; - vm_vaddr_t idt; vm_vaddr_t handlers; uint32_t dirty_ring_size; uint64_t gpa_tag_mask; diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h index d0b587c38e07..972bb1c4ab4c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h @@ -5,11 +5,16 @@ #include #include +#include "kvm_util_types.h" #include "test_util.h" extern bool is_forced_emulation_enabled; struct kvm_vm_arch { + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + uint64_t c_bit; uint64_t s_bit; int sev_fd; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 90aeacf114bf..f2506b9edb37 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -418,7 +418,7 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp) static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) { - void *gdt = addr_gva2hva(vm, vm->gdt); + void *gdt = addr_gva2hva(vm, vm->arch.gdt); struct desc64 *desc = gdt + (segp->selector >> 3) * 8; desc->limit0 = segp->limit & 0xFFFF; @@ -519,21 +519,21 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) { - if (!vm->gdt) - vm->gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + if (!vm->arch.gdt) + vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - dt->base = vm->gdt; + dt->base = vm->arch.gdt; dt->limit = getpagesize(); } static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, int selector) { - if (!vm->tss) - vm->tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + if (!vm->arch.tss) + vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); memset(segp, 0, sizeof(*segp)); - segp->base = vm->tss; + segp->base = vm->arch.tss; segp->limit = 0x67; segp->selector = selector; segp->type = 0xb; @@ -1097,7 +1097,7 @@ static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, int dpl, unsigned short selector) { struct idt_entry *base = - (struct idt_entry *)addr_gva2hva(vm, vm->idt); + (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); struct idt_entry *e = &base[vector]; memset(e, 0, sizeof(*e)); @@ -1150,7 +1150,7 @@ void vm_init_descriptor_tables(struct kvm_vm *vm) extern void *idt_handlers; int i; - vm->idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); /* Handlers have the same address in both address spaces.*/ for (i = 0; i < NUM_INTERRUPTS; i++) @@ -1164,9 +1164,9 @@ void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) struct kvm_sregs sregs; vcpu_sregs_get(vcpu, &sregs); - sregs.idt.base = vm->idt; + sregs.idt.base = vm->arch.idt; sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; - sregs.gdt.base = vm->gdt; + sregs.gdt.base = vm->arch.gdt; sregs.gdt.limit = getpagesize() - 1; kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); vcpu_sregs_set(vcpu, &sregs); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c index d6fcdcc3af31..f4a1137e04ab 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c @@ -53,7 +53,7 @@ int main(int argc, char *argv[]) vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vcpu, 2, svm_gva, vm->idt); + vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 0c7ce3d4e83a..2478a9e50743 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -166,7 +166,7 @@ static void run_test(bool is_nmi) idt_alt_vm = vm_vaddr_alloc_page(vm); idt_alt = addr_gva2hva(vm, idt_alt_vm); - idt = addr_gva2hva(vm, vm->idt); + idt = addr_gva2hva(vm, vm->arch.idt); memcpy(idt_alt, idt, getpagesize()); } else { idt_alt_vm = 0; -- cgit v1.2.3 From 0d95817e075314706b3e4086080a9bbb1421634c Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Thu, 14 Mar 2024 16:26:23 -0700 Subject: KVM: selftests: Fix off-by-one initialization of GDT limit Fix an off-by-one bug in the initialization of the GDT limit, which as defined in the SDM is inclusive, not exclusive. Note, vcpu_init_descriptor_tables() gets the limit correct, it's only vcpu_setup() that is broken, i.e. only tests that _don't_ invoke vcpu_init_descriptor_tables() can have problems. And the fact that KVM effectively initializes the GDT twice will be cleaned up in the near future. Signed-off-by: Ackerley Tng [sean: rewrite changelog] Link: https://lore.kernel.org/r/20240314232637.2538648-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f2506b9edb37..a04c2e623d09 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -523,7 +523,7 @@ static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); dt->base = vm->arch.gdt; - dt->limit = getpagesize(); + dt->limit = getpagesize() - 1; } static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, -- cgit v1.2.3 From 53635ec253c025aeb0e4401e586a8f7827cd7817 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:24 -0700 Subject: KVM: selftests: Move platform_info_test's main assert into guest code As a first step toward gracefully handling the expected #GP on RDMSR in platform_info_test, move the test's assert on the non-faulting RDMSR result into the guest itself. This will allow using a unified flow for the host userspace side of things. Link: https://lore.kernel.org/r/20240314232637.2538648-6-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/platform_info_test.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 2165b1ad8b38..a6be98dbe654 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -27,7 +27,9 @@ static void guest_code(void) for (;;) { msr_platform_info = rdmsr(MSR_PLATFORM_INFO); - GUEST_SYNC(msr_platform_info); + GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); + GUEST_SYNC(0); asm volatile ("inc %r11"); } } @@ -40,13 +42,15 @@ static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - get_ucall(vcpu, &uc); - TEST_ASSERT(uc.cmd == UCALL_SYNC, - "Received ucall other than UCALL_SYNC: %lu", uc.cmd); - TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == - MSR_PLATFORM_INFO_MAX_TURBO_RATIO, - "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", - MSR_PLATFORM_INFO_MAX_TURBO_RATIO); + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + break; + } } static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From dec79eab2b48e4ae71a3e688342dce19da4212c2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:25 -0700 Subject: KVM: selftests: Rework platform_info_test to actually verify #GP Rework platform_info_test to actually handle and verify the expected #GP on RDMSR when the associated KVM capability is disabled. Currently, the test _deliberately_ doesn't handle the #GP, and instead lets it escalated to a triple fault shutdown. In addition to verifying that KVM generates the correct fault, handling the #GP will be necessary (without even more shenanigans) when a future change to the core KVM selftests library configures the IDT and exception handlers by default (the test subtly relies on the IDT limit being '0'). Link: https://lore.kernel.org/r/20240314232637.2538648-7-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/platform_info_test.c | 66 +++++++++++----------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index a6be98dbe654..cdca3579b3cd 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -24,40 +24,18 @@ static void guest_code(void) { uint64_t msr_platform_info; + uint8_t vector; - for (;;) { - msr_platform_info = rdmsr(MSR_PLATFORM_INFO); - GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO, - MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - GUEST_SYNC(0); - asm volatile ("inc %r11"); - } -} - -static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) -{ - struct ucall uc; + GUEST_SYNC(true); + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, true); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - - switch (get_ucall(vcpu, &uc)) { - case UCALL_SYNC: - break; - case UCALL_ABORT: - REPORT_GUEST_ASSERT(uc); - default: - TEST_FAIL("Unexpected ucall %lu", uc.cmd); - break; - } -} + GUEST_SYNC(false); + vector = rdmsr_safe(MSR_PLATFORM_INFO, &msr_platform_info); + GUEST_ASSERT_EQ(vector, GP_VECTOR); -static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu) -{ - vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, false); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -65,16 +43,38 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t msr_platform_info; + struct ucall uc; TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO); vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_enabled(vcpu); - test_msr_platform_info_disabled(vcpu); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, uc.args[1]); + break; + case UCALL_DONE: + goto done; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + break; + } + } + +done: vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); -- cgit v1.2.3 From 61c3cffd4cbfe5c795d2eaf4a0341ec347fd0799 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:26 -0700 Subject: KVM: selftests: Explicitly clobber the IDT in the "delete memslot" testcase Explicitly clobber the guest IDT in the "delete memslot" test, which expects the deleted memslot to result in either a KVM emulation error, or a triple fault shutdown. A future change to the core selftests library will configuring the guest IDT and exception handlers by default, i.e. will install a guest #PF handler and put the guest into an infinite #NPF loop (the guest hits a !PRESENT SPTE when trying to vector a #PF, and KVM reinjects the #PF without fixing the #NPF, because there is no memslot). Note, it's not clear whether or not KVM's behavior is reasonable in this case, e.g. arguably KVM should try (and fail) to emulate in response to the #NPF. But barring a goofy/broken userspace, this scenario will likely never happen in practice. Punt the KVM investigation to the future. Link: https://lore.kernel.org/r/20240314232637.2538648-8-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/set_memory_region_test.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index c6e438c9d851..ae98cb00da05 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -220,8 +220,20 @@ static void test_move_memory_region(void) static void guest_code_delete_memory_region(void) { + struct desc_ptr idt; uint64_t val; + /* + * Clobber the IDT so that a #PF due to the memory region being deleted + * escalates to triple-fault shutdown. Because the memory region is + * deleted, there will be no valid mappings. As a result, KVM will + * repeatedly intercepts the state-2 page fault that occurs when trying + * to vector the guest's #PF. I.e. trying to actually handle the #PF + * in the guest will never succeed, and so isn't an option. + */ + memset(&idt, 0, sizeof(idt)); + __asm__ __volatile__("lidt %0" :: "m"(idt)); + GUEST_SYNC(0); /* Spin until the memory region is deleted. */ -- cgit v1.2.3 From b62c32c532cd8d96ff86d6340416f6846101eeb6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:27 -0700 Subject: KVM: selftests: Move x86's descriptor table helpers "up" in processor.c Move x86's various descriptor table helpers in processor.c up above kvm_arch_vm_post_create() and vcpu_setup() so that the helpers can be made static and invoked from the aforementioned functions. No functional change intended. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-9-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 191 ++++++++++----------- 1 file changed, 95 insertions(+), 96 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a04c2e623d09..f86e2d79b130 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -541,6 +541,21 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) +{ + struct kvm_vm *vm = vcpu->vm; + struct kvm_sregs sregs; + + vcpu_sregs_get(vcpu, &sregs); + sregs.idt.base = vm->arch.idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->arch.gdt; + sregs.gdt.limit = getpagesize() - 1; + kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); + vcpu_sregs_set(vcpu, &sregs); + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; @@ -573,6 +588,86 @@ static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) vcpu_sregs_set(vcpu, &sregs); } +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + +static bool kvm_fixup_exception(struct ex_regs *regs) +{ + if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) + return false; + + if (regs->vector == DE_VECTOR) + return false; + + regs->rip = regs->r11; + regs->r9 = regs->vector; + regs->r10 = regs->error_code; + return true; +} + +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; + + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; + } + + if (kvm_fixup_exception(regs)) + return; + + ucall_assert(UCALL_UNHANDLED, + "Unhandled exception in guest", __FILE__, __LINE__, + "Unhandled exception '0x%lx' at guest RIP '0x%lx'", + regs->vector, regs->rip); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + int i; + + vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, + DEFAULT_CODE_SELECTOR); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) + REPORT_GUEST_ASSERT(uc); +} + void kvm_arch_vm_post_create(struct kvm_vm *vm) { vm_create_irqchip(vm); @@ -1093,102 +1188,6 @@ void kvm_init_vm_address_properties(struct kvm_vm *vm) } } -static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, - int dpl, unsigned short selector) -{ - struct idt_entry *base = - (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); - struct idt_entry *e = &base[vector]; - - memset(e, 0, sizeof(*e)); - e->offset0 = addr; - e->selector = selector; - e->ist = 0; - e->type = 14; - e->dpl = dpl; - e->p = 1; - e->offset1 = addr >> 16; - e->offset2 = addr >> 32; -} - - -static bool kvm_fixup_exception(struct ex_regs *regs) -{ - if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) - return false; - - if (regs->vector == DE_VECTOR) - return false; - - regs->rip = regs->r11; - regs->r9 = regs->vector; - regs->r10 = regs->error_code; - return true; -} - -void route_exception(struct ex_regs *regs) -{ - typedef void(*handler)(struct ex_regs *); - handler *handlers = (handler *)exception_handlers; - - if (handlers && handlers[regs->vector]) { - handlers[regs->vector](regs); - return; - } - - if (kvm_fixup_exception(regs)) - return; - - ucall_assert(UCALL_UNHANDLED, - "Unhandled exception in guest", __FILE__, __LINE__, - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", - regs->vector, regs->rip); -} - -void vm_init_descriptor_tables(struct kvm_vm *vm) -{ - extern void *idt_handlers; - int i; - - vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - /* Handlers have the same address in both address spaces.*/ - for (i = 0; i < NUM_INTERRUPTS; i++) - set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, - DEFAULT_CODE_SELECTOR); -} - -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) -{ - struct kvm_vm *vm = vcpu->vm; - struct kvm_sregs sregs; - - vcpu_sregs_get(vcpu, &sregs); - sregs.idt.base = vm->arch.idt; - sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; - sregs.gdt.base = vm->arch.gdt; - sregs.gdt.limit = getpagesize() - 1; - kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); - vcpu_sregs_set(vcpu, &sregs); - *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; -} - -void vm_install_exception_handler(struct kvm_vm *vm, int vector, - void (*handler)(struct ex_regs *)) -{ - vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); - - handlers[vector] = (vm_vaddr_t)handler; -} - -void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) - REPORT_GUEST_ASSERT(uc); -} - const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, uint32_t function, uint32_t index) { -- cgit v1.2.3 From d8c63805e4e56cb775e2b02f4a7e2b536d97c8c5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:28 -0700 Subject: KVM: selftests: Rename x86's vcpu_setup() to vcpu_init_sregs() Rename vcpu_setup() to be more descriptive and precise, there is a whole lot of "setup" that is done for a vCPU that isn't in said helper. No functional change intended. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-10-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f86e2d79b130..88d6e6caa302 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -556,7 +556,7 @@ void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } -static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; @@ -719,7 +719,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu = __vm_vcpu_add(vm, vcpu_id); vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); - vcpu_setup(vm, vcpu); + vcpu_init_sregs(vm, vcpu); /* Setup guest general purpose registers */ vcpu_regs_get(vcpu, ®s); -- cgit v1.2.3 From c1b9793b45d521767efdb6ab26008cabd0473ea9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:29 -0700 Subject: KVM: selftests: Init IDT and exception handlers for all VMs/vCPUs on x86 Initialize the IDT and exception handlers for all non-barebones VMs and vCPUs on x86. Forcing tests to manually configure the IDT just to save 8KiB of memory is a terrible tradeoff, and also leads to weird tests (multiple tests have deliberately relied on shutdown to indicate success), and hard-to-debug failures, e.g. instead of a precise unexpected exception failure, tests see only shutdown. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-11-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 -- tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 ++++++-- tools/testing/selftests/kvm/x86_64/amx_test.c | 2 -- tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c | 2 -- tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c | 2 -- tools/testing/selftests/kvm/x86_64/hyperv_features.c | 6 ------ tools/testing/selftests/kvm/x86_64/hyperv_ipi.c | 3 --- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 3 --- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 3 --- tools/testing/selftests/kvm/x86_64/platform_info_test.c | 3 --- tools/testing/selftests/kvm/x86_64/pmu_counters_test.c | 3 --- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 6 ------ .../selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c | 3 --- tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c | 3 --- tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c | 3 --- tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 3 --- tools/testing/selftests/kvm/x86_64/ucna_injection_test.c | 4 ---- tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c | 3 --- .../selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c | 3 --- tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c | 3 --- tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c | 2 -- tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c | 3 --- tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 -- 23 files changed, 6 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 2b654b65fe47..8eb57de0b587 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1134,8 +1134,6 @@ struct idt_entry { uint32_t offset2; uint32_t reserved; }; -void vm_init_descriptor_tables(struct kvm_vm *vm); -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 88d6e6caa302..f3d9ac7e8692 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -541,7 +541,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } -void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) +static void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) { struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; @@ -586,6 +586,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr3 = vm->pgd; vcpu_sregs_set(vcpu, &sregs); + + vcpu_init_descriptor_tables(vcpu); } static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, @@ -639,7 +641,7 @@ void route_exception(struct ex_regs *regs) regs->vector, regs->rip); } -void vm_init_descriptor_tables(struct kvm_vm *vm) +static void vm_init_descriptor_tables(struct kvm_vm *vm) { extern void *idt_handlers; int i; @@ -671,6 +673,8 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) void kvm_arch_vm_post_create(struct kvm_vm *vm) { vm_create_irqchip(vm); + vm_init_descriptor_tables(vm); + sync_global_to_guest(vm, host_cpu_is_intel); sync_global_to_guest(vm, host_cpu_is_amd); sync_global_to_guest(vm, is_forced_emulation_enabled); diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 8e5713e36d4b..903940c54d2d 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -244,8 +244,6 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); /* Register #NM handler */ - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); /* amx cfg for guest_code */ diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index f3c2239228b1..762628f7d4ba 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -110,8 +110,6 @@ static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk) { struct kvm_vm *vm = vcpu->vm; - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); if (disable_quirk) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index 4f3f3a9b038b..e192720bfe14 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -257,8 +257,6 @@ int main(int argc, char *argv[]) vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index b923a285e96f..068e9c69710d 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -156,9 +156,6 @@ static void guest_test_msrs_access(void) vcpu_init_cpuid(vcpu, prev_cpuid); } - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - /* TODO: Make this entire test easier to maintain. */ if (stage >= 21) vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); @@ -532,9 +529,6 @@ static void guest_test_hcalls_access(void) while (true) { vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - /* Hypercall input/output */ hcall_page = vm_vaddr_alloc_pages(vm, 2); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c index 8206f5ef42dd..22c0c124582f 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -254,16 +254,13 @@ int main(int argc, char *argv[]) hcall_page = vm_vaddr_alloc_pages(vm, 2); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); - vm_init_descriptor_tables(vm); vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code); - vcpu_init_descriptor_tables(vcpu[1]); vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1); vcpu_set_hv_cpuid(vcpu[1]); vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code); - vcpu_init_descriptor_tables(vcpu[2]); vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2); vcpu_set_hv_cpuid(vcpu[2]); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 40cc59f4e650..78878b3a2725 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -183,9 +183,6 @@ int main(void) vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - enter_guest(vcpu); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 853802641e1e..9c8445379d76 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -80,9 +80,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - while (1) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index cdca3579b3cd..eda88080c186 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -49,9 +49,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO); vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index bad2ab1b9810..2556777c4674 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -28,9 +28,6 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vm *vm; vm = vm_create_with_one_vcpu(vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(*vcpu); - sync_global_to_guest(vm, kvm_pmu_version); /* diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 5ce53b8c46e0..26b3e7efe5dd 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -334,9 +334,6 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); vcpu = vm_vcpu_add(vm, 0, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - TEST_ASSERT(!sanity_check_pmu(vcpu), "Guest should not be able to use disabled PMU."); @@ -873,9 +870,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - TEST_REQUIRE(sanity_check_pmu(vcpu)); if (use_amd_pmu()) diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 362be40fc00d..fabeeaddfb3a 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -57,9 +57,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled()); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index 32bef39bec21..916e04248fbb 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -93,9 +93,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c index f4a1137e04ab..00135cbba35e 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c @@ -48,9 +48,6 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_alloc_svm(vm, &svm_gva); vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 2478a9e50743..7b6481d6c0d3 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -152,9 +152,6 @@ static void run_test(bool is_nmi) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); vm_install_exception_handler(vm, INT_NR, guest_int_handler); diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index fb976d6a1969..57f157c06b39 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -282,10 +282,6 @@ int main(int argc, char *argv[]) cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code); cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(ucna_vcpu); - vcpu_init_descriptor_tables(cmcidis_vcpu); - vcpu_init_descriptor_tables(cmci_vcpu); vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index af9981a6642f..32b2794b78fe 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -525,9 +525,6 @@ KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); /* Process guest code userspace exits. */ diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index fad3634fd9eb..3fd6eceab46f 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -115,9 +115,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); get_set_sigalrm_vcpu(vcpu); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); /* diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 3b93f262b797..7c92536551cc 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -85,9 +85,6 @@ KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code) struct ucall uc; int r, i; - vm_init_descriptor_tables(vcpu->vm); - vcpu_init_descriptor_tables(vcpu); - vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); vcpu_args_set(vcpu, 1, host_cap.capabilities); diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index c78e5f755116..a76078a08ff8 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -408,8 +408,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(¶ms[0].vcpu, halter_guest_code); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(params[0].vcpu); vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c index 25a0b0db5c3c..95ce192d0753 100644 --- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c @@ -109,9 +109,6 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - while (1) { vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 1ba06551526b..aa88cc541fe2 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -554,8 +554,6 @@ int main(int argc, char *argv[]) }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); if (do_runstate_tests) { -- cgit v1.2.3 From 44c93b27726928a7b28eca233f7a504b92bc8f88 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:30 -0700 Subject: KVM: selftests: Map x86's exception_handlers at VM creation, not vCPU setup Map x86's exception handlers at VM creation, not vCPU setup, as the mapping is per-VM, i.e. doesn't need to be (re)done for every vCPU. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-12-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f3d9ac7e8692..ff800b860913 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -553,7 +553,6 @@ static void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) sregs.gdt.limit = getpagesize() - 1; kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); vcpu_sregs_set(vcpu, &sregs); - *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) @@ -652,6 +651,8 @@ static void vm_init_descriptor_tables(struct kvm_vm *vm) for (i = 0; i < NUM_INTERRUPTS; i++) set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, DEFAULT_CODE_SELECTOR); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } void vm_install_exception_handler(struct kvm_vm *vm, int vector, -- cgit v1.2.3 From 2a511ca994933ba02309c65401d88cbf4f19630e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:31 -0700 Subject: KVM: selftests: Allocate x86's GDT during VM creation Allocate the GDT during creation of non-barebones VMs instead of waiting until the first vCPU is created, as the whole point of non-barebones VMs is to be able to run vCPUs, i.e. the GDT is going to get allocated no matter what. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-13-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index ff800b860913..7ac77dd970ca 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -519,9 +519,6 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) { - if (!vm->arch.gdt) - vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - dt->base = vm->arch.gdt; dt->limit = getpagesize() - 1; } @@ -645,6 +642,7 @@ static void vm_init_descriptor_tables(struct kvm_vm *vm) extern void *idt_handlers; int i; + vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); /* Handlers have the same address in both address spaces.*/ -- cgit v1.2.3 From 1051e29cb9156789e908ab9565f59e7aba470d60 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:32 -0700 Subject: KVM: selftests: Drop superfluous switch() on vm->mode in vcpu_init_sregs() Replace the switch statement on vm->mode in x86's vcpu_init_sregs()'s with a simple assert that the VM has a 48-bit virtual address space. A switch statement is both overkill and misleading, as the existing code incorrectly implies that VMs with LA57 would need different to configuration for the LDT, TSS, and flat segments. In all likelihood, the only difference that would be needed for selftests is CR4.LA57 itself. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-14-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 27 +++++++++------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 7ac77dd970ca..6128c9c52cc5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -556,6 +556,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; + TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); @@ -563,22 +565,15 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_setup_gdt(vm, &sregs.gdt); - switch (vm->mode) { - case VM_MODE_PXXV48_4K: - sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; - sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; - sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); - - kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); - break; - - default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); - } + sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; + sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; + sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + + kvm_seg_set_unusable(&sregs.ldt); + kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); sregs.cr3 = vm->pgd; vcpu_sregs_set(vcpu, &sregs); -- cgit v1.2.3 From 23ef21f58cf8bbecbe479015ef79baa15a0da0b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:33 -0700 Subject: KVM: selftests: Fold x86's descriptor tables helpers into vcpu_init_sregs() Now that the per-VM, on-demand allocation logic in kvm_setup_gdt() and vcpu_init_descriptor_tables() is gone, fold them into vcpu_init_sregs(). Note, both kvm_setup_gdt() and vcpu_init_descriptor_tables() configured the GDT, which is why it looks like kvm_setup_gdt() disappears. Opportunistically delete the pointless zeroing of the IDT limit (it was being unconditionally overwritten by vcpu_init_descriptor_tables()). Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-15-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 30 ++++------------------ 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 6128c9c52cc5..72b65eac2df2 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -517,12 +517,6 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) -{ - dt->base = vm->arch.gdt; - dt->limit = getpagesize() - 1; -} - static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, int selector) { @@ -538,20 +532,6 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) -{ - struct kvm_vm *vm = vcpu->vm; - struct kvm_sregs sregs; - - vcpu_sregs_get(vcpu, &sregs); - sregs.idt.base = vm->arch.idt; - sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; - sregs.gdt.base = vm->arch.gdt; - sregs.gdt.limit = getpagesize() - 1; - kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); - vcpu_sregs_set(vcpu, &sregs); -} - static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; @@ -561,9 +541,10 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); - sregs.idt.limit = 0; - - kvm_setup_gdt(vm, &sregs.gdt); + sregs.idt.base = vm->arch.idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->arch.gdt; + sregs.gdt.limit = getpagesize() - 1; sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; @@ -573,12 +554,11 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); + kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); sregs.cr3 = vm->pgd; vcpu_sregs_set(vcpu, &sregs); - - vcpu_init_descriptor_tables(vcpu); } static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, -- cgit v1.2.3 From a2834e6e0b98bc89b874aef15c99b23db5f438df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:34 -0700 Subject: KVM: selftests: Allocate x86's TSS at VM creation Allocate x86's per-VM TSS at creation of a non-barebones VM. Like the GDT, the TSS is needed to actually run vCPUs, i.e. every non-barebones VM is all but guaranteed to allocate the TSS sooner or later. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-16-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 72b65eac2df2..9042f7f3330d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -520,9 +520,6 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, int selector) { - if (!vm->arch.tss) - vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); - memset(segp, 0, sizeof(*segp)); segp->base = vm->arch.tss; segp->limit = 0x67; @@ -620,6 +617,8 @@ static void vm_init_descriptor_tables(struct kvm_vm *vm) vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + /* Handlers have the same address in both address spaces.*/ for (i = 0; i < NUM_INTERRUPTS; i++) set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, -- cgit v1.2.3 From f18ef97fc60217f648a910835c895cf27ba75a03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:35 -0700 Subject: KVM: selftests: Add macro for TSS selector, rename up code/data macros Add a proper #define for the TSS selector instead of open coding 0x18 and hoping future developers don't use that selector for something else. Opportunistically rename the code and data selector macros to shorten the names, align the naming with the kernel's scheme, and capture that they are *kernel* segments. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-17-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 9042f7f3330d..9265842ed4cc 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -15,8 +15,9 @@ #define NUM_INTERRUPTS 256 #endif -#define DEFAULT_CODE_SELECTOR 0x8 -#define DEFAULT_DATA_SELECTOR 0x10 +#define KERNEL_CS 0x8 +#define KERNEL_DS 0x10 +#define KERNEL_TSS 0x18 #define MAX_NR_CPUID_ENTRIES 100 @@ -548,11 +549,11 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); - kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); + kvm_seg_set_kernel_code_64bit(vm, KERNEL_CS, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, KERNEL_DS, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, KERNEL_DS, &sregs.es); + kvm_seg_set_kernel_data_64bit(NULL, KERNEL_DS, &sregs.gs); + kvm_setup_tss_64bit(vm, &sregs.tr, KERNEL_TSS); sregs.cr3 = vm->pgd; vcpu_sregs_set(vcpu, &sregs); @@ -621,8 +622,7 @@ static void vm_init_descriptor_tables(struct kvm_vm *vm) /* Handlers have the same address in both address spaces.*/ for (i = 0; i < NUM_INTERRUPTS; i++) - set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, - DEFAULT_CODE_SELECTOR); + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS); *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } -- cgit v1.2.3 From 0f53a0245068e09b3b31e7f43ace0ab9edd066ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:36 -0700 Subject: KVM: selftests: Init x86's segments during VM creation Initialize x86's various segments in the GDT during creation of relevant VMs instead of waiting until vCPUs come along. Re-installing the segments for every vCPU is both wasteful and confusing, as is installing KERNEL_DS multiple times; NOT installing KERNEL_DS for GS is icing on the cake. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-18-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 68 +++++++--------------- 1 file changed, 20 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 9265842ed4cc..ee3fad41ee24 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -439,24 +439,7 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->base3 = segp->base >> 32; } - -/* - * Set Long Mode Flat Kernel Code Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a code segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_code_64bit(uint16_t selector, struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); segp->selector = selector; @@ -468,27 +451,9 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, segp->g = true; segp->l = true; segp->present = 1; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } -/* - * Set Long Mode Flat Kernel Data Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a data segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_data_64bit(uint16_t selector, struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); segp->selector = selector; @@ -499,8 +464,6 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, */ segp->g = true; segp->present = true; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) @@ -518,16 +481,15 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); } -static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector) +static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp, + int selector) { memset(segp, 0, sizeof(*segp)); - segp->base = vm->arch.tss; + segp->base = base; segp->limit = 0x67; segp->selector = selector; segp->type = 0xb; segp->present = 1; - kvm_seg_fill_gdt_64bit(vm, segp); } static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) @@ -549,11 +511,11 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, KERNEL_CS, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, KERNEL_DS, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, KERNEL_DS, &sregs.es); - kvm_seg_set_kernel_data_64bit(NULL, KERNEL_DS, &sregs.gs); - kvm_setup_tss_64bit(vm, &sregs.tr, KERNEL_TSS); + kvm_seg_set_kernel_code_64bit(KERNEL_CS, &sregs.cs); + kvm_seg_set_kernel_data_64bit(KERNEL_DS, &sregs.ds); + kvm_seg_set_kernel_data_64bit(KERNEL_DS, &sregs.es); + kvm_seg_set_kernel_data_64bit(KERNEL_DS, &sregs.gs); + kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr, KERNEL_TSS); sregs.cr3 = vm->pgd; vcpu_sregs_set(vcpu, &sregs); @@ -613,6 +575,7 @@ void route_exception(struct ex_regs *regs) static void vm_init_descriptor_tables(struct kvm_vm *vm) { extern void *idt_handlers; + struct kvm_segment seg; int i; vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); @@ -625,6 +588,15 @@ static void vm_init_descriptor_tables(struct kvm_vm *vm) set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS); *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; + + kvm_seg_set_kernel_code_64bit(KERNEL_CS, &seg); + kvm_seg_fill_gdt_64bit(vm, &seg); + + kvm_seg_set_kernel_data_64bit(KERNEL_DS, &seg); + kvm_seg_fill_gdt_64bit(vm, &seg); + + kvm_seg_set_tss_64bit(vm->arch.tss, &seg, KERNEL_TSS); + kvm_seg_fill_gdt_64bit(vm, &seg); } void vm_install_exception_handler(struct kvm_vm *vm, int vector, -- cgit v1.2.3 From b093f87fd1957cdfbe518d5bb2caa39ba80c1669 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 16:26:37 -0700 Subject: KVM: selftests: Drop @selector from segment helpers Drop the @selector from the kernel code, data, and TSS builders and instead hardcode the respective selector in the helper. Accepting a selector but not a base makes the selector useless, e.g. the data helper can't create per-vCPU for FS or GS, and so loading GS with KERNEL_DS is the only logical choice. And for code and TSS, there is no known reason to ever want multiple segments, e.g. there are zero plans to support 32-bit kernel code (and again, that would require more than just the selector). If KVM selftests ever do add support for per-vCPU segments, it'd arguably be more readable to add a dedicated helper for building/setting the per-vCPU segment, and move the common data segment code to an inner helper. Lastly, hardcoding the selector reduces the probability of setting the wrong selector in the vCPU versus what was created by the VM in the GDT. Reviewed-by: Ackerley Tng Link: https://lore.kernel.org/r/20240314232637.2538648-19-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 29 +++++++++++----------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index ee3fad41ee24..c664e446136b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -439,10 +439,10 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->base3 = segp->base >> 32; } -static void kvm_seg_set_kernel_code_64bit(uint16_t selector, struct kvm_segment *segp) +static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_CS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed @@ -453,10 +453,10 @@ static void kvm_seg_set_kernel_code_64bit(uint16_t selector, struct kvm_segment segp->present = 1; } -static void kvm_seg_set_kernel_data_64bit(uint16_t selector, struct kvm_segment *segp) +static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_DS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed @@ -481,13 +481,12 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); } -static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp, - int selector) +static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); segp->base = base; segp->limit = 0x67; - segp->selector = selector; + segp->selector = KERNEL_TSS; segp->type = 0xb; segp->present = 1; } @@ -511,11 +510,11 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(KERNEL_CS, &sregs.cs); - kvm_seg_set_kernel_data_64bit(KERNEL_DS, &sregs.ds); - kvm_seg_set_kernel_data_64bit(KERNEL_DS, &sregs.es); - kvm_seg_set_kernel_data_64bit(KERNEL_DS, &sregs.gs); - kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr, KERNEL_TSS); + kvm_seg_set_kernel_code_64bit(&sregs.cs); + kvm_seg_set_kernel_data_64bit(&sregs.ds); + kvm_seg_set_kernel_data_64bit(&sregs.es); + kvm_seg_set_kernel_data_64bit(&sregs.gs); + kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); sregs.cr3 = vm->pgd; vcpu_sregs_set(vcpu, &sregs); @@ -589,13 +588,13 @@ static void vm_init_descriptor_tables(struct kvm_vm *vm) *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; - kvm_seg_set_kernel_code_64bit(KERNEL_CS, &seg); + kvm_seg_set_kernel_code_64bit(&seg); kvm_seg_fill_gdt_64bit(vm, &seg); - kvm_seg_set_kernel_data_64bit(KERNEL_DS, &seg); + kvm_seg_set_kernel_data_64bit(&seg); kvm_seg_fill_gdt_64bit(vm, &seg); - kvm_seg_set_tss_64bit(vm->arch.tss, &seg, KERNEL_TSS); + kvm_seg_set_tss_64bit(vm->arch.tss, &seg); kvm_seg_fill_gdt_64bit(vm, &seg); } -- cgit v1.2.3 From 0db63c0b86e981a1e97d2596d64ceceba1a5470e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Apr 2024 17:25:44 -0700 Subject: bpf: Fix verifier assumptions about socket->sk The verifier assumes that 'sk' field in 'struct socket' is valid and non-NULL when 'socket' pointer itself is trusted and non-NULL. That may not be the case when socket was just created and passed to LSM socket_accept hook. Fix this verifier assumption and adjust tests. Reported-by: Liam Wisehart Acked-by: Kumar Kartikeya Dwivedi Fixes: 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.") Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20240427002544.68803-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 23 +++++++++++++++++----- .../bpf/progs/bench_local_storage_create.c | 5 +++-- tools/testing/selftests/bpf/progs/local_storage.c | 20 ++++++++++--------- tools/testing/selftests/bpf/progs/lsm_cgroup.c | 8 ++++++-- 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87ff414899cf..5d42db05315e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2368,6 +2368,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -5400,8 +5402,6 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); - /* For mark_ptr_or_null_reg */ - val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -5719,7 +5719,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* Types listed in the reg2btf_ids are always trusted */ - if (reg2btf_ids[base_type(reg->type)]) + if (reg2btf_ids[base_type(reg->type)] && + !bpf_type_has_unsafe_modifiers(reg->type)) return true; /* If a register is not referenced, it is trusted if it has the @@ -6339,6 +6340,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) #define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) +#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null) /* * Allow list few fields as RCU trusted or full trusted. @@ -6402,7 +6404,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) { struct inode *d_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct socket) { +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; @@ -6437,11 +6439,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } +static bool type_is_trusted_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, + "__safe_trusted_or_null"); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -6550,6 +6561,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; + } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) { + flag |= PTR_TRUSTED | PTR_MAYBE_NULL; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c index e4bfbba6c193..c8ec0d0368e4 100644 --- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c @@ -61,14 +61,15 @@ SEC("lsm.s/socket_post_create") int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) { + struct sock *sk = sock->sk; struct storage *stg; __u32 pid; pid = bpf_get_current_pid_tgid() >> 32; - if (pid != bench_pid) + if (pid != bench_pid || !sk) return 0; - stg = bpf_sk_storage_get(&sk_storage_map, sock->sk, NULL, + stg = bpf_sk_storage_get(&sk_storage_map, sk, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); if (stg) diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index e5e3a8b8dd07..637e75df2e14 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -140,11 +140,12 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, 0); + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0); if (!storage) return 0; @@ -155,24 +156,24 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, /* This tests that we can associate multiple elements * with the local storage. */ - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map2, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map2, sk)) return 0; - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map, sk)) return 0; /* Ensure that the sk_storage_map is disconnected from the storage. */ - if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap) + if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap) return 0; sk_storage_result = 0; @@ -185,11 +186,12 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index 02c11d16b692..d7598538aa2d 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -103,11 +103,15 @@ static __always_inline int real_bind(struct socket *sock, int addrlen) { struct sockaddr_ll sa = {}; + struct sock *sk = sock->sk; - if (sock->sk->__sk_common.skc_family != AF_PACKET) + if (!sk) + return 1; + + if (sk->__sk_common.skc_family != AF_PACKET) return 1; - if (sock->sk->sk_kern_sock) + if (sk->sk_kern_sock) return 1; bpf_probe_read_kernel(&sa, sizeof(sa), address); -- cgit v1.2.3 From 19468ed51488dae19254e8a67c75d583b05fa5e3 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Mon, 29 Apr 2024 13:23:11 +0200 Subject: selftests/bpf: Run cgroup1_hierarchy test in own mount namespace The cgroup1_hierarchy test uses setup_classid_environment to setup cgroupv1 environment. The problem is that the environment is set in /sys/fs/cgroup and therefore, if not run under an own mount namespace, effectively deletes all system cgroups: $ ls /sys/fs/cgroup | wc -l 27 $ sudo ./test_progs -t cgroup1_hierarchy #41/1 cgroup1_hierarchy/test_cgroup1_hierarchy:OK #41/2 cgroup1_hierarchy/test_root_cgid:OK #41/3 cgroup1_hierarchy/test_invalid_level:OK #41/4 cgroup1_hierarchy/test_invalid_cgid:OK #41/5 cgroup1_hierarchy/test_invalid_hid:OK #41/6 cgroup1_hierarchy/test_invalid_cgrp_name:OK #41/7 cgroup1_hierarchy/test_invalid_cgrp_name2:OK #41/8 cgroup1_hierarchy/test_sleepable_prog:OK #41 cgroup1_hierarchy:OK Summary: 1/8 PASSED, 0 SKIPPED, 0 FAILED $ ls /sys/fs/cgroup | wc -l 1 To avoid this, run setup_cgroup_environment first which will create an own mount namespace. This only affects the cgroupv1_hierarchy test as all other cgroup1 test progs already run setup_cgroup_environment prior to running setup_classid_environment. Also add a comment to the header of setup_classid_environment to warn against this invalid usage in future. Fixes: 360769233cc9 ("selftests/bpf: Add selftests for cgroup1 hierarchy") Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240429112311.402497-1-vmalik@redhat.com --- tools/testing/selftests/bpf/cgroup_helpers.c | 3 +++ tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index f2952a65dcc2..23bb9a9e6a7d 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -508,6 +508,9 @@ int cgroup_setup_and_join(const char *path) { /** * setup_classid_environment() - Setup the cgroupv1 net_cls environment * + * This function should only be called in a custom mount namespace, e.g. + * created by running setup_cgroup_environment. + * * After calling this function, cleanup_classid_environment should be called * once testing is complete. * diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c index 74d6d7546f40..25332e596750 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c @@ -87,9 +87,12 @@ void test_cgroup1_hierarchy(void) goto destroy; /* Setup cgroup1 hierarchy */ + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto destroy; err = setup_classid_environment(); if (!ASSERT_OK(err, "setup_classid_environment")) - goto destroy; + goto cleanup_cgroup; err = join_classid(); if (!ASSERT_OK(err, "join_cgroup1")) @@ -153,6 +156,8 @@ void test_cgroup1_hierarchy(void) cleanup: cleanup_classid_environment(); +cleanup_cgroup: + cleanup_cgroup_environment(); destroy: test_cgroup1_hierarchy__destroy(skel); } -- cgit v1.2.3 From 237c522c1d5d19e8d3057a38ce690c753020c7d1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 29 Apr 2024 15:07:33 +0800 Subject: selftests/bpf: Free strdup memory in test_sockmap The strdup() function returns a pointer to a new string which is a duplicate of the string "ptr". Memory for the new string is obtained with malloc(), and need to be freed with free(). This patch adds these missing "free(ptr)" in check_whitelist() and check_blacklist() to avoid memory leaks in test_sockmap.c. Signed-off-by: Geliang Tang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/b76f2f4c550aebe4ab8ea73d23c4cbe4f06ea996.1714374022.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/test_sockmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 43612de44fbf..92752f5eeded 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1887,10 +1887,13 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } @@ -1907,10 +1910,13 @@ static int check_blacklist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } -- cgit v1.2.3 From 25927d0a1bec5091d371693c9fdd9640478837de Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 29 Apr 2024 15:07:34 +0800 Subject: selftests/bpf: Free strdup memory in veristat The strdup() function returns a pointer to a new string which is a duplicate of the string "input". Memory for the new string is obtained with malloc(), and need to be freed with free(). This patch adds these missing "free(input)" in parse_stats() to avoid memory leak in veristat.c. Signed-off-by: Geliang Tang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/ded44f8865cd7f337f52fc5fb0a5fbed7d6bd641.1714374022.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/veristat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 244d4996e06e..b2854238d4a0 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -792,10 +792,13 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs) while ((next = strtok_r(state ? NULL : input, ",", &state))) { err = parse_stat(next, specs); - if (err) + if (err) { + free(input); return err; + } } + free(input); return 0; } -- cgit v1.2.3 From f973fccd43d34b096077d5d21d051ef75b22a7ea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 27 Apr 2024 20:09:53 -0700 Subject: libbpf: handle nulled-out program in struct_ops correctly If struct_ops has one of program callbacks set declaratively and host kernel is old and doesn't support this callback, libbpf will allow to load such struct_ops as long as that callback was explicitly nulled-out (presumably through skeleton). This is all working correctly, except we won't reset corresponding program slot to NULL before bailing out, which will lead to libbpf not detecting that BPF program has to be not auto-loaded. Fix this by unconditionally resetting corresponding program slot to NULL. Fixes: c911fc61a7ce ("libbpf: Skip zeroed or null fields if not found in the kernel type.") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240428030954.3918764-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 97eb6e5dd7c8..898d5d34ecea 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1148,6 +1148,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) * presented in the kernel BTF. */ if (libbpf_is_mem_zeroed(mdata, msize)) { + st_ops->progs[i] = NULL; pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); continue; -- cgit v1.2.3 From 1bba3b3d373dbafae891e7cb06b8c82c8d62aba1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 27 Apr 2024 20:09:54 -0700 Subject: selftests/bpf: validate nulled-out struct_ops program is handled properly Add a selftests validating that it's possible to have some struct_ops callback set declaratively, then disable it (by setting to NULL) programmatically. Libbpf should detect that such program should not be loaded. Otherwise, it will unnecessarily fail the loading when the host kernel does not have the type information. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240428030954.3918764-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/test_struct_ops_module.c | 18 ++++++++++++++++-- tools/testing/selftests/bpf/progs/struct_ops_module.c | 7 +++++++ 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 7cf2b9ddd3e1..bd39586abd5a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -66,6 +66,7 @@ static void test_struct_ops_load(void) * auto-loading, or it will fail to load. */ bpf_program__set_autoload(skel->progs.test_2, false); + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); err = struct_ops_module__load(skel); if (!ASSERT_OK(err, "struct_ops_module_load")) @@ -103,6 +104,10 @@ static void test_struct_ops_not_zeroed(void) if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) return; + skel->struct_ops.testmod_zeroed->zeroed = 0; + /* zeroed_op prog should be not loaded automatically now */ + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; + err = struct_ops_module__load(skel); ASSERT_OK(err, "struct_ops_module_load"); @@ -118,6 +123,7 @@ static void test_struct_ops_not_zeroed(void) * value of "zeroed" is non-zero. */ skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef; + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; err = struct_ops_module__load(skel); ASSERT_ERR(err, "struct_ops_module_load_not_zeroed"); @@ -148,15 +154,23 @@ static void test_struct_ops_incompatible(void) { struct struct_ops_module *skel; struct bpf_link *link; + int err; - skel = struct_ops_module__open_and_load(); - if (!ASSERT_OK_PTR(skel, "open_and_load")) + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) return; + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); + + err = struct_ops_module__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); if (ASSERT_OK_PTR(link, "attach_struct_ops")) bpf_link__destroy(link); +cleanup: struct_ops_module__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 63b065dae002..40109be2b3ae 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -63,10 +63,17 @@ struct bpf_testmod_ops___zeroed { int zeroed; }; +SEC("?struct_ops/test_3") +int BPF_PROG(zeroed_op) +{ + return 1; +} + SEC(".struct_ops.link") struct bpf_testmod_ops___zeroed testmod_zeroed = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, + .zeroed_op = (void *)zeroed_op, }; struct bpf_testmod_ops___incompatible { -- cgit v1.2.3 From 0cfe71f45f420e412fda2395807a56c453a6e0b6 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 26 Apr 2024 11:39:27 +0800 Subject: netdev: add queue stats These stats are commonly. Support reporting those via netdev-genl queue stats. name: rx-hw-drops name: rx-hw-drop-overruns name: rx-csum-unnecessary name: rx-csum-none name: rx-csum-bad name: rx-hw-gro-packets name: rx-hw-gro-bytes name: rx-hw-gro-wire-packets name: rx-hw-gro-wire-bytes name: rx-hw-drop-ratelimits name: tx-hw-drops name: tx-hw-drop-errors name: tx-csum-none name: tx-needs-csum name: tx-hw-gso-packets name: tx-hw-gso-bytes name: tx-hw-gso-wire-packets name: tx-hw-gso-wire-bytes name: tx-hw-drop-ratelimits Signed-off-by: Xuan Zhuo Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 104 ++++++++++++++++++++++++++++++++ include/net/netdev_queues.h | 27 +++++++++ include/uapi/linux/netdev.h | 19 ++++++ net/core/netdev-genl.c | 23 ++++++- tools/include/uapi/linux/netdev.h | 19 ++++++ 5 files changed, 190 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 679c4130707c..2be4b3714d17 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -335,6 +335,110 @@ attribute-sets: Allocation failure may, or may not result in a packet drop, depending on driver implementation and whether system recovers quickly. type: uint + - + name: rx-hw-drops + doc: | + Number of all packets which entered the device, but never left it, + including but not limited to: packets dropped due to lack of buffer + space, processing errors, explicit or implicit policies and packet + filters. + type: uint + - + name: rx-hw-drop-overruns + doc: | + Number of packets dropped due to transient lack of resources, such as + buffer space, host descriptors etc. + type: uint + - + name: rx-csum-unnecessary + doc: Number of packets that were marked as CHECKSUM_UNNECESSARY. + type: uint + - + name: rx-csum-none + doc: Number of packets that were not checksummed by device. + type: uint + - + name: rx-csum-bad + doc: | + Number of packets with bad checksum. The packets are not discarded, + but still delivered to the stack. + type: uint + - + name: rx-hw-gro-packets + doc: | + Number of packets that were coalesced from smaller packets by the device. + Counts only packets coalesced with the HW-GRO netdevice feature, + LRO-coalesced packets are not counted. + type: uint + - + name: rx-hw-gro-bytes + doc: See `rx-hw-gro-packets`. + type: uint + - + name: rx-hw-gro-wire-packets + doc: | + Number of packets that were coalesced to bigger packetss with the HW-GRO + netdevice feature. LRO-coalesced packets are not counted. + type: uint + - + name: rx-hw-gro-wire-bytes + doc: See `rx-hw-gro-wire-packets`. + type: uint + - + name: rx-hw-drop-ratelimits + doc: | + Number of the packets dropped by the device due to the received + packets bitrate exceeding the device rate limit. + type: uint + - + name: tx-hw-drops + doc: | + Number of packets that arrived at the device but never left it, + encompassing packets dropped for reasons such as processing errors, as + well as those affected by explicitly defined policies and packet + filtering criteria. + type: uint + - + name: tx-hw-drop-errors + doc: Number of packets dropped because they were invalid or malformed. + type: uint + - + name: tx-csum-none + doc: | + Number of packets that did not require the device to calculate the + checksum. + type: uint + - + name: tx-needs-csum + doc: | + Number of packets that required the device to calculate the checksum. + type: uint + - + name: tx-hw-gso-packets + doc: | + Number of packets that necessitated segmentation into smaller packets + by the device. + type: uint + - + name: tx-hw-gso-bytes + doc: See `tx-hw-gso-packets`. + type: uint + - + name: tx-hw-gso-wire-packets + doc: | + Number of wire-sized packets generated by processing + `tx-hw-gso-packets` + type: uint + - + name: tx-hw-gso-wire-bytes + doc: See `tx-hw-gso-wire-packets`. + type: uint + - + name: tx-hw-drop-ratelimits + doc: | + Number of the packets dropped by the device due to the transmit + packets bitrate exceeding the device rate limit. + type: uint operations: list: diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 1ec408585373..c7ac4539eafc 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -9,11 +9,38 @@ struct netdev_queue_stats_rx { u64 bytes; u64 packets; u64 alloc_fail; + + u64 hw_drops; + u64 hw_drop_overruns; + + u64 csum_unnecessary; + u64 csum_none; + u64 csum_bad; + + u64 hw_gro_packets; + u64 hw_gro_bytes; + u64 hw_gro_wire_packets; + u64 hw_gro_wire_bytes; + + u64 hw_drop_ratelimits; }; struct netdev_queue_stats_tx { u64 bytes; u64 packets; + + u64 hw_drops; + u64 hw_drop_errors; + + u64 csum_none; + u64 needs_csum; + + u64 hw_gso_packets; + u64 hw_gso_bytes; + u64 hw_gso_wire_packets; + u64 hw_gso_wire_bytes; + + u64 hw_drop_ratelimits; }; /** diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index bb65ee840cda..cf24f1d9adf8 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -146,6 +146,25 @@ enum { NETDEV_A_QSTATS_TX_PACKETS, NETDEV_A_QSTATS_TX_BYTES, NETDEV_A_QSTATS_RX_ALLOC_FAIL, + NETDEV_A_QSTATS_RX_HW_DROPS, + NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, + NETDEV_A_QSTATS_RX_CSUM_NONE, + NETDEV_A_QSTATS_RX_CSUM_BAD, + NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_BYTES, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, + NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_HW_DROPS, + NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, + NETDEV_A_QSTATS_TX_CSUM_NONE, + NETDEV_A_QSTATS_TX_NEEDS_CSUM, + NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_BYTES, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, + NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, __NETDEV_A_QSTATS_MAX, NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index dd6510f2c652..4b5054087309 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -489,7 +489,17 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } @@ -498,7 +508,16 @@ static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index bb65ee840cda..cf24f1d9adf8 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -146,6 +146,25 @@ enum { NETDEV_A_QSTATS_TX_PACKETS, NETDEV_A_QSTATS_TX_BYTES, NETDEV_A_QSTATS_RX_ALLOC_FAIL, + NETDEV_A_QSTATS_RX_HW_DROPS, + NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, + NETDEV_A_QSTATS_RX_CSUM_NONE, + NETDEV_A_QSTATS_RX_CSUM_BAD, + NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_BYTES, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, + NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_HW_DROPS, + NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, + NETDEV_A_QSTATS_TX_CSUM_NONE, + NETDEV_A_QSTATS_TX_NEEDS_CSUM, + NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_BYTES, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, + NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, __NETDEV_A_QSTATS_MAX, NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) -- cgit v1.2.3 From 05cbc217aafbc631a6c2fab4accf95850cb48358 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:26 +0200 Subject: selftests/bpf: Drop an unused local variable Some copy/paste leftover, this is never used. Fixes: e3d9eac99afd ("selftests/bpf: wq: add bpf_wq_init() checks") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-3-27afe7f3b17c@kernel.org --- tools/testing/selftests/bpf/prog_tests/wq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index c4bacd3160e1..99e438fe12ac 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -36,7 +36,5 @@ void serial_test_wq(void) void serial_test_failures_wq(void) { - LIBBPF_OPTS(bpf_test_run_opts, topts); - RUN_TESTS(wq_failures); } -- cgit v1.2.3 From f581bcf02f0e0b42516bfeb3e34860919b9e78d2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Apr 2024 12:57:28 +0200 Subject: selftests: netfilter: avoid test timeouts on debug kernels Jakub reports that some tests fail on netdev CI when executed in a debug kernel. Increase test timeout to 30m, this should hopefully be enough. Also reduce test duration where possible for "slow" machines. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240429105736.22677-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/br_netfilter.sh | 6 +++++- tools/testing/selftests/net/netfilter/nft_nat_zones.sh | 1 + tools/testing/selftests/net/netfilter/nft_zones_many.sh | 4 +++- tools/testing/selftests/net/netfilter/settings | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh index d7806753f5de..c28379a965d8 100755 --- a/tools/testing/selftests/net/netfilter/br_netfilter.sh +++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh @@ -40,7 +40,11 @@ bcast_ping() fromns="$1" dstip="$2" - for i in $(seq 1 500); do + local packets=500 + + [ "$KSFT_MACHINE_SLOW" = yes ] && packets=100 + + for i in $(seq 1 $packets); do if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then echo "ERROR: ping -b from $fromns to $dstip" ip netns exec "$ns0" nft list ruleset diff --git a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh index 549f264b41f3..3b81d88bdde3 100755 --- a/tools/testing/selftests/net/netfilter/nft_nat_zones.sh +++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh @@ -13,6 +13,7 @@ maxclients=100 have_socat=0 ret=0 +[ "$KSFT_MACHINE_SLOW" = yes ] && maxclients=40 # client1---. # veth1-. # | diff --git a/tools/testing/selftests/net/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh index 4ad75038f6ff..7db9982ba5a6 100755 --- a/tools/testing/selftests/net/netfilter/nft_zones_many.sh +++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh @@ -6,6 +6,8 @@ source lib.sh zones=2000 +[ "$KSFT_MACHINE_SLOW" = yes ] && zones=500 + have_ct_tool=0 ret=0 @@ -89,7 +91,7 @@ fi count=$(ip netns exec "$ns1" conntrack -C) duration=$((stop-outerstart)) - if [ "$count" -eq "$max_zones" ]; then + if [ "$count" -ge "$max_zones" ]; then echo "PASS: inserted $count entries from packet path in $duration ms total" else ip netns exec "$ns1" conntrack -S 1>&2 diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings index 288bd9704773..abc5648b59ab 100644 --- a/tools/testing/selftests/net/netfilter/settings +++ b/tools/testing/selftests/net/netfilter/settings @@ -1 +1 @@ -timeout=500 +timeout=1800 -- cgit v1.2.3 From ff4b2bfa63bd07cca35f6e704dc5035650595950 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2024 07:44:22 -0700 Subject: selftests: drv-net-hw: support using Python from net hw tests We created a separate directory for HW-only tests, recently. Glue in the Python test library there, Python is a bit annoying when it comes to using library code located "lower" in the directory structure. Reuse the Env class, but let tests require non-nsim setup. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240429144426.743476-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/drivers/net/hw/Makefile | 1 + .../testing/selftests/drivers/net/hw/lib/py/__init__.py | 16 ++++++++++++++++ tools/testing/selftests/drivers/net/lib/py/env.py | 10 ++++++++-- 4 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/hw/lib/py/__init__.py (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2c940e9c4ced..9039f3709aff 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -119,7 +119,7 @@ TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug # Networking tests want the net/lib target, include it automatically -ifneq ($(filter net drivers/net,$(TARGETS)),) +ifneq ($(filter net drivers/net drivers/net/hw,$(TARGETS)),) ifeq ($(filter net/lib,$(TARGETS)),) INSTALL_DEP_TARGETS := net/lib endif diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 2259a39a70ed..95f32158b095 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -16,6 +16,7 @@ TEST_FILES := \ # TEST_INCLUDES := \ + $(wildcard lib/py/*.py ../lib/py/*.py) \ ../../../net/lib.sh \ ../../../net/forwarding/lib.sh \ ../../../net/forwarding/ipip_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py new file mode 100644 index 000000000000..b582885786f5 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +import sys +from pathlib import Path + +KSFT_DIR = (Path(__file__).parent / "../../../../..").resolve() + +try: + sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * + from drivers.net.lib.py import * +except ModuleNotFoundError as e: + ksft_pr("Failed importing `net` library from kernel sources") + ksft_pr(str(e)) + ktap_result(True, comment="SKIP") + sys.exit(4) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index e2ab637e56dc..5c8f695b2536 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -2,7 +2,7 @@ import os from pathlib import Path -from lib.py import KsftSkipEx +from lib.py import KsftSkipEx, KsftXfailEx from lib.py import cmd, ip from lib.py import NetNS, NetdevSimDev from .remote import Remote @@ -76,7 +76,7 @@ class NetDrvEpEnv: nsim_v4_pfx = "192.0.2." nsim_v6_pfx = "2001:db8::" - def __init__(self, src_path): + def __init__(self, src_path, nsim_test=None): self.env = _load_env_file(src_path) @@ -88,7 +88,10 @@ class NetDrvEpEnv: self._ns_peer = None if "NETIF" in self.env: + if nsim_test is True: + raise KsftXfailEx("Test only works on netdevsim") self._check_env() + self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] self.v4 = self.env.get("LOCAL_V4") @@ -98,6 +101,9 @@ class NetDrvEpEnv: kind = self.env["REMOTE_TYPE"] args = self.env["REMOTE_ARGS"] else: + if nsim_test is False: + raise KsftXfailEx("Test does not work on netdevsim") + self.create_local() self.dev = self._ns.nsims[0].dev -- cgit v1.2.3 From 32a4ca1361d7a51e5003d4af4dfbf570f1b5fd00 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2024 07:44:23 -0700 Subject: selftests: net: py: extract tool logic The main use of the ip() wrapper over cmd() is that it can parse JSON. cmd("ip -j link show") will return stdout as a string, and test has to call json.loads(). With ip("link show", json=True) the return value will be already parsed. More tools (ethtool, bpftool etc.) support the --json switch. To avoid having to wrap all of them individually create a tool() helper. Switch from -j to --json (for ethtool). While at it consume the netns attribute at the ip() level. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240429144426.743476-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index d3715e6c21f2..4930a90a64ea 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -56,10 +56,10 @@ class bkg(cmd): return self.process(terminate=self.terminate) -def ip(args, json=None, ns=None, host=None): - cmd_str = "ip " +def tool(name, args, json=None, ns=None, host=None): + cmd_str = name + ' ' if json: - cmd_str += '-j ' + cmd_str += '--json ' cmd_str += args cmd_obj = cmd(cmd_str, ns=ns, host=host) if json: @@ -67,6 +67,12 @@ def ip(args, json=None, ns=None, host=None): return cmd_obj +def ip(args, json=None, ns=None, host=None): + if ns: + args = f'-netns {ns} ' + args + return tool('ip', args, json=json, host=host) + + def rand_port(): """ Get unprivileged port, for now just random, one day we may decide to check if used. -- cgit v1.2.3 From ee2512d6bf4168998f3c218fb33ed50544e69e3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2024 07:44:24 -0700 Subject: selftests: net: py: avoid all ports < 10k When picking TCP ports to use, avoid all below 10k. This should lower the chance of collision or running afoul whatever random policies may be on the host. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240429144426.743476-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 4930a90a64ea..b57d467afd0f 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -77,7 +77,7 @@ def rand_port(): """ Get unprivileged port, for now just random, one day we may decide to check if used. """ - return random.randint(1024, 65535) + return random.randint(10000, 65535) def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): -- cgit v1.2.3 From 0f0cdf312ecc06e63fbba95caf2844e2c405b076 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2024 07:44:25 -0700 Subject: selftests: drv-net: support generating iperf3 load While we are not very interested in testing performance it's useful to be able to generate a lot of traffic. iperf is the simplest way of getting relatively high PPS. Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240429144426.743476-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/lib/py/__init__.py | 1 + tools/testing/selftests/drivers/net/lib/py/load.py | 41 ++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/lib/py/load.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 4789c1a4282d..401e70f7f136 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -15,4 +15,5 @@ except ModuleNotFoundError as e: sys.exit(4) from .env import * +from .load import * from .remote import Remote diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py new file mode 100644 index 000000000000..abdb677bdb1c --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 + +import time + +from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen + +class GenerateTraffic: + def __init__(self, env): + env.require_cmd("iperf3", remote=True) + + self.env = env + + port = rand_port() + self._iperf_server = cmd(f"iperf3 -s -p {port}", background=True) + wait_port_listen(port) + time.sleep(0.1) + self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400", + background=True, host=env.remote) + + # Wait for traffic to ramp up + pkt = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"] + for _ in range(50): + time.sleep(0.1) + now = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"] + if now - pkt > 1000: + return + pkt = now + self.stop(verbose=True) + raise Exception("iperf3 traffic did not ramp up") + + def stop(self, verbose=None): + self._iperf_client.process(terminate=True) + if verbose: + ksft_pr(">> Client:") + ksft_pr(self._iperf_client.stdout) + ksft_pr(self._iperf_client.stderr) + self._iperf_server.process(terminate=True) + if verbose: + ksft_pr(">> Server:") + ksft_pr(self._iperf_server.stdout) + ksft_pr(self._iperf_server.stderr) -- cgit v1.2.3 From 9da271f825e42156058a2eb09360bc993853bbba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2024 07:44:26 -0700 Subject: selftests: drv-net-hw: add test for memory allocation failures with page pool Bugs in memory allocation failure paths are quite common. Add a test exercising those paths based on qstat and page pool failure hook. Running on bnxt: # ./drivers/net/hw/pp_alloc_fail.py KTAP version 1 1..1 # ethtool -G change retval: success ok 1 pp_alloc_fail.test_pp_alloc # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 I initially wrote this test to validate commit be43b7489a3c ("net/mlx5e: RX, Fix page_pool allocation failure recovery for striding rq") but mlx5 still doesn't have qstat. So I run it on bnxt, and while bnxt survives I found the problem fixed in commit 730117730709 ("eth: bnxt: fix counting packets discarded due to OOM and netpoll"). Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240429144426.743476-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/Makefile | 1 + .../selftests/drivers/net/hw/pp_alloc_fail.py | 129 +++++++++++++++++++++ tools/testing/selftests/net/lib/py/ksft.py | 4 + 3 files changed, 134 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 95f32158b095..1dd732855d76 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -9,6 +9,7 @@ TEST_PROGS = \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ loopback.sh \ + pp_alloc_fail.py \ # TEST_FILES := \ diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py new file mode 100755 index 000000000000..026d98976c35 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time +import os +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import NetdevFamily, NlError +from lib.py import NetDrvEpEnv +from lib.py import cmd, tool, GenerateTraffic + + +def _write_fail_config(config): + for key, value in config.items(): + with open("/sys/kernel/debug/fail_function/" + key, "w") as fp: + fp.write(str(value) + "\n") + + +def _enable_pp_allocation_fail(): + if not os.path.exists("/sys/kernel/debug/fail_function"): + raise KsftSkipEx("Kernel built without function error injection (or DebugFS)") + + if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"): + with open("/sys/kernel/debug/fail_function/inject", "w") as fp: + fp.write("page_pool_alloc_pages\n") + + _write_fail_config({ + "verbose": 0, + "interval": 511, + "probability": 100, + "times": -1, + }) + + +def _disable_pp_allocation_fail(): + if not os.path.exists("/sys/kernel/debug/fail_function"): + return + + if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"): + with open("/sys/kernel/debug/fail_function/inject", "w") as fp: + fp.write("\n") + + _write_fail_config({ + "probability": 0, + "times": 0, + }) + + +def test_pp_alloc(cfg, netdevnl): + def get_stats(): + return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + + def check_traffic_flowing(): + stat1 = get_stats() + time.sleep(1) + stat2 = get_stats() + if stat2['rx-packets'] - stat1['rx-packets'] < 15000: + raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets']) + + + try: + stats = get_stats() + except NlError as e: + if e.nl_msg.error == -95: + stats = {} + else: + raise + if 'rx-alloc-fail' not in stats: + raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats") + + set_g = False + traffic = None + try: + traffic = GenerateTraffic(cfg) + + check_traffic_flowing() + + _enable_pp_allocation_fail() + + s1 = get_stats() + time.sleep(3) + s2 = get_stats() + + if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1: + raise KsftSkipEx("Allocation failures not increasing") + if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100: + raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'], + "packets:", s2['rx-packets'] - s1['rx-packets']) + + # Basic failures are fine, try to wobble some settings to catch extra failures + check_traffic_flowing() + g = tool("ethtool", "-g " + cfg.ifname, json=True)[0] + if 'rx' in g and g["rx"] * 2 <= g["rx-max"]: + new_g = g['rx'] * 2 + elif 'rx' in g: + new_g = g['rx'] // 2 + else: + new_g = None + + if new_g: + set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0 + if set_g: + ksft_pr("ethtool -G change retval: success") + else: + ksft_pr("ethtool -G change retval: did not succeed", new_g) + else: + ksft_pr("ethtool -G change retval: did not try") + + time.sleep(0.1) + check_traffic_flowing() + finally: + _disable_pp_allocation_fail() + if traffic: + traffic.stop() + time.sleep(0.1) + if set_g: + cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}") + + +def main() -> None: + netdevnl = NetdevFamily() + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + + ksft_run([test_pp_alloc], args=(cfg, netdevnl, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index f84e9fdd0032..4769b4eb1ea1 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -11,6 +11,10 @@ KSFT_RESULT = None KSFT_RESULT_ALL = True +class KsftFailEx(Exception): + pass + + class KsftSkipEx(Exception): pass -- cgit v1.2.3 From 535a3692ba7245792e6f23654507865d4293c850 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:24 +0200 Subject: bpf: Add support for kprobe session attach Adding support to attach bpf program for entry and return probe of the same function. This is common use case which at the moment requires to create two kprobe multi links. Adding new BPF_TRACE_KPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the bpf program on return probe simply by returning zero or non zero from the entry bpf program execution to execute or not the bpf program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-2-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 7 ++++++- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++-------- tools/include/uapi/linux/bpf.h | 1 + 4 files changed, 28 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94a72593ead..90706a47f6ff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f655adf42e39..13ad74ecf2cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4016,11 +4016,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_KPROBE_SESSION) + return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; @@ -5281,7 +5285,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) ret = bpf_uprobe_multi_link_attach(attr, prog); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ba722b57af3..06a9671834b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1631,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static bool is_kprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + +static inline bool is_kprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1646,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_attach_cookie_proto_umulti; @@ -2834,10 +2845,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, void *data) { struct bpf_kprobe_multi_link *link; + int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); - return 0; + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + return is_kprobe_session(link->link.prog) ? err : 0; } static void @@ -2981,7 +2993,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + if (!is_kprobe_multi(prog)) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; @@ -3062,10 +3074,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error; - if (flags & BPF_F_KPROBE_MULTI_RETURN) - link->fp.exit_handler = kprobe_multi_link_exit_handler; - else + if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; + if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) + link->fp.exit_handler = kprobe_multi_link_exit_handler; link->addrs = addrs; link->cookies = cookies; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d94a72593ead..90706a47f6ff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 2ca178f02b2f4e523e970894def16282e4adbc39 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:27 +0200 Subject: libbpf: Add support for kprobe session attach Adding support to attach program in kprobe session mode with bpf_program__attach_kprobe_multi_opts function. Adding session bool to bpf_kprobe_multi_opts struct that allows to load and attach the bpf program via kprobe session. the attachment to create kprobe multi session. Also adding new program loader section that allows: SEC("kprobe.session/bpf_fentry_test*") and loads/attaches kprobe program as kprobe session. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-5-jolsa@kernel.org --- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/libbpf.c | 39 +++++++++++++++++++++++++++++++++++++-- tools/lib/bpf/libbpf.h | 4 +++- 3 files changed, 41 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index c9f4e04f38fe..466a29d80124 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -766,6 +766,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); break; case BPF_TRACE_KPROBE_MULTI: + case BPF_TRACE_KPROBE_SESSION: attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 898d5d34ecea..16dae279a900 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9273,6 +9273,7 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -9289,6 +9290,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kprobe.session+", KPROBE, BPF_TRACE_KPROBE_SESSION, SEC_NONE, attach_kprobe_session), SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), @@ -11381,13 +11383,14 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, struct kprobe_multi_resolve res = { .pattern = pattern, }; + enum bpf_attach_type attach_type; struct bpf_link *link = NULL; char errmsg[STRERR_BUFSIZE]; const unsigned long *addrs; int err, link_fd, prog_fd; + bool retprobe, session; const __u64 *cookies; const char **syms; - bool retprobe; size_t cnt; if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) @@ -11426,6 +11429,12 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } retprobe = OPTS_GET(opts, retprobe, false); + session = OPTS_GET(opts, session, false); + + if (retprobe && session) + return libbpf_err_ptr(-EINVAL); + + attach_type = session ? BPF_TRACE_KPROBE_SESSION : BPF_TRACE_KPROBE_MULTI; lopts.kprobe_multi.syms = syms; lopts.kprobe_multi.addrs = addrs; @@ -11440,7 +11449,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + link_fd = bpf_link_create(prog_fd, 0, attach_type, &lopts); if (link_fd < 0) { err = -errno; pr_warn("prog '%s': failed to attach: %s\n", @@ -11546,6 +11555,32 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru return libbpf_get_error(*link); } +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, + struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, .session = true); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.session") */ + if (strcmp(prog->sec_name, "kprobe.session") == 0) + return 0; + + spec = prog->sec_name + sizeof("kprobe.session/") - 1; + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe session pattern is invalid: %s\n", pattern); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return *link ? 0 : -errno; +} + static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) { char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1333ae20ebe6..c3f77d9260fe 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -539,10 +539,12 @@ struct bpf_kprobe_multi_opts { size_t cnt; /* create return kprobes */ bool retprobe; + /* create session kprobes */ + bool session; size_t :0; }; -#define bpf_kprobe_multi_opts__last_field retprobe +#define bpf_kprobe_multi_opts__last_field session LIBBPF_API struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, -- cgit v1.2.3 From 7b94965429f2fa32a83e1275c6bf6ed0add08603 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:28 +0200 Subject: libbpf: Add kprobe session attach type name to attach_type_name Adding kprobe session attach type name to attach_type_name, so libbpf_bpf_attach_type_str returns proper string name for BPF_TRACE_KPROBE_SESSION attach type. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-6-jolsa@kernel.org --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 16dae279a900..7667671187e9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -132,6 +132,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", [BPF_NETKIT_PRIMARY] = "netkit_primary", [BPF_NETKIT_PEER] = "netkit_peer", + [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From 0983b1697aefbf69f465f907b934b89bbce467ea Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:29 +0200 Subject: selftests/bpf: Add kprobe session test Adding kprobe session test and testing that the entry program return value controls execution of the return probe program. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-7-jolsa@kernel.org --- tools/testing/selftests/bpf/bpf_kfuncs.h | 2 + .../selftests/bpf/prog_tests/kprobe_multi_test.c | 39 +++++++++++ .../selftests/bpf/progs/kprobe_multi_session.c | 79 ++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_session.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 14ebe7d9e1a3..cb946d9fd63a 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -75,4 +75,6 @@ extern void bpf_key_put(struct bpf_key *key) __ksym; extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; + +extern bool bpf_session_is_return(void) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 51628455b6f5..42d6592f1e7f 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -4,6 +4,7 @@ #include "trace_helpers.h" #include "kprobe_multi_empty.skel.h" #include "kprobe_multi_override.skel.h" +#include "kprobe_multi_session.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -326,6 +327,42 @@ cleanup: kprobe_multi__destroy(skel); } +static void test_session_skel_api(void) +{ + struct kprobe_multi_session *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int i, err, prog_fd; + + skel = kprobe_multi_session__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_session__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_session__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + /* bpf_fentry_test1-4 trigger return probe, result is 2 */ + for (i = 0; i < 4; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 2, "kprobe_session_result"); + + /* bpf_fentry_test5-8 trigger only entry probe, result is 1 */ + for (i = 4; i < 8; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 1, "kprobe_session_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session__destroy(skel); +} + static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); @@ -690,4 +727,6 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("session")) + test_session_skel_api(); } diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c new file mode 100644 index 000000000000..bbba9eb46551 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) + +char _license[] SEC("license") = "GPL"; + +extern const void bpf_fentry_test1 __ksym; +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_fentry_test5 __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; +extern const void bpf_fentry_test8 __ksym; + +int pid = 0; + +__u64 kprobe_session_result[8]; + +static int session_check(void *ctx) +{ + unsigned int i; + __u64 addr; + const void *kfuncs[] = { + &bpf_fentry_test1, + &bpf_fentry_test2, + &bpf_fentry_test3, + &bpf_fentry_test4, + &bpf_fentry_test5, + &bpf_fentry_test6, + &bpf_fentry_test7, + &bpf_fentry_test8, + }; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + addr = bpf_get_func_ip(ctx); + + for (i = 0; i < ARRAY_SIZE(kfuncs); i++) { + if (kfuncs[i] == (void *) addr) { + kprobe_session_result[i]++; + break; + } + } + + /* + * Force probes for function bpf_fentry_test[5-8] not to + * install and execute the return probe + */ + if (((const void *) addr == &bpf_fentry_test5) || + ((const void *) addr == &bpf_fentry_test6) || + ((const void *) addr == &bpf_fentry_test7) || + ((const void *) addr == &bpf_fentry_test8)) + return 1; + + return 0; +} + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +SEC("kprobe.session/bpf_fentry_test*") +int test_kprobe(struct pt_regs *ctx) +{ + return session_check(ctx); +} -- cgit v1.2.3 From a3a5113393ccfad2eb23ca091aa6e55b5bd67eb4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:30 +0200 Subject: selftests/bpf: Add kprobe session cookie test Adding kprobe session test that verifies the cookie value get properly propagated from entry to return program. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-8-jolsa@kernel.org --- tools/testing/selftests/bpf/bpf_kfuncs.h | 1 + .../selftests/bpf/prog_tests/kprobe_multi_test.c | 35 +++++++++++++ .../bpf/progs/kprobe_multi_session_cookie.c | 58 ++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index cb946d9fd63a..be91a6919315 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -77,4 +77,5 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_key *trusted_keyring) __ksym; extern bool bpf_session_is_return(void) __ksym __weak; +extern long *bpf_session_cookie(void) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 42d6592f1e7f..960c9323d1e0 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -5,6 +5,7 @@ #include "kprobe_multi_empty.skel.h" #include "kprobe_multi_override.skel.h" #include "kprobe_multi_session.skel.h" +#include "kprobe_multi_session_cookie.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -363,6 +364,38 @@ cleanup: kprobe_multi_session__destroy(skel); } +static void test_session_cookie_skel_api(void) +{ + struct kprobe_multi_session_cookie *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int err, prog_fd; + + skel = kprobe_multi_session_cookie__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session_cookie__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_wrapper__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + ASSERT_EQ(skel->bss->test_kprobe_1_result, 1, "test_kprobe_1_result"); + ASSERT_EQ(skel->bss->test_kprobe_2_result, 2, "test_kprobe_2_result"); + ASSERT_EQ(skel->bss->test_kprobe_3_result, 3, "test_kprobe_3_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session_cookie__destroy(skel); +} + static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); @@ -729,4 +762,6 @@ void test_kprobe_multi_test(void) test_attach_override(); if (test__start_subtest("session")) test_session_skel_api(); + if (test__start_subtest("session_cookie")) + test_session_cookie_skel_api(); } diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c new file mode 100644 index 000000000000..d49070803e22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +__u64 test_kprobe_1_result = 0; +__u64 test_kprobe_2_result = 0; +__u64 test_kprobe_3_result = 0; + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +static int check_cookie(__u64 val, __u64 *result) +{ + long *cookie; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + cookie = bpf_session_cookie(); + + if (bpf_session_is_return()) + *result = *cookie == val ? val : 0; + else + *cookie = val; + return 0; +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_1(struct pt_regs *ctx) +{ + return check_cookie(1, &test_kprobe_1_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_2(struct pt_regs *ctx) +{ + return check_cookie(2, &test_kprobe_2_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_3(struct pt_regs *ctx) +{ + return check_cookie(3, &test_kprobe_3_result); +} -- cgit v1.2.3 From d70b2660e75b85bdaa9d75f9c4224c2f6f89cf23 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 26 Apr 2024 16:16:19 -0700 Subject: selftests/bpf: Extend sockopt tests to use BPF_LINK_CREATE Run all existing test cases with the attachment created via BPF_LINK_CREATE. Next commit will add extra test cases to verify link_create attach_type enforcement. Signed-off-by: Stanislav Fomichev Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240426231621.2716876-3-sdf@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockopt.c | 25 ++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c index 5a4491d4edfe..dea340996e97 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c @@ -1036,9 +1036,10 @@ static int call_getsockopt(bool use_io_uring, int fd, int level, int optname, return getsockopt(fd, level, optname, optval, optlen); } -static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring) +static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring, + bool use_link) { - int sock_fd, err, prog_fd; + int sock_fd, err, prog_fd, link_fd = -1; void *optval = NULL; int ret = 0; @@ -1051,7 +1052,12 @@ static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring) return -1; } - err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0); + if (use_link) { + err = bpf_link_create(prog_fd, cgroup_fd, test->attach_type, NULL); + link_fd = err; + } else { + err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0); + } if (err < 0) { if (test->error == DENY_ATTACH) goto close_prog_fd; @@ -1142,7 +1148,12 @@ free_optval: close_sock_fd: close(sock_fd); detach_prog: - bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type); + if (use_link) { + if (link_fd >= 0) + close(link_fd); + } else { + bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type); + } close_prog_fd: close(prog_fd); return ret; @@ -1160,10 +1171,12 @@ void test_sockopt(void) if (!test__start_subtest(tests[i].descr)) continue; - ASSERT_OK(run_test(cgroup_fd, &tests[i], false), + ASSERT_OK(run_test(cgroup_fd, &tests[i], false, false), + tests[i].descr); + ASSERT_OK(run_test(cgroup_fd, &tests[i], false, true), tests[i].descr); if (tests[i].io_uring_support) - ASSERT_OK(run_test(cgroup_fd, &tests[i], true), + ASSERT_OK(run_test(cgroup_fd, &tests[i], true, false), tests[i].descr); } -- cgit v1.2.3 From 095ddb501b39b7842e5da555915ad89e370b9888 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 26 Apr 2024 16:16:20 -0700 Subject: selftests/bpf: Add sockopt case to verify prog_type Make sure only sockopt programs can be attached to the setsockopt and getsockopt hooks. Signed-off-by: Stanislav Fomichev Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240426231621.2716876-4-sdf@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockopt.c | 40 ++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c index dea340996e97..eaac83a7f388 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c @@ -24,6 +24,7 @@ enum sockopt_test_error { static struct sockopt_test { const char *descr; const struct bpf_insn insns[64]; + enum bpf_prog_type prog_type; enum bpf_attach_type attach_type; enum bpf_attach_type expected_attach_type; @@ -928,9 +929,40 @@ static struct sockopt_test { .error = EPERM_SETSOCKOPT, }, + + /* ==================== prog_type ==================== */ + + { + .descr = "can attach only BPF_CGROUP_SETSOCKOP", + .insns = { + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_SETSOCKOPT, + .expected_attach_type = 0, + .error = DENY_ATTACH, + }, + + { + .descr = "can attach only BPF_CGROUP_GETSOCKOP", + .insns = { + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_GETSOCKOPT, + .expected_attach_type = 0, + .error = DENY_ATTACH, + }, }; static int load_prog(const struct bpf_insn *insns, + enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { LIBBPF_OPTS(bpf_prog_load_opts, opts, @@ -947,7 +979,7 @@ static int load_prog(const struct bpf_insn *insns, } insns_cnt++; - fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCKOPT, NULL, "GPL", insns, insns_cnt, &opts); + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); if (verbose && fd < 0) fprintf(stderr, "%s\n", bpf_log_buf); @@ -1039,11 +1071,15 @@ static int call_getsockopt(bool use_io_uring, int fd, int level, int optname, static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring, bool use_link) { + int prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT; int sock_fd, err, prog_fd, link_fd = -1; void *optval = NULL; int ret = 0; - prog_fd = load_prog(test->insns, test->expected_attach_type); + if (test->prog_type) + prog_type = test->prog_type; + + prog_fd = load_prog(test->insns, prog_type, test->expected_attach_type); if (prog_fd < 0) { if (test->error == DENY_LOAD) return 0; -- cgit v1.2.3 From 364ee9f3265e486772f445f437273a9c94b66d4b Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 1 Apr 2024 22:31:58 -0700 Subject: cxl/test: Enhance event testing An issue was found in the processing of event logs when the output buffer length was not reset.[1] This bug was not caught with cxl-test for 2 reasons. First, the test harness mbox_send command [mock_get_event()] does not set the output size based on the amount of data returned like the hardware command does. Second, the simplistic event log testing always returned the same number of elements per-get command. Enhance the simulation of the event log mailbox to better match the bug found with real hardware to cover potential regressions. NOTE: These changes will cause cxl-events.sh in ndctl to fail without the fix from Kwangjin. However, no changes to the user space test was required. Therefore ndctl itself will be compatible with old or new kernels once both patches land in the new kernel. [1] Link: https://lore.kernel.org/all/20240401091057.1044-1-kwangjin.ko@sk.com/ Cc: Kwangjin Ko Signed-off-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240401-enhance-event-test-v1-1-6669a524ed38@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/mem.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 35ee41e435ab..6584443144de 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -127,7 +127,7 @@ static struct { #define CXL_TEST_EVENT_CNT_MAX 15 /* Set a number of events to return at a time for simulation. */ -#define CXL_TEST_EVENT_CNT 3 +#define CXL_TEST_EVENT_RET_MAX 4 struct mock_event_log { u16 clear_idx; @@ -222,6 +222,12 @@ static void mes_add_event(struct mock_event_store *mes, log->nr_events++; } +/* + * Vary the number of events returned to simulate events occuring while the + * logs are being read. + */ +static int ret_limit = 0; + static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; @@ -233,14 +239,18 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) if (cmd->size_in != sizeof(log_type)) return -EINVAL; - if (cmd->size_out < struct_size(pl, records, CXL_TEST_EVENT_CNT)) + ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX; + if (!ret_limit) + ret_limit = 1; + + if (cmd->size_out < struct_size(pl, records, ret_limit)) return -EINVAL; log_type = *((u8 *)cmd->payload_in); if (log_type >= CXL_EVENT_TYPE_MAX) return -EINVAL; - memset(cmd->payload_out, 0, cmd->size_out); + memset(cmd->payload_out, 0, struct_size(pl, records, 0)); log = event_find_log(dev, log_type); if (!log || event_log_empty(log)) @@ -248,7 +258,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) pl = cmd->payload_out; - for (i = 0; i < CXL_TEST_EVENT_CNT && !event_log_empty(log); i++) { + for (i = 0; i < ret_limit && !event_log_empty(log); i++) { memcpy(&pl->records[i], event_get_current(log), sizeof(pl->records[i])); pl->records[i].event.generic.hdr.handle = @@ -256,6 +266,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) log->cur_idx++; } + cmd->size_out = struct_size(pl, records, i); pl->record_count = cpu_to_le16(i); if (!event_log_empty(log)) pl->flags |= CXL_GET_EVENT_FLAG_MORE_RECORDS; -- cgit v1.2.3 From 95b88500b97ca8bafc0b9c8e79e9716c2ddc40c6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2024 11:23:41 +0800 Subject: selftests/bpf: Add opts argument for __start_server This patch adds network_helper_opts parameter for __start_server() instead of "int protocol" and "int timeout_ms". This not only reduces the number of parameters, but also makes it more flexible. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/127d2f0929980b41f757dcfebe1b667e6bfb43f1.1714014697.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index a9e4905a2115..05af3b87ea45 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -80,19 +80,19 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -static int __start_server(int type, int protocol, const struct sockaddr *addr, - socklen_t addrlen, int timeout_ms, bool reuseport) +static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, + bool reuseport, const struct network_helper_opts *opts) { int on = 1; int fd; - fd = socket(addr->sa_family, type, protocol); + fd = socket(addr->sa_family, type, opts->proto); if (fd < 0) { log_err("Failed to create server socket"); return -1; } - if (settimeo(fd, timeout_ms)) + if (settimeo(fd, opts->timeout_ms)) goto error_close; if (reuseport && @@ -123,14 +123,17 @@ error_close: static int start_server_proto(int family, int type, int protocol, const char *addr_str, __u16 port, int timeout_ms) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .proto = protocol, + }; struct sockaddr_storage addr; socklen_t addrlen; if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, protocol, (struct sockaddr *)&addr, - addrlen, timeout_ms, false); + return __start_server(type, (struct sockaddr *)&addr, addrlen, false, &opts); } int start_server(int family, int type, const char *addr_str, __u16 port, @@ -149,6 +152,9 @@ int start_mptcp_server(int family, const char *addr_str, __u16 port, int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + }; struct sockaddr_storage addr; unsigned int nr_fds = 0; socklen_t addrlen; @@ -164,8 +170,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, 0, (struct sockaddr *)&addr, addrlen, - timeout_ms, true); + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -174,8 +179,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, 0, (struct sockaddr *)&addr, - addrlen, timeout_ms, true); + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -193,8 +197,7 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t l if (!opts) opts = &default_opts; - return __start_server(type, 0, (struct sockaddr *)addr, len, - opts->timeout_ms, 0); + return __start_server(type, (struct sockaddr *)addr, len, 0, opts); } void free_fds(int *fds, unsigned int nr_close_fds) -- cgit v1.2.3 From 044032ee6c4e786746058aaf5527be13e831cc5c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2024 11:23:42 +0800 Subject: selftests/bpf: Make start_mptcp_server static start_mptcp_server() shouldn't be a public helper, it only be used in MPTCP tests. This patch moves it into prog_tests/mptcp.c, and implenments it using make_sockaddr() and start_server_addr() instead of using start_server_proto(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/50ec7049e280c60a2924937940851f8fee2b73b8.1714014697.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 7 ------- tools/testing/selftests/bpf/network_helpers.h | 2 -- tools/testing/selftests/bpf/prog_tests/mptcp.c | 16 ++++++++++++++++ 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 05af3b87ea45..aa23692029c4 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -142,13 +142,6 @@ int start_server(int family, int type, const char *addr_str, __u16 port, return start_server_proto(family, type, 0, addr_str, port, timeout_ms); } -int start_mptcp_server(int family, const char *addr_str, __u16 port, - int timeout_ms) -{ - return start_server_proto(family, SOCK_STREAM, IPPROTO_MPTCP, addr_str, - port, timeout_ms); -} - int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 5a8c5cf4ec1a..c62b54daa914 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -49,8 +49,6 @@ extern struct ipv6_packet pkt_v6; int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); -int start_mptcp_server(int family, const char *addr, __u16 port, - int timeout_ms); int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 4e0f69295872..274d2e033e39 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -82,6 +82,22 @@ static void cleanup_netns(struct nstoken *nstoken) SYS_NOFAIL("ip netns del %s", NS_TEST); } +static int start_mptcp_server(int family, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .proto = IPPROTO_MPTCP, + }; + struct sockaddr_storage addr; + socklen_t addrlen; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return start_server_addr(SOCK_STREAM, &addr, addrlen, &opts); +} + static int verify_tsk(int map_fd, int client_fd) { int err, cfd = client_fd; -- cgit v1.2.3 From 8405e6980f21e2b75f232e970edd76bc50cf1491 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2024 11:23:43 +0800 Subject: selftests/bpf: Drop start_server_proto helper Protocol can be set by __start_server() helper directly now, this makes the heler start_server_proto() useless. This patch drops it, and implenments start_server() using make_sockaddr() and __start_server(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/55d8a04e0bb8240a5fda2da3e9bdffe6fc8547b2.1714014697.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index aa23692029c4..054d26e383e0 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -120,12 +120,11 @@ error_close: return -1; } -static int start_server_proto(int family, int type, int protocol, - const char *addr_str, __u16 port, int timeout_ms) +int start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) { struct network_helper_opts opts = { .timeout_ms = timeout_ms, - .proto = protocol, }; struct sockaddr_storage addr; socklen_t addrlen; @@ -136,12 +135,6 @@ static int start_server_proto(int family, int type, int protocol, return __start_server(type, (struct sockaddr *)&addr, addrlen, false, &opts); } -int start_server(int family, int type, const char *addr_str, __u16 port, - int timeout_ms) -{ - return start_server_proto(family, type, 0, addr_str, port, timeout_ms); -} - int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { -- cgit v1.2.3 From 196eca020600470ca44da94c65607e7a98aa9d3c Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 27 Mar 2024 22:35:03 +0800 Subject: tools/power turbostat: Enhance ARL/LNL support ARL/LNL don't have PC8, other than that, it behaves the same as CNL. Copy cnl_features for ARL/LNL, except that PC8 support is removed. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4b95fd90e16c..672936015b55 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -663,6 +663,23 @@ static const struct platform_features adl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features arl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -905,8 +922,8 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE, &cnl_features }, - { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, + { INTEL_FAM6_ARROWLAKE, &arl_features }, + { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, -- cgit v1.2.3 From f04fcc7ac8ceb87933244cca28759d0fac6103ce Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 27 Mar 2024 22:36:38 +0800 Subject: tools/power turbostat: Add ARL-H support Add turbostat support for ARL-H, which behaves the same as ARL. [lenb: also add ARL-U] Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 672936015b55..fadf96934f4e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -922,6 +922,8 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, + { INTEL_FAM6_ARROWLAKE_H, &arl_features }, + { INTEL_FAM6_ARROWLAKE_U, &arl_features }, { INTEL_FAM6_ARROWLAKE, &arl_features }, { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, -- cgit v1.2.3 From d3e6f6253895f499b63bac261b81732f9efc4902 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 23 Apr 2024 11:59:49 +0200 Subject: tools/power turbostat: Replace _Static_assert with BUILD_BUG_ON So it compiles on GCC older than 9.0. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fadf96934f4e..bd6cb31b7099 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -38,6 +38,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -3467,7 +3468,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data } } - _Static_assert(NUM_RAPL_COUNTERS == 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); -- cgit v1.2.3 From 0e39702fbbcdb16ad349439065d24a3bb5e2f331 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 25 Apr 2024 17:54:18 +0200 Subject: tools/power turbostat: Enable non-privileged users to read sysfs counters A group of counters called "sysfs" displays software C-state request counts and resulting perceived C-state residency. They are not built-in counters that turbostat knows about ahead of time, rather they are discovered in sysfs when turbostat starts. Thus, they are added dynamically, using the same interface as user-added MSR counters. When turbostat enters "no-msr" mode, such as when running as a non-privileged user, it clears all added counters. Updating that to clear only actual MSR added counters allows regular users to see the sysfs counters. [lenb: commit message] Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 56 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bd6cb31b7099..f92b46cfda31 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1377,36 +1377,42 @@ struct sys_counters { struct msr_counter *pp; } sys; -void free_sys_counters(void) +static size_t free_msr_counters_(struct msr_counter **pp) { - struct msr_counter *p = sys.tp, *pnext = NULL; + struct msr_counter *p = NULL; + size_t num_freed = 0; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + while (*pp) { + p = *pp; - p = sys.cp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + if (p->msr_num != 0) { + *pp = p->next; - p = sys.pp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; + free(p); + ++num_freed; + + continue; + } + + pp = &p->next; } - sys.added_thread_counters = 0; - sys.added_core_counters = 0; - sys.added_package_counters = 0; - sys.tp = NULL; - sys.cp = NULL; - sys.pp = NULL; + return num_freed; +} + +/* + * Free all added counters accessed via msr. + */ +static void free_sys_msr_counters(void) +{ + /* Thread counters */ + sys.added_thread_counters -= free_msr_counters_(&sys.tp); + + /* Core counters */ + sys.added_core_counters -= free_msr_counters_(&sys.cp); + + /* Package counters */ + sys.added_package_counters -= free_msr_counters_(&sys.pp); } struct system_summary { @@ -1566,7 +1572,7 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; - free_sys_counters(); + free_sys_msr_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) -- cgit v1.2.3 From c01768b05e306d8c8d5997356ff427f5d53c7d20 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:07 +0800 Subject: selftests/ftrace: add kprobe test cases for VFS type "%pd" and "%pD" This patch adds test cases for new print format type "%pd/%pD".The test cases test the following items: 1. Test README if add "%pd/%pD" type; 2. Test "%pd" type for dput(); 3. Test "%pD" type for vfs_read(); This test case require enable CONFIG_HAVE_FUNCTION_ARG_ACCESS_API configuration. Link: https://lore.kernel.org/all/20240322064308.284457-5-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- .../ftrace/test.d/kprobe/kprobe_args_vfs.tc | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index 000000000000..21a54be6894c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name" +echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo 'p:testprobe dput $arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace -- cgit v1.2.3 From ee97e5e135c6fc937c4c81e0341c6513e2e6d44c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 22 Mar 2024 14:43:08 +0800 Subject: selftests/ftrace: add fprobe test cases for VFS type "%pd" and "%pD" This patch adds fprobe test cases for new print format type "%pd/%pD".The test cases test the following items: 1. Test "%pd" type for dput(); 2. Test "%pD" type for vfs_read(); This test case require enable CONFIG_HAVE_FUNCTION_ARG_ACCESS_API configuration. Link: https://lore.kernel.org/all/20240322064308.284457-6-yebin10@huawei.com/ Signed-off-by: Ye Bin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- .../ftrace/test.d/dynevent/fprobe_args_vfs.tc | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc new file mode 100644 index 000000000000..49a833bf334c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Fprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name for fprobe" +echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pd without name for fprobe" +echo 'f:testprobe dput $arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD with name for fprobe" +echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD without name for fprobe" +echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace -- cgit v1.2.3 From 8f8a024272f3e335854515b41638bdf89c6d3146 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Tue, 30 Apr 2024 11:38:06 +0200 Subject: libbpf: support "module: Function" syntax for tracing programs In some situations, it is useful to explicitly specify a kernel module to search for a tracing program target (e.g. when a function of the same name exists in multiple modules or in vmlinux). This patch enables that by allowing the "module:function" syntax for the find_kernel_btf_id function. Thanks to this, the syntax can be used both from a SEC macro (i.e. `SEC(fentry/module:function)`) and via the bpf_program__set_attach_target API call. Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/9085a8cb9a552de98e554deb22ff7e977d025440.1714469650.git.vmalik@redhat.com --- tools/lib/bpf/libbpf.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7667671187e9..109b33fb68dc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9862,16 +9862,28 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, enum bpf_attach_type attach_type, int *btf_obj_fd, int *btf_type_id) { - int ret, i; + int ret, i, mod_len; + const char *fn_name, *mod_name = NULL; - ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); - if (ret > 0) { - *btf_obj_fd = 0; /* vmlinux BTF */ - *btf_type_id = ret; - return 0; + fn_name = strchr(attach_name, ':'); + if (fn_name) { + mod_name = attach_name; + mod_len = fn_name - mod_name; + fn_name++; + } + + if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) { + ret = find_attach_btf_id(obj->btf_vmlinux, + mod_name ? fn_name : attach_name, + attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; } - if (ret != -ENOENT) - return ret; ret = load_module_btfs(obj); if (ret) @@ -9880,7 +9892,12 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, for (i = 0; i < obj->btf_module_cnt; i++) { const struct module_btf *mod = &obj->btf_modules[i]; - ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0) + continue; + + ret = find_attach_btf_id(mod->btf, + mod_name ? fn_name : attach_name, + attach_type); if (ret > 0) { *btf_obj_fd = mod->fd; *btf_type_id = ret; -- cgit v1.2.3 From 960635887c967338fd567def3e7905a294f5002b Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Tue, 30 Apr 2024 11:38:07 +0200 Subject: selftests/bpf: add tests for the "module: Function" syntax The previous patch added support for the "module:function" syntax for tracing programs. This adds tests for explicitly specifying the module name via the SEC macro and via the bpf_program__set_attach_target call. Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8a076168ed847f7c8a6c25715737b1fea84e38be.1714469650.git.vmalik@redhat.com --- .../selftests/bpf/prog_tests/module_attach.c | 6 ++++++ .../selftests/bpf/progs/test_module_attach.c | 23 ++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index f53d658ed080..6d391d95f96e 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -51,6 +51,10 @@ void test_module_attach(void) 0, "bpf_testmod_test_read"); ASSERT_OK(err, "set_attach_target"); + err = bpf_program__set_attach_target(skel->progs.handle_fentry_explicit_manual, + 0, "bpf_testmod:bpf_testmod_test_read"); + ASSERT_OK(err, "set_attach_target_explicit"); + err = test_module_attach__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton\n")) return; @@ -70,6 +74,8 @@ void test_module_attach(void) ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); + ASSERT_EQ(bss->fentry_explicit_read_sz, READ_SZ, "fentry_explicit"); + ASSERT_EQ(bss->fentry_explicit_manual_read_sz, READ_SZ, "fentry_explicit_manual"); ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index 8a1b50f3a002..cc1a012d038f 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -73,6 +73,29 @@ int BPF_PROG(handle_fentry_manual, return 0; } +__u32 fentry_explicit_read_sz = 0; + +SEC("fentry/bpf_testmod:bpf_testmod_test_read") +int BPF_PROG(handle_fentry_explicit, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_read_sz = len; + return 0; +} + + +__u32 fentry_explicit_manual_read_sz = 0; + +SEC("fentry") +int BPF_PROG(handle_fentry_explicit_manual, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_manual_read_sz = len; + return 0; +} + __u32 fexit_read_sz = 0; int fexit_ret = 0; -- cgit v1.2.3 From 0737df6de94661ae55fd3343ce9abec32c687e62 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Apr 2024 21:17:06 -0700 Subject: libbpf: better fix for handling nulled-out struct_ops program Previous attempt to fix the handling of nulled-out (from skeleton) struct_ops program is working well only if struct_ops program is defined as non-autoloaded by default (i.e., has SEC("?struct_ops") annotation, with question mark). Unfortunately, that fix is incomplete due to how bpf_object_adjust_struct_ops_autoload() is marking referenced or non-referenced struct_ops program as autoloaded (or not). Because bpf_object_adjust_struct_ops_autoload() is run after bpf_map__init_kern_struct_ops() step, which sets program slot to NULL, such programs won't be considered "referenced", and so its autoload property won't be changed. This all sounds convoluted and it is, but the desire is to have as natural behavior (as far as struct_ops usage is concerned) as possible. This fix is redoing the original fix but makes it work for autoloaded-by-default struct_ops programs as well. We achieve this by forcing prog->autoload to false if prog was declaratively set for some struct_ops map, but then nulled-out from skeleton (programmatically). This achieves desired effect of not autoloading it. If such program is still referenced somewhere else (different struct_ops map or different callback field), it will get its autoload property adjusted by bpf_object_adjust_struct_ops_autoload() later. We also fix selftest, which accidentally used SEC("?struct_ops") annotation. It was meant to use autoload-by-default program from the very beginning. Fixes: f973fccd43d3 ("libbpf: handle nulled-out program in struct_ops correctly") Cc: Kui-Feng Lee Cc: Eduard Zingerman Cc: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240501041706.3712608-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 37 +++++++++++++++------- .../selftests/bpf/progs/struct_ops_module.c | 2 +- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 109b33fb68dc..4ffc8873d1ad 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1128,6 +1128,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const struct btf_type *mtype, *kern_mtype; __u32 mtype_id, kern_mtype_id; void *mdata, *kern_mdata; + struct bpf_program *prog; __s64 msize, kern_msize; __u32 moff, kern_moff; __u32 kern_member_idx; @@ -1145,19 +1146,35 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { - /* Skip all zeros or null fields if they are not - * presented in the kernel BTF. - */ - if (libbpf_is_mem_zeroed(mdata, msize)) { - st_ops->progs[i] = NULL; - pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", + if (!libbpf_is_mem_zeroed(mdata, msize)) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", map->name, mname); - continue; + return -ENOTSUP; } - pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + prog = st_ops->progs[i]; + if (prog) { + /* If we had declaratively set struct_ops callback, we need to + * first validate that it's actually a struct_ops program. + * And then force its autoload to false, because it doesn't have + * a chance of succeeding from POV of the current struct_ops map. + * If this program is still referenced somewhere else, though, + * then bpf_object_adjust_struct_ops_autoload() will update its + * autoload accordingly. + */ + if (!is_valid_st_ops_program(obj, prog)) { + pr_warn("struct_ops init_kern %s: member %s is declaratively assigned a non-struct_ops program\n", + map->name, mname); + return -EINVAL; + } + prog->autoload = false; + st_ops->progs[i] = NULL; + } + + /* Skip all-zero/NULL fields if they are not present in the kernel BTF */ + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); - return -ENOTSUP; + continue; } kern_member_idx = kern_member - btf_members(kern_type); @@ -1183,8 +1200,6 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) } if (btf_is_ptr(mtype)) { - struct bpf_program *prog; - /* Update the value from the shadow type */ prog = *(void **)mdata; st_ops->progs[i] = prog; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 40109be2b3ae..4c56d4a9d9f4 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -63,7 +63,7 @@ struct bpf_testmod_ops___zeroed { int zeroed; }; -SEC("?struct_ops/test_3") +SEC("struct_ops/test_3") int BPF_PROG(zeroed_op) { return 1; -- cgit v1.2.3 From 496bc5861c73f34e0486612f93634a9289de0dfb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Apr 2024 16:58:07 +0200 Subject: selftests: netfilter: nft_concat_range.sh: reduce debug kernel run time Even a 1h timeout isn't enough for nft_concat_range.sh to complete on debug kernels. Reduce test complexity and only match on single entry if KSFT_MACHINE_SLOW is set. To spot 'slow' tests, print the subtest duration (in seconds) in addition to the status. Add new nft_concat_range_perf.sh script, not executed via kselftest, to run the performance (pps match rate) tests. Those need about 25m to complete which seems too much to run this via 'make run_tests'. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240430145810.23447-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/Makefile | 2 ++ tools/testing/selftests/net/netfilter/config | 1 + .../selftests/net/netfilter/nft_concat_range.sh | 28 +++++++++++++++++----- .../net/netfilter/nft_concat_range_perf.sh | 9 +++++++ 4 files changed, 34 insertions(+), 6 deletions(-) create mode 100755 tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 72c6001964a6..e9a6c702b8c9 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -28,6 +28,8 @@ TEST_PROGS += nft_zones_many.sh TEST_PROGS += rpath.sh TEST_PROGS += xt_string.sh +TEST_PROGS_EXTENDED = nft_concat_range_perf.sh + TEST_GEN_PROGS = conntrack_dump_flush TEST_GEN_FILES = audit_logread diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 60b86c7f3ea1..5b5b764f6cd0 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -85,3 +85,4 @@ CONFIG_VETH=m CONFIG_VLAN_8021Q=m CONFIG_XFRM_USER=m CONFIG_XFRM_STATISTICS=y +CONFIG_NET_PKTGEN=m diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 2b6661519055..6d66240e149c 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -19,7 +19,7 @@ source lib.sh # - timeout: check that packets match entries until they expire # - performance: estimate matching rate, compare with rbtree and hash baselines TESTS="reported_issues correctness concurrency timeout" -[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" +[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}" # Set types, defined by TYPE_ variables below TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto @@ -31,7 +31,7 @@ BUGS="flush_remove_add reload" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" - ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + ../../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh pktgen/pktgen_bench_xmit_mode_netif_receive.sh" # Definition of set types: @@ -951,6 +951,10 @@ cleanup() { killall iperf 2>/dev/null killall netperf 2>/dev/null killall netserver 2>/dev/null +} + +cleanup_exit() { + cleanup rm -f "$tmp" } @@ -1371,6 +1375,9 @@ test_timeout() { setup veth send_"${proto}" set || return ${ksft_skip} timeout=3 + + [ "$KSFT_MACHINE_SLOW" = "yes" ] && timeout=8 + range_size=1 for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) @@ -1386,7 +1393,7 @@ test_timeout() { range_size=$((range_size + 1)) start=$((end + range_size)) done - sleep 3 + sleep $timeout for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) @@ -1480,10 +1487,13 @@ test_performance() { } test_bug_flush_remove_add() { + rounds=100 + [ "$KSFT_MACHINE_SLOW" = "yes" ] && rounds=10 + set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in $(seq 1 100); do + for i in $(seq 1 $rounds); do nft add table t "$set_cmd" || return ${ksft_skip} nft add element t s "$elem1" 2>/dev/null || return 1 nft flush set t s 2>/dev/null || return 1 @@ -1552,7 +1562,7 @@ test_reported_issues() { # Run everything in a separate network namespace [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } tmp="$(mktemp)" -trap cleanup EXIT +trap cleanup_exit EXIT # Entry point for test runs passed=0 @@ -1584,10 +1594,16 @@ for name in ${TESTS}; do continue fi - printf " %-60s " "${display}" + [ "$KSFT_MACHINE_SLOW" = "yes" ] && count=1 + + printf " %-32s " "${display}" + tthen=$(date +%s) eval test_"${name}" ret=$? + tnow=$(date +%s) + printf "%5ds%-30s" $((tnow-tthen)) + if [ $ret -eq 0 ]; then printf "[ OK ]\n" info_flush diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh new file mode 100755 index 000000000000..5d276995a5c5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +source lib.sh + +[ "$KSFT_MACHINE_SLOW" = yes ] && exit ${ksft_skip} + +NFT_CONCAT_RANGE_TESTS="performance" exec ./nft_concat_range.sh -- cgit v1.2.3 From d7b60803a790b716dc930bde2155c6ecb7537d51 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 1 May 2024 23:00:06 -0700 Subject: perf dwarf-aux: Add die_collect_global_vars() This function is to search all global variables in the CU. We want to have the list of global variables at once and match them later. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240502060011.1838090-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.h | 8 ++++++ 2 files changed, 70 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 40cfbdfe2d75..c0a492e65388 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1630,6 +1630,68 @@ void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types) die_find_child(sc_die, __die_collect_vars_cb, (void *)var_types, &die_mem); } + +static int __die_collect_global_vars_cb(Dwarf_Die *die_mem, void *arg) +{ + struct die_var_type **var_types = arg; + Dwarf_Die type_die; + int tag = dwarf_tag(die_mem); + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Op *ops; + size_t nops; + struct die_var_type *vt; + + if (tag != DW_TAG_variable) + return DIE_FIND_CB_SIBLING; + + if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL) + return DIE_FIND_CB_SIBLING; + + /* Only collect the location with an absolute address. */ + if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0) + return DIE_FIND_CB_SIBLING; + + if (ops->atom != DW_OP_addr) + return DIE_FIND_CB_SIBLING; + + if (!check_allowed_ops(ops, nops)) + return DIE_FIND_CB_SIBLING; + + if (die_get_real_type(die_mem, &type_die) == NULL) + return DIE_FIND_CB_SIBLING; + + vt = malloc(sizeof(*vt)); + if (vt == NULL) + return DIE_FIND_CB_END; + + vt->die_off = dwarf_dieoffset(&type_die); + vt->addr = ops->number; + vt->reg = -1; + vt->offset = 0; + vt->next = *var_types; + *var_types = vt; + + return DIE_FIND_CB_SIBLING; +} + +/** + * die_collect_global_vars - Save all global variables + * @cu_die: a CU DIE + * @var_types: a pointer to save the resulting list + * + * Save all global variables in the @cu_die and save them to @var_types. + * The @var_types is a singly-linked list containing type and location info. + * Actual type can be retrieved using dwarf_offdie() with 'die_off' later. + * + * Callers should free @var_types. + */ +void die_collect_global_vars(Dwarf_Die *cu_die, struct die_var_type **var_types) +{ + Dwarf_Die die_mem; + + die_find_child(cu_die, __die_collect_global_vars_cb, (void *)var_types, &die_mem); +} #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #ifdef HAVE_DWARF_CFI_SUPPORT diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index b0f25fbf9668..24446412b869 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -171,6 +171,9 @@ Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr, /* Save all variables and parameters in this scope */ void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types); +/* Save all global variables in this CU */ +void die_collect_global_vars(Dwarf_Die *cu_die, struct die_var_type **var_types); + #else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, @@ -203,6 +206,11 @@ static inline void die_collect_vars(Dwarf_Die *sc_die __maybe_unused, { } +static inline void die_collect_global_vars(Dwarf_Die *cu_die __maybe_unused, + struct die_var_type **var_types __maybe_unused) +{ +} + #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #ifdef HAVE_DWARF_CFI_SUPPORT -- cgit v1.2.3 From c1da8411e4be2a96a448979baede9a0e86c5baf8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 1 May 2024 23:00:07 -0700 Subject: perf annotate-data: Collect global variables in advance Currently it looks up global variables from the current CU using address and name. But it sometimes fails to find a variable as the variable can come from a different CU - but it's still strange it failed to find a declaration for some reason. Anyway, it can collect all global variables from all CU once and then lookup them later on. This slightly improves the success rate of my test data set. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240502060011.1838090-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 57 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 12d5faff3b7a..4dd0911904f2 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -28,6 +28,8 @@ /* register number of the stack pointer */ #define X86_REG_SP 7 +static void delete_var_types(struct die_var_type *var_types); + enum type_state_kind { TSR_KIND_INVALID = 0, TSR_KIND_TYPE, @@ -557,8 +559,8 @@ static bool global_var__add(struct data_loc_info *dloc, u64 addr, if (gvar == NULL) return false; - gvar->name = strdup(name); - if (gvar->name == NULL) { + gvar->name = name ? strdup(name) : NULL; + if (name && gvar->name == NULL) { free(gvar); return false; } @@ -612,6 +614,53 @@ static bool get_global_var_info(struct data_loc_info *dloc, u64 addr, return true; } +static void global_var__collect(struct data_loc_info *dloc) +{ + Dwarf *dwarf = dloc->di->dbg; + Dwarf_Off off, next_off; + Dwarf_Die cu_die, type_die; + size_t header_size; + + /* Iterate all CU and collect global variables that have no location in a register. */ + off = 0; + while (dwarf_nextcu(dwarf, off, &next_off, &header_size, + NULL, NULL, NULL) == 0) { + struct die_var_type *var_types = NULL; + struct die_var_type *pos; + + if (dwarf_offdie(dwarf, off + header_size, &cu_die) == NULL) { + off = next_off; + continue; + } + + die_collect_global_vars(&cu_die, &var_types); + + for (pos = var_types; pos; pos = pos->next) { + const char *var_name = NULL; + int var_offset = 0; + + if (pos->reg != -1) + continue; + + if (!dwarf_offdie(dwarf, pos->die_off, &type_die)) + continue; + + if (!get_global_var_info(dloc, pos->addr, &var_name, + &var_offset)) + continue; + + if (var_offset != 0) + continue; + + global_var__add(dloc, pos->addr, var_name, &type_die); + } + + delete_var_types(var_types); + + off = next_off; + } +} + static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, u64 ip, u64 var_addr, int *var_offset, Dwarf_Die *type_die) @@ -620,8 +669,12 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, int offset; const char *var_name = NULL; struct global_var_entry *gvar; + struct dso *dso = map__dso(dloc->ms->map); Dwarf_Die var_die; + if (RB_EMPTY_ROOT(&dso->global_vars)) + global_var__collect(dloc); + gvar = global_var__find(dloc, var_addr); if (gvar) { if (!dwarf_offdie(dloc->di->dbg, gvar->die_offset, type_die)) -- cgit v1.2.3 From 4449c9047dc6f9f68333a720958cd7b58225910d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 1 May 2024 23:00:08 -0700 Subject: perf annotate-data: Handle direct global variable access Like per-cpu base offset array, sometimes it accesses the global variable directly using the offset. Allow this type of instructions as long as it finds a global variable for the address. movslq %edi, %rcx mov -0x7dc94ae0(,%rcx,8), %rcx <<<--- here As %rcx has a valid type (i.e. array index) from the first instruction, it will be checked by the first case in check_matching_type(). But as it's not a pointer type, the match will fail. But in this case, it should check if it accesses the kernel global array variable. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240502060011.1838090-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 4dd0911904f2..f1e52a531563 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1256,14 +1256,19 @@ static int check_matching_type(struct type_state *state, if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) { int tag = dwarf_tag(&state->regs[reg].type); - pr_debug_dtp("\n"); - /* * Normal registers should hold a pointer (or array) to * dereference a memory location. */ - if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) + if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) { + if (dloc->op->offset < 0 && reg != state->stack_reg) + goto check_kernel; + + pr_debug_dtp("\n"); return -1; + } + + pr_debug_dtp("\n"); /* Remove the pointer and get the target type */ if (die_get_real_type(&state->regs[reg].type, type_die) == NULL) @@ -1376,12 +1381,14 @@ static int check_matching_type(struct type_state *state, return -1; } - if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) { +check_kernel: + if (map__dso(dloc->ms->map)->kernel) { u64 addr; int offset; /* Direct this-cpu access like "%gs:0x34740" */ - if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm) { + if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm && + arch__is(dloc->arch, "x86")) { pr_debug_dtp(" this-cpu var\n"); addr = dloc->op->offset; @@ -1394,17 +1401,13 @@ static int check_matching_type(struct type_state *state, return -1; } - /* Access to per-cpu base like "-0x7dcf0500(,%rdx,8)" */ + /* Access to global variable like "-0x7dcf0500(,%rdx,8)" */ if (dloc->op->offset < 0 && reg != state->stack_reg) { - const char *var_name = NULL; - addr = (s64) dloc->op->offset; - if (get_global_var_info(dloc, addr, &var_name, &offset) && - !strcmp(var_name, "__per_cpu_offset") && offset == 0 && - get_global_var_type(cu_die, dloc, dloc->ip, addr, + if (get_global_var_type(cu_die, dloc, dloc->ip, addr, &offset, type_die)) { - pr_debug_dtp(" percpu base\n"); + pr_debug_dtp(" global var\n"); dloc->type_offset = offset; return 1; -- cgit v1.2.3 From eba1f853edf794ec259ec7b5e5a6efee5ede989f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 1 May 2024 23:00:09 -0700 Subject: perf annotate-data: Check memory access with two registers The following instruction pattern is used to access a global variable. mov $0x231c0, %rax movsql %edi, %rcx mov -0x7dc94ae0(,%rcx,8), %rcx cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here The first instruction set the address of the per-cpu variable (here, it is 'runqueues' of type 'struct rq'). The second instruction seems like a cpu number of the per-cpu base. The third instruction get the base offset of per-cpu area for that cpu. The last instruction compares the value of the per-cpu variable at the offset of 0xa60. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240502060011.1838090-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 44 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index f1e52a531563..245e3ef3e2ff 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1031,22 +1031,37 @@ retry: else if (has_reg_type(state, sreg) && state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr = src->offset; int offset; + if (src->multi_regs) { + int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + /* * In kernel, %gs points to a per-cpu region for the * current CPU. Access with a constant offset should * be treated as a global variable access. */ - if (get_global_var_type(cu_die, dloc, ip, src->offset, + if (get_global_var_type(cu_die, dloc, ip, var_addr, &offset, &type_die) && die_get_member_type(&type_die, offset, &type_die)) { tsr->type = type_die; tsr->kind = TSR_KIND_TYPE; tsr->ok = true; - pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); + if (src->multi_regs) { + pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d", + insn_offset, src->offset, src->reg1, + src->reg2, dst->reg1); + } else { + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + } pr_debug_type_name(&tsr->type, tsr->kind); } else { tsr->ok = false; @@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state, pr_debug_dtp(" percpu var\n"); + if (dloc->op->multi_regs) { + int reg2 = dloc->op->reg2; + + if (dloc->op->reg2 == reg) + reg2 = dloc->op->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, &var_offset, type_die)) { dloc->type_offset = var_offset; @@ -1527,8 +1553,16 @@ again: found = find_data_type_insn(dloc, reg, &basic_blocks, var_types, cu_die, type_die); if (found > 0) { - pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n", - dloc->op->offset, reg, dloc->type_offset); + char buf[64]; + + if (dloc->op->multi_regs) + snprintf(buf, sizeof(buf), "reg%d, reg%d", + dloc->op->reg1, dloc->op->reg2); + else + snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1); + + pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n", + dloc->op->offset, buf, dloc->type_offset); pr_debug_type_name(type_die, TSR_KIND_TYPE); ret = 0; break; -- cgit v1.2.3 From af89e8f2bdb2ff9252317307a755f97dd02f6cd7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 1 May 2024 23:00:10 -0700 Subject: perf annotate-data: Handle multi regs in find_data_type_block() The instruction tracking should be the same for the both registers. Just do it once and compare the result with multi regs as with the previous patches. Then we don't need to call find_data_type_block() separately for each reg. Let's remove the 'reg' argument from the relevant functions. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240502060011.1838090-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 245e3ef3e2ff..68fe7999f033 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1258,11 +1258,12 @@ static void setup_stack_canary(struct data_loc_info *dloc) * are similar to global variables and no additional info is needed. */ static int check_matching_type(struct type_state *state, - struct data_loc_info *dloc, int reg, + struct data_loc_info *dloc, Dwarf_Die *cu_die, Dwarf_Die *type_die) { Dwarf_Word size; u32 insn_offset = dloc->ip - dloc->ms->sym->start; + int reg = dloc->op->reg1; pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d", insn_offset, reg, dloc->op->offset, @@ -1448,7 +1449,7 @@ check_kernel: } /* Iterate instructions in basic blocks and update type table */ -static int find_data_type_insn(struct data_loc_info *dloc, int reg, +static int find_data_type_insn(struct data_loc_info *dloc, struct list_head *basic_blocks, struct die_var_type *var_types, Dwarf_Die *cu_die, Dwarf_Die *type_die) @@ -1481,7 +1482,7 @@ static int find_data_type_insn(struct data_loc_info *dloc, int reg, update_var_state(&state, dloc, addr, dl->al.offset, var_types); if (this_ip == dloc->ip) { - ret = check_matching_type(&state, dloc, reg, + ret = check_matching_type(&state, dloc, cu_die, type_die); goto out; } @@ -1502,7 +1503,7 @@ out: * Construct a list of basic blocks for each scope with variables and try to find * the data type by updating a type state table through instructions. */ -static int find_data_type_block(struct data_loc_info *dloc, int reg, +static int find_data_type_block(struct data_loc_info *dloc, Dwarf_Die *cu_die, Dwarf_Die *scopes, int nr_scopes, Dwarf_Die *type_die) { @@ -1550,7 +1551,7 @@ again: fixup_var_address(var_types, start); /* Find from start of this scope to the target instruction */ - found = find_data_type_insn(dloc, reg, &basic_blocks, var_types, + found = find_data_type_insn(dloc, &basic_blocks, var_types, cu_die, type_die); if (found > 0) { char buf[64]; @@ -1716,8 +1717,13 @@ retry: goto out; } + if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) { + reg = loc->reg2; + goto retry; + } + if (reg != DWARF_REG_PC) { - ret = find_data_type_block(dloc, reg, &cu_die, scopes, + ret = find_data_type_block(dloc, &cu_die, scopes, nr_scopes, type_die); if (ret == 0) { ann_data_stat.insn_track++; @@ -1725,11 +1731,6 @@ retry: } } - if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) { - reg = loc->reg2; - goto retry; - } - if (ret < 0) { pr_debug_dtp("no variable found\n"); ann_data_stat.no_var++; -- cgit v1.2.3 From b7d4aacfc894ca2d86b11ef738f94e6c8cf2536b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 1 May 2024 23:00:11 -0700 Subject: perf annotate-data: Check kind of stack variables I sometimes see ("unknown type") in the result and it was because it didn't check the type of stack variables properly during the instruction tracking. The stack can carry constant values (without type info) and if the target instruction is accessing the stack location, it resulted in the "unknown type". Maybe we could pick one of integer types for the constant, but it doesn't really mean anything useful. Let's just drop the stack slot if it doesn't have a valid type info. Here's an example how it got the unknown type. Note that 0xffffff48 = -0xb8. ----------------------------------------------------------- find data type for 0xffffff48(reg6) at ... CU for ... frame base: cfa=0 fbreg=6 scope: [2/2] (die:11cb97f) bb: [37 - 3a] var [37] reg15 type='int' size=0x4 (die:0x1180633) bb: [40 - 4b] mov [40] imm=0x1 -> reg13 var [45] reg8 type='sigset_t*' size=0x8 (die:0x11a39ee) mov [45] imm=0x1 -> reg2 <--- here reg2 has a constant bb: [215 - 237] mov [218] reg2 -> -0xb8(stack) constant <--- and save it to the stack mov [225] reg13 -> -0xc4(stack) constant call [22f] find_task_by_vgpid call [22f] return -> reg0 type='struct task_struct*' size=0x8 (die:0x11881e8) bb: [5c8 - 5cf] bb: [2fb - 302] mov [2fb] -0xc4(stack) -> reg13 constant bb: [13b - 14d] mov [143] 0xd50(reg3) -> reg5 type='struct task_struct*' size=0x8 (die:0xa31f3c) bb: [153 - 153] chk [153] reg6 offset=0xffffff48 ok=0 kind=0 fbreg <--- access here found by insn track: 0xffffff48(reg6) type-offset=0 type='G^KU' size=0 (die:0xffffffffffffffff) Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240502060011.1838090-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 68fe7999f033..2c98813f95cd 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1314,6 +1314,9 @@ static int check_matching_type(struct type_state *state, return -1; } + if (stack->kind != TSR_KIND_TYPE) + return 0; + *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= stack->offset; @@ -1343,6 +1346,9 @@ static int check_matching_type(struct type_state *state, return -1; } + if (stack->kind != TSR_KIND_TYPE) + return 0; + *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= fboff + stack->offset; -- cgit v1.2.3 From cbaf2c4f932e08c9e3df2903cf1559a32defbc41 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 1 May 2024 14:57:51 +0100 Subject: perf cs-etm: Use struct perf_cpu as much as possible The perf_cpu struct makes some iterators simpler and avoids some mistakes with interchanging CPU IDs with indexes etc. At the moment in this file the conversion to an integer is done somewhere in the middle of the call tree. Change it to delay the conversion to an int until the leaf functions. Some of the usage patterns are duplicated, so instead of changing them all, make cs_etm_get_ro() more reusable and use that everywhere. cs_etm_get_ro() didn't return an error before, but return one now so that it can also be used where an error is needed. Continue to ignore the error where it was already ignored. Use cs_etm_pmu_path_exists() instead of cs_etm_get_ro() in cs_etm_is_etmv4() because cs_etm_get_ro() prints a warning, but path exists is sufficient for this use case. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240501135753.508022-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 204 ++++++++++++++++---------------------- 1 file changed, 88 insertions(+), 116 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 07be32d99805..a1fa711dc41a 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -66,18 +66,19 @@ static const char * const metadata_ete_ro[] = { [CS_ETE_TS_SOURCE] = "ts_source", }; -static bool cs_etm_is_etmv4(struct auxtrace_record *itr, int cpu); -static bool cs_etm_is_ete(struct auxtrace_record *itr, int cpu); +static bool cs_etm_is_etmv4(struct auxtrace_record *itr, struct perf_cpu cpu); +static bool cs_etm_is_ete(struct auxtrace_record *itr, struct perf_cpu cpu); +static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val); +static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path); -static int cs_etm_validate_context_id(struct auxtrace_record *itr, - struct evsel *evsel, int cpu) +static int cs_etm_validate_context_id(struct auxtrace_record *itr, struct evsel *evsel, + struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - char path[PATH_MAX]; int err; - u32 val; + __u64 val; u64 contextid = evsel->core.attr.config & (perf_pmu__format_bits(cs_etm_pmu, "contextid") | perf_pmu__format_bits(cs_etm_pmu, "contextid1") | @@ -94,16 +95,9 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, } /* Get a handle on TRCIDR2 */ - snprintf(path, PATH_MAX, "cpu%d/%s", - cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2]); - err = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val); - - /* There was a problem reading the file, bailing out */ - if (err != 1) { - pr_err("%s: can't read file %s\n", CORESIGHT_ETM_PMU_NAME, - path); + err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &val); + if (err) return err; - } if (contextid & perf_pmu__format_bits(cs_etm_pmu, "contextid1")) { @@ -140,15 +134,14 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, return 0; } -static int cs_etm_validate_timestamp(struct auxtrace_record *itr, - struct evsel *evsel, int cpu) +static int cs_etm_validate_timestamp(struct auxtrace_record *itr, struct evsel *evsel, + struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - char path[PATH_MAX]; int err; - u32 val; + __u64 val; if (!(evsel->core.attr.config & perf_pmu__format_bits(cs_etm_pmu, "timestamp"))) @@ -161,16 +154,9 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, } /* Get a handle on TRCIRD0 */ - snprintf(path, PATH_MAX, "cpu%d/%s", - cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); - err = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val); - - /* There was a problem reading the file, bailing out */ - if (err != 1) { - pr_err("%s: can't read file %s\n", - CORESIGHT_ETM_PMU_NAME, path); + err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &val); + if (err) return err; - } /* * TRCIDR0.TSSIZE, bit [28-24], indicates whether global timestamping @@ -218,11 +204,11 @@ static int cs_etm_validate_config(struct auxtrace_record *itr, } perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { - err = cs_etm_validate_context_id(itr, evsel, cpu.cpu); + err = cs_etm_validate_context_id(itr, evsel, cpu); if (err) break; - err = cs_etm_validate_timestamp(itr, evsel, cpu.cpu); + err = cs_etm_validate_timestamp(itr, evsel, cpu); if (err) break; } @@ -549,9 +535,9 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, intersect_cpus = perf_cpu_map__new_online_cpus(); } perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { - if (cs_etm_is_ete(itr, cpu.cpu)) + if (cs_etm_is_ete(itr, cpu)) ete++; - else if (cs_etm_is_etmv4(itr, cpu.cpu)) + else if (cs_etm_is_etmv4(itr, cpu)) etmv4++; else etmv3++; @@ -564,66 +550,59 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, (etmv3 * CS_ETMV3_PRIV_SIZE)); } -static bool cs_etm_is_etmv4(struct auxtrace_record *itr, int cpu) +static bool cs_etm_is_etmv4(struct auxtrace_record *itr, struct perf_cpu cpu) { - bool ret = false; - char path[PATH_MAX]; - int scan; - unsigned int val; struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; /* Take any of the RO files for ETMv4 and see if it present */ - snprintf(path, PATH_MAX, "cpu%d/%s", - cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); - scan = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val); - - /* The file was read successfully, we have a winner */ - if (scan == 1) - ret = true; - - return ret; + return cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); } -static int cs_etm_get_ro(struct perf_pmu *pmu, int cpu, const char *path) +static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val) { char pmu_path[PATH_MAX]; int scan; - unsigned int val = 0; /* Get RO metadata from sysfs */ - snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path); + snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu.cpu, path); - scan = perf_pmu__scan_file(pmu, pmu_path, "%x", &val); - if (scan != 1) + scan = perf_pmu__scan_file(pmu, pmu_path, "%llx", val); + if (scan != 1) { pr_err("%s: error reading: %s\n", __func__, pmu_path); + return -EINVAL; + } - return val; + return 0; } -static int cs_etm_get_ro_signed(struct perf_pmu *pmu, int cpu, const char *path) +static int cs_etm_get_ro_signed(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, + __u64 *out_val) { char pmu_path[PATH_MAX]; int scan; int val = 0; /* Get RO metadata from sysfs */ - snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path); + snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu.cpu, path); scan = perf_pmu__scan_file(pmu, pmu_path, "%d", &val); - if (scan != 1) + if (scan != 1) { pr_err("%s: error reading: %s\n", __func__, pmu_path); + return -EINVAL; + } - return val; + *out_val = (__u64) val; + return 0; } -static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, int cpu, const char *path) +static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path) { char pmu_path[PATH_MAX]; /* Get RO metadata from sysfs */ - snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path); + snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu.cpu, path); return perf_pmu__file_exists(pmu, pmu_path); } @@ -636,16 +615,16 @@ static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, int cpu, const char *pa #define TRCDEVARCH_ARCHVER_MASK GENMASK(15, 12) #define TRCDEVARCH_ARCHVER(x) (((x) & TRCDEVARCH_ARCHVER_MASK) >> TRCDEVARCH_ARCHVER_SHIFT) -static bool cs_etm_is_ete(struct auxtrace_record *itr, int cpu) +static bool cs_etm_is_ete(struct auxtrace_record *itr, struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - int trcdevarch; + __u64 trcdevarch; if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH])) return false; - trcdevarch = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH], &trcdevarch); /* * ETE if ARCHVER is 5 (ARCHVER is 4 for ETM) and ARCHPART is 0xA13. * See ETM_DEVARCH_ETE_ARCH in coresight-etm4x.h @@ -653,7 +632,12 @@ static bool cs_etm_is_ete(struct auxtrace_record *itr, int cpu) return TRCDEVARCH_ARCHVER(trcdevarch) == 5 && TRCDEVARCH_ARCHPART(trcdevarch) == 0xA13; } -static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, int cpu) +static __u64 cs_etm_get_legacy_trace_id(struct perf_cpu cpu) +{ + return CORESIGHT_LEGACY_CPU_TRACE_ID(cpu.cpu); +} + +static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; @@ -661,33 +645,32 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, /* Get trace configuration register */ data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr); /* traceID set to legacy version, in case new perf running on older system */ - data[CS_ETMV4_TRCTRACEIDR] = - CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | + CORESIGHT_TRACE_ID_UNUSED_FLAG; /* Get read-only information from sysFS */ - data[CS_ETMV4_TRCIDR0] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); - data[CS_ETMV4_TRCIDR1] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR1]); - data[CS_ETMV4_TRCIDR2] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR2]); - data[CS_ETMV4_TRCIDR8] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCIDR8]); - data[CS_ETMV4_TRCAUTHSTATUS] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TRCAUTHSTATUS]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], + &data[CS_ETMV4_TRCIDR0]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR1], + &data[CS_ETMV4_TRCIDR1]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], + &data[CS_ETMV4_TRCIDR2]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR8], + &data[CS_ETMV4_TRCIDR8]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCAUTHSTATUS], + &data[CS_ETMV4_TRCAUTHSTATUS]); /* Kernels older than 5.19 may not expose ts_source */ - if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TS_SOURCE])) - data[CS_ETMV4_TS_SOURCE] = (__u64) cs_etm_get_ro_signed(cs_etm_pmu, cpu, - metadata_etmv4_ro[CS_ETMV4_TS_SOURCE]); - else { + if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TS_SOURCE]) || + cs_etm_get_ro_signed(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TS_SOURCE], + &data[CS_ETMV4_TS_SOURCE])) { pr_debug3("[%03d] pmu file 'ts_source' not found. Fallback to safe value (-1)\n", - cpu); + cpu.cpu); data[CS_ETMV4_TS_SOURCE] = (__u64) -1; } } -static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, int cpu) +static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, struct perf_cpu cpu) { struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; @@ -695,36 +678,30 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, in /* Get trace configuration register */ data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr); /* traceID set to legacy version, in case new perf running on older system */ - data[CS_ETE_TRCTRACEIDR] = - CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; /* Get read-only information from sysFS */ - data[CS_ETE_TRCIDR0] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR0]); - data[CS_ETE_TRCIDR1] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR1]); - data[CS_ETE_TRCIDR2] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR2]); - data[CS_ETE_TRCIDR8] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCIDR8]); - data[CS_ETE_TRCAUTHSTATUS] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCAUTHSTATUS]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR0], &data[CS_ETE_TRCIDR0]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR1], &data[CS_ETE_TRCIDR1]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR2], &data[CS_ETE_TRCIDR2]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR8], &data[CS_ETE_TRCIDR8]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCAUTHSTATUS], + &data[CS_ETE_TRCAUTHSTATUS]); /* ETE uses the same registers as ETMv4 plus TRCDEVARCH */ - data[CS_ETE_TRCDEVARCH] = cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TRCDEVARCH]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH], + &data[CS_ETE_TRCDEVARCH]); /* Kernels older than 5.19 may not expose ts_source */ - if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TS_SOURCE])) - data[CS_ETE_TS_SOURCE] = (__u64) cs_etm_get_ro_signed(cs_etm_pmu, cpu, - metadata_ete_ro[CS_ETE_TS_SOURCE]); - else { + if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TS_SOURCE]) || + cs_etm_get_ro_signed(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TS_SOURCE], + &data[CS_ETE_TS_SOURCE])) { pr_debug3("[%03d] pmu file 'ts_source' not found. Fallback to safe value (-1)\n", - cpu); + cpu.cpu); data[CS_ETE_TS_SOURCE] = (__u64) -1; } } -static void cs_etm_get_metadata(int cpu, u32 *offset, +static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, struct auxtrace_record *itr, struct perf_record_auxtrace_info *info) { @@ -754,15 +731,13 @@ static void cs_etm_get_metadata(int cpu, u32 *offset, /* Get configuration register */ info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr); /* traceID set to legacy value in case new perf running on old system */ - info->priv[*offset + CS_ETM_ETMTRACEIDR] = - CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | + CORESIGHT_TRACE_ID_UNUSED_FLAG; /* Get read-only information from sysFS */ - info->priv[*offset + CS_ETM_ETMCCER] = - cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv3_ro[CS_ETM_ETMCCER]); - info->priv[*offset + CS_ETM_ETMIDR] = - cs_etm_get_ro(cs_etm_pmu, cpu, - metadata_etmv3_ro[CS_ETM_ETMIDR]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMCCER], + &info->priv[*offset + CS_ETM_ETMCCER]); + cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMIDR], + &info->priv[*offset + CS_ETM_ETMIDR]); /* How much space was used */ increment = CS_ETM_PRIV_MAX; @@ -771,7 +746,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset, /* Build generic header portion */ info->priv[*offset + CS_ETM_MAGIC] = magic; - info->priv[*offset + CS_ETM_CPU] = cpu; + info->priv[*offset + CS_ETM_CPU] = cpu.cpu; info->priv[*offset + CS_ETM_NR_TRC_PARAMS] = nr_trc_params; /* Where the next CPU entry should start from */ *offset += increment; @@ -791,6 +766,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; + struct perf_cpu cpu; if (priv_size != cs_etm_info_priv_size(itr, session->evlist)) return -EINVAL; @@ -803,8 +779,6 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, cpu_map = online_cpus; } else { /* Make sure all specified CPUs are online */ - struct perf_cpu cpu; - perf_cpu_map__for_each_cpu(cpu, i, event_cpus) { if (!perf_cpu_map__has(online_cpus, cpu)) return -EINVAL; @@ -826,11 +800,9 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, offset = CS_ETM_SNAPSHOT + 1; - for (i = 0; i < cpu__max_cpu().cpu && offset < priv_size; i++) { - struct perf_cpu cpu = { .cpu = i, }; - - if (perf_cpu_map__has(cpu_map, cpu)) - cs_etm_get_metadata(i, &offset, itr, info); + perf_cpu_map__for_each_cpu(cpu, i, cpu_map) { + assert(offset < priv_size); + cs_etm_get_metadata(cpu, &offset, itr, info); } perf_cpu_map__put(online_cpus); -- cgit v1.2.3 From bc5e0e1b93565e3760ffb407db836c9f50d4ffae Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 1 May 2024 14:57:52 +0100 Subject: perf cs-etm: Remove repeated fetches of the ETM PMU Most functions already have cs_etm_pmu, so it's a bit neater to pass it through rather than itr only to convert it again. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240501135753.508022-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 60 ++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index a1fa711dc41a..2fc4b41daea1 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -66,17 +66,14 @@ static const char * const metadata_ete_ro[] = { [CS_ETE_TS_SOURCE] = "ts_source", }; -static bool cs_etm_is_etmv4(struct auxtrace_record *itr, struct perf_cpu cpu); -static bool cs_etm_is_ete(struct auxtrace_record *itr, struct perf_cpu cpu); +static bool cs_etm_is_etmv4(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu); +static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu); static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val); static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path); -static int cs_etm_validate_context_id(struct auxtrace_record *itr, struct evsel *evsel, +static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel *evsel, struct perf_cpu cpu) { - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; int err; __u64 val; u64 contextid = evsel->core.attr.config & @@ -88,7 +85,7 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, struct evsel return 0; /* Not supported in etmv3 */ - if (!cs_etm_is_etmv4(itr, cpu)) { + if (!cs_etm_is_etmv4(cs_etm_pmu, cpu)) { pr_err("%s: contextid not supported in ETMv3, disable with %s/contextid=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; @@ -134,12 +131,9 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, struct evsel return 0; } -static int cs_etm_validate_timestamp(struct auxtrace_record *itr, struct evsel *evsel, +static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *evsel, struct perf_cpu cpu) { - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; int err; __u64 val; @@ -147,7 +141,7 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, struct evsel * perf_pmu__format_bits(cs_etm_pmu, "timestamp"))) return 0; - if (!cs_etm_is_etmv4(itr, cpu)) { + if (!cs_etm_is_etmv4(cs_etm_pmu, cpu)) { pr_err("%s: timestamp not supported in ETMv3, disable with %s/timestamp=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; @@ -173,6 +167,13 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, struct evsel * return 0; } +static struct perf_pmu *cs_etm_get_pmu(struct auxtrace_record *itr) +{ + struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); + + return ptr->cs_etm_pmu; +} + /* * Check whether the requested timestamp and contextid options should be * available on all requested CPUs and if not, tell the user how to override. @@ -180,7 +181,7 @@ static int cs_etm_validate_timestamp(struct auxtrace_record *itr, struct evsel * * first is better. In theory the kernel could still disable the option for * some other reason so this is best effort only. */ -static int cs_etm_validate_config(struct auxtrace_record *itr, +static int cs_etm_validate_config(struct perf_pmu *cs_etm_pmu, struct evsel *evsel) { int idx, err = 0; @@ -204,11 +205,11 @@ static int cs_etm_validate_config(struct auxtrace_record *itr, } perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { - err = cs_etm_validate_context_id(itr, evsel, cpu); + err = cs_etm_validate_context_id(cs_etm_pmu, evsel, cpu); if (err) break; - err = cs_etm_validate_timestamp(itr, evsel, cpu); + err = cs_etm_validate_timestamp(cs_etm_pmu, evsel, cpu); if (err) break; } @@ -449,7 +450,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(evsel, TIME); - err = cs_etm_validate_config(itr, cs_etm_evsel); + err = cs_etm_validate_config(cs_etm_pmu, cs_etm_evsel); out: return err; } @@ -515,14 +516,15 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr) } static size_t -cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, - struct evlist *evlist __maybe_unused) +cs_etm_info_priv_size(struct auxtrace_record *itr, + struct evlist *evlist) { int idx; int etmv3 = 0, etmv4 = 0, ete = 0; struct perf_cpu_map *event_cpus = evlist->core.user_requested_cpus; struct perf_cpu_map *intersect_cpus; struct perf_cpu cpu; + struct perf_pmu *cs_etm_pmu = cs_etm_get_pmu(itr); if (!perf_cpu_map__has_any_cpu(event_cpus)) { /* cpu map is not "any" CPU , we have specific CPUs to work with */ @@ -535,9 +537,9 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, intersect_cpus = perf_cpu_map__new_online_cpus(); } perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { - if (cs_etm_is_ete(itr, cpu)) + if (cs_etm_is_ete(cs_etm_pmu, cpu)) ete++; - else if (cs_etm_is_etmv4(itr, cpu)) + else if (cs_etm_is_etmv4(cs_etm_pmu, cpu)) etmv4++; else etmv3++; @@ -550,12 +552,8 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, (etmv3 * CS_ETMV3_PRIV_SIZE)); } -static bool cs_etm_is_etmv4(struct auxtrace_record *itr, struct perf_cpu cpu) +static bool cs_etm_is_etmv4(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu) { - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; - /* Take any of the RO files for ETMv4 and see if it present */ return cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); } @@ -615,10 +613,8 @@ static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, co #define TRCDEVARCH_ARCHVER_MASK GENMASK(15, 12) #define TRCDEVARCH_ARCHVER(x) (((x) & TRCDEVARCH_ARCHVER_MASK) >> TRCDEVARCH_ARCHVER_SHIFT) -static bool cs_etm_is_ete(struct auxtrace_record *itr, struct perf_cpu cpu) +static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu) { - struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; __u64 trcdevarch; if (!cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCDEVARCH])) @@ -707,19 +703,17 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, { u32 increment, nr_trc_params; u64 magic; - struct cs_etm_recording *ptr = - container_of(itr, struct cs_etm_recording, itr); - struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; + struct perf_pmu *cs_etm_pmu = cs_etm_get_pmu(itr); /* first see what kind of tracer this cpu is affined to */ - if (cs_etm_is_ete(itr, cpu)) { + if (cs_etm_is_ete(cs_etm_pmu, cpu)) { magic = __perf_cs_ete_magic; cs_etm_save_ete_header(&info->priv[*offset], itr, cpu); /* How much space was used */ increment = CS_ETE_PRIV_MAX; nr_trc_params = CS_ETE_PRIV_MAX - CS_ETM_COMMON_BLK_MAX_V1; - } else if (cs_etm_is_etmv4(itr, cpu)) { + } else if (cs_etm_is_etmv4(cs_etm_pmu, cpu)) { magic = __perf_cs_etmv4_magic; cs_etm_save_etmv4_header(&info->priv[*offset], itr, cpu); -- cgit v1.2.3 From e3123079b906dc2edb80466de85b2c333a56fbf2 Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 1 May 2024 14:57:53 +0100 Subject: perf cs-etm: Improve version detection and error reporting When the config validation functions are warning about ETMv3, they do it based on "not ETMv4". If the drivers aren't all loaded or the hardware doesn't support Coresight it will appear as "not ETMv4" and then Perf will print the error message "... not supported in ETMv3 ..." which is wrong and confusing. cs_etm_is_etmv4() is also misnamed because it also returns true for ETE because ETE has a superset of the ETMv4 metadata files. Although this was always done in the correct order so it wasn't a bug. Improve all this by making a single get version function which also handles not present as a separate case. Change the ETMv3 error message to only print when ETMv3 is detected, and add a new error message for the not present case. Reviewed-by: Ian Rogers Reviewed-by: Leo Yan Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240501135753.508022-4-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 61 +++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 2fc4b41daea1..da6231367993 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -66,11 +66,25 @@ static const char * const metadata_ete_ro[] = { [CS_ETE_TS_SOURCE] = "ts_source", }; -static bool cs_etm_is_etmv4(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu); +enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE }; + static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu); static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val); static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path); +static enum cs_etm_version cs_etm_get_version(struct perf_pmu *cs_etm_pmu, + struct perf_cpu cpu) +{ + if (cs_etm_is_ete(cs_etm_pmu, cpu)) + return CS_ETE; + else if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0])) + return CS_ETMV4; + else if (cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMCCER])) + return CS_ETMV3; + + return CS_NOT_PRESENT; +} + static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel *evsel, struct perf_cpu cpu) { @@ -85,7 +99,7 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel return 0; /* Not supported in etmv3 */ - if (!cs_etm_is_etmv4(cs_etm_pmu, cpu)) { + if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) { pr_err("%s: contextid not supported in ETMv3, disable with %s/contextid=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; @@ -141,7 +155,7 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel * perf_pmu__format_bits(cs_etm_pmu, "timestamp"))) return 0; - if (!cs_etm_is_etmv4(cs_etm_pmu, cpu)) { + if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) { pr_err("%s: timestamp not supported in ETMv3, disable with %s/timestamp=0/\n", CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); return -EINVAL; @@ -205,6 +219,11 @@ static int cs_etm_validate_config(struct perf_pmu *cs_etm_pmu, } perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { + if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_NOT_PRESENT) { + pr_err("%s: Not found on CPU %d. Check hardware and firmware support and that all Coresight drivers are loaded\n", + CORESIGHT_ETM_PMU_NAME, cpu.cpu); + return -EINVAL; + } err = cs_etm_validate_context_id(cs_etm_pmu, evsel, cpu); if (err) break; @@ -536,13 +555,13 @@ cs_etm_info_priv_size(struct auxtrace_record *itr, /* Event can be "any" CPU so count all online CPUs. */ intersect_cpus = perf_cpu_map__new_online_cpus(); } + /* Count number of each type of ETM. Don't count if that CPU has CS_NOT_PRESENT. */ perf_cpu_map__for_each_cpu_skip_any(cpu, idx, intersect_cpus) { - if (cs_etm_is_ete(cs_etm_pmu, cpu)) - ete++; - else if (cs_etm_is_etmv4(cs_etm_pmu, cpu)) - etmv4++; - else - etmv3++; + enum cs_etm_version v = cs_etm_get_version(cs_etm_pmu, cpu); + + ete += v == CS_ETE; + etmv4 += v == CS_ETMV4; + etmv3 += v == CS_ETMV3; } perf_cpu_map__put(intersect_cpus); @@ -552,12 +571,6 @@ cs_etm_info_priv_size(struct auxtrace_record *itr, (etmv3 * CS_ETMV3_PRIV_SIZE)); } -static bool cs_etm_is_etmv4(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu) -{ - /* Take any of the RO files for ETMv4 and see if it present */ - return cs_etm_pmu_path_exists(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]); -} - static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val) { char pmu_path[PATH_MAX]; @@ -706,21 +719,26 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, struct perf_pmu *cs_etm_pmu = cs_etm_get_pmu(itr); /* first see what kind of tracer this cpu is affined to */ - if (cs_etm_is_ete(cs_etm_pmu, cpu)) { + switch (cs_etm_get_version(cs_etm_pmu, cpu)) { + case CS_ETE: magic = __perf_cs_ete_magic; cs_etm_save_ete_header(&info->priv[*offset], itr, cpu); /* How much space was used */ increment = CS_ETE_PRIV_MAX; nr_trc_params = CS_ETE_PRIV_MAX - CS_ETM_COMMON_BLK_MAX_V1; - } else if (cs_etm_is_etmv4(cs_etm_pmu, cpu)) { + break; + + case CS_ETMV4: magic = __perf_cs_etmv4_magic; cs_etm_save_etmv4_header(&info->priv[*offset], itr, cpu); /* How much space was used */ increment = CS_ETMV4_PRIV_MAX; nr_trc_params = CS_ETMV4_PRIV_MAX - CS_ETMV4_TRCCONFIGR; - } else { + break; + + case CS_ETMV3: magic = __perf_cs_etmv3_magic; /* Get configuration register */ info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr); @@ -736,6 +754,13 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, /* How much space was used */ increment = CS_ETM_PRIV_MAX; nr_trc_params = CS_ETM_PRIV_MAX - CS_ETM_ETMCR; + break; + + default: + case CS_NOT_PRESENT: + /* Unreachable, CPUs already validated in cs_etm_validate_config() */ + assert(true); + return; } /* Build generic header portion */ -- cgit v1.2.3 From 5a3941f84b8f91bb1e111499d803b32188d33e5d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 2 May 2024 09:55:40 +0200 Subject: libbpf: Fix error message in attach_kprobe_session We just failed to retrieve pattern, so we need to print spec instead. Fixes: 2ca178f02b2f ("libbpf: Add support for kprobe session attach") Reported-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502075541.1425761-1-jolsa@kernel.org --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4ffc8873d1ad..68c87aed8335 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11605,7 +11605,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, spec = prog->sec_name + sizeof("kprobe.session/") - 1; n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); if (n < 1) { - pr_warn("kprobe session pattern is invalid: %s\n", pattern); + pr_warn("kprobe session pattern is invalid: %s\n", spec); return -EINVAL; } -- cgit v1.2.3 From 7c13ef16e87ac2e44d16c0468b1191bceb06f95c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 2 May 2024 09:55:41 +0200 Subject: libbpf: Fix error message in attach_kprobe_multi We just failed to retrieve pattern, so we need to print spec instead. Fixes: ddc6b04989eb ("libbpf: Add bpf_program__attach_kprobe_multi_opts function") Reported-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502075541.1425761-2-jolsa@kernel.org --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 68c87aed8335..a3566a456bc8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11579,7 +11579,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); if (n < 1) { - pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + pr_warn("kprobe multi pattern is invalid: %s\n", spec); return -EINVAL; } -- cgit v1.2.3 From 08e90da6872a9f9f63ca2911bbce6883b6fc1a19 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Thu, 2 May 2024 16:08:31 +0200 Subject: bpf: Missing trailing slash in tools/testing/selftests/bpf/Makefile tools/lib/bpf/Makefile assumes that the patch in OUTPUT is a directory and that it includes a trailing slash. This seems to be a common expectation for OUTPUT among all the Makefiles. In the rule for runqslower in tools/testing/selftests/bpf/Makefile the variable BPFTOOL_OUTPUT is set to a directory name that lacks a trailing slash. This results in a malformed BPF_HELPER_DEFS being defined in lib/bpf/Makefile. This problem becomes evident when a file like tools/lib/bpf/bpf_tracing.h gets updated. This patch fixes the problem by adding the missing slash in the value for BPFTOOL_OUTPUT in the $(OUTPUT)/runqslower rule. Regtested by running selftests in bpf-next master and building samples/bpf programs. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502140831.23915-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 82247aeef857..ba28d42b74db 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -262,7 +262,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ - BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf \ + BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \ EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)' \ EXTRA_LDFLAGS='$(SAN_LDFLAGS)' && \ -- cgit v1.2.3 From 8e667a065daa6f4c01eadc20f3815f7bf13255bc Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:18 -0500 Subject: selftests/bpf: Fix bind program for big endian systems Without this fix, the bind4 and bind6 programs will reject bind attempts on big endian systems. This patch ensures that CI tests pass for the s390x architecture. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-2-jrife@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/bind4_prog.c | 18 ++++++++++-------- tools/testing/selftests/bpf/progs/bind6_prog.c | 18 ++++++++++-------- tools/testing/selftests/bpf/progs/bind_prog.h | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+), 16 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bind_prog.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index a487f60b73ac..66005c1a5b36 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -12,6 +12,8 @@ #include #include +#include "bind_prog.h" + #define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ #define SERV4_PORT 4040 #define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ @@ -118,23 +120,23 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; + user_ip4 |= load_byte(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 1, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 2, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 3, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; + user_ip4 |= load_word(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_word(ctx->user_ip4, 1, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index d62cd9e9cf0e..9c86c712348c 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -12,6 +12,8 @@ #include #include +#include "bind_prog.h" + #define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ #define SERV6_IP_1 0x12345678 #define SERV6_IP_2 0x00000000 @@ -129,25 +131,25 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; + user_ip6 |= load_byte(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 1, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 2, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 3, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; + user_ip6 |= load_word(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_word(ctx->user_ip6[i], 1, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } diff --git a/tools/testing/selftests/bpf/progs/bind_prog.h b/tools/testing/selftests/bpf/progs/bind_prog.h new file mode 100644 index 000000000000..e830caa940c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind_prog.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BIND_PROG_H__ +#define __BIND_PROG_H__ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[b] << 8 * b) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * w) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[(b) + (sizeof(src) - (s))] << 8 * ((s) - (b) - 1)) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * (((s) / 2) - (w) - 1)) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#endif -- cgit v1.2.3 From bbb1cfdd02249dc8cf878e86a523b28814ed36c0 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:19 -0500 Subject: selftests/bpf: Implement socket kfuncs for bpf_testmod This patch adds a set of kfuncs to bpf_testmod that can be used to manipulate a socket from kernel space. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-3-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 255 +++++++++++++++++++++ .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h | 27 +++ 2 files changed, 282 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index eb2b78552ca2..e93013fc7bf4 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -10,18 +10,30 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include "bpf_testmod.h" #include "bpf_testmod_kfunc.h" #define CREATE_TRACE_POINTS #include "bpf_testmod-events.h" +#define CONNECT_TIMEOUT_SEC 1 + typedef int (*func_proto_typedef)(long); typedef int (*func_proto_typedef_nested1)(func_proto_typedef); typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1); DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123; long bpf_testmod_test_struct_arg_result; +static DEFINE_MUTEX(sock_lock); +static struct socket *sock; struct bpf_testmod_struct_arg_1 { int a; @@ -498,6 +510,237 @@ __bpf_kfunc void bpf_kfunc_call_test_sleepable(void) { } +__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args) +{ + int proto; + int err; + + mutex_lock(&sock_lock); + + if (sock) { + pr_err("%s called without releasing old sock", __func__); + err = -EPERM; + goto out; + } + + switch (args->af) { + case AF_INET: + case AF_INET6: + proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + break; + case AF_UNIX: + proto = PF_UNIX; + break; + default: + pr_err("invalid address family %d\n", args->af); + err = -EINVAL; + goto out; + } + + err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type, + proto, &sock); + + if (!err) + /* Set timeout for call to kernel_connect() to prevent it from hanging, + * and consider the connection attempt failed if it returns + * -EINPROGRESS. + */ + sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc void bpf_kfunc_close_sock(void) +{ + mutex_lock(&sock_lock); + + if (sock) { + sock_release(sock); + sock = NULL; + } + + mutex_unlock(&sock_lock); +} + +__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_connect(sock, (struct sockaddr *)&args->addr, + args->addrlen, 0); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_listen(void) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_listen(sock, 128); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen); + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = sock_sendmsg(sock, &msg); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getsockname(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getpeername(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -525,6 +768,15 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -655,6 +907,8 @@ static int bpf_testmod_init(void) return ret; if (bpf_fentry_test1(0) < 0) return -EINVAL; + sock = NULL; + mutex_init(&sock_lock); return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } @@ -668,6 +922,7 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index ce5cd763561c..b0d586a6751f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -64,6 +64,22 @@ struct prog_test_fail3 { char arr2[]; }; +struct init_sock_args { + int af; + int type; +}; + +struct addr_args { + char addr[sizeof(struct __kernel_sockaddr_storage)]; + int addrlen; +}; + +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; + struct prog_test_ref_kfunc * bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym; void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; @@ -107,4 +123,15 @@ void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p); void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len); void bpf_kfunc_common_test(void) __ksym; + +int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym; +void bpf_kfunc_close_sock(void) __ksym; +int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_listen(void) __ksym; +int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 15b6671efa508ff9c1fb995452913f8de85db73b Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:20 -0500 Subject: selftests/bpf: Implement BPF programs for kernel socket operations This patch lays out a set of SYSCALL programs that can be used to invoke the socket operation kfuncs in bpf_testmod, allowing a test program to manipulate kernel socket operations from userspace. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-4-jrife@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/sock_addr_kern.c | 65 ++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/sock_addr_kern.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/sock_addr_kern.c b/tools/testing/selftests/bpf/progs/sock_addr_kern.c new file mode 100644 index 000000000000..8386bb15ccdc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sock_addr_kern.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ +#include +#include +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +SEC("syscall") +int init_sock(struct init_sock_args *args) +{ + bpf_kfunc_init_sock(args); + + return 0; +} + +SEC("syscall") +int close_sock(void *ctx) +{ + bpf_kfunc_close_sock(); + + return 0; +} + +SEC("syscall") +int kernel_connect(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_connect(args); +} + +SEC("syscall") +int kernel_bind(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_bind(args); +} + +SEC("syscall") +int kernel_listen(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_listen(); +} + +SEC("syscall") +int kernel_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_kernel_sendmsg(args); +} + +SEC("syscall") +int sock_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_sock_sendmsg(args); +} + +SEC("syscall") +int kernel_getsockname(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getsockname(args); +} + +SEC("syscall") +int kernel_getpeername(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getpeername(args); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 3cdd98b42d212160d7aae746a97960a4595cbfc2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 29 Apr 2024 15:57:38 -0700 Subject: perf maps: Remove check_invariants() from maps__lock() I found that the debug build was a slowed down a lot by the maps lock code since it checks the invariants whenever it gets the pointer to the lock. This means it checks twice the invariants before and after the access. Instead, let's move the checking code within the lock area but after any modification and remove it from the read paths. This would remove (more than) half of the maps lock overhead. The time for perf report with a huge data file (200k+ of MMAP2 events). Non-debug Before After --------- -------- -------- 2m 43s 6m 45s 4m 21s Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240429225738.1491791-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index ce13145a9f8e..ac9fb880ddc7 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -211,11 +211,6 @@ void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libun static struct rw_semaphore *maps__lock(struct maps *maps) { - /* - * When the lock is acquired or released the maps invariants should - * hold. - */ - check_invariants(maps); return &RC_CHK_ACCESS(maps)->lock; } @@ -358,6 +353,7 @@ static int map__strcmp(const void *a, const void *b) static int maps__sort_by_name(struct maps *maps) { int err = 0; + down_write(maps__lock(maps)); if (!maps__maps_by_name_sorted(maps)) { struct map **maps_by_name = maps__maps_by_name(maps); @@ -384,6 +380,7 @@ static int maps__sort_by_name(struct maps *maps) maps__set_maps_by_name_sorted(maps, true); } } + check_invariants(maps); up_write(maps__lock(maps)); return err; } @@ -502,6 +499,7 @@ int maps__insert(struct maps *maps, struct map *map) down_write(maps__lock(maps)); ret = __maps__insert(maps, map); + check_invariants(maps); up_write(maps__lock(maps)); return ret; } @@ -536,6 +534,7 @@ void maps__remove(struct maps *maps, struct map *map) { down_write(maps__lock(maps)); __maps__remove(maps, map); + check_invariants(maps); up_write(maps__lock(maps)); } @@ -602,6 +601,7 @@ void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data else i++; } + check_invariants(maps); up_write(maps__lock(maps)); } @@ -942,6 +942,8 @@ int maps__copy_from(struct maps *dest, struct maps *parent) map__put(new); } } + check_invariants(dest); + up_read(maps__lock(parent)); up_write(maps__lock(dest)); return err; @@ -1097,6 +1099,7 @@ void maps__fixup_end(struct maps *maps) map__set_end(maps_by_address[n - 1], ~0ULL); RC_CHK_ACCESS(maps)->ends_broken = false; + check_invariants(maps); up_write(maps__lock(maps)); } @@ -1147,6 +1150,8 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map) map__start(kmaps_maps_by_address[first_after_]) >= map__end(new_map)) { /* No overlap so regular insert suffices. */ int ret = __maps__insert(kmaps, new_map); + + check_invariants(kmaps); up_write(maps__lock(kmaps)); return ret; } @@ -1184,6 +1189,7 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map) map__zput(kmaps_maps_by_address[i]); free(kmaps_maps_by_address); + check_invariants(kmaps); up_write(maps__lock(kmaps)); return 0; } -- cgit v1.2.3 From 8a9d22b8aeb2182cfe83991f11a88b3351084d3e Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:21 -0500 Subject: selftests/bpf: Move IPv4 and IPv6 sockaddr test cases This patch lays the groundwork for testing IPv4 and IPv6 sockaddr hooks and their interaction with both socket syscalls and kernel functions (e.g. kernel_connect, kernel_bind, etc.). It moves some of the test cases from the old-style bpf/test_sock_addr.c self test into the sock_addr prog_test in a step towards fully retiring bpf/test_sock_addr.c. We will expand the test dimensions in the sock_addr prog_test in a later patch series in order to migrate the remaining test cases. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-5-jrife@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 390 ++++++++++++++------- tools/testing/selftests/bpf/test_sock_addr.c | 192 ---------- 2 files changed, 268 insertions(+), 314 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 61668e0f11b0..bf1ad18262d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -3,16 +3,44 @@ #include "test_progs.h" +#include "bind4_prog.skel.h" +#include "bind6_prog.skel.h" #include "connect_unix_prog.skel.h" +#include "connect4_prog.skel.h" +#include "connect6_prog.skel.h" +#include "sendmsg4_prog.skel.h" +#include "sendmsg6_prog.skel.h" +#include "recvmsg4_prog.skel.h" +#include "recvmsg6_prog.skel.h" #include "sendmsg_unix_prog.skel.h" #include "recvmsg_unix_prog.skel.h" #include "getsockname_unix_prog.skel.h" #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" +#define TEST_NS "sock_addr" +#define TEST_IF_PREFIX "test_sock_addr" +#define TEST_IPV4 "127.0.0.4" +#define TEST_IPV6 "::6" + +#define SERV4_IP "192.168.1.254" +#define SERV4_REWRITE_IP "127.0.0.1" +#define SRC4_IP "172.16.0.1" +#define SRC4_REWRITE_IP TEST_IPV4 +#define SERV4_PORT 4040 +#define SERV4_REWRITE_PORT 4444 + +#define SERV6_IP "face:b00c:1234:5678::abcd" +#define SERV6_REWRITE_IP "::1" +#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" +#define SRC6_IP "::1" +#define SRC6_REWRITE_IP TEST_IPV6 +#define SERV6_PORT 6060 +#define SERV6_REWRITE_PORT 6666 + #define SERVUN_ADDRESS "bpf_cgroup_unix_test" #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite" -#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" +#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" enum sock_addr_test_type { SOCK_ADDR_TEST_BIND, @@ -43,130 +71,148 @@ struct sock_addr_test { const char *expected_src_addr; }; -static void *connect_unix_prog_load(int cgroup_fd) -{ - struct connect_unix_prog *skel; - - skel = connect_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.connect_unix_prog = bpf_program__attach_cgroup( - skel->progs.connect_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.connect_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - connect_unix_prog__destroy(skel); - return NULL; -} - -static void connect_unix_prog_destroy(void *skel) -{ - connect_unix_prog__destroy(skel); -} - -static void *sendmsg_unix_prog_load(int cgroup_fd) -{ - struct sendmsg_unix_prog *skel; - - skel = sendmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.sendmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.sendmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.sendmsg_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - sendmsg_unix_prog__destroy(skel); - return NULL; -} - -static void sendmsg_unix_prog_destroy(void *skel) -{ - sendmsg_unix_prog__destroy(skel); -} - -static void *recvmsg_unix_prog_load(int cgroup_fd) -{ - struct recvmsg_unix_prog *skel; - - skel = recvmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.recvmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.recvmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.recvmsg_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - recvmsg_unix_prog__destroy(skel); - return NULL; -} - -static void recvmsg_unix_prog_destroy(void *skel) -{ - recvmsg_unix_prog__destroy(skel); -} - -static void *getsockname_unix_prog_load(int cgroup_fd) -{ - struct getsockname_unix_prog *skel; - - skel = getsockname_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.getsockname_unix_prog = bpf_program__attach_cgroup( - skel->progs.getsockname_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getsockname_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - getsockname_unix_prog__destroy(skel); - return NULL; +#define BPF_SKEL_FUNCS(skel_name, prog_name) \ +static void *skel_name##_load(int cgroup_fd) \ +{ \ + struct skel_name *skel; \ + skel = skel_name##__open_and_load(); \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + skel->links.prog_name = bpf_program__attach_cgroup( \ + skel->progs.prog_name, cgroup_fd); \ + if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \ + goto cleanup; \ + return skel; \ +cleanup: \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void skel_name##_destroy(void *skel) \ +{ \ + skel_name##__destroy(skel); \ } -static void getsockname_unix_prog_destroy(void *skel) -{ - getsockname_unix_prog__destroy(skel); -} - -static void *getpeername_unix_prog_load(int cgroup_fd) -{ - struct getpeername_unix_prog *skel; - - skel = getpeername_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.getpeername_unix_prog = bpf_program__attach_cgroup( - skel->progs.getpeername_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getpeername_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - getpeername_unix_prog__destroy(skel); - return NULL; -} - -static void getpeername_unix_prog_destroy(void *skel) -{ - getpeername_unix_prog__destroy(skel); -} +BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); static struct sock_addr_test tests[] = { + /* bind - system calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (stream)", + bind4_prog_load, + bind4_prog_destroy, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (dgram)", + bind4_prog_load, + bind4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (stream)", + bind6_prog_load, + bind6_prog_destroy, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (dgram)", + bind6_prog_load, + bind6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + + /* connect - system calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (stream)", + connect4_prog_load, + connect4_prog_destroy, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (dgram)", + connect4_prog_load, + connect4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (stream)", + connect6_prog_load, + connect6_prog_destroy, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (dgram)", + connect6_prog_load, + connect6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, { SOCK_ADDR_TEST_CONNECT, - "connect_unix", + "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, AF_UNIX, @@ -177,9 +223,37 @@ static struct sock_addr_test tests[] = { 0, NULL, }, + + /* sendmsg - system calls */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg (dgram)", + sendmsg4_prog_load, + sendmsg4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg (dgram)", + sendmsg6_prog_load, + sendmsg6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, { SOCK_ADDR_TEST_SENDMSG, - "sendmsg_unix", + "sendmsg_unix: sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, AF_UNIX, @@ -190,9 +264,37 @@ static struct sock_addr_test tests[] = { 0, NULL, }, + + /* recvmsg - system calls */ + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: recvfrom (dgram)", + recvmsg4_prog_load, + recvmsg4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-dgram", + "recvmsg6: recvfrom (dgram)", + recvmsg6_prog_load, + recvmsg6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, AF_UNIX, @@ -205,7 +307,7 @@ static struct sock_addr_test tests[] = { }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-stream", + "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, AF_UNIX, @@ -216,6 +318,8 @@ static struct sock_addr_test tests[] = { 0, SERVUN_ADDRESS, }, + + /* getsockname - system calls */ { SOCK_ADDR_TEST_GETSOCKNAME, "getsockname_unix", @@ -229,6 +333,8 @@ static struct sock_addr_test tests[] = { 0, NULL, }, + + /* getpeername - system calls */ { SOCK_ADDR_TEST_GETPEERNAME, "getpeername_unix", @@ -558,8 +664,44 @@ cleanup: close(serv); } +static int setup_test_env(struct nstoken **tok) +{ + int err; + + SYS_NOFAIL("ip netns delete %s", TEST_NS); + SYS(fail, "ip netns add %s", TEST_NS); + *tok = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(*tok, "netns token")) + goto fail; + + SYS(fail, "ip link add dev %s1 type veth peer name %s2", TEST_IF_PREFIX, + TEST_IF_PREFIX); + SYS(fail, "ip link set lo up"); + SYS(fail, "ip link set %s1 up", TEST_IF_PREFIX); + SYS(fail, "ip link set %s2 up", TEST_IF_PREFIX); + SYS(fail, "ip -4 addr add %s/8 dev %s1", TEST_IPV4, TEST_IF_PREFIX); + SYS(fail, "ip -6 addr add %s/128 nodad dev %s1", TEST_IPV6, TEST_IF_PREFIX); + + err = 0; + goto out; +fail: + err = -1; + close_netns(*tok); + *tok = NULL; + SYS_NOFAIL("ip netns delete %s", TEST_NS); +out: + return err; +} + +static void cleanup_test_env(struct nstoken *tok) +{ + close_netns(tok); + SYS_NOFAIL("ip netns delete %s", TEST_NS); +} + void test_sock_addr(void) { + struct nstoken *tok = NULL; int cgroup_fd = -1; void *skel; @@ -567,6 +709,9 @@ void test_sock_addr(void) if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) goto cleanup; + if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env")) + goto cleanup; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; @@ -607,6 +752,7 @@ void test_sock_addr(void) } cleanup: + cleanup_test_env(tok); if (cgroup_fd >= 0) close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index c412de84b88f..aa2198a0f24d 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -97,11 +97,7 @@ static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int recvmsg_allow_prog_load(const struct sock_addr_test *test); static int recvmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); @@ -135,34 +131,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "bind4: rewrite IP & TCP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind4: rewrite IP & UDP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, { "bind6: load prog with wrong expected attach type", bind6_prog_load, @@ -191,34 +159,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "bind6: rewrite IP & TCP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind6: rewrite IP & UDP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, /* connect */ { @@ -249,34 +189,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "connect4: rewrite IP & TCP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "connect4: rewrite IP & UDP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, { "connect6: load prog with wrong expected attach type", connect6_prog_load, @@ -305,34 +217,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "connect6: rewrite IP & TCP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "connect6: rewrite IP & UDP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, /* sendmsg */ { @@ -377,20 +261,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg4: rewrite IP & port (C)", - sendmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, { "sendmsg4: deny call", sendmsg_deny_prog_load, @@ -447,20 +317,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: rewrite IP & port (C)", - sendmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, { "sendmsg6: IPv4-mapped IPv6", sendmsg6_rw_v4mapped_prog_load, @@ -575,34 +431,6 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, - { - "recvmsg4: rewrite IP & port (C)", - recvmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_IP, - SUCCESS, - }, - { - "recvmsg6: rewrite IP & port (C)", - recvmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_IP, - SUCCESS, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -761,16 +589,6 @@ static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) return load_insns(test, insns, ARRAY_SIZE(insns)); } -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG4_PROG_PATH); -} - -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG4_PROG_PATH); -} - static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, const char *rw_dst_ip) { @@ -829,11 +647,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG6_PROG_PATH); -} - static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) { return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); @@ -844,11 +657,6 @@ static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); } -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG6_PROG_PATH); -} - static int cmp_addr(const struct sockaddr_storage *addr1, const struct sockaddr_storage *addr2, int cmp_port) { -- cgit v1.2.3 From 524e05ac4e14bb50b4f442e1fe88540abc9b72fd Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:22 -0500 Subject: selftests/bpf: Make sock configurable for each test case In order to reuse the same test code for both socket system calls (e.g. connect(), bind(), etc.) and kernel socket functions (e.g. kernel_connect(), kernel_bind(), etc.), this patch introduces the "ops" field to sock_addr_test. This field allows each test cases to configure the set of functions used in the test case to create, manipulate, and tear down a socket. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-6-jrife@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 140 ++++++++++++++------- 1 file changed, 98 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index bf1ad18262d8..8f678fa413f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -54,12 +54,64 @@ enum sock_addr_test_type { typedef void *(*load_fn)(int cgroup_fd); typedef void (*destroy_fn)(void *skel); +struct sock_ops { + int (*connect_to_addr)(int type, const struct sockaddr_storage *addr, + socklen_t addrlen, + const struct network_helper_opts *opts); + int (*start_server)(int family, int type, const char *addr_str, + __u16 port, int timeout_ms); + int (*socket)(int famil, int type, int protocol); + int (*bind)(int fd, struct sockaddr *addr, socklen_t addrlen); + int (*getsockname)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*getpeername)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*sendmsg)(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen); + int (*close)(int fd); +}; + +static int user_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) +{ + struct msghdr hdr; + struct iovec iov; + + memset(&iov, 0, sizeof(iov)); + iov.iov_base = msg; + iov.iov_len = msglen; + + memset(&hdr, 0, sizeof(hdr)); + hdr.msg_name = (void *)addr; + hdr.msg_namelen = addrlen; + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + + return sendmsg(fd, &hdr, 0); +} + +static int user_bind(int fd, struct sockaddr *addr, socklen_t addrlen) +{ + return bind(fd, (const struct sockaddr *)addr, addrlen); +} + +struct sock_ops user_ops = { + .connect_to_addr = connect_to_addr, + .start_server = start_server, + .socket = socket, + .bind = user_bind, + .getsockname = getsockname, + .getpeername = getpeername, + .sendmsg = user_sendmsg, + .close = close, +}; + struct sock_addr_test { enum sock_addr_test_type type; const char *name; /* BPF prog properties */ load_fn loadfn; destroy_fn destroyfn; + /* Socket operations */ + struct sock_ops *ops; /* Socket properties */ int socket_family; int socket_type; @@ -113,6 +165,7 @@ static struct sock_addr_test tests[] = { "bind4: bind (stream)", bind4_prog_load, bind4_prog_destroy, + &user_ops, AF_INET, SOCK_STREAM, SERV4_IP, @@ -125,6 +178,7 @@ static struct sock_addr_test tests[] = { "bind4: bind (dgram)", bind4_prog_load, bind4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_IP, @@ -137,6 +191,7 @@ static struct sock_addr_test tests[] = { "bind6: bind (stream)", bind6_prog_load, bind6_prog_destroy, + &user_ops, AF_INET6, SOCK_STREAM, SERV6_IP, @@ -149,6 +204,7 @@ static struct sock_addr_test tests[] = { "bind6: bind (dgram)", bind6_prog_load, bind6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_IP, @@ -163,6 +219,7 @@ static struct sock_addr_test tests[] = { "connect4: connect (stream)", connect4_prog_load, connect4_prog_destroy, + &user_ops, AF_INET, SOCK_STREAM, SERV4_IP, @@ -176,6 +233,7 @@ static struct sock_addr_test tests[] = { "connect4: connect (dgram)", connect4_prog_load, connect4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_IP, @@ -189,6 +247,7 @@ static struct sock_addr_test tests[] = { "connect6: connect (stream)", connect6_prog_load, connect6_prog_destroy, + &user_ops, AF_INET6, SOCK_STREAM, SERV6_IP, @@ -202,6 +261,7 @@ static struct sock_addr_test tests[] = { "connect6: connect (dgram)", connect6_prog_load, connect6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_IP, @@ -215,6 +275,7 @@ static struct sock_addr_test tests[] = { "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -230,6 +291,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: sendmsg (dgram)", sendmsg4_prog_load, sendmsg4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_IP, @@ -243,6 +305,7 @@ static struct sock_addr_test tests[] = { "sendmsg6: sendmsg (dgram)", sendmsg6_prog_load, sendmsg6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_IP, @@ -256,6 +319,7 @@ static struct sock_addr_test tests[] = { "sendmsg_unix: sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_DGRAM, SERVUN_ADDRESS, @@ -271,6 +335,7 @@ static struct sock_addr_test tests[] = { "recvmsg4: recvfrom (dgram)", recvmsg4_prog_load, recvmsg4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_REWRITE_IP, @@ -284,6 +349,7 @@ static struct sock_addr_test tests[] = { "recvmsg6: recvfrom (dgram)", recvmsg6_prog_load, recvmsg6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_REWRITE_IP, @@ -297,6 +363,7 @@ static struct sock_addr_test tests[] = { "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_DGRAM, SERVUN_REWRITE_ADDRESS, @@ -310,6 +377,7 @@ static struct sock_addr_test tests[] = { "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_REWRITE_ADDRESS, @@ -325,6 +393,7 @@ static struct sock_addr_test tests[] = { "getsockname_unix", getsockname_unix_prog_load, getsockname_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -340,6 +409,7 @@ static struct sock_addr_test tests[] = { "getpeername_unix", getpeername_unix_prog_load, getpeername_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -400,26 +470,15 @@ static int cmp_sock_addr(info_fn fn, int sock1, return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port); } -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) -{ - return cmp_sock_addr(getsockname, sock1, addr2, addr2_len, cmp_port); -} - -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) -{ - return cmp_sock_addr(getpeername, sock1, addr2, addr2_len, cmp_port); -} - static void test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, client = -1, err; - serv = start_server(test->socket_family, test->socket_type, - test->requested_addr, test->requested_port, 0); + serv = test->ops->start_server(test->socket_family, test->socket_type, + test->requested_addr, + test->requested_port, 0); if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; @@ -429,7 +488,8 @@ static void test_bind(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; @@ -442,7 +502,7 @@ cleanup: if (client != -1) close(client); if (serv != -1) - close(serv); + test->ops->close(serv); } static void test_connect(struct sock_addr_test *test) @@ -463,7 +523,8 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -479,18 +540,21 @@ static void test_connect(struct sock_addr_test *test) goto cleanup; } - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; if (test->expected_src_addr) { - err = cmp_local_addr(client, &expected_src_addr, expected_src_addr_len, false); + err = cmp_sock_addr(test->ops->getsockname, client, + &expected_src_addr, expected_src_addr_len, + false); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; } cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); } @@ -500,8 +564,6 @@ static void test_xmsg(struct sock_addr_test *test) struct sockaddr_storage addr, src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), src_addr_len = sizeof(struct sockaddr_storage); - struct msghdr hdr; - struct iovec iov; char data = 'a'; int serv = -1, client = -1, err; @@ -514,7 +576,7 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; - client = socket(test->socket_family, test->socket_type, 0); + client = test->ops->socket(test->socket_family, test->socket_type, 0); if (!ASSERT_GE(client, 0, "socket")) goto cleanup; @@ -524,7 +586,8 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = bind(client, (const struct sockaddr *) &src_addr, src_addr_len); + err = test->ops->bind(client, (struct sockaddr *)&src_addr, + src_addr_len); if (!ASSERT_OK(err, "bind")) goto cleanup; } @@ -535,17 +598,8 @@ static void test_xmsg(struct sock_addr_test *test) goto cleanup; if (test->socket_type == SOCK_DGRAM) { - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)&addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - err = sendmsg(client, &hdr, 0); + err = test->ops->sendmsg(client, (struct sockaddr *)&addr, + addr_len, &data, sizeof(data)); if (!ASSERT_EQ(err, sizeof(data), "sendmsg")) goto cleanup; } else { @@ -596,7 +650,7 @@ static void test_xmsg(struct sock_addr_test *test) cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); } @@ -607,7 +661,7 @@ static void test_getsockname(struct sock_addr_test *test) socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, err; - serv = start_server(test->socket_family, test->socket_type, + serv = test->ops->start_server(test->socket_family, test->socket_type, test->requested_addr, test->requested_port, 0); if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; @@ -618,13 +672,13 @@ static void test_getsockname(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; cleanup: if (serv != -1) - close(serv); + test->ops->close(serv); } static void test_getpeername(struct sock_addr_test *test) @@ -644,7 +698,8 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -653,13 +708,14 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); } -- cgit v1.2.3 From e0c8a7e7526ff1526d4dcc7d71aad4e7fe2ec767 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:23 -0500 Subject: selftests/bpf: Add kernel socket operation tests This patch creates two sets of sock_ops that call out to the SYSCALL hooks in the sock_addr_kern BPF program and uses them to construct test cases for the range of supported operations (kernel_connect(), kernel_bind(), kernel_sendms(), sock_sendmsg(), kernel_getsockname(), kenel_getpeername()). This ensures that these interact with BPF sockaddr hooks as intended. Beyond this it also ensures that these operations do not modify their address parameter, providing regression coverage for the issues addressed by this set of patches: - commit 0bdf399342c5("net: Avoid address overwrite in kernel_connect") - commit 86a7e0b69bd5("net: prevent rewrite of msg_name in sock_sendmsg()") - commit c889a99a21bf("net: prevent address rewrite in kernel_bind()") - commit 01b2885d9415("net: Save and restore msg_namelen in sock_sendmsg") Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-7-jrife@google.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 472 +++++++++++++++++++++ 1 file changed, 472 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 8f678fa413f5..9c709c33f889 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -3,6 +3,7 @@ #include "test_progs.h" +#include "sock_addr_kern.skel.h" #include "bind4_prog.skel.h" #include "bind6_prog.skel.h" #include "connect_unix_prog.skel.h" @@ -54,6 +55,216 @@ enum sock_addr_test_type { typedef void *(*load_fn)(int cgroup_fd); typedef void (*destroy_fn)(void *skel); +static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len, + const struct sockaddr_storage *addr2, socklen_t addr2_len, + bool cmp_port); + +struct init_sock_args { + int af; + int type; +}; + +struct addr_args { + char addr[sizeof(struct sockaddr_storage)]; + int addrlen; +}; + +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; + +static struct sock_addr_kern *skel; + +static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_program *prog; + int prog_fd, err; + + topts.ctx_in = ctx; + topts.ctx_size_in = ctx_size; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto err; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, prog_name)) + goto err; + + err = topts.retval; + goto out; +err: + err = -1; +out: + return err; +} + +static int kernel_init_sock(int af, int type, int protocol) +{ + struct init_sock_args args = { + .af = af, + .type = type, + }; + + return run_bpf_prog("init_sock", &args, sizeof(args)); +} + +static int kernel_close_sock(int fd) +{ + return run_bpf_prog("close_sock", NULL, 0); +} + +static int sock_addr_op(const char *name, struct sockaddr *addr, + socklen_t *addrlen, bool expect_change) +{ + struct addr_args args; + int err; + + if (addrlen) + args.addrlen = *addrlen; + + if (addr) + memcpy(&args.addr, addr, *addrlen); + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!expect_change && addr) + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + *addrlen, + (struct sockaddr_storage *)&args.addr, + args.addrlen, 1), + 0, "address_param_modified")) + return -1; + + if (addrlen) + *addrlen = args.addrlen; + + if (addr) + memcpy(addr, &args.addr, *addrlen); + + return err; +} + +static int send_msg_op(const char *name, struct sockaddr *addr, + socklen_t addrlen, const char *msg, int msglen) +{ + struct sendmsg_args args; + int err; + + memset(&args, 0, sizeof(args)); + memcpy(&args.addr.addr, addr, addrlen); + args.addr.addrlen = addrlen; + memcpy(args.msg, msg, msglen); + args.msglen = msglen; + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + addrlen, + (struct sockaddr_storage *)&args.addr.addr, + args.addr.addrlen, 1), + 0, "address_param_modified")) + return -1; + + return err; +} + +static int kernel_connect(struct sockaddr *addr, socklen_t addrlen) +{ + return sock_addr_op("kernel_connect", addr, &addrlen, false); +} + +static int kernel_bind(int fd, struct sockaddr *addr, socklen_t addrlen) +{ + return sock_addr_op("kernel_bind", addr, &addrlen, false); +} + +static int kernel_listen(void) +{ + return sock_addr_op("kernel_listen", NULL, NULL, false); +} + +static int kernel_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) +{ + return send_msg_op("kernel_sendmsg", addr, addrlen, msg, msglen); +} + +static int sock_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) +{ + return send_msg_op("sock_sendmsg", addr, addrlen, msg, msglen); +} + +static int kernel_getsockname(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getsockname", addr, addrlen, true); +} + +static int kernel_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getpeername", addr, addrlen, true); +} + +int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) +{ + int err; + + if (!ASSERT_OK(kernel_init_sock(addr->ss_family, type, 0), + "kernel_init_sock")) + goto err; + + if (!ASSERT_OK(kernel_connect((struct sockaddr *)addr, addrlen), + "kernel_connect")) + goto err; + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); +out: + return err; +} + +int kernel_start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct sockaddr_storage addr; + socklen_t addrlen; + int err; + + if (!ASSERT_OK(kernel_init_sock(family, type, 0), "kernel_init_sock")) + goto err; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + goto err; + + if (!ASSERT_OK(kernel_bind(0, (struct sockaddr *)&addr, addrlen), + "kernel_bind")) + goto err; + + if (type == SOCK_STREAM) { + if (!ASSERT_OK(kernel_listen(), "kernel_listen")) + goto err; + } + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); +out: + return err; +} + struct sock_ops { int (*connect_to_addr)(int type, const struct sockaddr_storage *addr, socklen_t addrlen, @@ -104,6 +315,28 @@ struct sock_ops user_ops = { .close = close, }; +struct sock_ops kern_ops_sock_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = sock_sendmsg, + .close = kernel_close_sock, +}; + +struct sock_ops kern_ops_kernel_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = kernel_sendmsg, + .close = kernel_close_sock, +}; + struct sock_addr_test { enum sock_addr_test_type type; const char *name; @@ -213,6 +446,60 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_PORT, }, + /* bind - kernel calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (stream)", + bind4_prog_load, + bind4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (dgram)", + bind4_prog_load, + bind4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (stream)", + bind6_prog_load, + bind6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (dgram)", + bind6_prog_load, + bind6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + /* connect - system calls */ { SOCK_ADDR_TEST_CONNECT, @@ -285,6 +572,78 @@ static struct sock_addr_test tests[] = { NULL, }, + /* connect - kernel calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (stream)", + connect4_prog_load, + connect4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (dgram)", + connect4_prog_load, + connect4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (stream)", + connect6_prog_load, + connect6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (dgram)", + connect6_prog_load, + connect6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect (dgram)", + connect_unix_prog_load, + connect_unix_prog_destroy, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + }, + /* sendmsg - system calls */ { SOCK_ADDR_TEST_SENDMSG, @@ -329,6 +688,94 @@ static struct sock_addr_test tests[] = { NULL, }, + /* sendmsg - kernel calls (sock_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg (dgram)", + sendmsg4_prog_load, + sendmsg4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg (dgram)", + sendmsg6_prog_load, + sendmsg6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + }, + + /* sendmsg - kernel calls (kernel_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg (dgram)", + sendmsg4_prog_load, + sendmsg4_prog_destroy, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg (dgram)", + sendmsg6_prog_load, + sendmsg6_prog_destroy, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + }, + /* recvmsg - system calls */ { SOCK_ADDR_TEST_RECVMSG, @@ -470,6 +917,27 @@ static int cmp_sock_addr(info_fn fn, int sock1, return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port); } +static int load_sock_addr_kern(void) +{ + int err; + + skel = sock_addr_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto err; + + err = 0; + goto out; +err: + err = -1; +out: + return err; +} + +static void unload_sock_addr_kern(void) +{ + sock_addr_kern__destroy(skel); +} + static void test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; @@ -768,6 +1236,9 @@ void test_sock_addr(void) if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env")) goto cleanup; + if (!ASSERT_OK(load_sock_addr_kern(), "load_sock_addr_kern")) + goto cleanup; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; @@ -808,6 +1279,7 @@ void test_sock_addr(void) } cleanup: + unload_sock_addr_kern(); cleanup_test_env(tok); if (cgroup_fd >= 0) close(cgroup_fd); -- cgit v1.2.3 From 20ecf595b513b4ee69220794c3317380c4f051b1 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Thu, 2 May 2024 14:39:36 -0700 Subject: KVM: selftests: Allow skipping the KVM_RUN sanity check in rseq_test The rseq test's migration worker delays 1-10 us, assuming that one KVM_RUN iteration only takes a few microseconds. But if the CPU low power wakeup latency is large enough, for example, hundreds or even thousands of microseconds for deep C-state exit latencies on x86 server CPUs, it may happen that the target CPU is unable to wakeup and run the vCPU before the migration worker starts to migrate the vCPU thread to the _next_ CPU. If the system workload is light, most CPUs could be at a certain low power state, which may result in less successful migrations and fail the migration/KVM_RUN ratio sanity check. But this is not supposed to be deemed a test failure. Add a command line option to skip the sanity check, along with a comment and a verbose assert message to try to help the user resolve the potential source of failures without having to resort to disabling the check. Co-developed-by: Dongsheng Zhang Signed-off-by: Dongsheng Zhang Signed-off-by: Zide Chen Link: https://lore.kernel.org/r/20240502213936.27619-1-zide.chen@intel.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/rseq_test.c | 35 +++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 28f97fb52044..ff365b9f47fd 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -186,12 +186,35 @@ static void calc_min_max_cpu(void) "Only one usable CPU, task migration not possible"); } +static void help(const char *name) +{ + puts(""); + printf("usage: %s [-h] [-u]\n", name); + printf(" -u: Don't sanity check the number of successful KVM_RUNs\n"); + puts(""); + exit(0); +} + int main(int argc, char *argv[]) { + bool skip_sanity_check = false; int r, i, snapshot; struct kvm_vm *vm; struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; + int opt; + + while ((opt = getopt(argc, argv, "hu")) != -1) { + switch (opt) { + case 'u': + skip_sanity_check = true; + break; + case 'h': + default: + help(argv[0]); + break; + } + } r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, @@ -254,9 +277,17 @@ int main(int argc, char *argv[]) * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly * conservative ratio on x86-64, which can do _more_ KVM_RUNs than * migrations given the 1us+ delay in the migration task. + * + * Another reason why it may have small migration:KVM_RUN ratio is that, + * on systems with large low power mode wakeup latency, it may happen + * quite often that the scheduler is not able to wake up the target CPU + * before the vCPU thread is scheduled to another CPU. */ - TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), - "Only performed %d KVM_RUNs, task stalled too much?", i); + TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2), + "Only performed %d KVM_RUNs, task stalled too much?\n\n" + " Try disabling deep sleep states to reduce CPU wakeup latency,\n" + " e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n" + " or run with -u to disable this sanity check.", i); pthread_join(migration_thread, NULL); -- cgit v1.2.3 From 8a53e13021330a25775a31ced44fbec2225a9443 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Apr 2024 09:21:33 -0700 Subject: KVM: selftests: Require KVM_CAP_USER_MEMORY2 for tests that create memslots Explicitly require KVM_CAP_USER_MEMORY2 for selftests that create memslots, i.e. skip selftests that need memslots instead of letting them fail on KVM_SET_USER_MEMORY_REGION2. While it's ok to take a dependency on new kernel features, selftests should skip gracefully instead of failing hard when run on older kernels. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/69ae0694-8ca3-402c-b864-99b500b24f5d@moroto.mountain Suggested-by: Shuah Khan Link: https://lore.kernel.org/r/20240430162133.337541-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b2262b5fad9e..a24bba9e0390 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -930,6 +930,10 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } +#define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \ + __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \ + "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)") + int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva, uint32_t guest_memfd, uint64_t guest_memfd_offset) @@ -944,6 +948,8 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag .guest_memfd_offset = guest_memfd_offset, }; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); } @@ -970,6 +976,8 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, size_t mem_size = npages * vm->page_size; size_t alignment; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); -- cgit v1.2.3 From 57bfc7605ca5b102ba336779ae9adbc5bbba1d96 Mon Sep 17 00:00:00 2001 From: Miao Xu Date: Wed, 1 May 2024 21:23:16 -0700 Subject: tcp: Add new args for cong_control in tcp_congestion_ops This patch adds two new arguments for cong_control of struct tcp_congestion_ops: - ack - flag These two arguments are inherited from the caller tcp_cong_control in tcp_intput.c. One use case of them is to update cwnd and pacing rate inside cong_control based on the info they provide. For example, the flag can be used to decide if it is the right time to raise or reduce a sender's cwnd. Reviewed-by: Eric Dumazet Signed-off-by: Miao Xu Link: https://lore.kernel.org/r/20240502042318.801932-2-miaxu@meta.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 2 +- net/ipv4/bpf_tcp_ca.c | 3 ++- net/ipv4/tcp_bbr.c | 2 +- net/ipv4/tcp_input.c | 2 +- tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 6 +++--- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/include/net/tcp.h b/include/net/tcp.h index fe98fb01879b..7294da8fb780 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1172,7 +1172,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 7f518ea5f4ac..6bd7f8db189a 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -307,7 +307,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) return 0; } -static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) { } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 7e52ab24e40a..760941e55153 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53e1150f706f..23ccfc7b1d3c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3541,7 +3541,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { - icsk->icsk_ca_ops->cong_control(sk, rs); + icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c index fcfbfe0336b4..52b610357309 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -5,7 +5,7 @@ #include extern void bbr_init(struct sock *sk) __ksym; -extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym; extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; @@ -42,9 +42,9 @@ void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) } SEC("struct_ops/cong_control") -void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { - bbr_main(sk, rs); + bbr_main(sk, ack, flag, rs); } SEC("struct_ops/cong_avoid") -- cgit v1.2.3 From 96c3490d6423b7f24d356e24a61c24de69f3de77 Mon Sep 17 00:00:00 2001 From: Miao Xu Date: Wed, 1 May 2024 21:23:18 -0700 Subject: selftests/bpf: Add test for the use of new args in cong_control This patch adds a selftest to show the usage of the new arguments in cong_control. For simplicity's sake, the testing example reuses cubic's kernel functions. Signed-off-by: Miao Xu Link: https://lore.kernel.org/r/20240502042318.801932-4-miaxu@meta.com Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 24 +++ tools/testing/selftests/bpf/progs/bpf_cc_cubic.c | 199 +++++++++++++++++++++ .../testing/selftests/bpf/progs/bpf_tracing_net.h | 10 ++ 3 files changed, 233 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_cc_cubic.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 907bac46c774..0aca02532794 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -14,6 +14,7 @@ #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" #include "tcp_ca_kfunc.skel.h" +#include "bpf_cc_cubic.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -452,6 +453,27 @@ static void test_tcp_ca_kfunc(void) tcp_ca_kfunc__destroy(skel); } +static void test_cc_cubic(void) +{ + struct bpf_cc_cubic *cc_cubic_skel; + struct bpf_link *link; + + cc_cubic_skel = bpf_cc_cubic__open_and_load(); + if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_cc_cubic__destroy(cc_cubic_skel); + return; + } + + do_test("bpf_cc_cubic", NULL); + + bpf_link__destroy(link); + bpf_cc_cubic__destroy(cc_cubic_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -482,4 +504,6 @@ void test_bpf_tcp_ca(void) test_link_replace(); if (test__start_subtest("tcp_ca_kfunc")) test_tcp_ca_kfunc(); + if (test__start_subtest("cc_cubic")) + test_cc_cubic(); } diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c new file mode 100644 index 000000000000..5c7697c70e2a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Highlights: + * 1. The major difference between this bpf program and tcp_cubic.c + * is that this bpf program relies on `cong_control` rather than + * `cong_avoid` in the struct tcp_congestion_ops. + * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and + * tcp_update_pacing_rate is bypassed when `cong_control` is + * defined, so moving these logic to `cong_control`. + * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c. + * The main purpose is to show use cases of the arguments in + * `cong_control`. For simplicity's sake, it reuses tcp cubic's + * kernel functions. + */ + +#include "vmlinux.h" + +#include +#include +#include "bpf_tracing_net.h" + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, args) + +#define USEC_PER_SEC 1000000UL +#define TCP_PACING_SS_RATIO (200) +#define TCP_PACING_CA_RATIO (120) +#define TCP_REORDERING (12) + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define after(seq2, seq1) before(seq1, seq2) + +extern void cubictcp_init(struct sock *sk) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; + +static struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} + +static __u64 div64_u64(__u64 dividend, __u64 divisor) +{ + return dividend / divisor; +} + +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= TCP_PACING_SS_RATIO; + else + rate *= TCP_PACING_CA_RATIO; + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (tp->srtt_us) + rate = div64_u64(rate, (__u64)tp->srtt_us); + + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int newly_lost, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; + int delta = tp->snd_ssthresh - pkts_in_flight; + + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) + return; + + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; + + if (delta < 0) { + __u64 dividend = + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) + sndcnt++; + sndcnt = min(delta, sndcnt); + } + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); + tp->snd_cwnd = pkts_in_flight + sndcnt; +} + +/* Decide wheather to run the increase function of congestion control. */ +static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) +{ + if (tcp_sk(sk)->reordering > TCP_REORDERING) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; +} + +void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk) +{ + cubictcp_init(sk); +} + +void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + cubictcp_cwnd_event(sk, event); +} + +void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (((1<icsk_ca_state)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); + + if (!before(tp->snd_una, tp->high_seq)) { + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_jiffies32; + } + } + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); + tp->snd_cwnd_stamp = tcp_jiffies32; + } + + tcp_update_pacing_rate(sk); +} + +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +{ + return cubictcp_recalc_ssthresh(sk); +} + +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +{ + cubictcp_state(sk, new_state); +} + +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, + const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); +} + +SEC(".struct_ops") +struct tcp_congestion_ops cc_cubic = { + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_control = (void *)bpf_cubic_cong_control, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, + .name = "bpf_cc_cubic", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 7001965d1cc3..f9ec630dfcd5 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -80,6 +80,14 @@ #define TCP_INFINITE_SSTHRESH 0x7fffffff #define TCP_PINGPONG_THRESH 3 +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_SND_UNA_ADVANCED \ + 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) + #define fib_nh_dev nh_common.nhc_dev #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw6 nh_common.nhc_gw.ipv6 @@ -119,4 +127,6 @@ #define tw_v6_daddr __tw_common.skc_v6_daddr #define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr +#define tcp_jiffies32 ((__u32)bpf_jiffies64()) + #endif -- cgit v1.2.3 From 00f0e08f23fc007f4a5a71cd7e37fcdb15af0c1b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Apr 2024 13:19:51 -0700 Subject: libbpf: fix potential overflow in ring__consume_n() ringbuf_process_ring() return int64_t, while ring__consume_n() assigns it to int. It's highly unlikely, but possible for ringbuf_process_ring() to return value larger than INT_MAX, so use int64_t. ring__consume_n() does check INT_MAX before returning int result to the user. Fixes: 4d22ea94ea33 ("libbpf: Add ring__consume_n / ring_buffer__consume_n") Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240430201952.888293-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 99e44cf02321..37c5a2d86a78 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -405,7 +405,7 @@ int ring__map_fd(const struct ring *r) int ring__consume_n(struct ring *r, size_t n) { - int res; + int64_t res; res = ringbuf_process_ring(r, n); if (res < 0) -- cgit v1.2.3 From 087d757fb4736ecbd3e42eebf9b39d5225d4a2ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Apr 2024 13:19:52 -0700 Subject: libbpf: fix ring_buffer__consume_n() return result logic Add INT_MAX check to ring_buffer__consume_n(). We do the similar check to handle int return result of all these ring buffer APIs in other APIs and ring_buffer__consume_n() is missing one. This patch fixes this omission. Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240430201952.888293-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 37c5a2d86a78..bfd8dac4c0cc 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -301,7 +301,7 @@ int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) if (n == 0) break; } - return res; + return res > INT_MAX ? INT_MAX : res; } /* Consume available ring buffer(s) data without event polling. -- cgit v1.2.3 From e1bb5e65de8355ee76f51c6bfee2328ac5b2be15 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 May 2024 19:53:25 -0700 Subject: selftests: net: py: check process exit code in bkg() and background cmd() We're a bit too loose with error checking for background processes. cmd() completely ignores the fail argument passed to the constructor if background is True. Default to checking for errors if process is not terminated explicitly. Caller can override with True / False. For bkg() the processing step is called magically by __exit__ so record the value passed in the constructor. Reported-by: Willem de Bruijn Tested-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240502025325.1924923-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index b57d467afd0f..ec8b086b4fcb 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -26,6 +26,9 @@ class cmd: self.process(terminate=False, fail=fail) def process(self, terminate=True, fail=None): + if fail is None: + fail = not terminate + if terminate: self.proc.terminate() stdout, stderr = self.proc.communicate(timeout=5) @@ -43,17 +46,18 @@ class cmd: class bkg(cmd): - def __init__(self, comm, shell=True, fail=True, ns=None, host=None, + def __init__(self, comm, shell=True, fail=None, ns=None, host=None, exit_wait=False): super().__init__(comm, background=True, shell=shell, fail=fail, ns=ns, host=host) self.terminate = not exit_wait + self.check_fail = fail def __enter__(self): return self def __exit__(self, ex_type, ex_value, ex_tb): - return self.process(terminate=self.terminate) + return self.process(terminate=self.terminate, fail=self.check_fail) def tool(name, args, json=None, ns=None, host=None): -- cgit v1.2.3 From ec6f25bc8aba2539a95be74b2a38f6a9cc13245f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 May 2024 15:30:22 -0400 Subject: selftests/net: skip partial checksum packets in csum test Detect packets with ip_summed CHECKSUM_PARTIAL and skip these. These should not exist, as the test sends individual packets between two hosts. But if (HW) GRO is on, with randomized content sometimes subsequent packets can be coalesced. In this case the GSO packet checksum is converted to a pseudo checksum in anticipation of sending out as TSO/USO. So the field will not match the expected value. Do not count these as test errors. Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240501193156.3627344-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/csum.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/csum.c b/tools/testing/selftests/net/csum.c index 90eb06fefa59..b9f3fc3c3426 100644 --- a/tools/testing/selftests/net/csum.c +++ b/tools/testing/selftests/net/csum.c @@ -682,7 +682,7 @@ static int recv_verify_packet_ipv6(void *nh, int len) } /* return whether auxdata includes TP_STATUS_CSUM_VALID */ -static bool recv_verify_packet_csum(struct msghdr *msg) +static uint32_t recv_get_packet_csum_status(struct msghdr *msg) { struct tpacket_auxdata *aux = NULL; struct cmsghdr *cm; @@ -706,7 +706,7 @@ static bool recv_verify_packet_csum(struct msghdr *msg) if (!aux) error(1, 0, "cmsg: no auxdata"); - return aux->tp_status & TP_STATUS_CSUM_VALID; + return aux->tp_status; } static int recv_packet(int fd) @@ -716,6 +716,7 @@ static int recv_packet(int fd) char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; struct pkt *buf = (void *)_buf; struct msghdr msg = {0}; + uint32_t tp_status; struct iovec iov; int len, ret; @@ -737,6 +738,17 @@ static int recv_packet(int fd) if (len == -1) error(1, errno, "recv p"); + tp_status = recv_get_packet_csum_status(&msg); + + /* GRO might coalesce randomized packets. Such GSO packets are + * then reinitialized for csum offload (CHECKSUM_PARTIAL), with + * a pseudo csum. Do not try to validate these checksums. + */ + if (tp_status & TP_STATUS_CSUMNOTREADY) { + fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n"); + continue; + } + if (cfg_family == PF_INET6) ret = recv_verify_packet_ipv6(buf, len); else @@ -753,7 +765,7 @@ static int recv_packet(int fd) * Do not fail if kernel does not validate a good csum: * Absence of validation does not imply invalid. */ - if (recv_verify_packet_csum(&msg) && cfg_bad_csum) { + if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) { fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n"); bad_validations++; } -- cgit v1.2.3 From cf9bea94f6b2934d409511c05337010b137316a3 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sun, 28 Apr 2024 13:25:59 +0200 Subject: libbpf: Fix bpf_ksym_exists() in GCC The macro bpf_ksym_exists is defined in bpf_helpers.h as: #define bpf_ksym_exists(sym) ({ \ _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ !!sym; \ }) The purpose of the macro is to determine whether a given symbol has been defined, given the address of the object associated with the symbol. It also has a compile-time check to make sure the object whose address is passed to the macro has been declared as weak, which makes the check on `sym' meaningful. As it happens, the check for weak doesn't work in GCC in all cases, because __builtin_constant_p not always folds at parse time when optimizing. This is because optimizations that happen later in the compilation process, like inlining, may make a previously non-constant expression a constant. This results in errors like the following when building the selftests with GCC: bpf_helpers.h:190:24: error: expression in static assertion is not constant 190 | _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fortunately recent versions of GCC support a __builtin_has_attribute that can be used to directly check for the __weak__ attribute. This patch changes bpf_helpers.h to use that builtin when building with a recent enough GCC, and to omit the check if GCC is too old to support the builtin. The macro used for GCC becomes: #define bpf_ksym_exists(sym) ({ \ _Static_assert(__builtin_has_attribute (*sym, __weak__), #sym " should be marked as __weak"); \ !!sym; \ }) Note that since bpf_ksym_exists is designed to get the address of the object associated with symbol SYM, we pass *sym to __builtin_has_attribute instead of sym. When an expression is passed to __builtin_has_attribute then it is the type of the passed expression that is checked for the specified attribute. The expression itself is not evaluated. This accommodates well with the existing usages of the macro: - For function objects: struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym __weak; [...] bpf_ksym_exists(bpf_task_acquire) - For variable objects: extern const struct rq runqueues __ksym __weak; /* typed */ [...] bpf_ksym_exists(&runqueues) Note also that BPF support was added in GCC 10 and support for __builtin_has_attribute in GCC 9. Locally tested in bpf-next master branch. No regressions. Signed-of-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240428112559.10518-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_helpers.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 62e1c0cc4a59..305c62817dd3 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -186,10 +186,21 @@ enum libbpf_tristate { #define __kptr __attribute__((btf_type_tag("kptr"))) #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) -#define bpf_ksym_exists(sym) ({ \ - _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ - !!sym; \ +#if defined (__clang__) +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), \ + #sym " should be marked as __weak"); \ + !!sym; \ }) +#elif __GNUC__ > 8 +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(__builtin_has_attribute (*sym, __weak__), \ + #sym " should be marked as __weak"); \ + !!sym; \ +}) +#else +#define bpf_ksym_exists(sym) !!sym +#endif #define __arg_ctx __attribute__((btf_decl_tag("arg:ctx"))) #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) -- cgit v1.2.3 From a9e7715ce8b3a62a2133e47e87107632a26ad1e2 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Thu, 2 May 2024 19:09:25 +0200 Subject: libbpf: Avoid casts from pointers to enums in bpf_tracing.h [Differences from V1: - Do not introduce a global typedef, as this is a public header. - Keep the void* casts in BPF_KPROBE_READ_RET_IP and BPF_KRETPROBE_READ_RET_IP, as these are necessary for converting to a const void* argument of bpf_probe_read_kernel.] The BPF_PROG, BPF_KPROBE and BPF_KSYSCALL macros defined in tools/lib/bpf/bpf_tracing.h use a clever hack in order to provide a convenient way to define entry points for BPF programs as if they were normal C functions that get typed actual arguments, instead of as elements in a single "context" array argument. For example, PPF_PROGS allows writing: SEC("struct_ops/cwnd_event") void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) { bbr_cwnd_event(sk, event); dctcp_cwnd_event(sk, event); cubictcp_cwnd_event(sk, event); } That expands into a pair of functions: void ____cwnd_event (unsigned long long *ctx, struct sock *sk, enum tcp_ca_event event) { bbr_cwnd_event(sk, event); dctcp_cwnd_event(sk, event); cubictcp_cwnd_event(sk, event); } void cwnd_event (unsigned long long *ctx) { _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") return ____cwnd_event(ctx, (void*)ctx[0], (void*)ctx[1]); _Pragma("GCC diagnostic pop") } Note how the 64-bit unsigned integers in the incoming CTX get casted to a void pointer, and then implicitly converted to whatever type of the actual argument in the wrapped function. In this case: Arg1: unsigned long long -> void * -> struct sock * Arg2: unsigned long long -> void * -> enum tcp_ca_event The behavior of GCC and clang when facing such conversions differ: pointer -> pointer Allowed by the C standard. GCC: no warning nor error. clang: no warning nor error. pointer -> integer type [C standard says the result of this conversion is implementation defined, and it may lead to unaligned pointer etc.] GCC: error: integer from pointer without a cast [-Wint-conversion] clang: error: incompatible pointer to integer conversion [-Wint-conversion] pointer -> enumerated type GCC: error: incompatible types in assigment (*) clang: error: incompatible pointer to integer conversion [-Wint-conversion] These macros work because converting pointers to pointers is allowed, and converting pointers to integers also works provided a suitable integer type even if it is implementation defined, much like casting a pointer to uintptr_t is guaranteed to work by the C standard. The conversion errors emitted by both compilers by default are silenced by the pragmas. However, the GCC error marked with (*) above when assigning a pointer to an enumerated value is not associated with the -Wint-conversion warning, and it is not possible to turn it off. This is preventing building the BPF kernel selftests with GCC. This patch fixes this by avoiding intermediate casts to void*, replaced with casts to `unsigned long long', which is an integer type capable of safely store a BPF pointer, much like the standard uintptr_t. Testing performed in bpf-next master: - vmtest.sh -- ./test_verifier - vmtest.sh -- ./test_progs - make M=samples/bpf No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502170925.3194-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_tracing.h | 70 ++++++++++++++++++++++----------------------- tools/lib/bpf/usdt.bpf.h | 24 ++++++++-------- 2 files changed, 47 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 1c13f8e88833..9314fa95f04e 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -633,18 +633,18 @@ struct pt_regs; #endif #define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11] #define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) /* @@ -786,14 +786,14 @@ ____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) struct pt_regs; #define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) -#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) -#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) -#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx) #define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* @@ -821,7 +821,7 @@ static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx) #define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) /* @@ -845,24 +845,24 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) /* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ #define ___bpf_syscall_args0() ctx -#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) -#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) -#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) -#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) -#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) -#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) -#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ #define ___bpf_syswrap_args0() ctx -#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs) #define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) /* diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index f6763300b26a..76359bcdc94a 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -214,18 +214,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx) /* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ #define ___bpf_usdt_args0() ctx -#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) -#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) -#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) -#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) -#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) -#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) -#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) -#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) -#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) -#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) -#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) -#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; }) #define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) /* -- cgit v1.2.3 From d7228a58d9438d6f219dc7f33eab0d1980b3bd2f Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:17 +1000 Subject: selftests/powerpc/dexcr: Add -no-pie to hashchk tests The hashchk tests want to verify that the hash key is changed over exec. It does so by calculating hashes at the same address across an exec. This is made simpler by disabling PIE functionality, so we can re-execute ourselves and be using the same addresses in the child. While -fno-pie is already added, -no-pie is also required. Fixes: bdb07f35a52f ("selftests/powerpc/dexcr: Add hashst/hashchk test") Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-2-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 523947a38d17..1b775959f981 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -4,7 +4,7 @@ TEST_GEN_FILES := lsdexcr include ../../lib.mk include ../flags.mk -$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) +$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c $(TEST_GEN_FILES): ../utils.c ./dexcr.c -- cgit v1.2.3 From 9c313ccdfc079f71f16c9b3d3a6c6d60996b9367 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 21 Apr 2024 00:38:37 +0200 Subject: bitops: Change function return types from long to int Change the return types of bitops functions (ffs, fls, and fns) from long to int. The expected return values are in the range [0, 64], for which int is sufficient. Additionally, int aligns well with the return types of the corresponding __builtin_* functions, potentially reducing overall type conversions. Many of the existing bitops functions already return an int and don't need to be changed. The bitops functions in arch/ should be considered separately. Adjust some return variables to match the function return types. With GCC 13 and defconfig, these changes reduced the size of a test kernel image by 5,432 bytes on arm64 and by 248 bytes on riscv; there were no changes in size on x86_64, powerpc, or m68k. Signed-off-by: Thorsten Blum Signed-off-by: Arnd Bergmann --- include/asm-generic/bitops/__ffs.h | 4 ++-- include/asm-generic/bitops/__fls.h | 4 ++-- include/asm-generic/bitops/builtin-__ffs.h | 2 +- include/asm-generic/bitops/builtin-__fls.h | 2 +- include/linux/bitops.h | 6 +++--- tools/include/asm-generic/bitops/__ffs.h | 4 ++-- tools/include/asm-generic/bitops/__fls.h | 4 ++-- tools/include/linux/bitops.h | 2 +- 8 files changed, 14 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h index 446fea6dda78..2d08c750c8a7 100644 --- a/include/asm-generic/bitops/__ffs.h +++ b/include/asm-generic/bitops/__ffs.h @@ -10,9 +10,9 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long generic___ffs(unsigned long word) +static __always_inline unsigned int generic___ffs(unsigned long word) { - int num = 0; + unsigned int num = 0; #if BITS_PER_LONG == 64 if ((word & 0xffffffff) == 0) { diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h index 54ccccf96e21..e974ec932ec1 100644 --- a/include/asm-generic/bitops/__fls.h +++ b/include/asm-generic/bitops/__fls.h @@ -10,9 +10,9 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long generic___fls(unsigned long word) +static __always_inline unsigned int generic___fls(unsigned long word) { - int num = BITS_PER_LONG - 1; + unsigned int num = BITS_PER_LONG - 1; #if BITS_PER_LONG == 64 if (!(word & (~0ul << 32))) { diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h index 87024da44d10..cf4b3d33bf96 100644 --- a/include/asm-generic/bitops/builtin-__ffs.h +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -8,7 +8,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h index 43a5aa9afbdb..6d72fc8a5259 100644 --- a/include/asm-generic/bitops/builtin-__fls.h +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -8,7 +8,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline unsigned int __fls(unsigned long word) { return (sizeof(word) * 8) - 1 - __builtin_clzl(word); } diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2ba557e067fe..f60220f119e2 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -200,7 +200,7 @@ static __always_inline __s64 sign_extend64(__u64 value, int index) return (__s64)(value << shift) >> shift; } -static inline unsigned fls_long(unsigned long l) +static inline unsigned int fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); @@ -236,7 +236,7 @@ static inline int get_count_order_long(unsigned long l) * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ -static inline unsigned long __ffs64(u64 word) +static inline unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) @@ -252,7 +252,7 @@ static inline unsigned long __ffs64(u64 word) * @word: The word to search * @n: Bit to find */ -static inline unsigned long fns(unsigned long word, unsigned int n) +static inline unsigned int fns(unsigned long word, unsigned int n) { unsigned int bit; diff --git a/tools/include/asm-generic/bitops/__ffs.h b/tools/include/asm-generic/bitops/__ffs.h index 9d1310519497..2d94c1e9b2f3 100644 --- a/tools/include/asm-generic/bitops/__ffs.h +++ b/tools/include/asm-generic/bitops/__ffs.h @@ -11,9 +11,9 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned int __ffs(unsigned long word) { - int num = 0; + unsigned int num = 0; #if __BITS_PER_LONG == 64 if ((word & 0xffffffff) == 0) { diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index 54ccccf96e21..e974ec932ec1 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -10,9 +10,9 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long generic___fls(unsigned long word) +static __always_inline unsigned int generic___fls(unsigned long word) { - int num = BITS_PER_LONG - 1; + unsigned int num = BITS_PER_LONG - 1; #if BITS_PER_LONG == 64 if (!(word & (~0ul << 32))) { diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 7319f6ced108..8e073a111d2a 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -70,7 +70,7 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } -static inline unsigned fls_long(unsigned long l) +static inline unsigned int fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); -- cgit v1.2.3 From 1da2363228d68da266443e7a85fa91edc2be3dac Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 May 2024 20:51:02 -0700 Subject: selftests/cgroup: fix clang build failures for abs() calls First of all, in order to build with clang at all, one must first apply Valentin Obst's build fix for LLVM [1]. Once that is done, then when building with clang, via: make LLVM=1 -C tools/testing/selftests ...clang is pickier than gcc, about which version of abs(3) to call, depending on the argument type: int abs(int j); long labs(long j); long long llabs(long long j); ...and this is causing both build failures and warnings, when running: make LLVM=1 -C tools/testing/selftests Fix this by calling labs() in value_close(), because the arguments are unambiguously "long" type. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Signed-off-by: John Hubbard Reviewed-by: Roman Gushchin Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/cgroup_util.h | 2 +- tools/testing/selftests/cgroup/test_kmem.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 89e8519fb271..e8d04ac9e3d2 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -18,7 +18,7 @@ */ static inline int values_close(long a, long b, int err) { - return abs(a - b) <= (a + b) / 100 * err; + return labs(a - b) <= (a + b) / 100 * err; } extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate); diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 137506db0312..96693d8772be 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -192,7 +192,7 @@ static int test_kmem_memcg_deletion(const char *root) goto cleanup; sum = anon + file + kernel + sock; - if (abs(sum - current) < MAX_VMSTAT_ERROR) { + if (labs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { printf("memory.current = %ld\n", current); @@ -380,7 +380,7 @@ static int test_percpu_basic(const char *root) current = cg_read_long(parent, "memory.current"); percpu = cg_read_key_long(parent, "memory.stat", "percpu "); - if (current > 0 && percpu > 0 && abs(current - percpu) < + if (current > 0 && percpu > 0 && labs(current - percpu) < MAX_VMSTAT_ERROR) ret = KSFT_PASS; else -- cgit v1.2.3 From 0515089418d064000b7f375257c10107d3ad0c7f Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 May 2024 20:51:03 -0700 Subject: selftests/cgroup: fix clang warnings: uninitialized fd variable First of all, in order to build with clang at all, one must first apply Valentin Obst's build fix for LLVM [1]. Once that is done, then when building with clang, via: make LLVM=1 -C tools/testing/selftests ...clang warns about fd being used uninitialized, in test_memcg_reclaim()'s error handling path. Fix this by initializing fd to -1. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Signed-off-by: John Hubbard Reviewed-by: Roman Gushchin Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index b462416b3806..41ae8047b889 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -716,7 +716,9 @@ static bool reclaim_until(const char *memcg, long goal) */ static int test_memcg_reclaim(const char *root) { - int ret = KSFT_FAIL, fd, retries; + int ret = KSFT_FAIL; + int fd = -1; + int retries; char *memcg; long current, expected_usage; -- cgit v1.2.3 From 3309ca6f47f11b5d817ce1e5d8b2f1637b93243e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 May 2024 20:51:04 -0700 Subject: selftests/cgroup: cpu_hogger init: use {} instead of {NULL} First of all, in order to build with clang at all, one must first apply Valentin Obst's build fix for LLVM [1]. Once that is done, then when building with clang, via: make LLVM=1 -C tools/testing/selftests ...clang generates warning here, because struct cpu_hogger has multiple fields, and the code is initializing an array of these structs, and it is incorrect to specify a single NULL value as the initializer. Fix this by initializing with {}, so that the compiler knows to use default initializer values for all fields in each array entry. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Signed-off-by: John Hubbard Reviewed-by: Roman Gushchin Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 186bf96f6a28..dad2ed82f3ef 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -237,7 +237,7 @@ run_cpucg_weight_test( { int ret = KSFT_FAIL, i; char *parent = NULL; - struct cpu_hogger children[3] = {NULL}; + struct cpu_hogger children[3] = {}; parent = cg_name(root, "cpucg_test_0"); if (!parent) @@ -408,7 +408,7 @@ run_cpucg_nested_weight_test(const char *root, bool overprovisioned) { int ret = KSFT_FAIL, i; char *parent = NULL, *child = NULL; - struct cpu_hogger leaf[3] = {NULL}; + struct cpu_hogger leaf[3] = {}; long nested_leaf_usage, child_usage; int nprocs = get_nprocs(); -- cgit v1.2.3 From 8f6d24a5db2acac3f52a34e6df347ec131d231ab Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 May 2024 20:51:05 -0700 Subject: selftests/cgroup: fix uninitialized variables in test_zswap.c First of all, in order to build with clang at all, one must first apply Valentin Obst's build fix for LLVM [1]. Once that is done, then when building with clang, via: make LLVM=1 -C tools/testing/selftests ...clang finds and warning about some uninitialized variables. Fix these by initializing them. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Signed-off-by: John Hubbard Reviewed-by: Roman Gushchin Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index ef7f39545317..fa571bfd2432 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -257,7 +257,7 @@ static int test_no_invasive_cgroup_shrink(const char *root) { int ret = KSFT_FAIL; size_t control_allocation_size = MB(10); - char *control_allocation, *wb_group = NULL, *control_group = NULL; + char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); if (!wb_group) @@ -342,7 +342,7 @@ static int test_no_kmem_bypass(const char *root) struct sysinfo sys_info; int ret = KSFT_FAIL; int child_status; - char *test_group; + char *test_group = NULL; pid_t child_pid; /* Read sys info and compute test values accordingly */ -- cgit v1.2.3 From 97c48ea8ff1cd70f471aa9a225a0fa9dca95d223 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 May 2024 14:35:02 -0700 Subject: perf test pmu-events: Make it clearer that pmu-events tests JSON events Add JSON to the test name. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Thomas Richter Link: https://lore.kernel.org/r/20240502213507.2339733-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index 47a7c3277540..f39aadacd7a6 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -1105,6 +1105,6 @@ static struct test_case pmu_events_tests[] = { }; struct test_suite suite__pmu_events = { - .desc = "PMU events", + .desc = "PMU JSON event tests", .test_cases = pmu_events_tests, }; -- cgit v1.2.3 From aa1551f299ba414c07edb08ebb4f46eb46006345 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 May 2024 14:35:04 -0700 Subject: perf test pmu: Refactor format test and exposed test APIs In tests/pmu.c, make a common utility that creates a PMU in a mkdtemp directory and uses regular PMU parsing logic to load that PMU. Formats must still be eagerly loaded as by default the PMU code assumes devices are going to be in sysfs. In util/pmu.[ch], hide perf_pmu__format_parse but add the eager argument to perf_pmu__lookup called by perf_pmus__add_test_pmu. Later patches will eagerly load other non-sysfs files when eager loading is enabled. In tests/pmu.c, rather than manually constructing a list of term arguments, just use the term parsing code from a string. Add more comments and debug logging. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Thomas Richter Link: https://lore.kernel.org/r/20240502213507.2339733-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 319 +++++++++++++++++++---------------------- tools/perf/util/parse-events.c | 2 +- tools/perf/util/parse-events.h | 2 +- tools/perf/util/pmu.c | 11 +- tools/perf/util/pmu.h | 4 +- tools/perf/util/pmus.c | 16 ++- tools/perf/util/pmus.h | 2 + 7 files changed, 177 insertions(+), 179 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 8f18127d876a..424ebdb0f09d 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -2,203 +2,186 @@ #include "parse-events.h" #include "pmu.h" #include "tests.h" +#include "debug.h" #include #include #include -#include -#include -#include - -/* Simulated format definitions. */ -static struct test_format { - const char *name; - const char *value; -} test_formats[] = { - { "krava01", "config:0-1,62-63\n", }, - { "krava02", "config:10-17\n", }, - { "krava03", "config:5\n", }, - { "krava11", "config1:0,2,4,6,8,20-28\n", }, - { "krava12", "config1:63\n", }, - { "krava13", "config1:45-47\n", }, - { "krava21", "config2:0-3,10-13,20-23,30-33,40-43,50-53,60-63\n", }, - { "krava22", "config2:8,18,48,58\n", }, - { "krava23", "config2:28-29,38\n", }, -}; - -/* Simulated users input. */ -static struct parse_events_term test_terms[] = { - { - .config = "krava01", - .val.num = 15, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava02", - .val.num = 170, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava03", - .val.num = 1, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava11", - .val.num = 27, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava12", - .val.num = 1, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava13", - .val.num = 2, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava21", - .val.num = 119, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava22", - .val.num = 11, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, - { - .config = "krava23", - .val.num = 2, - .type_val = PARSE_EVENTS__TERM_TYPE_NUM, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - }, -}; - -/* - * Prepare format directory data, exported by kernel - * at /sys/bus/event_source/devices//format. - */ -static char *test_format_dir_get(char *dir, size_t sz) -{ - unsigned int i; - - snprintf(dir, sz, "/tmp/perf-pmu-test-format-XXXXXX"); - if (!mkdtemp(dir)) - return NULL; - - for (i = 0; i < ARRAY_SIZE(test_formats); i++) { - char name[PATH_MAX]; - struct test_format *format = &test_formats[i]; - FILE *file; - - scnprintf(name, PATH_MAX, "%s/%s", dir, format->name); - - file = fopen(name, "w"); - if (!file) - return NULL; - - if (1 != fwrite(format->value, strlen(format->value), 1, file)) - break; +#include +#include +#include - fclose(file); - } - - return dir; -} +/* Fake PMUs created in temp directory. */ +static LIST_HEAD(test_pmus); -/* Cleanup format directory. */ -static int test_format_dir_put(char *dir) +/* Cleanup test PMU directory. */ +static int test_pmu_put(const char *dir, struct perf_pmu *pmu) { char buf[PATH_MAX + 20]; + int ret; - snprintf(buf, sizeof(buf), "rm -f %s/*\n", dir); - if (system(buf)) - return -1; + if (scnprintf(buf, sizeof(buf), "rm -fr %s", dir) < 0) { + pr_err("Failure to set up buffer for \"%s\"\n", dir); + return -EINVAL; + } + ret = system(buf); + if (ret) + pr_err("Failure to \"%s\"\n", buf); - snprintf(buf, sizeof(buf), "rmdir %s\n", dir); - return system(buf); + list_del(&pmu->list); + perf_pmu__delete(pmu); + return ret; } -static void add_test_terms(struct parse_events_terms *terms) +/* + * Prepare test PMU directory data, normally exported by kernel at + * /sys/bus/event_source/devices//. Give as input a buffer to hold the file + * path, the result is PMU loaded using that directory. + */ +static struct perf_pmu *test_pmu_get(char *dir, size_t sz) { - unsigned int i; + /* Simulated format definitions. */ + const struct test_format { + const char *name; + const char *value; + } test_formats[] = { + { "krava01", "config:0-1,62-63\n", }, + { "krava02", "config:10-17\n", }, + { "krava03", "config:5\n", }, + { "krava11", "config1:0,2,4,6,8,20-28\n", }, + { "krava12", "config1:63\n", }, + { "krava13", "config1:45-47\n", }, + { "krava21", "config2:0-3,10-13,20-23,30-33,40-43,50-53,60-63\n", }, + { "krava22", "config2:8,18,48,58\n", }, + { "krava23", "config2:28-29,38\n", }, + }; + char name[PATH_MAX]; + int dirfd, file; + struct perf_pmu *pmu = NULL; + ssize_t len; + + /* Create equivalent of sysfs mount point. */ + scnprintf(dir, sz, "/tmp/perf-pmu-test-XXXXXX"); + if (!mkdtemp(dir)) { + pr_err("mkdtemp failed\n"); + dir[0] = '\0'; + return NULL; + } + dirfd = open(dir, O_DIRECTORY); + if (dirfd < 0) { + pr_err("Failed to open test directory \"%s\"\n", dir); + goto err_out; + } - for (i = 0; i < ARRAY_SIZE(test_terms); i++) { - struct parse_events_term *clone; + /* Create the test PMU directory and give it a perf_event_attr type number. */ + if (mkdirat(dirfd, "perf-pmu-test", 0755) < 0) { + pr_err("Failed to mkdir PMU directory\n"); + goto err_out; + } + file = openat(dirfd, "perf-pmu-test/type", O_WRONLY | O_CREAT, 0600); + if (!file) { + pr_err("Failed to open for writing file \"type\"\n"); + goto err_out; + } + len = strlen("9999"); + if (write(file, "9999\n", len) < len) { + close(file); + pr_err("Failed to write to 'type' file\n"); + goto err_out; + } + close(file); - parse_events_term__clone(&clone, &test_terms[i]); - list_add_tail(&clone->list, &terms->terms); + /* Create format directory and files. */ + if (mkdirat(dirfd, "perf-pmu-test/format", 0755) < 0) { + pr_err("Failed to mkdir PMU format directory\n)"); + goto err_out; } + for (size_t i = 0; i < ARRAY_SIZE(test_formats); i++) { + const struct test_format *format = &test_formats[i]; + + if (scnprintf(name, PATH_MAX, "perf-pmu-test/format/%s", format->name) < 0) { + pr_err("Failure to set up path for \"%s\"\n", format->name); + goto err_out; + } + file = openat(dirfd, name, O_WRONLY | O_CREAT, 0600); + if (!file) { + pr_err("Failed to open for writing file \"%s\"\n", name); + goto err_out; + } + + if (write(file, format->value, strlen(format->value)) < 0) { + pr_err("Failed to write to file \"%s\"\n", name); + close(file); + goto err_out; + } + close(file); + } + + /* Make the PMU reading the files created above. */ + pmu = perf_pmus__add_test_pmu(dirfd, "perf-pmu-test"); + if (!pmu) + pr_err("Test PMU creation failed\n"); + +err_out: + if (!pmu) + test_pmu_put(dir, pmu); + if (dirfd >= 0) + close(dirfd); + return pmu; } -static int test__pmu(struct test_suite *test __maybe_unused, int subtest __maybe_unused) +static int test__pmu_format(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { char dir[PATH_MAX]; - char *format; - struct parse_events_terms terms; struct perf_event_attr attr; - struct perf_pmu *pmu; - int fd; - int ret; + struct parse_events_terms terms; + int ret = TEST_FAIL; + struct perf_pmu *pmu = test_pmu_get(dir, sizeof(dir)); - parse_events_terms__init(&terms); - add_test_terms(&terms); - pmu = zalloc(sizeof(*pmu)); - if (!pmu) { - parse_events_terms__exit(&terms); - return -ENOMEM; - } + if (!pmu) + return TEST_FAIL; - INIT_LIST_HEAD(&pmu->format); - INIT_LIST_HEAD(&pmu->aliases); - INIT_LIST_HEAD(&pmu->caps); - format = test_format_dir_get(dir, sizeof(dir)); - if (!format) { - free(pmu); - parse_events_terms__exit(&terms); - return -EINVAL; + parse_events_terms__init(&terms); + if (parse_events_terms(&terms, + "krava01=15,krava02=170,krava03=1,krava11=27,krava12=1," + "krava13=2,krava21=119,krava22=11,krava23=2", + NULL)) { + pr_err("Term parsing failed\n"); + goto err_out; } memset(&attr, 0, sizeof(attr)); - - fd = open(format, O_DIRECTORY); - if (fd < 0) { - ret = fd; - goto out; + ret = perf_pmu__config_terms(pmu, &attr, &terms, /*zero=*/false, /*err=*/NULL); + if (ret) { + pr_err("perf_pmu__config_terms failed"); + goto err_out; } - pmu->name = strdup("perf-pmu-test"); - ret = perf_pmu__format_parse(pmu, fd, /*eager_load=*/true); - if (ret) - goto out; + if (attr.config != 0xc00000000002a823) { + pr_err("Unexpected config value %llx\n", attr.config); + goto err_out; + } + if (attr.config1 != 0x8000400000000145) { + pr_err("Unexpected config1 value %llx\n", attr.config1); + goto err_out; + } + if (attr.config2 != 0x0400000020041d07) { + pr_err("Unexpected config2 value %llx\n", attr.config2); + goto err_out; + } - ret = perf_pmu__config_terms(pmu, &attr, &terms, /*zero=*/false, /*err=*/NULL); - if (ret) - goto out; - - ret = -EINVAL; - if (attr.config != 0xc00000000002a823) - goto out; - if (attr.config1 != 0x8000400000000145) - goto out; - if (attr.config2 != 0x0400000020041d07) - goto out; - - ret = 0; -out: - test_format_dir_put(format); - perf_pmu__delete(pmu); + ret = TEST_OK; +err_out: parse_events_terms__exit(&terms); + test_pmu_put(dir, pmu); return ret; } -DEFINE_SUITE("Parse perf pmu format", pmu); +static struct test_case tests__pmu[] = { + TEST_CASE("Parsing with PMU format directory", pmu_format), + { .name = NULL, } +}; + +struct test_suite suite__pmu = { + .desc = "Sysfs PMU tests", + .test_cases = tests__pmu, +}; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 0f308b4db2b9..2b9ede311c31 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2585,7 +2585,7 @@ int parse_events_term__term(struct parse_events_term **term, } int parse_events_term__clone(struct parse_events_term **new, - struct parse_events_term *term) + const struct parse_events_term *term) { char *str; struct parse_events_term temp = *term; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 5695308efab9..e7ac1f13376d 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -178,7 +178,7 @@ int parse_events_term__term(struct parse_events_term **term, enum parse_events__term_type term_rhs, void *loc_term, void *loc_val); int parse_events_term__clone(struct parse_events_term **new, - struct parse_events_term *term); + const struct parse_events_term *term); void parse_events_term__delete(struct parse_events_term *term); void parse_events_terms__delete(struct parse_events_terms *terms); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 74dd5bd49d9a..fbbc535ed93f 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -182,7 +182,7 @@ static void perf_pmu_format__load(const struct perf_pmu *pmu, struct perf_pmu_fo * Parse & process all the sysfs attributes located under * the directory specified in 'dir' parameter. */ -int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load) +static int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load) { struct dirent *evt_ent; DIR *format_dir; @@ -232,7 +232,7 @@ int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load) * located at: * /sys/bus/event_source/devices//format as sysfs group attributes. */ -static int pmu_format(struct perf_pmu *pmu, int dirfd, const char *name) +static int pmu_format(struct perf_pmu *pmu, int dirfd, const char *name, bool eager_load) { int fd; @@ -241,7 +241,7 @@ static int pmu_format(struct perf_pmu *pmu, int dirfd, const char *name) return 0; /* it'll close the fd */ - if (perf_pmu__format_parse(pmu, fd, /*eager_load=*/false)) + if (perf_pmu__format_parse(pmu, fd, eager_load)) return -1; return 0; @@ -994,7 +994,8 @@ perf_pmu__arch_init(struct perf_pmu *pmu) pmu->mem_events = perf_mem_events; } -struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *name) +struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *name, + bool eager_load) { struct perf_pmu *pmu; __u32 type; @@ -1023,7 +1024,7 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char * type value and format definitions. Load both right * now. */ - if (pmu_format(pmu, dirfd, name)) + if (pmu_format(pmu, dirfd, name, eager_load)) goto err; pmu->is_core = is_pmu_core(name); diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 93d03bd3ecbe..561716aa2b25 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -214,7 +214,6 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_ struct parse_events_error *err); int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb); -int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load); void perf_pmu_format__set_value(void *format, int config, unsigned long *bits); bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name); int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb); @@ -272,7 +271,8 @@ int perf_pmu__pathname_scnprintf(char *buf, size_t size, int perf_pmu__event_source_devices_fd(void); int perf_pmu__pathname_fd(int dirfd, const char *pmu_name, const char *filename, int flags); -struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name); +struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name, + bool eager_load); struct perf_pmu *perf_pmu__create_placeholder_core_pmu(struct list_head *core_pmus); void perf_pmu__delete(struct perf_pmu *pmu); struct perf_pmu *perf_pmus__find_core_pmu(void); diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 2fd369e45832..b9b4c5eb5002 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -124,7 +124,8 @@ struct perf_pmu *perf_pmus__find(const char *name) return NULL; dirfd = perf_pmu__event_source_devices_fd(); - pmu = perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name); + pmu = perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name, + /*eager_load=*/false); close(dirfd); if (!pmu) { @@ -159,7 +160,8 @@ static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) if (core_pmu && read_sysfs_core_pmus) return NULL; - return perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name); + return perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name, + /*eager_load=*/false); } static int pmus_cmp(void *priv __maybe_unused, @@ -696,3 +698,13 @@ struct perf_pmu *perf_pmus__find_core_pmu(void) { return perf_pmus__scan_core(NULL); } + +struct perf_pmu *perf_pmus__add_test_pmu(int test_sysfs_dirfd, const char *name) +{ + /* + * Some PMU functions read from the sysfs mount point, so care is + * needed, hence passing the eager_load flag to load things like the + * format files. + */ + return perf_pmu__lookup(&other_pmus, test_sysfs_dirfd, name, /*eager_load=*/true); +} diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index eec599d8aebd..9d4ded80b8e9 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -24,4 +24,6 @@ int perf_pmus__num_core_pmus(void); bool perf_pmus__supports_extended_type(void); char *perf_pmus__default_pmu_name(void); +struct perf_pmu *perf_pmus__add_test_pmu(int test_sysfs_dirfd, const char *name); + #endif /* __PMUS_H */ -- cgit v1.2.3 From 18eb2ca8c18f0612c15aa12375e7cba29e97ab1a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 May 2024 14:35:05 -0700 Subject: perf test pmu: Add an eagerly loaded event test Allow events/aliases to be eagerly loaded for a PMU. Factor out the pmu_aliases_parse to allow this. Parse a test event and check it configures the attribute as expected. There is overlap with the parse-events tests, but this test is done with a PMU created in a temp directory and doesn't rely on PMUs in sysfs. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Thomas Richter Link: https://lore.kernel.org/r/20240502213507.2339733-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu.c | 69 +++++++++++++++++++++++++++++++-------------- 2 files changed, 124 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 424ebdb0f09d..071cfc51b52c 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "evlist.h" +#include "evsel.h" #include "parse-events.h" #include "pmu.h" #include "tests.h" @@ -54,6 +56,9 @@ static struct perf_pmu *test_pmu_get(char *dir, size_t sz) { "krava22", "config2:8,18,48,58\n", }, { "krava23", "config2:28-29,38\n", }, }; + const char *test_event = "krava01=15,krava02=170,krava03=1,krava11=27,krava12=1," + "krava13=2,krava21=119,krava22=11,krava23=2\n"; + char name[PATH_MAX]; int dirfd, file; struct perf_pmu *pmu = NULL; @@ -116,6 +121,24 @@ static struct perf_pmu *test_pmu_get(char *dir, size_t sz) close(file); } + /* Create test event. */ + if (mkdirat(dirfd, "perf-pmu-test/events", 0755) < 0) { + pr_err("Failed to mkdir PMU events directory\n"); + goto err_out; + } + file = openat(dirfd, "perf-pmu-test/events/test-event", O_WRONLY | O_CREAT, 0600); + if (!file) { + pr_err("Failed to open for writing file \"type\"\n"); + goto err_out; + } + len = strlen(test_event); + if (write(file, test_event, len) < len) { + close(file); + pr_err("Failed to write to 'test-event' file\n"); + goto err_out; + } + close(file); + /* Make the PMU reading the files created above. */ pmu = perf_pmus__add_test_pmu(dirfd, "perf-pmu-test"); if (!pmu) @@ -176,8 +199,61 @@ err_out: return ret; } +static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) +{ + char dir[PATH_MAX]; + struct parse_events_error err; + struct evlist *evlist; + struct evsel *evsel; + struct perf_event_attr *attr; + int ret = TEST_FAIL; + struct perf_pmu *pmu = test_pmu_get(dir, sizeof(dir)); + const char *event = "perf-pmu-test/test-event/"; + + + if (!pmu) + return TEST_FAIL; + + evlist = evlist__new(); + if (evlist == NULL) { + pr_err("Failed allocation"); + goto err_out; + } + parse_events_error__init(&err); + ret = parse_events(evlist, event, &err); + if (ret) { + pr_debug("failed to parse event '%s', err %d\n", event, ret); + parse_events_error__print(&err, event); + if (parse_events_error__contains(&err, "can't access trace events")) + ret = TEST_SKIP; + goto err_out; + } + evsel = evlist__first(evlist); + attr = &evsel->core.attr; + if (attr->config != 0xc00000000002a823) { + pr_err("Unexpected config value %llx\n", attr->config); + goto err_out; + } + if (attr->config1 != 0x8000400000000145) { + pr_err("Unexpected config1 value %llx\n", attr->config1); + goto err_out; + } + if (attr->config2 != 0x0400000020041d07) { + pr_err("Unexpected config2 value %llx\n", attr->config2); + goto err_out; + } + + ret = TEST_OK; +err_out: + parse_events_error__exit(&err); + evlist__delete(evlist); + test_pmu_put(dir, pmu); + return ret; +} + static struct test_case tests__pmu[] = { TEST_CASE("Parsing with PMU format directory", pmu_format), + TEST_CASE("Parsing with PMU event", pmu_events), { .name = NULL, } }; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index fbbc535ed93f..7849be4bfea1 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -597,33 +597,18 @@ static inline bool pmu_alias_info_file(const char *name) * Reading the pmu event aliases definition, which should be located at: * /sys/bus/event_source/devices//events as sysfs group attributes. */ -static int pmu_aliases_parse(struct perf_pmu *pmu) +static int __pmu_aliases_parse(struct perf_pmu *pmu, int events_dir_fd) { - char path[PATH_MAX]; struct dirent *evt_ent; DIR *event_dir; - size_t len; - int fd, dir_fd; - len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path)); - if (!len) - return 0; - scnprintf(path + len, sizeof(path) - len, "%s/events", pmu->name); - - dir_fd = open(path, O_DIRECTORY); - if (dir_fd == -1) { - pmu->sysfs_aliases_loaded = true; - return 0; - } - - event_dir = fdopendir(dir_fd); - if (!event_dir){ - close (dir_fd); + event_dir = fdopendir(events_dir_fd); + if (!event_dir) return -EINVAL; - } while ((evt_ent = readdir(event_dir))) { char *name = evt_ent->d_name; + int fd; FILE *file; if (!strcmp(name, ".") || !strcmp(name, "..")) @@ -635,7 +620,7 @@ static int pmu_aliases_parse(struct perf_pmu *pmu) if (pmu_alias_info_file(name)) continue; - fd = openat(dir_fd, name, O_RDONLY); + fd = openat(events_dir_fd, name, O_RDONLY); if (fd == -1) { pr_debug("Cannot open %s\n", name); continue; @@ -653,11 +638,50 @@ static int pmu_aliases_parse(struct perf_pmu *pmu) } closedir(event_dir); - close (dir_fd); pmu->sysfs_aliases_loaded = true; return 0; } +static int pmu_aliases_parse(struct perf_pmu *pmu) +{ + char path[PATH_MAX]; + size_t len; + int events_dir_fd, ret; + + if (pmu->sysfs_aliases_loaded) + return 0; + + len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path)); + if (!len) + return 0; + scnprintf(path + len, sizeof(path) - len, "%s/events", pmu->name); + + events_dir_fd = open(path, O_DIRECTORY); + if (events_dir_fd == -1) { + pmu->sysfs_aliases_loaded = true; + return 0; + } + ret = __pmu_aliases_parse(pmu, events_dir_fd); + close(events_dir_fd); + return ret; +} + +static int pmu_aliases_parse_eager(struct perf_pmu *pmu, int sysfs_fd) +{ + char path[FILENAME_MAX + 7]; + int ret, events_dir_fd; + + scnprintf(path, sizeof(path), "%s/events", pmu->name); + events_dir_fd = openat(sysfs_fd, path, O_DIRECTORY, 0); + if (events_dir_fd == -1) { + pmu->sysfs_aliases_loaded = true; + return 0; + } + ret = __pmu_aliases_parse(pmu, events_dir_fd); + close(events_dir_fd); + return ret; +} + static int pmu_alias_terms(struct perf_pmu_alias *alias, int err_loc, struct list_head *terms) { struct parse_events_term *term, *cloned; @@ -1042,6 +1066,9 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char perf_pmu__arch_init(pmu); + if (eager_load) + pmu_aliases_parse_eager(pmu, dirfd); + return pmu; err: zfree(&pmu->name); -- cgit v1.2.3 From 6debc5aa326fa2eefe2988aaa4c46a1aa6b16e11 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 May 2024 14:35:06 -0700 Subject: perf test pmu: Test all sysfs PMU event names are the same case Being either lower or upper case means event name probes can avoid scanning the directory doing case insensitive comparisons, just the lower or upper case version of the name can be checked for existence. For the majority of PMUs event names are all lower case, upper case names are present on S390. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Thomas Richter Link: https://lore.kernel.org/r/20240502213507.2339733-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 071cfc51b52c..06cc0e46cb28 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -5,12 +5,17 @@ #include "pmu.h" #include "tests.h" #include "debug.h" +#include "fncache.h" +#include +#include +#include #include #include #include #include #include #include +#include /* Fake PMUs created in temp directory. */ static LIST_HEAD(test_pmus); @@ -251,9 +256,94 @@ err_out: return ret; } +static bool permitted_event_name(const char *name) +{ + bool has_lower = false, has_upper = false; + + for (size_t i = 0; i < strlen(name); i++) { + char c = name[i]; + + if (islower(c)) { + if (has_upper) + return false; + has_lower = true; + continue; + } + if (isupper(c)) { + if (has_lower) + return false; + has_upper = true; + continue; + } + if (!isdigit(c) && c != '.' && c != '_' && c != '-') + return false; + } + return true; +} + +static int test__pmu_event_names(struct test_suite *test __maybe_unused, + int subtest __maybe_unused) +{ + char path[PATH_MAX]; + DIR *pmu_dir, *event_dir; + struct dirent *pmu_dent, *event_dent; + const char *sysfs = sysfs__mountpoint(); + int ret = TEST_OK; + + if (!sysfs) { + pr_err("Sysfs not mounted\n"); + return TEST_FAIL; + } + + snprintf(path, sizeof(path), "%s/bus/event_source/devices/", sysfs); + pmu_dir = opendir(path); + if (!pmu_dir) { + pr_err("Error opening \"%s\"\n", path); + return TEST_FAIL; + } + while ((pmu_dent = readdir(pmu_dir))) { + if (!strcmp(pmu_dent->d_name, ".") || + !strcmp(pmu_dent->d_name, "..")) + continue; + + snprintf(path, sizeof(path), "%s/bus/event_source/devices/%s/type", + sysfs, pmu_dent->d_name); + + /* Does it look like a PMU? */ + if (!file_available(path)) + continue; + + /* Process events. */ + snprintf(path, sizeof(path), "%s/bus/event_source/devices/%s/events", + sysfs, pmu_dent->d_name); + + event_dir = opendir(path); + if (!event_dir) { + pr_debug("Skipping as no event directory \"%s\"\n", path); + continue; + } + while ((event_dent = readdir(event_dir))) { + const char *event_name = event_dent->d_name; + + if (!strcmp(event_name, ".") || !strcmp(event_name, "..")) + continue; + + if (!permitted_event_name(event_name)) { + pr_err("Invalid sysfs event name: %s/%s\n", + pmu_dent->d_name, event_name); + ret = TEST_FAIL; + } + } + closedir(event_dir); + } + closedir(pmu_dir); + return ret; +} + static struct test_case tests__pmu[] = { TEST_CASE("Parsing with PMU format directory", pmu_format), TEST_CASE("Parsing with PMU event", pmu_events), + TEST_CASE("PMU event names", pmu_event_names), { .name = NULL, } }; -- cgit v1.2.3 From 7b6dd7a923281a7ccb980a0f768d6926721eb3cc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 May 2024 14:35:07 -0700 Subject: perf pmu: Assume sysfs events are always the same case Perf event names aren't case sensitive. For sysfs events the entire directory of events is read then iterated comparing names in a case insensitive way, most often to see if an event is present. Consider: $ perf stat -e inst_retired.any true The event inst_retired.any may be present in any PMU, so every PMU's sysfs events are loaded and then searched with strcasecmp to see if any match. This event is only present on the cpu PMU as a JSON event so a lot of events were loaded from sysfs unnecessarily just to prove an event didn't exist there. This change avoids loading all the events by assuming sysfs event names are always either lower or uppercase. It uses file exists and only loads the events when the desired event is present. For the example above, the number of openat calls measured by 'perf trace' on a tigerlake laptop goes from 325 down to 255. The reduction will be larger for machines with many PMUs, particularly replicated uncore PMUs. Ensure pmu_aliases_parse() is called before all uses of the aliases list, but remove some "pmu->sysfs_aliases_loaded" tests as they are now part of the function. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Thomas Richter Link: https://lore.kernel.org/r/20240502213507.2339733-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7849be4bfea1..b3b072feef02 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -425,9 +425,30 @@ static struct perf_pmu_alias *perf_pmu__find_alias(struct perf_pmu *pmu, { struct perf_pmu_alias *alias; - if (load && !pmu->sysfs_aliases_loaded) - pmu_aliases_parse(pmu); + if (load && !pmu->sysfs_aliases_loaded) { + bool has_sysfs_event; + char event_file_name[FILENAME_MAX + 8]; + /* + * Test if alias/event 'name' exists in the PMU's sysfs/events + * directory. If not skip parsing the sysfs aliases. Sysfs event + * name must be all lower or all upper case. + */ + scnprintf(event_file_name, sizeof(event_file_name), "events/%s", name); + for (size_t i = 7, n = 7 + strlen(name); i < n; i++) + event_file_name[i] = tolower(event_file_name[i]); + + has_sysfs_event = perf_pmu__file_exists(pmu, event_file_name); + if (!has_sysfs_event) { + for (size_t i = 7, n = 7 + strlen(name); i < n; i++) + event_file_name[i] = toupper(event_file_name[i]); + + has_sysfs_event = perf_pmu__file_exists(pmu, event_file_name); + } + if (has_sysfs_event) + pmu_aliases_parse(pmu); + + } list_for_each_entry(alias, &pmu->aliases, list) { if (!strcasecmp(alias->name, name)) return alias; @@ -1717,9 +1738,7 @@ size_t perf_pmu__num_events(struct perf_pmu *pmu) { size_t nr; - if (!pmu->sysfs_aliases_loaded) - pmu_aliases_parse(pmu); - + pmu_aliases_parse(pmu); nr = pmu->sysfs_aliases; if (pmu->cpu_aliases_added) @@ -1778,6 +1797,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus, struct strbuf sb; strbuf_init(&sb, /*hint=*/ 0); + pmu_aliases_parse(pmu); pmu_add_cpu_aliases(pmu); list_for_each_entry(event, &pmu->aliases, list) { size_t buf_used; @@ -2193,6 +2213,7 @@ const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config) if (!pmu) return NULL; + pmu_aliases_parse(pmu); pmu_add_cpu_aliases(pmu); list_for_each_entry(event, &pmu->aliases, list) { struct perf_event_attr attr = {.config = 0,}; -- cgit v1.2.3 From 3e51f2cbbc5dc854f89ca37d95d295bfcabb5b43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 May 2024 09:40:43 -0700 Subject: tools: ynl: add --list-ops and --list-msgs to CLI I often forget the exact naming of ops and have to look at the spec to find it. Add support for listing the operations: $ ./cli.py --spec .../netdev.yaml --list-ops dev-get [ do, dump ] page-pool-get [ do, dump ] page-pool-stats-get [ do, dump ] queue-get [ do, dump ] napi-get [ do, dump ] qstats-get [ dump ] For completeness also support listing all ops (including notifications: # ./cli.py --spec .../netdev.yaml --list-msgs dev-get [ dump, do ] dev-add-ntf [ notify ] dev-del-ntf [ notify ] dev-change-ntf [ notify ] page-pool-get [ dump, do ] page-pool-add-ntf [ notify ] page-pool-del-ntf [ notify ] page-pool-change-ntf [ notify ] page-pool-stats-get [ dump, do ] queue-get [ dump, do ] napi-get [ dump, do ] qstats-get [ dump ] Use double space after the name for slightly easier to read output. Reviewed-by: Jiri Pirko Reviewed-by: Donald Hunter Link: https://lore.kernel.org/r/20240502164043.2130184-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/cli.py | 9 +++++++++ tools/net/ynl/lib/nlspec.py | 2 ++ 2 files changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py index 058926d69ef0..b8481f401376 100755 --- a/tools/net/ynl/cli.py +++ b/tools/net/ynl/cli.py @@ -40,6 +40,8 @@ def main(): group.add_argument('--multi', dest='multi', nargs=2, action='append', metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + group.add_argument('--list-ops', action='store_true') + group.add_argument('--list-msgs', action='store_true') parser.add_argument('--sleep', dest='sleep', type=int) parser.add_argument('--subscribe', dest='ntf', type=str) @@ -81,6 +83,13 @@ def main(): if args.sleep: time.sleep(args.sleep) + if args.list_ops: + for op_name, op in ynl.ops.items(): + print(op_name, " [", ", ".join(op.modes), "]") + if args.list_msgs: + for op_name, op in ynl.msgs.items(): + print(op_name, " [", ", ".join(op.modes), "]") + try: if args.do: reply = ynl.do(args.do, attrs, args.flags) diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index 6d08ab9e213f..b6d6f8aef423 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -335,6 +335,7 @@ class SpecOperation(SpecElement): req_value numerical ID when serialized, user -> kernel rsp_value numerical ID when serialized, user <- kernel + modes supported operation modes (do, dump, event etc.) is_call bool, whether the operation is a call is_async bool, whether the operation is a notification is_resv bool, whether the operation does not exist (it's just a reserved ID) @@ -350,6 +351,7 @@ class SpecOperation(SpecElement): self.req_value = req_value self.rsp_value = rsp_value + self.modes = yaml.keys() & {'do', 'dump', 'event', 'notify'} self.is_call = 'do' in yaml or 'dump' in yaml self.is_async = 'notify' in yaml or 'event' in yaml self.is_resv = not self.is_async and not self.is_call -- cgit v1.2.3 From 8f283fb7b8092a5cf56e87e4e1918be26f126598 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Feb 2024 09:23:57 -0800 Subject: perf trace: Disable syscall augmentation with record Syscall augmentation is causing samples not to be written to the perf.data file with "perf trace record". Disabling augmentation is sub-optimal, but it beats having a totally broken perf trace record. Closes: https://lore.kernel.org/lkml/CAP-5=fV9Gd1Teak+EOcUSxe13KqSyfZyPNagK97GbLiOQRgGaw@mail.gmail.com/ Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240216172357.65037-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index e5fef39c34bf..3818d3a62779 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4883,6 +4883,11 @@ int cmd_trace(int argc, const char **argv) if (!trace.trace_syscalls) goto skip_augmentation; + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) { + pr_debug("Syscall augmentation fails with record, disabling augmentation"); + goto skip_augmentation; + } + trace.skel = augmented_raw_syscalls_bpf__open(); if (!trace.skel) { pr_debug("Failed to open augmented syscalls BPF skeleton"); -- cgit v1.2.3 From 45c072f2537ab07b38553e4d8f9e9fcf9be5fbd1 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 3 May 2024 12:46:19 +0530 Subject: perf vendor events amd: Add Zen 5 core events Add core events taken from Section 1.4 "Core Performance Monitor Counters" of the Performance Monitor Counters for AMD Family 1Ah Model 00h-0Fh Processors document available at the link below. This constitutes events which capture information on op dispatch, execution and retirement, branch prediction, L1 and L2 cache activity, TLB activity, etc. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=305974 Link: https://lore.kernel.org/r/668d194241bf0d42dc37f1c5af8131069a0bd82c.1714717230.git.sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/amdzen5/branch-prediction.json | 93 +++ tools/perf/pmu-events/arch/x86/amdzen5/decode.json | 115 +++ .../pmu-events/arch/x86/amdzen5/execution.json | 174 +++++ .../arch/x86/amdzen5/floating-point.json | 812 +++++++++++++++++++++ .../pmu-events/arch/x86/amdzen5/inst-cache.json | 72 ++ .../perf/pmu-events/arch/x86/amdzen5/l2-cache.json | 266 +++++++ .../pmu-events/arch/x86/amdzen5/load-store.json | 451 ++++++++++++ 7 files changed, 1983 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/decode.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/execution.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/load-store.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json b/tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json new file mode 100644 index 000000000000..2d8d18cb85c1 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/branch-prediction.json @@ -0,0 +1,93 @@ +[ + { + "EventName": "bp_l1_tlb_miss_l2_tlb_hit", + "EventCode": "0x84", + "BriefDescription": "Instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB." + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for 2M pages.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for 1G pages.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x08" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.all", + "EventCode": "0x85", + "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks are requested) for all page sizes.", + "UMask": "0x0f" + }, + { + "EventName": "bp_l2_btb_correct", + "EventCode": "0x8b", + "BriefDescription": "L2 branch prediction overrides existing prediction (speculative)." + }, + { + "EventName": "bp_dyn_ind_pred", + "EventCode": "0x8e", + "BriefDescription": "Dynamic indirect predictions (branch used the indirect predictor to make a prediction)." + }, + { + "EventName": "bp_de_redirect", + "EventCode": "0x91", + "BriefDescription": "Number of times an early redirect is sent to branch predictor. This happens when either the decoder or dispatch logic is able to detect that the branch predictor needs to be redirected." + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if4k", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 4k or coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if2m", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 2M pages.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if1g", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 1G pages.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.all", + "EventCode": "0x94", + "BriefDescription": "Instruction fetches that hit in the L1 ITLB for all page sizes.", + "UMask": "0x07" + }, + { + "EventName": "bp_redirects.resync", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the branch predictor caused by resyncs.", + "UMask": "0x01" + }, + { + "EventName": "bp_redirects.ex_redir", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the branch predictor caused by mispredicts.", + "UMask": "0x02" + }, + { + "EventName": "bp_redirects.all", + "EventCode": "0x9f", + "BriefDescription": "Redirects of the branch predictor." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/decode.json b/tools/perf/pmu-events/arch/x86/amdzen5/decode.json new file mode 100644 index 000000000000..d0eff7f2a3ea --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/decode.json @@ -0,0 +1,115 @@ +[ + { + "EventName": "de_op_queue_empty", + "EventCode": "0xa9", + "BriefDescription": "Cycles where the op queue is empty. Such cycles indicate that the front-end is not delivering instructions fast enough." + }, + { + "EventName": "de_src_op_disp.x86_decoder", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from x86 decoder.", + "UMask": "0x01" + }, + { + "EventName": "de_src_op_disp.op_cache", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from op cache.", + "UMask": "0x02" + }, + { + "EventName": "de_src_op_disp.all", + "EventCode": "0xaa", + "BriefDescription": "Ops dispatched from any source.", + "UMask": "0x07" + }, + { + "EventName": "de_dis_ops_from_decoder.any_fp_dispatch", + "EventCode": "0xab", + "BriefDescription": "Number of ops dispatched to the floating-point unit.", + "UMask": "0x04" + }, + { + "EventName": "de_dis_ops_from_decoder.any_integer_dispatch", + "EventCode": "0xab", + "BriefDescription": "Number of ops dispatched to the integer execution unit.", + "UMask": "0x08" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.int_phy_reg_file_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to an integer physical register file resource stall.", + "UMask": "0x01" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.load_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a lack of load queue tokens.", + "UMask": "0x02" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.store_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a lack of store queue tokens.", + "UMask": "0x04" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.taken_brnch_buffer_rsrc", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a taken branch buffer resource stall.", + "UMask": "0x10" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.fp_sch_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a floating-point non-schedulable queue token stall.", + "UMask": "0x40" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.al_tokens", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of ALU tokens.", + "UMask": "0x01" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ag_tokens", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of agen tokens.", + "UMask": "0x02" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ex_flush_recovery", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a pending integer execution flush recovery.", + "UMask": "0x04" + }, + { + "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.retq", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of retire queue tokens.", + "UMask": "0x20" + }, + { + "EventName": "de_no_dispatch_per_slot.no_ops_from_frontend", + "EventCode": "0x1a0", + "BriefDescription": "In each cycle counts dispatch slots left empty because the front-end did not supply ops.", + "UMask": "0x01" + }, + { + "EventName": "de_no_dispatch_per_slot.backend_stalls", + "EventCode": "0x1a0", + "BriefDescription": "In each cycle counts ops unable to dispatch because of back-end stalls.", + "UMask": "0x1e" + }, + { + "EventName": "de_no_dispatch_per_slot.smt_contention", + "EventCode": "0x1a0", + "BriefDescription": "In each cycle counts ops unable to dispatch because the dispatch cycle was granted to the other SMT thread.", + "UMask": "0x60" + }, + { + "EventName": "de_additional_resource_stalls.dispatch_stalls", + "EventCode": "0x1a2", + "BriefDescription": "Counts additional cycles where dispatch is stalled due to a lack of dispatch resources.", + "UMask": "0x30" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/execution.json b/tools/perf/pmu-events/arch/x86/amdzen5/execution.json new file mode 100644 index 000000000000..5a46d3db74e7 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/execution.json @@ -0,0 +1,174 @@ +[ + { + "EventName": "ex_ret_instr", + "EventCode": "0xc0", + "BriefDescription": "Retired instructions." + }, + { + "EventName": "ex_ret_ops", + "EventCode": "0xc1", + "BriefDescription": "Retired macro-ops." + }, + { + "EventName": "ex_ret_brn", + "EventCode": "0xc2", + "BriefDescription": "Retired branch instructions (all types of architectural control flow changes, including exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_misp", + "EventCode": "0xc3", + "BriefDescription": "Retired branch instructions mispredicted." + }, + { + "EventName": "ex_ret_brn_tkn", + "EventCode": "0xc4", + "BriefDescription": "Retired taken branch instructions (all types of architectural control flow changes, including exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_tkn_misp", + "EventCode": "0xc5", + "BriefDescription": "Retired taken branch instructions mispredicted." + }, + { + "EventName": "ex_ret_brn_far", + "EventCode": "0xc6", + "BriefDescription": "Retired far control transfers (far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts). Far control transfers are not subject to branch prediction." + }, + { + "EventName": "ex_ret_near_ret", + "EventCode": "0xc8", + "BriefDescription": "Retired near returns (RET or RET Iw)." + }, + { + "EventName": "ex_ret_near_ret_mispred", + "EventCode": "0xc9", + "BriefDescription": "Retired near returns mispredicted. Each misprediction incurs the same penalty as a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_brn_ind_misp", + "EventCode": "0xca", + "BriefDescription": "Retired indirect branch instructions mispredicted (only EX mispredicts). Each misprediction incurs the same penalty as a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_mmx_fp_instr.x87", + "EventCode": "0xcb", + "BriefDescription": "Retired x87 instructions.", + "UMask": "0x01" + }, + { + "EventName": "ex_ret_mmx_fp_instr.mmx", + "EventCode": "0xcb", + "BriefDescription": "Retired MMX instructions.", + "UMask": "0x02" + }, + { + "EventName": "ex_ret_mmx_fp_instr.sse", + "EventCode": "0xcb", + "BriefDescription": "Retired SSE instructions (includes SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42 and AVX).", + "UMask": "0x04" + }, + { + "EventName": "ex_ret_ind_brch_instr", + "EventCode": "0xcc", + "BriefDescription": "Retired indirect branch instructions." + }, + { + "EventName": "ex_ret_cond", + "EventCode": "0xd1", + "BriefDescription": "Retired conditional branch instructions." + }, + { + "EventName": "ex_div_busy", + "EventCode": "0xd3", + "BriefDescription": "Number of cycles the divider is busy." + }, + { + "EventName": "ex_div_count", + "EventCode": "0xd4", + "BriefDescription": "Divide ops executed." + }, + { + "EventName": "ex_no_retire.empty", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire due to the lack of valid ops in the retire queue (may be caused by front-end bottlenecks or pipeline redirects).", + "UMask": "0x01" + }, + { + "EventName": "ex_no_retire.not_complete", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire while the oldest op is waiting to be executed.", + "UMask": "0x02" + }, + { + "EventName": "ex_no_retire.other", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire caused by other reasons (retire breaks, traps, faults, etc.).", + "UMask": "0x08" + }, + { + "EventName": "ex_no_retire.thread_not_selected", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire because thread arbitration did not select the thread.", + "UMask": "0x10" + }, + { + "EventName": "ex_no_retire.load_not_complete", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire while the oldest op is waiting for load data.", + "UMask": "0xa2" + }, + { + "EventName": "ex_no_retire.all", + "EventCode": "0xd6", + "BriefDescription": "Cycles with no retire for any reason.", + "UMask": "0x1b" + }, + { + "EventName": "ex_ret_ucode_instr", + "EventCode": "0x1c1", + "BriefDescription": "Retired microcoded instructions." + }, + { + "EventName": "ex_ret_ucode_ops", + "EventCode": "0x1c2", + "BriefDescription": "Retired microcode ops." + }, + { + "EventName": "ex_ret_msprd_brnch_instr_dir_msmtch", + "EventCode": "0x1c7", + "BriefDescription": "Retired branch instructions mispredicted due to direction mismatch." + }, + { + "EventName": "ex_ret_uncond_brnch_instr_mispred", + "EventCode": "0x1c8", + "BriefDescription": "Retired unconditional indirect branch instructions mispredicted." + }, + { + "EventName": "ex_ret_uncond_brnch_instr", + "EventCode": "0x1c9", + "BriefDescription": "Retired unconditional branch instructions." + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops", + "EventCode": "0x1cf", + "BriefDescription": "Ops tagged by IBS.", + "UMask": "0x01" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret", + "EventCode": "0x1cf", + "BriefDescription": "Ops tagged by IBS that retired.", + "UMask": "0x02" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_count_rollover", + "EventCode": "0x1cf", + "BriefDescription": "Ops not tagged by IBS due to a previous tagged op that has not yet signaled interrupt.", + "UMask": "0x04" + }, + { + "EventName": "ex_ret_fused_instr", + "EventCode": "0x1d0", + "BriefDescription": "Retired fused instructions." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json new file mode 100644 index 000000000000..9204bfb1d69e --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/floating-point.json @@ -0,0 +1,812 @@ +[ + { + "EventName": "fp_ret_x87_fp_ops.add_sub_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point add and subtract ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_x87_fp_ops.mul_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point multiply ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_x87_fp_ops.div_sqrt_ops", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point divide and square root ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_x87_fp_ops.all", + "EventCode": "0x02", + "BriefDescription": "Retired x87 floating-point ops of all types.", + "UMask": "0x07" + }, + { + "EventName": "fp_ret_sse_avx_ops.add_sub_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point add and subtract ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_sse_avx_ops.mult_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point multiply ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_sse_avx_ops.div_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point divide and square root ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_sse_avx_ops.mac_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point multiply-accumulate ops (each operation is counted as 2 ops).", + "UMask": "0x08" + }, + { + "EventName": "fp_ret_sse_avx_ops.bfloat16_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point bfloat16 ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_single_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point scalar single-precision ops.", + "UMask": "0x40" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_single_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point packed single-precision ops.", + "UMask": "0x60" + }, + { + "EventName": "fp_ret_sse_avx_ops.scalar_double_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point scalar double-precision ops.", + "UMask": "0x80" + }, + { + "EventName": "fp_ret_sse_avx_ops.packed_double_flops", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point packed double-precision ops.", + "UMask": "0xa0" + }, + { + "EventName": "fp_ret_sse_avx_ops.all", + "EventCode": "0x03", + "BriefDescription": "Retired SSE and AVX floating-point ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_ops_retired_by_width.x87_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired x87 floating-point ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ops_retired_by_width.mmx_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired MMX floating-point ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ops_retired_by_width.scalar_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired scalar floating-point ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ops_retired_by_width.pack_128_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired packed 128-bit floating-point ops.", + "UMask": "0x08" + }, + { + "EventName": "fp_ops_retired_by_width.pack_256_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired packed 256-bit floating-point ops.", + "UMask": "0x10" + }, + { + "EventName": "fp_ops_retired_by_width.pack_512_uops_retired", + "EventCode": "0x08", + "BriefDescription": "Retired packed 512-bit floating-point ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ops_retired_by_width.all", + "EventCode": "0x08", + "BriefDescription": "Retired floating-point ops of all widths.", + "UMask": "0x3f" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_add", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point add ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_sub", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_mul", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_mac", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_div", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point divide ops.", + "UMask": "0x05" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_sqrt", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point square root ops.", + "UMask": "0x06" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_cmp", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point compare ops.", + "UMask": "0x07" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_cvt", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point convert ops.", + "UMask": "0x08" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_blend", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point blend ops.", + "UMask": "0x09" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_other", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_ops_retired_by_type.scalar_all", + "EventCode": "0x0a", + "BriefDescription": "Retired scalar floating-point ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_ops_retired_by_type.vector_add", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point add ops.", + "UMask": "0x10" + }, + { + "EventName": "fp_ops_retired_by_type.vector_sub", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_ops_retired_by_type.vector_mul", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "fp_ops_retired_by_type.vector_mac", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "fp_ops_retired_by_type.vector_div", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point divide ops.", + "UMask": "0x50" + }, + { + "EventName": "fp_ops_retired_by_type.vector_sqrt", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point square root ops.", + "UMask": "0x60" + }, + { + "EventName": "fp_ops_retired_by_type.vector_cmp", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point compare ops.", + "UMask": "0x70" + }, + { + "EventName": "fp_ops_retired_by_type.vector_cvt", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point convert ops.", + "UMask": "0x80" + }, + { + "EventName": "fp_ops_retired_by_type.vector_blend", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point blend ops.", + "UMask": "0x90" + }, + { + "EventName": "fp_ops_retired_by_type.vector_shuffle", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_ops_retired_by_type.vector_logical", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_ops_retired_by_type.vector_other", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_ops_retired_by_type.vector_all", + "EventCode": "0x0a", + "BriefDescription": "Retired vector floating-point ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_ops_retired_by_type.all", + "EventCode": "0x0a", + "BriefDescription": "Retired floating-point ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "sse_avx_ops_retired.mmx_add", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer add.", + "UMask": "0x01" + }, + { + "EventName": "sse_avx_ops_retired.mmx_sub", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "sse_avx_ops_retired.mmx_mul", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "sse_avx_ops_retired.mmx_mac", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "sse_avx_ops_retired.mmx_cmp", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer compare ops.", + "UMask": "0x07" + }, + { + "EventName": "sse_avx_ops_retired.mmx_shift", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer shift ops.", + "UMask": "0x09" + }, + { + "EventName": "sse_avx_ops_retired.mmx_mov", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer MOV ops.", + "UMask": "0x0a" + }, + { + "EventName": "sse_avx_ops_retired.mmx_shuffle", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "sse_avx_ops_retired.mmx_pack", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer pack ops.", + "UMask": "0x0c" + }, + { + "EventName": "sse_avx_ops_retired.mmx_logical", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer logical ops.", + "UMask": "0x0d" + }, + { + "EventName": "sse_avx_ops_retired.mmx_other", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer multiply ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "sse_avx_ops_retired.mmx_all", + "EventCode": "0x0b", + "BriefDescription": "Retired MMX integer ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_add", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer add ops.", + "UMask": "0x10" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_sub", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_mul", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_mac", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_aes", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer AES ops.", + "UMask": "0x50" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_sha", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer SHA ops.", + "UMask": "0x60" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_cmp", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer compare ops.", + "UMask": "0x70" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_clm", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer CLM ops.", + "UMask": "0x80" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_shift", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer shift ops.", + "UMask": "0x90" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_mov", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer MOV ops.", + "UMask": "0xa0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_shuffle", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_pack", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer pack ops.", + "UMask": "0xc0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_logical", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_other", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "sse_avx_ops_retired.sse_avx_all", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE and AVX integer ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "sse_avx_ops_retired.all", + "EventCode": "0x0b", + "BriefDescription": "Retired SSE, AVX and MMX integer ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_pack_ops_retired.fp128_add", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point add ops.", + "UMask": "0x01" + }, + { + "EventName": "fp_pack_ops_retired.fp128_sub", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "fp_pack_ops_retired.fp128_mul", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "fp_pack_ops_retired.fp128_mac", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "fp_pack_ops_retired.fp128_div", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point divide ops.", + "UMask": "0x05" + }, + { + "EventName": "fp_pack_ops_retired.fp128_sqrt", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point square root ops.", + "UMask": "0x06" + }, + { + "EventName": "fp_pack_ops_retired.fp128_cmp", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point compare ops.", + "UMask": "0x07" + }, + { + "EventName": "fp_pack_ops_retired.fp128_cvt", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point convert ops.", + "UMask": "0x08" + }, + { + "EventName": "fp_pack_ops_retired.fp128_blend", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point blend ops.", + "UMask": "0x09" + }, + { + "EventName": "fp_pack_ops_retired.fp128_shuffle", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "fp_pack_ops_retired.fp128_logical", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point logical ops.", + "UMask": "0x0d" + }, + { + "EventName": "fp_pack_ops_retired.fp128_other", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "fp_pack_ops_retired.fp128_all", + "EventCode": "0x0c", + "BriefDescription": "Retired 128-bit packed floating-point ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "fp_pack_ops_retired.fp256_add", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point add ops.", + "UMask": "0x10" + }, + { + "EventName": "fp_pack_ops_retired.fp256_sub", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "fp_pack_ops_retired.fp256_mul", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "fp_pack_ops_retired.fp256_mac", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "fp_pack_ops_retired.fp256_div", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point divide ops.", + "UMask": "0x50" + }, + { + "EventName": "fp_pack_ops_retired.fp256_sqrt", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point square root ops.", + "UMask": "0x60" + }, + { + "EventName": "fp_pack_ops_retired.fp256_cmp", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point compare ops.", + "UMask": "0x70" + }, + { + "EventName": "fp_pack_ops_retired.fp256_cvt", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point convert ops.", + "UMask": "0x80" + }, + { + "EventName": "fp_pack_ops_retired.fp256_blend", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point blend ops.", + "UMask": "0x90" + }, + { + "EventName": "fp_pack_ops_retired.fp256_shuffle", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "fp_pack_ops_retired.fp256_logical", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "fp_pack_ops_retired.fp256_other", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "fp_pack_ops_retired.fp256_all", + "EventCode": "0x0c", + "BriefDescription": "Retired 256-bit packed floating-point ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "fp_pack_ops_retired.all", + "EventCode": "0x0c", + "BriefDescription": "Retired packed floating-point ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "packed_int_op_type.int128_add", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer add ops.", + "UMask": "0x01" + }, + { + "EventName": "packed_int_op_type.int128_sub", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer subtract ops.", + "UMask": "0x02" + }, + { + "EventName": "packed_int_op_type.int128_mul", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer multiply ops.", + "UMask": "0x03" + }, + { + "EventName": "packed_int_op_type.int128_mac", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer multiply-accumulate ops.", + "UMask": "0x04" + }, + { + "EventName": "packed_int_op_type.int128_aes", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer AES ops.", + "UMask": "0x05" + }, + { + "EventName": "packed_int_op_type.int128_sha", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer SHA ops.", + "UMask": "0x06" + }, + { + "EventName": "packed_int_op_type.int128_cmp", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer compare ops.", + "UMask": "0x07" + }, + { + "EventName": "packed_int_op_type.int128_clm", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer CLM ops.", + "UMask": "0x08" + }, + { + "EventName": "packed_int_op_type.int128_shift", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer shift ops.", + "UMask": "0x09" + }, + { + "EventName": "packed_int_op_type.int128_mov", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer MOV ops.", + "UMask": "0x0a" + }, + { + "EventName": "packed_int_op_type.int128_shuffle", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0x0b" + }, + { + "EventName": "packed_int_op_type.int128_pack", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer pack ops.", + "UMask": "0x0c" + }, + { + "EventName": "packed_int_op_type.int128_logical", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer logical ops.", + "UMask": "0x0d" + }, + { + "EventName": "packed_int_op_type.int128_other", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer ops of other types.", + "UMask": "0x0e" + }, + { + "EventName": "packed_int_op_type.int128_all", + "EventCode": "0x0d", + "BriefDescription": "Retired 128-bit packed integer ops of all types.", + "UMask": "0x0f" + }, + { + "EventName": "packed_int_op_type.int256_add", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer add ops.", + "UMask": "0x10" + }, + { + "EventName": "packed_int_op_type.int256_sub", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer subtract ops.", + "UMask": "0x20" + }, + { + "EventName": "packed_int_op_type.int256_mul", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer multiply ops.", + "UMask": "0x30" + }, + { + "EventName": "packed_int_op_type.int256_mac", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer multiply-accumulate ops.", + "UMask": "0x40" + }, + { + "EventName": "packed_int_op_type.int256_cmp", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer compare ops.", + "UMask": "0x70" + }, + { + "EventName": "packed_int_op_type.int256_shift", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer shift ops.", + "UMask": "0x90" + }, + { + "EventName": "packed_int_op_type.int256_mov", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer MOV ops.", + "UMask": "0xa0" + }, + { + "EventName": "packed_int_op_type.int256_shuffle", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer shuffle ops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).", + "UMask": "0xb0" + }, + { + "EventName": "packed_int_op_type.int256_pack", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer pack ops.", + "UMask": "0xc0" + }, + { + "EventName": "packed_int_op_type.int256_logical", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer logical ops.", + "UMask": "0xd0" + }, + { + "EventName": "packed_int_op_type.int256_other", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer ops of other types.", + "UMask": "0xe0" + }, + { + "EventName": "packed_int_op_type.int256_all", + "EventCode": "0x0d", + "BriefDescription": "Retired 256-bit packed integer ops of all types.", + "UMask": "0xf0" + }, + { + "EventName": "packed_int_op_type.all", + "EventCode": "0x0d", + "BriefDescription": "Retired packed integer ops of all types.", + "UMask": "0xff" + }, + { + "EventName": "fp_disp_faults.x87_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for x87 fills.", + "UMask": "0x01" + }, + { + "EventName": "fp_disp_faults.xmm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for XMM fills.", + "UMask": "0x02" + }, + { + "EventName": "fp_disp_faults.ymm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for YMM fills.", + "UMask": "0x04" + }, + { + "EventName": "fp_disp_faults.ymm_spill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults for YMM spills.", + "UMask": "0x08" + }, + { + "EventName": "fp_disp_faults.sse_avx_all", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults of all types for SSE and AVX ops.", + "UMask": "0x0e" + }, + { + "EventName": "fp_disp_faults.all", + "EventCode": "0x0e", + "BriefDescription": "Floating-point dispatch faults of all types.", + "UMask": "0x0f" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json b/tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json new file mode 100644 index 000000000000..ad75e5bf9513 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/inst-cache.json @@ -0,0 +1,72 @@ +[ + { + "EventName": "ic_cache_fill_l2", + "EventCode": "0x82", + "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from the L2 cache." + }, + { + "EventName": "ic_cache_fill_sys", + "EventCode": "0x83", + "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from system memory or another cache." + }, + { + "EventName": "ic_fetch_ibs_events.fetch_tagged", + "EventCode": "0x188", + "BriefDescription": "Fetches tagged by Fetch IBS. Not all tagged fetches result in a valid sample and an IBS interrupt.", + "UMask": "0x02" + }, + { + "EventName": "ic_fetch_ibs_events.sample_discarded", + "EventCode": "0x188", + "BriefDescription": "Fetches discarded after being tagged by Fetch IBS due to reasons other than IBS filtering.", + "UMask": "0x04" + }, + { + "EventName": "ic_fetch_ibs_events.sample_filtered", + "EventCode": "0x188", + "BriefDescription": "Fetches discarded after being tagged by Fetch IBS due to IBS filtering.", + "UMask": "0x08" + }, + { + "EventName": "ic_fetch_ibs_events.sample_valid", + "EventCode": "0x188", + "BriefDescription": "Fetches tagged by Fetch IBS that result in a valid sample and an IBS interrupt.", + "UMask": "0x10" + }, + { + "EventName": "ic_tag_hit_miss.instruction_cache_hit", + "EventCode": "0x18e", + "BriefDescription": "Instruction cache hits.", + "UMask": "0x07" + }, + { + "EventName": "ic_tag_hit_miss.instruction_cache_miss", + "EventCode": "0x18e", + "BriefDescription": "Instruction cache misses.", + "UMask": "0x18" + }, + { + "EventName": "ic_tag_hit_miss.all_instruction_cache_accesses", + "EventCode": "0x18e", + "BriefDescription": "Instruction cache accesses of all types.", + "UMask": "0x1f" + }, + { + "EventName": "op_cache_hit_miss.op_cache_hit", + "EventCode": "0x28f", + "BriefDescription": "Op cache hits.", + "UMask": "0x03" + }, + { + "EventName": "op_cache_hit_miss.op_cache_miss", + "EventCode": "0x28f", + "BriefDescription": "Op cache misses.", + "UMask": "0x04" + }, + { + "EventName": "op_cache_hit_miss.all_op_cache_accesses", + "EventCode": "0x28f", + "BriefDescription": "Op cache accesses of all types.", + "UMask": "0x07" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json b/tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json new file mode 100644 index 000000000000..d1de51a02922 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/l2-cache.json @@ -0,0 +1,266 @@ +[ + { + "EventName": "l2_request_g1.group2", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of non-cacheable type (non-cached data and instructions reads, self-modifying code checks).", + "UMask": "0x01" + }, + { + "EventName": "l2_request_g1.l2_hw_pf", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: from hardware prefetchers to prefetch directly into L2 (hit or miss).", + "UMask": "0x02" + }, + { + "EventName": "l2_request_g1.prefetch_l2_cmd", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: prefetch directly into L2.", + "UMask": "0x04" + }, + { + "EventName": "l2_request_g1.cacheable_ic_read", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: instruction cache reads.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g1.ls_rd_blk_c_s", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: data cache shared reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g1.rd_blk_x", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: data cache stores.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g1.rd_blk_l", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests: data cache reads including hardware and software prefetch.", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g1.all_dc", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of common types from L1 data cache (including prefetches).", + "UMask": "0xe0" + }, + { + "EventName": "l2_request_g1.all_no_prefetch", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of common types not including prefetches.", + "UMask": "0xf1" + }, + { + "EventName": "l2_request_g1.all", + "EventCode": "0x60", + "BriefDescription": "L2 cache requests of all types.", + "UMask": "0xf7" + }, + { + "EventName": "l2_request_g2.ls_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests: non-coherent, non-cacheable LS sized reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g2.ls_rd_sized", + "EventCode": "0x61", + "BriefDescription": "L2 cache requests: coherent, non-cacheable LS sized reads.", + "UMask": "0x40" + }, + { + "EventName": "l2_wcb_req.wcb_close", + "EventCode": "0x63", + "BriefDescription": "Write Combining Buffer (WCB) closures.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_miss", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: instruction cache request miss in L2.", + "UMask": "0x01" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: instruction cache hit non-modifiable line in L2.", + "UMask": "0x02" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: instruction cache hit modifiable line in L2.", + "UMask": "0x04" + }, + { + "EventName": "l2_cache_req_stat.ic_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for instruction cache hits.", + "UMask": "0x06" + }, + { + "EventName": "l2_cache_req_stat.ic_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for instruction cache access.", + "UMask": "0x07" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_c", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache request miss in L2.", + "UMask": "0x08" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data and instruction cache misses.", + "UMask": "0x09" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache store or state change hit in L2.", + "UMask": "0x10" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache read hit non-modifiable line in L2.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache read hit modifiable line in L2.", + "UMask": "0x40" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_cs", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) with status: data cache shared read hit in L2.", + "UMask": "0x80" + }, + { + "EventName": "l2_cache_req_stat.dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data cache hits.", + "UMask": "0xf0" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data and instruction cache hits.", + "UMask": "0xf6" + }, + { + "EventName": "l2_cache_req_stat.dc_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data cache access.", + "UMask": "0xf8" + }, + { + "EventName": "l2_cache_req_stat.all", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) for data and instruction cache access.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_hit_l2.l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_hit_l2.l1_dc_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_hit_l2.l1_dc_l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l2_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf", + "EventCode": "0x71", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_l3.l2_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L2 hardware prefetchers.", + "UMask": "0x1f" + }, + { + "EventName": "l2_pf_miss_l2_l3.l1_dc_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data hardware prefetchers.", + "UMask": "0xe0" + }, + { + "EventName": "l2_pf_miss_l2_l3.l1_dc_l2_hwpf", + "EventCode": "0x72", + "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data and L2 hardware prefetchers.", + "UMask": "0xff" + }, + { + "EventName": "l2_fill_rsp_src.local_ccx", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "l2_fill_rsp_src.near_cache", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_near", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "l2_fill_rsp_src.far_cache", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "l2_fill_rsp_src.dram_io_far", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "l2_fill_rsp_src.alternate_memories", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "l2_fill_rsp_src.all", + "EventCode": "0x165", + "BriefDescription": "L2 cache fills from all types of data sources.", + "UMask": "0xde" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json new file mode 100644 index 000000000000..af2fdf1f55d6 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json @@ -0,0 +1,451 @@ +[ + { + "EventName": "ls_bad_status2.stli_other", + "EventCode": "0x24", + "BriefDescription": "Store-to-load conflicts (load unable to complete due to a non-forwardable conflict with an older store).", + "UMask": "0x02" + }, + { + "EventName": "ls_locks.bus_lock", + "EventCode": "0x25", + "BriefDescription": "Retired Lock instructions which caused a bus lock.", + "UMask": "0x01" + }, + { + "EventName": "ls_ret_cl_flush", + "EventCode": "0x26", + "BriefDescription": "Retired CLFLUSH instructions." + }, + { + "EventName": "ls_ret_cpuid", + "EventCode": "0x27", + "BriefDescription": "Retired CPUID instructions." + }, + { + "EventName": "ls_dispatch.ld_dispatch", + "EventCode": "0x29", + "BriefDescription": "Number of memory load operations dispatched to the load-store unit.", + "UMask": "0x01" + }, + { + "EventName": "ls_dispatch.store_dispatch", + "EventCode": "0x29", + "BriefDescription": "Number of memory store operations dispatched to the load-store unit.", + "UMask": "0x02" + }, + { + "EventName": "ls_dispatch.ld_st_dispatch", + "EventCode": "0x29", + "BriefDescription": "Number of memory load-store operations dispatched to the load-store unit.", + "UMask": "0x04" + }, + { + "EventName": "ls_dispatch.all", + "EventCode": "0x29", + "BriefDescription": "Number of memory operations dispatched to the load-store unit.", + "UMask": "0x07" + }, + { + "EventName": "ls_smi_rx", + "EventCode": "0x2b", + "BriefDescription": "SMIs received." + }, + { + "EventName": "ls_int_taken", + "EventCode": "0x2c", + "BriefDescription": "Interrupts taken." + }, + { + "EventName": "ls_stlf", + "EventCode": "0x35", + "BriefDescription": "Store-to-load-forward (STLF) hits." + }, + { + "EventName": "ls_st_commit_cancel2.st_commit_cancel_wcb_full", + "EventCode": "0x37", + "BriefDescription": "Non-cacheable store commits cancelled due to the non-cacheable commit buffer being full.", + "UMask": "0x01" + }, + { + "EventName": "ls_mab_alloc.load_store_allocations", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.", + "UMask": "0x3f" + }, + { + "EventName": "ls_mab_alloc.hardware_prefetcher_allocations", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.", + "UMask": "0x40" + }, + { + "EventName": "ls_mab_alloc.all_allocations", + "EventCode": "0x41", + "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.", + "UMask": "0x7f" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_l2", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_dmnd_fills_from_sys.local_ccx", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_dmnd_fills_from_sys.near_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_near", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_dmnd_fills_from_sys.far_cache", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_dmnd_fills_from_sys.dram_io_far", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_dmnd_fills_from_sys.alternate_memories", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_dmnd_fills_from_sys.all", + "EventCode": "0x43", + "BriefDescription": "Demand data cache fills from all types of data sources.", + "UMask": "0xff" + }, + { + "EventName": "ls_any_fills_from_sys.local_l2", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_any_fills_from_sys.local_ccx", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_any_fills_from_sys.local_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from local L2 cache or L3 cache or different L2 cache in the same CCX.", + "UMask": "0x03" + }, + { + "EventName": "ls_any_fills_from_sys.near_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_near", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_any_fills_from_sys.far_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_any_fills_from_sys.remote_cache", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from cache of another CCX when the address was in the same or a different NUMA node.", + "UMask": "0x14" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_far", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_any_fills_from_sys.dram_io_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in any NUMA node (same or different socket).", + "UMask": "0x48" + }, + { + "EventName": "ls_any_fills_from_sys.far_all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either cache of another CCX, DRAM or MMIO when the address was in a different NUMA node (same or different socket).", + "UMask": "0x50" + }, + { + "EventName": "ls_any_fills_from_sys.all_dram_io", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from either DRAM or MMIO in any NUMA node (same or different socket).", + "UMask": "0x48" + }, + { + "EventName": "ls_any_fills_from_sys.alternate_memories", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_any_fills_from_sys.all", + "EventCode": "0x44", + "BriefDescription": "Any data cache fills from all types of data sources.", + "UMask": "0xff" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 4k pages.", + "UMask": "0x01" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x02" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 2M pages.", + "UMask": "0x04" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 1G pages.", + "UMask": "0x08" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for 4k pages.", + "UMask": "0x10" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for coalesced pages. A coalesced page is a 16k page created from four adjacent 4k pages.", + "UMask": "0x20" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for 2M pages.", + "UMask": "0x40" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for 1G pages.", + "UMask": "0x80" + }, + { + "EventName": "ls_l1_d_tlb_miss.all_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks are requested) for all page sizes.", + "UMask": "0xf0" + }, + { + "EventName": "ls_l1_d_tlb_miss.all", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB misses for all page sizes.", + "UMask": "0xff" + }, + { + "EventName": "ls_misal_loads.ma64", + "EventCode": "0x47", + "BriefDescription": "64B misaligned (cacheline crossing) loads.", + "UMask": "0x01" + }, + { + "EventName": "ls_misal_loads.ma4k", + "EventCode": "0x47", + "BriefDescription": "4kB misaligned (page crossing) loads.", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchT0 (move data to all cache levels), T1 (move data to all cache levels except L1) and T2 (move data to all cache levels except L1 and L2).", + "UMask": "0x01" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_w", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchW (move data to L1 cache and mark it modifiable).", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_nta", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchNTA (move data with minimum cache pollution i.e. non-temporal access).", + "UMask": "0x04" + }, + { + "EventName": "ls_pref_instr_disp.all", + "EventCode": "0x4b", + "BriefDescription": "Software prefetch instructions dispatched (speculative) of all types.", + "UMask": "0x07" + }, + { + "EventName": "wcb_close.full_line_64b", + "EventCode": "0x50", + "BriefDescription": "Number of events that caused a Write Combining Buffer (WCB) entry to close because all 64 bytes of the entry have been written to.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", + "EventCode": "0x52", + "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a data cache hit.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.mab_mch_cnt", + "EventCode": "0x52", + "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a match on an already allocated Miss Address Buffer (MAB).", + "UMask": "0x02" + }, + { + "EventName": "ls_inef_sw_pref.all", + "EventCode": "0x52", + "BriefDescript6ion": "Software prefetches that did not fetch data outside of the processor core for any reason.", + "UMask": "0x03" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_l2", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_sw_pf_dc_fills.local_ccx", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_sw_pf_dc_fills.near_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from cache of another CCX in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_near", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_sw_pf_dc_fills.far_cache", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from cache of another CCX in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_sw_pf_dc_fills.dram_io_far", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_sw_pf_dc_fills.alternate_memories", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_sw_pf_dc_fills.all", + "EventCode": "0x59", + "BriefDescription": "Software prefetch data cache fills from all types of data sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_l2", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from local L2 cache.", + "UMask": "0x01" + }, + { + "EventName": "ls_hw_pf_dc_fills.local_ccx", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from L3 cache or different L2 cache in the same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_hw_pf_dc_fills.near_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from cache of another CCX when the address was in the same NUMA node.", + "UMask": "0x04" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_near", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from either DRAM or MMIO in the same NUMA node.", + "UMask": "0x08" + }, + { + "EventName": "ls_hw_pf_dc_fills.far_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from cache of another CCX when the address was in a different NUMA node.", + "UMask": "0x10" + }, + { + "EventName": "ls_hw_pf_dc_fills.dram_io_far", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from either DRAM or MMIO in a different NUMA node (same or different socket).", + "UMask": "0x40" + }, + { + "EventName": "ls_hw_pf_dc_fills.alternate_memories", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from extension memory.", + "UMask": "0x80" + }, + { + "EventName": "ls_hw_pf_dc_fills.all", + "EventCode": "0x5a", + "BriefDescription": "Hardware prefetch data cache fills from all types of data sources.", + "UMask": "0xdf" + }, + { + "EventName": "ls_alloc_mab_count", + "EventCode": "0x5f", + "BriefDescription": "In-flight L1 data cache misses i.e. Miss Address Buffer (MAB) allocations each cycle." + }, + { + "EventName": "ls_not_halted_cyc", + "EventCode": "0x76", + "BriefDescription": "Core cycles not in halt." + }, + { + "EventName": "ls_tlb_flush.all", + "EventCode": "0x78", + "BriefDescription": "All TLB Flushes.", + "UMask": "0xff" + }, + { + "EventName": "ls_not_halted_p0_cyc.p0_freq_cyc", + "EventCode": "0x120", + "BriefDescription": "Reference cycles (P0 frequency) not in halt .", + "UMask": "0x1" + } +] -- cgit v1.2.3 From dc082ae61858dc262fd3a482846fb0fe4b947f33 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 3 May 2024 12:46:20 +0530 Subject: perf vendor events amd: Add Zen 5 uncore events Add uncore events taken from Section 1.5 "L3 Cache Performance Monitor Counters" and Section 2 "UMC Performance Monitors" of the Performance Monitor Counters for AMD Family 1Ah Model 00h-0Fh Processors document available at the link below. This constitutes events which capture L3 cache and UMC command activity. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=305974 Link: https://lore.kernel.org/r/e11e8d9d1af34a0fb565fc9d1c4a05f569c39ddc.1714717230.git.sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/amdzen5/l3-cache.json | 177 +++++++++++++++++++++ .../arch/x86/amdzen5/memory-controller.json | 101 ++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json b/tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json new file mode 100644 index 000000000000..b50fe14d4520 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/l3-cache.json @@ -0,0 +1,177 @@ +[ + { + "EventName": "l3_lookup_state.l3_miss", + "EventCode": "0x04", + "BriefDescription": "L3 cache misses.", + "UMask": "0x01", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.l3_hit", + "EventCode": "0x04", + "BriefDescription": "L3 cache hits.", + "UMask": "0xfe", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.all_coherent_accesses_to_l3", + "EventCode": "0x04", + "BriefDescription": "L3 cache requests for all coherent accesses.", + "UMask": "0xff", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.dram_near", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from DRAM in the same NUMA node.", + "UMask": "0x01", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.dram_far", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from DRAM in a different NUMA node.", + "UMask": "0x02", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.near_cache", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from another CCX's cache when the address was in the same NUMA node.", + "UMask": "0x04", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.far_cache", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from another CCX's cache when the address was in a different NUMA node.", + "UMask": "0x08", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.ext_near", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.ext_far", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency when data is sourced from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency.all", + "EventCode": "0xac", + "BriefDescription": "Average sampled latency from all data sources.", + "UMask": "0x3f", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.dram_near", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from DRAM in the same NUMA node.", + "UMask": "0x01", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.dram_far", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from DRAM in a different NUMA node.", + "UMask": "0x02", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.near_cache", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from another CCX's cache when the address was in the same NUMA node.", + "UMask": "0x04", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.far_cache", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from another CCX's cache when the address was in a different NUMA node.", + "UMask": "0x08", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.ext_near", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from extension memory (CXL) in the same NUMA node.", + "UMask": "0x10", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.ext_far", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from extension memory (CXL) in a different NUMA node.", + "UMask": "0x20", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + }, + { + "EventName": "l3_xi_sampled_latency_requests.all", + "EventCode": "0xad", + "BriefDescription": "L3 cache fill requests sourced from all data sources.", + "UMask": "0x3f", + "EnAllCores": "0x1", + "EnAllSlices": "0x1", + "SliceId": "0x3", + "ThreadMask": "0x3", + "Unit": "L3PMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json b/tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json new file mode 100644 index 000000000000..1a629fc9474a --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/memory-controller.json @@ -0,0 +1,101 @@ +[ + { + "EventName": "umc_mem_clk", + "PublicDescription": "Number of memory clock (MEMCLK) cycles.", + "EventCode": "0x00", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.all", + "PublicDescription": "Number of ACTIVATE commands sent.", + "EventCode": "0x05", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.rd", + "PublicDescription": "Number of ACTIVATE commands sent for reads.", + "EventCode": "0x05", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.wr", + "PublicDescription": "Number of ACTIVATE commands sent for writes.", + "EventCode": "0x05", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.all", + "PublicDescription": "Number of PRECHARGE commands sent.", + "EventCode": "0x06", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.rd", + "PublicDescription": "Number of PRECHARGE commands sent for reads.", + "EventCode": "0x06", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.wr", + "PublicDescription": "Number of PRECHARGE commands sent for writes.", + "EventCode": "0x06", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.all", + "PublicDescription": "Number of CAS commands sent.", + "EventCode": "0x0a", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.rd", + "PublicDescription": "Number of CAS commands sent for reads.", + "EventCode": "0x0a", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.wr", + "PublicDescription": "Number of CAS commands sent for writes.", + "EventCode": "0x0a", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.all", + "PublicDescription": "Number of clock cycles used by the data bus.", + "EventCode": "0x14", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.rd", + "PublicDescription": "Number of clock cycles used by the data bus for reads.", + "EventCode": "0x14", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.wr", + "PublicDescription": "Number of clock cycles used by the data bus for writes.", + "EventCode": "0x14", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + } +] -- cgit v1.2.3 From a9fe4ac7a3a25f0242406db7aa237850692fb29c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 3 May 2024 12:46:21 +0530 Subject: perf vendor events amd: Add Zen 5 metrics Add metrics taken from Section 1.2 "Performance Measurement" of the Performance Monitor Counters for AMD Family 1Ah Model 00h-0Fh Processors document available at the link below. The recommended metrics are sourced from Table 1 "Guidance for Common Performance Statistics with Complex Event Selects". The pipeline utilization metrics are sourced from Table 2 "Guidance for Pipeline Utilization Analysis Statistics". These are useful for finding performance bottlenecks by analyzing activity at different stages of the pipeline. There are metric groups available for Level 1 and Level 2 analysis. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://bugzilla.kernel.org/attachment.cgi?id=305974 Link: https://lore.kernel.org/r/ee21ff77d89efa99997d3c2ebeeae22ddb6e7e12.1714717230.git.sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/amdzen5/pipeline.json | 99 ++++++ .../pmu-events/arch/x86/amdzen5/recommended.json | 345 +++++++++++++++++++++ 2 files changed, 444 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen5/recommended.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json b/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json new file mode 100644 index 000000000000..d860bf599cf2 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/pipeline.json @@ -0,0 +1,99 @@ +[ + { + "MetricName": "total_dispatch_slots", + "BriefDescription": "Total dispatch slots (up to 8 instructions can be dispatched in each cycle).", + "MetricExpr": "8 * ls_not_halted_cyc", + "ScaleUnit": "1slots" + }, + { + "MetricName": "frontend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because the frontend did not supply enough instructions/ops.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation", + "BriefDescription": "Percentage of dispatched ops that did not retire.", + "MetricExpr": "d_ratio(de_src_op_disp.all - ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound", + "BriefDescription": "Percentage of dispatch slots that remained unused because of backend stalls.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.backend_stalls, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "smt_contention", + "BriefDescription": "Percentage of dispatch slots that remained unused because the other thread was selected.", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.smt_contention, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring", + "BriefDescription": "Percentage of dispatch slots used by ops that retired.", + "MetricExpr": "d_ratio(ex_ret_ops, total_dispatch_slots)", + "MetricGroup": "PipelineL1", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_latency", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a latency bottleneck in the frontend (such as instruction cache or TLB misses).", + "MetricExpr": "d_ratio((8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "frontend_bound_by_bandwidth", + "BriefDescription": "Percentage of dispatch slots that remained unused because of a bandwidth bottleneck in the frontend (such as decode or op cache fetch bandwidth).", + "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend - (8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)", + "MetricGroup": "PipelineL2;frontend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "bad_speculation_from_mispredicts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to branch mispredicts.", + "MetricExpr": "d_ratio(bad_speculation * ex_ret_brn_misp, ex_ret_brn_misp + bp_redirects.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "bad_speculation_from_pipeline_restarts", + "BriefDescription": "Percentage of dispatched ops that were flushed due to pipeline restarts (resyncs).", + "MetricExpr": "d_ratio(bad_speculation * bp_redirects.resync, ex_ret_brn_misp + bp_redirects.resync)", + "MetricGroup": "PipelineL2;bad_speculation_group", + "ScaleUnit": "100%ops" + }, + { + "MetricName": "backend_bound_by_memory", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls due to the memory subsystem.", + "MetricExpr": "backend_bound * d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete)", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "backend_bound_by_cpu", + "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls not related to the memory subsystem.", + "MetricExpr": "backend_bound * (1 - d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete))", + "MetricGroup": "PipelineL2;backend_bound_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_fastpath", + "BriefDescription": "Percentage of dispatch slots used by fastpath ops that retired.", + "MetricExpr": "retiring * (1 - d_ratio(ex_ret_ucode_ops, ex_ret_ops))", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + }, + { + "MetricName": "retiring_from_microcode", + "BriefDescription": "Percentage of dispatch slots used by microcode ops that retired.", + "MetricExpr": "retiring * d_ratio(ex_ret_ucode_ops, ex_ret_ops)", + "MetricGroup": "PipelineL2;retiring_group", + "ScaleUnit": "100%slots" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json new file mode 100644 index 000000000000..c97874039c1e --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen5/recommended.json @@ -0,0 +1,345 @@ +[ + { + "MetricName": "branch_misprediction_rate", + "BriefDescription": "Execution-time branch misprediction rate (non-speculative).", + "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)", + "MetricGroup": "branch_prediction", + "ScaleUnit": "1per_branch" + }, + { + "MetricName": "all_data_cache_accesses_pti", + "BriefDescription": "All data cache accesses per thousand instructions.", + "MetricExpr": "ls_dispatch.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_accesses_pti", + "BriefDescription": "All L2 cache accesses per thousand instructions.", + "MetricExpr": "(l2_request_g1.all_no_prefetch + l2_pf_hit_l2.l2_hwpf + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_ic_misses_pti", + "BriefDescription": "L2 cache accesses from L1 instruction cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.cacheable_ic_read / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l1_dc_misses_pti", + "BriefDescription": "L2 cache accesses from L1 data cache misses (including prefetch) per thousand instructions.", + "MetricExpr": "l2_request_g1.all_dc / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_accesses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache accesses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_hit_l2.l1_dc_l2_hwpf + l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_misses_pti", + "BriefDescription": "All L2 cache misses per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache misses from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_fill_miss / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache misses from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ls_rd_blk_c / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_misses_from_l2_hwpf_pti", + "BriefDescription": "L2 cache misses from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "(l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l2_cache_hits_pti", + "BriefDescription": "All L2 cache hits per thousand instructions.", + "MetricExpr": "(l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2.l2_hwpf) / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_ic_miss_pti", + "BriefDescription": "L2 cache hits from L1 instruction cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.ic_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l1_dc_miss_pti", + "BriefDescription": "L2 cache hits from L1 data cache misses per thousand instructions.", + "MetricExpr": "l2_cache_req_stat.dc_hit_in_l2 / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_cache_hits_from_l2_hwpf_pti", + "BriefDescription": "L2 cache hits from L2 cache hardware prefetcher per thousand instructions.", + "MetricExpr": "l2_pf_hit_l2.l1_dc_l2_hwpf / instructions", + "MetricGroup": "l2_cache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l3_cache_accesses", + "BriefDescription": "L3 cache accesses.", + "MetricExpr": "l3_lookup_state.all_coherent_accesses_to_l3", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_misses", + "BriefDescription": "L3 misses (including cacheline state change requests).", + "MetricExpr": "l3_lookup_state.l3_miss", + "MetricGroup": "l3_cache" + }, + { + "MetricName": "l3_read_miss_latency", + "BriefDescription": "Average L3 read miss latency (in core clocks).", + "MetricExpr": "(l3_xi_sampled_latency.all * 10) / l3_xi_sampled_latency_requests.all", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_local_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for local DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_near * 10) / l3_xi_sampled_latency_requests.dram_near", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "l3_read_miss_latency_for_remote_dram", + "BriefDescription": "Average L3 read miss latency (in core clocks) for remote DRAM.", + "MetricExpr": "(l3_xi_sampled_latency.dram_far * 10) / l3_xi_sampled_latency_requests.dram_far", + "MetricGroup": "l3_cache", + "ScaleUnit": "1ns" + }, + { + "MetricName": "op_cache_fetch_miss_ratio", + "BriefDescription": "Op cache miss ratio for all fetches.", + "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)", + "ScaleUnit": "100%" + }, + { + "MetricName": "ic_fetch_miss_ratio", + "BriefDescription": "Instruction cache miss ratio for all fetches. An instruction cache miss will not be counted by this metric if it is an OC hit.", + "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)", + "ScaleUnit": "100%" + }, + { + "MetricName": "l1_data_cache_fills_from_memory_pti", + "BriefDescription": "L1 data cache fills from DRAM or MMIO in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.dram_io_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_remote_node_pti", + "BriefDescription": "L1 data cache fills from a different NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.far_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.local_all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_data_cache_fills_from_different_ccx_pti", + "BriefDescription": "L1 data cache fills from another CCX cache in any NUMA node per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.remote_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_l1_data_cache_fills_pti", + "BriefDescription": "All L1 data cache fills per thousand instructions.", + "MetricExpr": "ls_any_fills_from_sys.all / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_local_l2_pti", + "BriefDescription": "L1 demand data cache fills from local L2 cache per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_l2 / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_same_ccx_pti", + "BriefDescription": "L1 demand data cache fills from within the same CCX per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.local_ccx / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.near_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_near_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in the same NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_near / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_cache_pti", + "BriefDescription": "L1 demand data cache fills from another CCX cache in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.far_cache / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_demand_data_cache_fills_from_far_memory_pti", + "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in a different NUMA node per thousand instructions.", + "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_far / instructions", + "MetricGroup": "l1_dcache", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_itlb_misses_pti", + "BriefDescription": "L1 instruction TLB misses per thousand instructions.", + "MetricExpr": "(bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss.all) / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_itlb_misses_pti", + "BriefDescription": "L2 instruction TLB misses and instruction page walks per thousand instructions.", + "MetricExpr": "bp_l1_tlb_miss_l2_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l1_dtlb_misses_pti", + "BriefDescription": "L1 data TLB misses per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "l2_dtlb_misses_pti", + "BriefDescription": "L2 data TLB misses and data page walks per thousand instructions.", + "MetricExpr": "ls_l1_d_tlb_miss.all_l2_miss / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "all_tlbs_flushed_pti", + "BriefDescription": "All TLBs flushed per thousand instructions.", + "MetricExpr": "ls_tlb_flush.all / instructions", + "MetricGroup": "tlb", + "ScaleUnit": "1e3per_1k_instr" + }, + { + "MetricName": "macro_ops_dispatched", + "BriefDescription": "Macro-ops dispatched.", + "MetricExpr": "de_src_op_disp.all", + "MetricGroup": "decoder" + }, + { + "MetricName": "sse_avx_stalls", + "BriefDescription": "Mixed SSE/AVX stalls.", + "MetricExpr": "fp_disp_faults.sse_avx_all" + }, + { + "MetricName": "macro_ops_retired", + "BriefDescription": "Macro-ops retired.", + "MetricExpr": "ex_ret_ops" + }, + { + "MetricName": "umc_data_bus_utilization", + "BriefDescription": "Memory controller data bus utilization.", + "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_rate", + "BriefDescription": "Memory controller CAS command rate.", + "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_cas_cmd_read_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for reads.", + "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_write_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for writes.", + "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_mem_read_bandwidth", + "BriefDescription": "Estimated memory read bandwidth.", + "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_write_bandwidth", + "BriefDescription": "Estimated memory write bandwidth.", + "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_bandwidth", + "BriefDescription": "Estimated combined memory bandwidth.", + "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_activate_cmd_rate", + "BriefDescription": "Memory controller ACTIVATE command rate.", + "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + }, + { + "MetricName": "umc_precharge_cmd_rate", + "BriefDescription": "Memory controller PRECHARGE command rate.", + "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1per_memclk" + } +] -- cgit v1.2.3 From 77a70f80751da3a673232e7bff3f71d8c3995eff Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 3 May 2024 12:46:22 +0530 Subject: perf vendor events amd: Add Zen 5 mapping Add a regular expression in the map file so that appropriate JSON event files are used for AMD Zen 5 processors belonging to Family 1Ah. Reviewed-by: Ian Rogers Signed-off-by: Sandipan Das Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://lore.kernel.org/r/862a6b683755601725f9081897a850127d085ace.1714717230.git.sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index c372e3594a69..c9891630be10 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -40,3 +40,4 @@ AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core AuthenticAMD-25-([245][[:xdigit:]]|[[:xdigit:]]),v1,amdzen3,core AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen4,core +AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen5,core -- cgit v1.2.3 From a7575bc541b8dc34078ca55a02237acef3103762 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 23 Apr 2024 12:22:20 -0700 Subject: tools: fix userspace compilation with new test_xarray changes Patch series "test_xarray: couple of fixes for v6-9-rc6", v2. Here are a couple of fixes which should be merged into the queue for v6.9-rc6. The first one was reported by Liam, after fixing that I noticed an issue with a test, and a fix for that is in the second patch. This patch (of 2): Liam reported that compiling the test_xarray on userspace was broken. I was not even aware that was possible but you can via and you can run these tests in userspace with: make -C tools/testing/radix-tree ./tools/testing/radix-tree/xarray Add the two helpers we need to fix compilation. We don't need a userspace schedule() so just make it do nothing. Link: https://lkml.kernel.org/r/20240423192221.301095-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20240423192221.301095-2-mcgrof@kernel.org Fixes: a60cc288a1a2 ("test_xarray: add tests for advanced multi-index use") Signed-off-by: Luis Chamberlain Reported-by: "Liam R. Howlett" Cc: Daniel Gomez Cc: Darrick J. Wong Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux/kernel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index c5c9d05f29da..c0a2bb785b92 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -18,6 +18,8 @@ #define pr_info printk #define pr_debug printk #define pr_cont printk +#define schedule() +#define PAGE_SHIFT 12 #define __acquires(x) #define __releases(x) -- cgit v1.2.3 From dc8dc573aae7e1482425804b672905013a7191cc Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sun, 28 Apr 2024 11:05:29 +0800 Subject: selftests/vDSO: fix building errors on LoongArch Patch series "selftests/vDSO: Fix errors on LoongArch", v4. This patch (of 2): There exist the following errors when build vDSO selftests on LoongArch: # make headers && cd tools/testing/selftests/vDSO && make ... error: 'VDSO_VERSION' undeclared (first use in this function) ... error: 'VDSO_NAMES' undeclared (first use in this function) We can see the following code in arch/loongarch/vdso/vdso.lds.S: VERSION { LINUX_5.10 { global: __vdso_getcpu; __vdso_clock_getres; __vdso_clock_gettime; __vdso_gettimeofday; __vdso_rt_sigreturn; local: *; }; } so VDSO_VERSION should be 6 and VDSO_NAMES should be 1 for LoongArch, add them to fix the building errors on LoongArch. Link: https://lkml.kernel.org/r/20240428030530.24399-1-yangtiezhu@loongson.cn Link: https://lkml.kernel.org/r/20240428030530.24399-2-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reviewed-by: Muhammad Usama Anjum Cc: Andy Lutomirski Cc: Kees Cook Cc: Mark Brown Cc: Shuah Khan Cc: Thomas Gleixner Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- tools/testing/selftests/vDSO/vdso_config.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index cdfed403ba13..7b543e7f04d7 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -53,15 +53,19 @@ #if __riscv_xlen == 32 #define VDSO_32BIT 1 #endif +#elif defined(__loongarch__) +#define VDSO_VERSION 6 +#define VDSO_NAMES 1 #endif -static const char *versions[6] = { +static const char *versions[7] = { "LINUX_2.6", "LINUX_2.6.15", "LINUX_2.6.29", "LINUX_2.6.39", "LINUX_4", "LINUX_4.15", + "LINUX_5.10" }; static const char *names[2][6] = { -- cgit v1.2.3 From 48f044a784d6783f862b035ce5c5cca81b0ca117 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sun, 28 Apr 2024 11:05:30 +0800 Subject: selftests/vDSO: fix runtime errors on LoongArch It could not find __vdso_getcpu and __vdso_gettimeofday when test getcpu and gettimeofday on LoongArch. # make headers && cd tools/testing/selftests/vDSO && make # ./vdso_test_getcpu Could not find __vdso_getcpu # ./vdso_test_gettimeofday Could not find __vdso_gettimeofday One simple way is to add LoongArch case to define version and name, just like commit d942f231afc0 ("selftests/vDSO: Add riscv getcpu & gettimeofday test"), but it is not the best way. Since each architecture has already defined names and versions in vdso_config.h, it is proper to include vdso_config.h to get version and name for all archs. Link: https://lkml.kernel.org/r/20240428030530.24399-3-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reviewed-by: Muhammad Usama Anjum Tested-by: Muhammad Usama Anjum Cc: Andy Lutomirski Cc: Kees Cook Cc: Mark Brown Cc: Shuah Khan Cc: Thomas Gleixner Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- tools/testing/selftests/vDSO/vdso_test_getcpu.c | 16 +++++-------- .../selftests/vDSO/vdso_test_gettimeofday.c | 26 ++++++---------------- 2 files changed, 13 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index 1df5d057d79f..b758f68c6c9c 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -13,13 +13,7 @@ #include "../kselftest.h" #include "parse_vdso.h" - -#if defined(__riscv) -const char *version = "LINUX_4.15"; -#else -const char *version = "LINUX_2.6"; -#endif -const char *name = "__vdso_getcpu"; +#include "vdso_config.h" struct getcpu_cache; typedef long (*getcpu_t)(unsigned int *, unsigned int *, @@ -27,6 +21,8 @@ typedef long (*getcpu_t)(unsigned int *, unsigned int *, int main(int argc, char **argv) { + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; unsigned long sysinfo_ehdr; unsigned int cpu, node; getcpu_t get_cpu; @@ -40,9 +36,9 @@ int main(int argc, char **argv) vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); - get_cpu = (getcpu_t)vdso_sym(version, name); + get_cpu = (getcpu_t)vdso_sym(version, name[4]); if (!get_cpu) { - printf("Could not find %s\n", name); + printf("Could not find %s\n", name[4]); return KSFT_SKIP; } @@ -50,7 +46,7 @@ int main(int argc, char **argv) if (ret == 0) { printf("Running on CPU %u node %u\n", cpu, node); } else { - printf("%s failed\n", name); + printf("%s failed\n", name[4]); return KSFT_FAIL; } diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index e411f287a426..ee4f1ca56a71 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -18,25 +18,13 @@ #include "../kselftest.h" #include "parse_vdso.h" - -/* - * ARM64's vDSO exports its gettimeofday() implementation with a different - * name and version from other architectures, so we need to handle it as - * a special case. - */ -#if defined(__aarch64__) -const char *version = "LINUX_2.6.39"; -const char *name = "__kernel_gettimeofday"; -#elif defined(__riscv) -const char *version = "LINUX_4.15"; -const char *name = "__vdso_gettimeofday"; -#else -const char *version = "LINUX_2.6"; -const char *name = "__vdso_gettimeofday"; -#endif +#include "vdso_config.h" int main(int argc, char **argv) { + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); @@ -47,10 +35,10 @@ int main(int argc, char **argv) /* Find gettimeofday. */ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); - gtod_t gtod = (gtod_t)vdso_sym(version, name); + gtod_t gtod = (gtod_t)vdso_sym(version, name[0]); if (!gtod) { - printf("Could not find %s\n", name); + printf("Could not find %s\n", name[0]); return KSFT_SKIP; } @@ -61,7 +49,7 @@ int main(int argc, char **argv) printf("The time is %lld.%06lld\n", (long long)tv.tv_sec, (long long)tv.tv_usec); } else { - printf("%s failed\n", name); + printf("%s failed\n", name[0]); return KSFT_FAIL; } -- cgit v1.2.3 From 4673ad3bdca2678a8970c0076aa07f5302d914fb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 24 Apr 2024 11:53:01 +0100 Subject: selftests/mm: soft-dirty should fail if a testcase fails Previously soft-dirty was unconditionally exiting with success, even if one of its testcases failed. Let's fix that so that failure can be reported to automated systems properly. Link: https://lkml.kernel.org/r/20240424105301.3157695-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Muhammad Usama Anjum Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/soft-dirty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 7dbfa53d93a0..bdfa5d085f00 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -209,5 +209,5 @@ int main(int argc, char **argv) close(pagemap_fd); - return ksft_exit_pass(); + ksft_finished(); } -- cgit v1.2.3 From 881f1bb5e25c8982ed963b2d319fc0fc732e55db Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 23 Apr 2024 11:46:42 +0800 Subject: writeback: add wb_monitor.py script to monitor writeback info on bdi Add wb_monitor.py script to monitor writeback information on backing dev which makes it easier and more convenient to observe writeback behaviors of running system. The wb_monitor.py script is written based on wq_monitor.py. Following domain hierarchy is tested: global domain (320G) / \ cgroup domain1(10G) cgroup domain2(10G) | | bdi wb1 wb2 The wb_monitor.py script output is as following: ./wb_monitor.py 252:16 -c writeback reclaimable dirtied written avg_bw 252:16_1 0 0 0 0 102400 252:16_4284 672 820064 9230368 8410304 685612 252:16_4325 896 819840 10491264 9671648 652348 252:16 1568 1639904 19721632 18081952 1440360 writeback reclaimable dirtied written avg_bw 252:16_1 0 0 0 0 102400 252:16_4284 672 820064 9230368 8410304 685612 252:16_4325 896 819840 10491264 9671648 652348 252:16 1568 1639904 19721632 18081952 1440360 ... Link: https://lkml.kernel.org/r/20240423034643.141219-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Suggested-by: Tejun Heo Cc: Brian Foster Cc: David Howells Cc: David Sterba Cc: Jan Kara Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- tools/writeback/wb_monitor.py | 172 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tools/writeback/wb_monitor.py (limited to 'tools') diff --git a/tools/writeback/wb_monitor.py b/tools/writeback/wb_monitor.py new file mode 100644 index 000000000000..5e3591f1f9a9 --- /dev/null +++ b/tools/writeback/wb_monitor.py @@ -0,0 +1,172 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Kemeng Shi +# Copyright (C) 2024 Huawei Inc + +desc = """ +This is a drgn script based on wq_monitor.py to monitor writeback info on +backing dev. For more info on drgn, visit https://github.com/osandov/drgn. + + writeback(kB) Amount of dirty pages are currently being written back to + disk. + + reclaimable(kB) Amount of pages are currently reclaimable. + + dirtied(kB) Amount of pages have been dirtied. + + wrttien(kB) Amount of dirty pages have been written back to disk. + + avg_wb(kBps) Smoothly estimated write bandwidth of writing dirty pages + back to disk. +""" + +import signal +import re +import time +import json + +import drgn +from drgn.helpers.linux.list import list_for_each_entry + +import argparse +parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument('bdi', metavar='REGEX', nargs='*', + help='Target backing device name patterns (all if empty)') +parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, + help='Monitoring interval (0 to print once and exit)') +parser.add_argument('-j', '--json', action='store_true', + help='Output in json') +parser.add_argument('-c', '--cgroup', action='store_true', + help='show writeback of bdi in cgroup') +args = parser.parse_args() + +bdi_list = prog['bdi_list'] + +WB_RECLAIMABLE = prog['WB_RECLAIMABLE'] +WB_WRITEBACK = prog['WB_WRITEBACK'] +WB_DIRTIED = prog['WB_DIRTIED'] +WB_WRITTEN = prog['WB_WRITTEN'] +NR_WB_STAT_ITEMS = prog['NR_WB_STAT_ITEMS'] + +PAGE_SHIFT = prog['PAGE_SHIFT'] + +def K(x): + return x << (PAGE_SHIFT - 10) + +class Stats: + def dict(self, now): + return { 'timestamp' : now, + 'name' : self.name, + 'writeback' : self.stats[WB_WRITEBACK], + 'reclaimable' : self.stats[WB_RECLAIMABLE], + 'dirtied' : self.stats[WB_DIRTIED], + 'written' : self.stats[WB_WRITTEN], + 'avg_wb' : self.avg_bw, } + + def table_header_str(): + return f'{"":>16} {"writeback":>10} {"reclaimable":>12} ' \ + f'{"dirtied":>9} {"written":>9} {"avg_bw":>9}' + + def table_row_str(self): + out = f'{self.name[-16:]:16} ' \ + f'{self.stats[WB_WRITEBACK]:10} ' \ + f'{self.stats[WB_RECLAIMABLE]:12} ' \ + f'{self.stats[WB_DIRTIED]:9} ' \ + f'{self.stats[WB_WRITTEN]:9} ' \ + f'{self.avg_bw:9} ' + return out + + def show_header(): + if Stats.table_fmt: + print() + print(Stats.table_header_str()) + + def show_stats(self): + if Stats.table_fmt: + print(self.table_row_str()) + else: + print(self.dict(Stats.now)) + +class WbStats(Stats): + def __init__(self, wb): + bdi_name = wb.bdi.dev_name.string_().decode() + # avoid to use bdi.wb.memcg_css which is only defined when + # CONFIG_CGROUP_WRITEBACK is enabled + if wb == wb.bdi.wb.address_of_(): + ino = "1" + else: + ino = str(wb.memcg_css.cgroup.kn.id.value_()) + self.name = bdi_name + '_' + ino + + self.stats = [0] * NR_WB_STAT_ITEMS + for i in range(NR_WB_STAT_ITEMS): + if wb.stat[i].count >= 0: + self.stats[i] = int(K(wb.stat[i].count)) + else: + self.stats[i] = 0 + + self.avg_bw = int(K(wb.avg_write_bandwidth)) + +class BdiStats(Stats): + def __init__(self, bdi): + self.name = bdi.dev_name.string_().decode() + self.stats = [0] * NR_WB_STAT_ITEMS + self.avg_bw = 0 + + def collectStats(self, wb_stats): + for i in range(NR_WB_STAT_ITEMS): + self.stats[i] += wb_stats.stats[i] + + self.avg_bw += wb_stats.avg_bw + +exit_req = False + +def sigint_handler(signr, frame): + global exit_req + exit_req = True + +def main(): + # handle args + Stats.table_fmt = not args.json + interval = args.interval + cgroup = args.cgroup + + re_str = None + if args.bdi: + for r in args.bdi: + if re_str is None: + re_str = r + else: + re_str += '|' + r + + filter_re = re.compile(re_str) if re_str else None + + # monitoring loop + signal.signal(signal.SIGINT, sigint_handler) + + while not exit_req: + Stats.now = time.time() + + Stats.show_header() + for bdi in list_for_each_entry('struct backing_dev_info', bdi_list.address_of_(), 'bdi_list'): + bdi_stats = BdiStats(bdi) + if filter_re and not filter_re.search(bdi_stats.name): + continue + + for wb in list_for_each_entry('struct bdi_writeback', bdi.wb_list.address_of_(), 'bdi_node'): + wb_stats = WbStats(wb) + bdi_stats.collectStats(wb_stats) + if cgroup: + wb_stats.show_stats() + + bdi_stats.show_stats() + if cgroup and Stats.table_fmt: + print() + + if interval == 0: + break + time.sleep(interval) + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 5bfa66bf86d792bbcc76bc09cf99a2ae9d6e0eec Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:21 +1000 Subject: selftests/powerpc/dexcr: Add DEXCR prctl interface test Some basic tests of the prctl interface of the DEXCR. Signed-off-by: Benjamin Gray [mpe: Add missing SPDX tag] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-6-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/.gitignore | 1 + tools/testing/selftests/powerpc/dexcr/Makefile | 4 +- tools/testing/selftests/powerpc/dexcr/dexcr.c | 40 ++++ tools/testing/selftests/powerpc/dexcr/dexcr.h | 10 + tools/testing/selftests/powerpc/dexcr/dexcr_test.c | 215 +++++++++++++++++++++ 5 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index b82f45dd46b9..5d526613cd26 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,2 +1,3 @@ +dexcr_test hashchk_test lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 1b775959f981..3b685b28f029 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,9 +1,11 @@ -TEST_GEN_PROGS := hashchk_test +TEST_GEN_PROGS := dexcr_test hashchk_test TEST_GEN_FILES := lsdexcr include ../../lib.mk include ../flags.mk +CFLAGS += $(KHDR_INCLUDES) + $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c b/tools/testing/selftests/powerpc/dexcr/dexcr.c index 65ec5347de98..468fd0dc9912 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,45 @@ out: return exists; } +unsigned int pr_which_to_aspect(unsigned long which) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + return DEXCR_PR_SBHE; + case PR_PPC_DEXCR_IBRTPD: + return DEXCR_PR_IBRTPD; + case PR_PPC_DEXCR_SRAPD: + return DEXCR_PR_SRAPD; + case PR_PPC_DEXCR_NPHIE: + return DEXCR_PR_NPHIE; + default: + FAIL_IF_EXIT_MSG(true, "unknown PR aspect"); + } +} + +int pr_get_dexcr(unsigned long which) +{ + return prctl(PR_PPC_GET_DEXCR, which, 0UL, 0UL, 0UL); +} + +int pr_set_dexcr(unsigned long which, unsigned long ctrl) +{ + return prctl(PR_PPC_SET_DEXCR, which, ctrl, 0UL, 0UL); +} + +bool pr_dexcr_aspect_supported(unsigned long which) +{ + if (pr_get_dexcr(which) == -1) + return errno == ENODEV; + + return true; +} + +bool pr_dexcr_aspect_editable(unsigned long which) +{ + return pr_get_dexcr(which) & PR_PPC_DEXCR_CTRL_EDITABLE; +} + /* * Just test if a bad hashchk triggers a signal, without checking * for support or if the NPHIE aspect is enabled. diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index f55cbbc8643b..a6aa7eac11da 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -28,6 +28,16 @@ bool dexcr_exists(void); +bool pr_dexcr_aspect_supported(unsigned long which); + +bool pr_dexcr_aspect_editable(unsigned long which); + +int pr_get_dexcr(unsigned long pr_aspect); + +int pr_set_dexcr(unsigned long pr_aspect, unsigned long ctrl); + +unsigned int pr_which_to_aspect(unsigned long which); + bool hashchk_triggers(void); enum dexcr_source { diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr_test.c b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c new file mode 100644 index 000000000000..7a8657164908 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +/* + * Helper function for testing the behaviour of a newly exec-ed process + */ +static int dexcr_prctl_onexec_test_child(unsigned long which, const char *status) +{ + unsigned long dexcr = mfspr(SPRN_DEXCR_RO); + unsigned long aspect = pr_which_to_aspect(which); + int ctrl = pr_get_dexcr(which); + + if (!strcmp(status, "set")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), + "setting aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(!(aspect & dexcr), "setting aspect across exec did not take effect"); + } else if (!strcmp(status, "clear")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), + "clearing aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(aspect & dexcr, "clearing aspect across exec did not take effect"); + } else { + FAIL_IF_EXIT_MSG(true, "unknown expected status"); + } + + return 0; +} + +/* + * Test that the given prctl value can be manipulated freely + */ +static int dexcr_prctl_aspect_test(unsigned long which) +{ + unsigned long aspect = pr_which_to_aspect(which); + pid_t pid; + int ctrl; + int err; + int errno_save; + + SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_supported(which), "DEXCR aspect not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_editable(which), "DEXCR aspect not editable with prctl"); + + /* We reject invalid combinations of arguments */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear should be rejected with EINVAL"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear on exec should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear on exec should be rejected with EINVAL"); + + /* We set the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR, "config value unexpected clear flag"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "setting aspect did not take effect"); + + /* We clear the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET, "config value unexpected set flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "clearing aspect did not take effect"); + + /* We make it set on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC, "config value unexpected clear on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "scheduling aspect to set on exec should not change it now"); + + /* We make it clear on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect config should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC, "config value unexpected set on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should still be cleared"); + + /* We allow setting the current and on-exec value in a single call */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "process aspect should be set"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should be clear"); + + /* Verify the onexec value is applied across exec */ + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "set", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(mfspr(SPRN_DEXCR_RO) & aspect, + "setting aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "clear", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(!(mfspr(SPRN_DEXCR_RO) & aspect), + "clearing aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + return 0; +} + +static int dexcr_prctl_ibrtpd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_IBRTPD); +} + +static int dexcr_prctl_srapd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_SRAPD); +} + +static int dexcr_prctl_nphie_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_NPHIE); +} + +int main(int argc, char *argv[]) +{ + int err = 0; + + /* + * Some tests require checking what happens across exec, so we may be + * invoked as the child of a particular test + */ + if (argc > 1) { + if (argc == 3 && !strcmp(argv[0], "dexcr_prctl_onexec_test_child")) { + unsigned long which; + + err = parse_ulong(argv[1], strlen(argv[1]), &which, 10); + FAIL_IF_MSG(err, "failed to parse which value for child"); + + return dexcr_prctl_onexec_test_child(which, argv[2]); + } + + FAIL_IF_MSG(true, "unknown test case"); + } + + /* + * Otherwise we are the main test invocation and run the full suite + */ + err |= test_harness(dexcr_prctl_ibrtpd_test, "dexcr_prctl_ibrtpd"); + err |= test_harness(dexcr_prctl_srapd_test, "dexcr_prctl_srapd"); + err |= test_harness(dexcr_prctl_nphie_test, "dexcr_prctl_nphie"); + + return err; +} -- cgit v1.2.3 From 9930fba02a1c587849aea1e6c5688168013c065f Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:22 +1000 Subject: selftests/powerpc/dexcr: Attempt to enable NPHIE in hashchk selftest Now that a process can control its DEXCR to some extent, make the hashchk tests more reliable by explicitly setting the local and onexec NPHIE aspect. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-7-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/hashchk_test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c index 7d5658c9ebe4..645224bdc142 100644 --- a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c +++ b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c @@ -21,8 +21,14 @@ static int require_nphie(void) { SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + + pr_set_dexcr(PR_PPC_DEXCR_NPHIE, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + + if (get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE) + return 0; + SKIP_IF_MSG(!(get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE), - "DEXCR[NPHIE] not enabled"); + "Failed to enable DEXCR[NPHIE]"); return 0; } -- cgit v1.2.3 From 9c4866b209ad31cae7c832d45c6137ce6a993ca0 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:23 +1000 Subject: selftests/powerpc/dexcr: Add DEXCR config details to lsdexcr Now that the DEXCR can be configured with prctl, add a section in lsdexcr that explains why each aspect is set the way it is. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-8-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/lsdexcr.c | 113 +++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index 94abbfcc389e..a63db47b6610 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0+ -#include #include #include #include +#include #include "dexcr.h" #include "utils.h" @@ -16,6 +16,8 @@ struct dexcr_aspect { const char *name; const char *desc; unsigned int index; + unsigned long prctl; + const char *sysctl; }; static const struct dexcr_aspect aspects[] = { @@ -23,26 +25,36 @@ static const struct dexcr_aspect aspects[] = { .name = "SBHE", .desc = "Speculative branch hint enable", .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + .sysctl = "speculative_branch_hint_enable", }, { .name = "IBRTPD", .desc = "Indirect branch recurrent target prediction disable", .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + .sysctl = "indirect_branch_recurrent_target_prediction_disable", }, { .name = "SRAPD", .desc = "Subroutine return address prediction disable", .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + .sysctl = "subroutine_return_address_prediction_disable", }, { .name = "NPHIE", .desc = "Non-privileged hash instruction enable", .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + .sysctl = "nonprivileged_hash_instruction_enable", }, { .name = "PHIE", .desc = "Privileged hash instruction enable", .index = 6, + .prctl = -1, + .sysctl = NULL, }, }; @@ -60,7 +72,7 @@ static void print_dexcr(char *name, unsigned int bits) const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL}; size_t j = 0; - printf("%s: %08x", name, bits); + printf("%s: 0x%08x", name, bits); if (bits == 0) { printf("\n"); @@ -103,6 +115,95 @@ static void print_aspect(const struct dexcr_aspect *aspect) printf(" \t(%s)\n", aspect->desc); } +static void print_aspect_config(const struct dexcr_aspect *aspect) +{ + char sysctl_path[128] = "/proc/sys/kernel/dexcr/"; + const char *reason = "unknown"; + const char *reason_hyp = NULL; + const char *reason_sysctl = "no sysctl"; + const char *reason_prctl = "no prctl"; + bool actual = effective & DEXCR_PR_BIT(aspect->index); + bool expected = false; + + long sysctl_ctrl = 0; + int prctl_ctrl = 0; + int err; + + if (aspect->prctl >= 0) { + prctl_ctrl = pr_get_dexcr(aspect->prctl); + if (prctl_ctrl < 0) + reason_prctl = "(failed to read prctl)"; + else { + if (prctl_ctrl & PR_PPC_DEXCR_CTRL_SET) { + reason_prctl = "set by prctl"; + expected = true; + } else if (prctl_ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + reason_prctl = "cleared by prctl"; + expected = false; + } else + reason_prctl = "unknown prctl"; + + reason = reason_prctl; + } + } + + if (aspect->sysctl) { + strcat(sysctl_path, aspect->sysctl); + err = read_long(sysctl_path, &sysctl_ctrl, 10); + if (err) + reason_sysctl = "(failed to read sysctl)"; + else { + switch (sysctl_ctrl) { + case 0: + reason_sysctl = "cleared by sysctl"; + reason = reason_sysctl; + expected = false; + break; + case 1: + reason_sysctl = "set by sysctl"; + reason = reason_sysctl; + expected = true; + break; + case 2: + reason_sysctl = "not modified by sysctl"; + break; + case 3: + reason_sysctl = "cleared by sysctl (permanent)"; + reason = reason_sysctl; + expected = false; + break; + case 4: + reason_sysctl = "set by sysctl (permanent)"; + reason = reason_sysctl; + expected = true; + break; + default: + reason_sysctl = "unknown sysctl"; + break; + } + } + } + + + if (hdexcr & DEXCR_PR_BIT(aspect->index)) { + reason_hyp = "set by hypervisor"; + reason = reason_hyp; + expected = true; + } else + reason_hyp = "not modified by hypervisor"; + + printf("%12s (%d): %-28s (%s, %s, %s)\n", + aspect->name, + aspect->index, + reason, + reason_hyp, + reason_sysctl, + reason_prctl); + + if (actual != expected) + printf(" : ! actual %s does not match config\n", aspect->name); +} + int main(int argc, char *argv[]) { if (!dexcr_exists()) { @@ -114,6 +215,8 @@ int main(int argc, char *argv[]) hdexcr = get_dexcr(HDEXCR); effective = dexcr | hdexcr; + printf("current status:\n"); + print_dexcr(" DEXCR", dexcr); print_dexcr(" HDEXCR", hdexcr); print_dexcr("Effective", effective); @@ -136,6 +239,12 @@ int main(int argc, char *argv[]) else printf("ignored\n"); } + printf("\n"); + + printf("configuration:\n"); + for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) + print_aspect_config(&aspects[i]); + printf("\n"); return 0; } -- cgit v1.2.3 From f88723a609787254f7645eb6ac261b8363e8a5bc Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:24 +1000 Subject: selftests/powerpc/dexcr: Add chdexcr utility Adds a utility to exercise the prctl DEXCR inheritance in the shell. Supports setting and clearing each aspect. Signed-off-by: Benjamin Gray [mpe: Use correct SPDX license, use execvp() for usability, print errors] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-9-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/.gitignore | 1 + tools/testing/selftests/powerpc/dexcr/Makefile | 2 +- tools/testing/selftests/powerpc/dexcr/chdexcr.c | 112 +++++++++++++++++++ tools/testing/selftests/powerpc/dexcr/dexcr.h | 47 ++++++++ tools/testing/selftests/powerpc/dexcr/lsdexcr.c | 130 +++++------------------ 5 files changed, 187 insertions(+), 105 deletions(-) create mode 100644 tools/testing/selftests/powerpc/dexcr/chdexcr.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index 5d526613cd26..11eefb4b9fa4 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,3 +1,4 @@ dexcr_test hashchk_test +chdexcr lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 3b685b28f029..58cf9f722905 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,5 +1,5 @@ TEST_GEN_PROGS := dexcr_test hashchk_test -TEST_GEN_FILES := lsdexcr +TEST_GEN_FILES := lsdexcr chdexcr include ../../lib.mk include ../flags.mk diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c new file mode 100644 index 000000000000..bda44630cada --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +static void die(const char *msg) +{ + printf("%s\n", msg); + exit(1); +} + +static void help(void) +{ + printf("Invoke a provided program with a custom DEXCR on-exec reset value\n" + "\n" + "usage: chdexcr [CHDEXCR OPTIONS] -- PROGRAM [ARGS...]\n" + "\n" + "Each configurable DEXCR aspect is exposed as an option.\n" + "\n" + "The normal option sets the aspect in the DEXCR. The --no- variant\n" + "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" + "so indirect branch predicition will be disabled in the provided program.\n" + "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" + "prediction may occur.\n" + "\n" + "CHDEXCR OPTIONS:\n"); + + for (int i = 0; i < ARRAY_SIZE(aspects); i++) { + const struct dexcr_aspect *aspect = &aspects[i]; + + if (aspect->prctl == -1) + continue; + + printf(" --%-6s / --no-%-6s : %s\n", aspect->opt, aspect->opt, aspect->desc); + } +} + +static const struct dexcr_aspect *opt_to_aspect(const char *opt) +{ + for (int i = 0; i < ARRAY_SIZE(aspects); i++) + if (aspects[i].prctl != -1 && !strcmp(aspects[i].opt, opt)) + return &aspects[i]; + + return NULL; +} + +static int apply_option(const char *option) +{ + const struct dexcr_aspect *aspect; + const char *opt = NULL; + const char *set_prefix = "--"; + const char *clear_prefix = "--no-"; + unsigned long ctrl = 0; + int err; + + if (!strcmp(option, "-h") || !strcmp(option, "--help")) { + help(); + exit(0); + } + + /* Strip out --(no-) prefix and determine ctrl value */ + if (!strncmp(option, clear_prefix, strlen(clear_prefix))) { + opt = &option[strlen(clear_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + } else if (!strncmp(option, set_prefix, strlen(set_prefix))) { + opt = &option[strlen(set_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + } + + if (!opt || !*opt) + return 1; + + aspect = opt_to_aspect(opt); + if (!aspect) + die("unknown aspect"); + + err = pr_set_dexcr(aspect->prctl, ctrl); + if (err) + die("failed to apply option"); + + return 0; +} + +int main(int argc, char *const argv[]) +{ + int i; + + if (!dexcr_exists()) + die("DEXCR not detected on this hardware"); + + for (i = 1; i < argc; i++) + if (apply_option(argv[i])) + break; + + if (i < argc && !strcmp(argv[i], "--")) + i++; + + if (i >= argc) + die("missing command"); + + execvp(argv[i], &argv[i]); + perror("execve"); + + return errno; +} diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index a6aa7eac11da..51e9ba3b0997 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -9,6 +9,7 @@ #define _SELFTESTS_POWERPC_DEXCR_DEXCR_H #include +#include #include #include "reg.h" @@ -26,6 +27,52 @@ #define PPC_RAW_HASHCHK(b, i, a) \ str(.long (0x7C0005E4 | PPC_RAW_HASH_ARGS(b, i, a));) +struct dexcr_aspect { + const char *name; /* Short display name */ + const char *opt; /* Option name for chdexcr */ + const char *desc; /* Expanded aspect meaning */ + unsigned int index; /* Aspect bit index in DEXCR */ + unsigned long prctl; /* 'which' value for get/set prctl */ +}; + +static const struct dexcr_aspect aspects[] = { + { + .name = "SBHE", + .opt = "sbhe", + .desc = "Speculative branch hint enable", + .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + }, + { + .name = "IBRTPD", + .opt = "ibrtpd", + .desc = "Indirect branch recurrent target prediction disable", + .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + }, + { + .name = "SRAPD", + .opt = "srapd", + .desc = "Subroutine return address prediction disable", + .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + }, + { + .name = "NPHIE", + .opt = "nphie", + .desc = "Non-privileged hash instruction enable", + .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + }, + { + .name = "PHIE", + .opt = "phie", + .desc = "Privileged hash instruction enable", + .index = 6, + .prctl = -1, + }, +}; + bool dexcr_exists(void); bool pr_dexcr_aspect_supported(unsigned long which); diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index a63db47b6610..7588929180ab 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -12,52 +12,6 @@ static unsigned int dexcr; static unsigned int hdexcr; static unsigned int effective; -struct dexcr_aspect { - const char *name; - const char *desc; - unsigned int index; - unsigned long prctl; - const char *sysctl; -}; - -static const struct dexcr_aspect aspects[] = { - { - .name = "SBHE", - .desc = "Speculative branch hint enable", - .index = 0, - .prctl = PR_PPC_DEXCR_SBHE, - .sysctl = "speculative_branch_hint_enable", - }, - { - .name = "IBRTPD", - .desc = "Indirect branch recurrent target prediction disable", - .index = 3, - .prctl = PR_PPC_DEXCR_IBRTPD, - .sysctl = "indirect_branch_recurrent_target_prediction_disable", - }, - { - .name = "SRAPD", - .desc = "Subroutine return address prediction disable", - .index = 4, - .prctl = PR_PPC_DEXCR_SRAPD, - .sysctl = "subroutine_return_address_prediction_disable", - }, - { - .name = "NPHIE", - .desc = "Non-privileged hash instruction enable", - .index = 5, - .prctl = PR_PPC_DEXCR_NPHIE, - .sysctl = "nonprivileged_hash_instruction_enable", - }, - { - .name = "PHIE", - .desc = "Privileged hash instruction enable", - .index = 6, - .prctl = -1, - .sysctl = NULL, - }, -}; - static void print_list(const char *list[], size_t len) { for (size_t i = 0; i < len; i++) { @@ -117,89 +71,57 @@ static void print_aspect(const struct dexcr_aspect *aspect) static void print_aspect_config(const struct dexcr_aspect *aspect) { - char sysctl_path[128] = "/proc/sys/kernel/dexcr/"; - const char *reason = "unknown"; + const char *reason = NULL; const char *reason_hyp = NULL; - const char *reason_sysctl = "no sysctl"; const char *reason_prctl = "no prctl"; bool actual = effective & DEXCR_PR_BIT(aspect->index); - bool expected = false; - - long sysctl_ctrl = 0; - int prctl_ctrl = 0; - int err; - - if (aspect->prctl >= 0) { - prctl_ctrl = pr_get_dexcr(aspect->prctl); - if (prctl_ctrl < 0) - reason_prctl = "(failed to read prctl)"; - else { - if (prctl_ctrl & PR_PPC_DEXCR_CTRL_SET) { + bool expected = actual; /* Assume it's fine if we don't expect a specific set/clear value */ + + if (actual) + reason = "set by unknown"; + else + reason = "cleared by unknown"; + + if (aspect->prctl != -1) { + int ctrl = pr_get_dexcr(aspect->prctl); + + if (ctrl < 0) { + reason_prctl = "failed to read prctl"; + } else { + if (ctrl & PR_PPC_DEXCR_CTRL_SET) { reason_prctl = "set by prctl"; expected = true; - } else if (prctl_ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + } else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { reason_prctl = "cleared by prctl"; expected = false; - } else + } else { reason_prctl = "unknown prctl"; + } reason = reason_prctl; } } - if (aspect->sysctl) { - strcat(sysctl_path, aspect->sysctl); - err = read_long(sysctl_path, &sysctl_ctrl, 10); - if (err) - reason_sysctl = "(failed to read sysctl)"; - else { - switch (sysctl_ctrl) { - case 0: - reason_sysctl = "cleared by sysctl"; - reason = reason_sysctl; - expected = false; - break; - case 1: - reason_sysctl = "set by sysctl"; - reason = reason_sysctl; - expected = true; - break; - case 2: - reason_sysctl = "not modified by sysctl"; - break; - case 3: - reason_sysctl = "cleared by sysctl (permanent)"; - reason = reason_sysctl; - expected = false; - break; - case 4: - reason_sysctl = "set by sysctl (permanent)"; - reason = reason_sysctl; - expected = true; - break; - default: - reason_sysctl = "unknown sysctl"; - break; - } - } - } - - if (hdexcr & DEXCR_PR_BIT(aspect->index)) { reason_hyp = "set by hypervisor"; reason = reason_hyp; expected = true; - } else + } else { reason_hyp = "not modified by hypervisor"; + } - printf("%12s (%d): %-28s (%s, %s, %s)\n", + printf("%12s (%d): %-28s (%s, %s)\n", aspect->name, aspect->index, reason, reason_hyp, - reason_sysctl, reason_prctl); + /* + * The checks are not atomic, so this can technically trigger if the + * hypervisor makes a change while we are checking each source. It's + * far more likely to be a bug if we see this though. + */ if (actual != expected) printf(" : ! actual %s does not match config\n", aspect->name); } -- cgit v1.2.3 From 3f4ac23a990853ab5012037767281dfd4beb4b15 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 4 May 2024 14:37:57 -0700 Subject: perf dsos: Switch backing storage to array from rbtree/list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DSOs were held on a list for fast iteration and in an rbtree for fast finds. Switch to using a lazily sorted array where iteration is just iterating through the array and binary searches are the same complexity as searching the rbtree. The find may need to sort the array first which does increase the complexity, but add operations have lower complexity and overall the complexity should remain about the same. The set name operations on the dso just records that the array is no longer sorted, avoiding complexity in rebalancing the rbtree. Tighter locking discipline is enforced to avoid the array being resorted while long and short names or ids are changed. The array is smaller in size, replacing 6 pointers with 2, and so even with extra allocated space in the array, the array may be 50% unoccupied, the memory saving should be at least 2x. Committer testing: On a previous version of this patchset we were getting a lot of warnings about deleting a DSO still on a list, now it is ok: root@x1:~# perf probe -l root@x1:~# perf probe finish_task_switch Added new event: probe:finish_task_switch (on finish_task_switch) You can now use it in all perf tools, such as: perf record -e probe:finish_task_switch -aR sleep 1 root@x1:~# perf probe -l probe:finish_task_switch (on finish_task_switch@kernel/sched/core.c) root@x1:~# perf trace -e probe:finish_task_switch/max-stack=8/ --max-events=1 0.000 migration/0/19 probe:finish_task_switch(__probe_ip: -1894408688) finish_task_switch.isra.0 ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule ([kernel.kallsyms]) smpboot_thread_fn ([kernel.kallsyms]) kthread ([kernel.kallsyms]) ret_from_fork ([kernel.kallsyms]) ret_from_fork_asm ([kernel.kallsyms]) root@x1:~# root@x1:~# perf probe -d probe:* Removed event: probe:finish_task_switch root@x1:~# perf probe -l root@x1:~# I also ran the full 'perf test' suite after applying this one, no regressions. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Dima Kogan Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Tiezhu Yang Cc: Yanteng Si Cc: zhaimingbing Link: https://lore.kernel.org/r/20240504213803.218974-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 67 +++++++++++------- tools/perf/util/dso.h | 10 ++- tools/perf/util/dsos.c | 188 +++++++++++++++++++++++++++++++------------------ tools/perf/util/dsos.h | 21 +++--- 4 files changed, 177 insertions(+), 109 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index ad562743d769..3caca60a6ce3 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1241,35 +1241,35 @@ struct dso *machine__findnew_kernel(struct machine *machine, const char *name, return dso; } -static void dso__set_long_name_id(struct dso *dso, const char *name, struct dso_id *id, bool name_allocated) +static void dso__set_long_name_id(struct dso *dso, const char *name, bool name_allocated) { - struct rb_root *root = dso->root; + struct dsos *dsos = dso->dsos; if (name == NULL) return; - if (dso->long_name_allocated) - free((char *)dso->long_name); - - if (root) { - rb_erase(&dso->rb_node, root); + if (dsos) { /* - * __dsos__findnew_link_by_longname_id() isn't guaranteed to - * add it back, so a clean removal is required here. + * Need to avoid re-sorting the dsos breaking by non-atomically + * renaming the dso. */ - RB_CLEAR_NODE(&dso->rb_node); - dso->root = NULL; + down_write(&dsos->lock); } + if (dso->long_name_allocated) + free((char *)dso->long_name); + dso->long_name = name; dso->long_name_len = strlen(name); dso->long_name_allocated = name_allocated; - if (root) - __dsos__findnew_link_by_longname_id(root, dso, NULL, id); + if (dsos) { + dsos->sorted = false; + up_write(&dsos->lock); + } } -static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) +static int __dso_id__cmp(const struct dso_id *a, const struct dso_id *b) { if (a->maj > b->maj) return -1; if (a->maj < b->maj) return 1; @@ -1297,7 +1297,7 @@ static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) return 0; } -bool dso_id__empty(struct dso_id *id) +bool dso_id__empty(const struct dso_id *id) { if (!id) return true; @@ -1305,15 +1305,22 @@ bool dso_id__empty(struct dso_id *id) return !id->maj && !id->min && !id->ino && !id->ino_generation; } -void dso__inject_id(struct dso *dso, struct dso_id *id) +void __dso__inject_id(struct dso *dso, struct dso_id *id) { + struct dsos *dsos = dso->dsos; + + /* dsos write lock held by caller. */ + dso->id.maj = id->maj; dso->id.min = id->min; dso->id.ino = id->ino; dso->id.ino_generation = id->ino_generation; + + if (dsos) + dsos->sorted = false; } -int dso_id__cmp(struct dso_id *a, struct dso_id *b) +int dso_id__cmp(const struct dso_id *a, const struct dso_id *b) { /* * The second is always dso->id, so zeroes if not set, assume passing @@ -1332,20 +1339,34 @@ int dso__cmp_id(struct dso *a, struct dso *b) void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) { - dso__set_long_name_id(dso, name, NULL, name_allocated); + dso__set_long_name_id(dso, name, name_allocated); } void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated) { + struct dsos *dsos = dso->dsos; + if (name == NULL) return; + if (dsos) { + /* + * Need to avoid re-sorting the dsos breaking by non-atomically + * renaming the dso. + */ + down_write(&dsos->lock); + } if (dso->short_name_allocated) free((char *)dso->short_name); dso->short_name = name; dso->short_name_len = strlen(name); dso->short_name_allocated = name_allocated; + + if (dsos) { + dsos->sorted = false; + up_write(&dsos->lock); + } } int dso__name_len(const struct dso *dso) @@ -1381,7 +1402,7 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) strcpy(dso->name, name); if (id) dso->id = *id; - dso__set_long_name_id(dso, dso->name, id, false); + dso__set_long_name_id(dso, dso->name, false); dso__set_short_name(dso, dso->name, false); dso->symbols = RB_ROOT_CACHED; dso->symbol_names = NULL; @@ -1406,9 +1427,6 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) dso->is_kmod = 0; dso->needs_swap = DSO_SWAP__UNSET; dso->comp = COMP_ID__NONE; - RB_CLEAR_NODE(&dso->rb_node); - dso->root = NULL; - INIT_LIST_HEAD(&dso->node); INIT_LIST_HEAD(&dso->data.open_entry); mutex_init(&dso->lock); refcount_set(&dso->refcnt, 1); @@ -1424,9 +1442,8 @@ struct dso *dso__new(const char *name) void dso__delete(struct dso *dso) { - if (!RB_EMPTY_NODE(&dso->rb_node)) - pr_err("DSO %s is still in rbtree when being deleted!\n", - dso->long_name); + if (dso->dsos) + pr_err("DSO %s is still in rbtree when being deleted!\n", dso->long_name); /* free inlines first, as they reference symbols */ inlines__tree_delete(&dso->inlined_nodes); diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 2c295438226d..b22dec8b3f3a 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -146,9 +146,7 @@ struct auxtrace_cache; struct dso { struct mutex lock; - struct list_head node; - struct rb_node rb_node; /* rbtree node sorted by long name */ - struct rb_root *root; /* root of rbtree that rb_node is in */ + struct dsos *dsos; struct rb_root_cached symbols; struct symbol **symbol_names; size_t symbol_names_len; @@ -238,8 +236,8 @@ static inline void dso__set_loaded(struct dso *dso) dso->loaded = true; } -int dso_id__cmp(struct dso_id *a, struct dso_id *b); -bool dso_id__empty(struct dso_id *id); +int dso_id__cmp(const struct dso_id *a, const struct dso_id *b); +bool dso_id__empty(const struct dso_id *id); struct dso *dso__new_id(const char *name, struct dso_id *id); struct dso *dso__new(const char *name); @@ -248,7 +246,7 @@ void dso__delete(struct dso *dso); int dso__cmp_id(struct dso *a, struct dso *b); void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated); void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated); -void dso__inject_id(struct dso *dso, struct dso_id *id); +void __dso__inject_id(struct dso *dso, struct dso_id *id); int dso__name_len(const struct dso *dso); diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index b7fbfb877ae3..c01f0b538e65 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -14,24 +14,30 @@ void dsos__init(struct dsos *dsos) { - INIT_LIST_HEAD(&dsos->head); - dsos->root = RB_ROOT; init_rwsem(&dsos->lock); + + dsos->cnt = 0; + dsos->allocated = 0; + dsos->dsos = NULL; + dsos->sorted = true; } static void dsos__purge(struct dsos *dsos) { - struct dso *pos, *n; - down_write(&dsos->lock); - list_for_each_entry_safe(pos, n, &dsos->head, node) { - RB_CLEAR_NODE(&pos->rb_node); - pos->root = NULL; - list_del_init(&pos->node); - dso__put(pos); + for (unsigned int i = 0; i < dsos->cnt; i++) { + struct dso *dso = dsos->dsos[i]; + + dso->dsos = NULL; + dso__put(dso); } + zfree(&dsos->dsos); + dsos->cnt = 0; + dsos->allocated = 0; + dsos->sorted = true; + up_write(&dsos->lock); } @@ -46,9 +52,8 @@ static int __dsos__for_each_dso(struct dsos *dsos, int (*cb)(struct dso *dso, void *data), void *data) { - struct dso *dso; - - list_for_each_entry(dso, &dsos->head, node) { + for (unsigned int i = 0; i < dsos->cnt; i++) { + struct dso *dso = dsos->dsos[i]; int err; err = cb(dso, data); @@ -119,16 +124,47 @@ static int dso__cmp_short_name(struct dso *a, struct dso *b) return __dso__cmp_short_name(a->short_name, &a->id, b); } +static int dsos__cmp_long_name_id_short_name(const void *va, const void *vb) +{ + const struct dso *a = *((const struct dso **)va); + const struct dso *b = *((const struct dso **)vb); + int rc = strcmp(a->long_name, b->long_name); + + if (!rc) { + rc = dso_id__cmp(&a->id, &b->id); + if (!rc) + rc = strcmp(a->short_name, b->short_name); + } + return rc; +} + /* * Find a matching entry and/or link current entry to RB tree. * Either one of the dso or name parameter must be non-NULL or the * function will not work. */ -struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, - const char *name, struct dso_id *id) +struct dso *__dsos__findnew_link_by_longname_id(struct dsos *dsos, + struct dso *dso, + const char *name, + struct dso_id *id, + bool write_locked) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; + int low = 0, high = dsos->cnt - 1; + + if (!dsos->sorted) { + if (!write_locked) { + up_read(&dsos->lock); + down_write(&dsos->lock); + dso = __dsos__findnew_link_by_longname_id(dsos, dso, name, id, + /*write_locked=*/true); + up_write(&dsos->lock); + down_read(&dsos->lock); + return dso; + } + qsort(dsos->dsos, dsos->cnt, sizeof(struct dso *), + dsos__cmp_long_name_id_short_name); + dsos->sorted = true; + } if (!name) name = dso->long_name; @@ -136,11 +172,11 @@ struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso /* * Find node with the matching name */ - while (*p) { - struct dso *this = rb_entry(*p, struct dso, rb_node); + while (low <= high) { + int mid = (low + high) / 2; + struct dso *this = dsos->dsos[mid]; int rc = __dso__cmp_long_name(name, id, this); - parent = *p; if (rc == 0) { /* * In case the new DSO is a duplicate of an existing @@ -161,56 +197,53 @@ struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso } } if (rc < 0) - p = &parent->rb_left; + high = mid - 1; else - p = &parent->rb_right; - } - if (dso) { - /* Add new node and rebalance tree */ - rb_link_node(&dso->rb_node, parent, p); - rb_insert_color(&dso->rb_node, root); - dso->root = root; + low = mid + 1; } + if (dso) + __dsos__add(dsos, dso); return NULL; } -void __dsos__add(struct dsos *dsos, struct dso *dso) +int __dsos__add(struct dsos *dsos, struct dso *dso) { - list_add_tail(&dso->node, &dsos->head); - __dsos__findnew_link_by_longname_id(&dsos->root, dso, NULL, &dso->id); - /* - * It is now in the linked list, grab a reference, then garbage collect - * this when needing memory, by looking at LRU dso instances in the - * list with atomic_read(&dso->refcnt) == 1, i.e. no references - * anywhere besides the one for the list, do, under a lock for the - * list: remove it from the list, then a dso__put(), that probably will - * be the last and will then call dso__delete(), end of life. - * - * That, or at the end of the 'struct machine' lifetime, when all - * 'struct dso' instances will be removed from the list, in - * dsos__exit(), if they have no other reference from some other data - * structure. - * - * E.g.: after processing a 'perf.data' file and storing references - * to objects instantiated while processing events, we will have - * references to the 'thread', 'map', 'dso' structs all from 'struct - * hist_entry' instances, but we may not need anything not referenced, - * so we might as well call machines__exit()/machines__delete() and - * garbage collect it. - */ - dso__get(dso); + if (dsos->cnt == dsos->allocated) { + unsigned int to_allocate = 2; + struct dso **temp; + + if (dsos->allocated > 0) + to_allocate = dsos->allocated * 2; + temp = realloc(dsos->dsos, sizeof(struct dso *) * to_allocate); + if (!temp) + return -ENOMEM; + dsos->dsos = temp; + dsos->allocated = to_allocate; + } + dsos->dsos[dsos->cnt++] = dso__get(dso); + if (dsos->cnt >= 2 && dsos->sorted) { + dsos->sorted = dsos__cmp_long_name_id_short_name(&dsos->dsos[dsos->cnt - 2], + &dsos->dsos[dsos->cnt - 1]) + <= 0; + } + dso->dsos = dsos; + return 0; } -void dsos__add(struct dsos *dsos, struct dso *dso) +int dsos__add(struct dsos *dsos, struct dso *dso) { + int ret; + down_write(&dsos->lock); - __dsos__add(dsos, dso); + ret = __dsos__add(dsos, dso); up_write(&dsos->lock); + return ret; } -static struct dso *__dsos__findnew_by_longname_id(struct rb_root *root, const char *name, struct dso_id *id) +static struct dso *__dsos__findnew_by_longname_id(struct dsos *dsos, const char *name, + struct dso_id *id, bool write_locked) { - return __dsos__findnew_link_by_longname_id(root, NULL, name, id); + return __dsos__findnew_link_by_longname_id(dsos, NULL, name, id, write_locked); } struct dsos__find_id_cb_args { @@ -231,7 +264,8 @@ static int dsos__find_id_cb(struct dso *dso, void *data) } -static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, bool cmp_short) +static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, + bool cmp_short, bool write_locked) { struct dso *res; @@ -245,7 +279,7 @@ static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct d __dsos__for_each_dso(dsos, dsos__find_id_cb, &args); return args.res; } - res = __dsos__findnew_by_longname_id(&dsos->root, name, id); + res = __dsos__findnew_by_longname_id(dsos, name, id, write_locked); return res; } @@ -254,7 +288,7 @@ struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short) struct dso *res; down_read(&dsos->lock); - res = __dsos__find_id(dsos, name, NULL, cmp_short); + res = __dsos__find_id(dsos, name, NULL, cmp_short, /*write_locked=*/false); up_read(&dsos->lock); return res; } @@ -296,8 +330,13 @@ static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct struct dso *dso = dso__new_id(name, id); if (dso != NULL) { - __dsos__add(dsos, dso); + /* + * The dsos lock is held on entry, so rename the dso before + * adding it to avoid needing to take the dsos lock again to say + * the array isn't sorted. + */ dso__set_basename(dso); + __dsos__add(dsos, dso); } return dso; } @@ -309,10 +348,10 @@ struct dso *__dsos__addnew(struct dsos *dsos, const char *name) static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) { - struct dso *dso = __dsos__find_id(dsos, name, id, false); + struct dso *dso = __dsos__find_id(dsos, name, id, false, /*write_locked=*/true); if (dso && dso_id__empty(&dso->id) && !dso_id__empty(id)) - dso__inject_id(dso, id); + __dso__inject_id(dso, id); return dso ? dso : __dsos__addnew_id(dsos, name, id); } @@ -403,18 +442,27 @@ struct dso *dsos__findnew_module_dso(struct dsos *dsos, down_write(&dsos->lock); - dso = __dsos__find_id(dsos, m->name, NULL, /*cmp_short=*/true); + dso = __dsos__find_id(dsos, m->name, NULL, /*cmp_short=*/true, /*write_locked=*/true); + if (dso) { + up_write(&dsos->lock); + return dso; + } + /* + * Failed to find the dso so create it. Change the name before adding it + * to the array, to avoid unnecessary sorts and potential locking + * issues. + */ + dso = dso__new_id(m->name, /*id=*/NULL); if (!dso) { - dso = __dsos__addnew(dsos, m->name); - if (dso == NULL) - goto out_unlock; - - dso__set_module_info(dso, m, machine); - dso__set_long_name(dso, strdup(filename), true); - dso->kernel = DSO_SPACE__KERNEL; + up_write(&dsos->lock); + return NULL; } + dso__set_basename(dso); + dso__set_module_info(dso, m, machine); + dso__set_long_name(dso, strdup(filename), true); + dso->kernel = DSO_SPACE__KERNEL; + __dsos__add(dsos, dso); -out_unlock: up_write(&dsos->lock); return dso; } diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 50bd51523475..c1b3979ad4bd 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -14,20 +14,22 @@ struct kmod_path; struct machine; /* - * DSOs are put into both a list for fast iteration and rbtree for fast - * long name lookup. + * Collection of DSOs as an array for iteration speed, but sorted for O(n) + * lookup. */ struct dsos { - struct list_head head; - struct rb_root root; /* rbtree root sorted by long name */ struct rw_semaphore lock; + struct dso **dsos; + unsigned int cnt; + unsigned int allocated; + bool sorted; }; void dsos__init(struct dsos *dsos); void dsos__exit(struct dsos *dsos); -void __dsos__add(struct dsos *dsos, struct dso *dso); -void dsos__add(struct dsos *dsos, struct dso *dso); +int __dsos__add(struct dsos *dsos, struct dso *dso); +int dsos__add(struct dsos *dsos, struct dso *dso); struct dso *__dsos__addnew(struct dsos *dsos, const char *name); struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); @@ -35,8 +37,11 @@ struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id bool dsos__read_build_ids(struct dsos *dsos, bool with_hits); -struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, - const char *name, struct dso_id *id); +struct dso *__dsos__findnew_link_by_longname_id(struct dsos *dsos, + struct dso *dso, + const char *name, + struct dso_id *id, + bool write_locked); size_t dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm); -- cgit v1.2.3 From dfd48165bbf752e41dd51e77b92db3f848a4a99a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 4 May 2024 14:37:58 -0700 Subject: perf dsos: Remove __dsos__addnew() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function no longer used so remove. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Dima Kogan Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Tiezhu Yang Cc: Yanteng Si Cc: zhaimingbing Link: https://lore.kernel.org/r/20240504213803.218974-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dsos.c | 5 ----- tools/perf/util/dsos.h | 1 - 2 files changed, 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index c01f0b538e65..89f87a197a44 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -341,11 +341,6 @@ static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct return dso; } -struct dso *__dsos__addnew(struct dsos *dsos, const char *name) -{ - return __dsos__addnew_id(dsos, name, NULL); -} - static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) { struct dso *dso = __dsos__find_id(dsos, name, id, false, /*write_locked=*/true); diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index c1b3979ad4bd..d1497b11d64c 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -30,7 +30,6 @@ void dsos__exit(struct dsos *dsos); int __dsos__add(struct dsos *dsos, struct dso *dso); int dsos__add(struct dsos *dsos, struct dso *dso); -struct dso *__dsos__addnew(struct dsos *dsos, const char *name); struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); -- cgit v1.2.3 From 7410d6008d28d65bead6aa85909e6915f2c8fc61 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 4 May 2024 14:37:59 -0700 Subject: perf dsos: Remove __dsos__findnew_link_by_longname_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function was only called in dsos.c with the dso parameter as NULL. Remove the function and specialize for the dso being NULL case removing other unused functions along the way. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Dima Kogan Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Tiezhu Yang Cc: Yanteng Si Cc: zhaimingbing Link: https://lore.kernel.org/r/20240504213803.218974-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dsos.c | 51 ++++++++++---------------------------------------- tools/perf/util/dsos.h | 6 ------ 2 files changed, 10 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index 89f87a197a44..e2ccceac7619 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -119,11 +119,6 @@ static int __dso__cmp_short_name(const char *short_name, struct dso_id *id, stru return rc ?: dso_id__cmp(id, &b->id); } -static int dso__cmp_short_name(struct dso *a, struct dso *b) -{ - return __dso__cmp_short_name(a->short_name, &a->id, b); -} - static int dsos__cmp_long_name_id_short_name(const void *va, const void *vb) { const struct dso *a = *((const struct dso **)va); @@ -143,20 +138,21 @@ static int dsos__cmp_long_name_id_short_name(const void *va, const void *vb) * Either one of the dso or name parameter must be non-NULL or the * function will not work. */ -struct dso *__dsos__findnew_link_by_longname_id(struct dsos *dsos, - struct dso *dso, - const char *name, - struct dso_id *id, - bool write_locked) +static struct dso *__dsos__find_by_longname_id(struct dsos *dsos, + const char *name, + struct dso_id *id, + bool write_locked) { int low = 0, high = dsos->cnt - 1; if (!dsos->sorted) { if (!write_locked) { + struct dso *dso; + up_read(&dsos->lock); down_write(&dsos->lock); - dso = __dsos__findnew_link_by_longname_id(dsos, dso, name, id, - /*write_locked=*/true); + dso = __dsos__find_by_longname_id(dsos, name, id, + /*write_locked=*/true); up_write(&dsos->lock); down_read(&dsos->lock); return dso; @@ -166,9 +162,6 @@ struct dso *__dsos__findnew_link_by_longname_id(struct dsos *dsos, dsos->sorted = true; } - if (!name) - name = dso->long_name; - /* * Find node with the matching name */ @@ -178,31 +171,13 @@ struct dso *__dsos__findnew_link_by_longname_id(struct dsos *dsos, int rc = __dso__cmp_long_name(name, id, this); if (rc == 0) { - /* - * In case the new DSO is a duplicate of an existing - * one, print a one-time warning & put the new entry - * at the end of the list of duplicates. - */ - if (!dso || (dso == this)) - return dso__get(this); /* Find matching dso */ - /* - * The core kernel DSOs may have duplicated long name. - * In this case, the short name should be different. - * Comparing the short names to differentiate the DSOs. - */ - rc = dso__cmp_short_name(dso, this); - if (rc == 0) { - pr_err("Duplicated dso name: %s\n", name); - return NULL; - } + return dso__get(this); /* Find matching dso */ } if (rc < 0) high = mid - 1; else low = mid + 1; } - if (dso) - __dsos__add(dsos, dso); return NULL; } @@ -240,12 +215,6 @@ int dsos__add(struct dsos *dsos, struct dso *dso) return ret; } -static struct dso *__dsos__findnew_by_longname_id(struct dsos *dsos, const char *name, - struct dso_id *id, bool write_locked) -{ - return __dsos__findnew_link_by_longname_id(dsos, NULL, name, id, write_locked); -} - struct dsos__find_id_cb_args { const char *name; struct dso_id *id; @@ -279,7 +248,7 @@ static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct d __dsos__for_each_dso(dsos, dsos__find_id_cb, &args); return args.res; } - res = __dsos__findnew_by_longname_id(dsos, name, id, write_locked); + res = __dsos__find_by_longname_id(dsos, name, id, write_locked); return res; } diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index d1497b11d64c..6c13b65648bc 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -36,12 +36,6 @@ struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id bool dsos__read_build_ids(struct dsos *dsos, bool with_hits); -struct dso *__dsos__findnew_link_by_longname_id(struct dsos *dsos, - struct dso *dso, - const char *name, - struct dso_id *id, - bool write_locked); - size_t dsos__fprintf_buildid(struct dsos *dsos, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm); size_t dsos__fprintf(struct dsos *dsos, FILE *fp); -- cgit v1.2.3 From 7a9418cf7f05a74cbc9d4c750ee1bfddaa11f121 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 4 May 2024 14:38:00 -0700 Subject: perf dsos: Switch hand crafted code to bsearch() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch to using the bsearch library function rather than having a hand written binary search. Const-ify some static functions to avoid compiler warnings. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Dima Kogan Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Tiezhu Yang Cc: Yanteng Si Cc: zhaimingbing Link: https://lore.kernel.org/r/20240504213803.218974-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dsos.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index e2ccceac7619..2e4e86dc6c17 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -107,13 +107,15 @@ bool dsos__read_build_ids(struct dsos *dsos, bool with_hits) return args.have_build_id; } -static int __dso__cmp_long_name(const char *long_name, struct dso_id *id, struct dso *b) +static int __dso__cmp_long_name(const char *long_name, const struct dso_id *id, + const struct dso *b) { int rc = strcmp(long_name, b->long_name); return rc ?: dso_id__cmp(id, &b->id); } -static int __dso__cmp_short_name(const char *short_name, struct dso_id *id, struct dso *b) +static int __dso__cmp_short_name(const char *short_name, const struct dso_id *id, + const struct dso *b) { int rc = strcmp(short_name, b->short_name); return rc ?: dso_id__cmp(id, &b->id); @@ -133,6 +135,19 @@ static int dsos__cmp_long_name_id_short_name(const void *va, const void *vb) return rc; } +struct dsos__key { + const char *long_name; + const struct dso_id *id; +}; + +static int dsos__cmp_key_long_name_id(const void *vkey, const void *vdso) +{ + const struct dsos__key *key = vkey; + const struct dso *dso = *((const struct dso **)vdso); + + return __dso__cmp_long_name(key->long_name, key->id, dso); +} + /* * Find a matching entry and/or link current entry to RB tree. * Either one of the dso or name parameter must be non-NULL or the @@ -143,7 +158,11 @@ static struct dso *__dsos__find_by_longname_id(struct dsos *dsos, struct dso_id *id, bool write_locked) { - int low = 0, high = dsos->cnt - 1; + struct dsos__key key = { + .long_name = name, + .id = id, + }; + struct dso **res; if (!dsos->sorted) { if (!write_locked) { @@ -162,23 +181,12 @@ static struct dso *__dsos__find_by_longname_id(struct dsos *dsos, dsos->sorted = true; } - /* - * Find node with the matching name - */ - while (low <= high) { - int mid = (low + high) / 2; - struct dso *this = dsos->dsos[mid]; - int rc = __dso__cmp_long_name(name, id, this); + res = bsearch(&key, dsos->dsos, dsos->cnt, sizeof(struct dso *), + dsos__cmp_key_long_name_id); + if (!res) + return NULL; - if (rc == 0) { - return dso__get(this); /* Find matching dso */ - } - if (rc < 0) - high = mid - 1; - else - low = mid + 1; - } - return NULL; + return dso__get(*res); } int __dsos__add(struct dsos *dsos, struct dso *dso) -- cgit v1.2.3 From 27021649ec88cf9aa14d2ac7e7f2e6789f055978 Mon Sep 17 00:00:00 2001 From: Yoann Congal Date: Sun, 5 May 2024 10:03:43 +0200 Subject: printk: Remove redundant CONFIG_BASE_FULL CONFIG_BASE_FULL is equivalent to !CONFIG_BASE_SMALL and is enabled by default: CONFIG_BASE_SMALL is the special case to take care of. So, remove CONFIG_BASE_FULL and move the config choice to CONFIG_BASE_SMALL (which defaults to 'n') For defconfigs explicitely disabling BASE_FULL, explicitely enable BASE_SMALL. For defconfigs explicitely enabling BASE_FULL, drop it as it is the default. Signed-off-by: Yoann Congal Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240505080343.1471198-4-yoann.congal@smile.fr Signed-off-by: Petr Mladek --- arch/arm/configs/collie_defconfig | 2 +- arch/arm/configs/keystone_defconfig | 2 +- arch/arm/configs/lpc18xx_defconfig | 2 +- arch/arm/configs/moxart_defconfig | 2 +- arch/arm/configs/mps2_defconfig | 2 +- arch/arm/configs/omap1_defconfig | 2 +- arch/arm/configs/stm32_defconfig | 2 +- arch/microblaze/configs/mmu_defconfig | 2 +- arch/mips/configs/rs90_defconfig | 2 +- arch/powerpc/configs/adder875_defconfig | 2 +- arch/powerpc/configs/ep88xc_defconfig | 2 +- arch/powerpc/configs/mpc866_ads_defconfig | 2 +- arch/powerpc/configs/mpc885_ads_defconfig | 2 +- arch/powerpc/configs/tqm8xx_defconfig | 2 +- arch/riscv/configs/nommu_k210_defconfig | 2 +- arch/riscv/configs/nommu_k210_sdcard_defconfig | 2 +- arch/riscv/configs/nommu_virt_defconfig | 2 +- arch/sh/configs/edosk7705_defconfig | 2 +- arch/sh/configs/se7619_defconfig | 2 +- arch/sh/configs/se7712_defconfig | 2 +- arch/sh/configs/se7721_defconfig | 2 +- arch/sh/configs/shmin_defconfig | 2 +- init/Kconfig | 10 +++------- tools/testing/selftests/wireguard/qemu/kernel.config | 1 - 24 files changed, 25 insertions(+), 30 deletions(-) (limited to 'tools') diff --git a/arch/arm/configs/collie_defconfig b/arch/arm/configs/collie_defconfig index 01b5a5a73f03..42cb1c854118 100644 --- a/arch/arm/configs/collie_defconfig +++ b/arch/arm/configs/collie_defconfig @@ -3,7 +3,7 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_EXPERT=y -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_EPOLL is not set CONFIG_ARCH_MULTI_V4=y # CONFIG_ARCH_MULTI_V7 is not set diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig index 59c4835ffc97..c1291ca290b2 100644 --- a/arch/arm/configs/keystone_defconfig +++ b/arch/arm/configs/keystone_defconfig @@ -12,7 +12,7 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_BLK_DEV_INITRD=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EXPERT=y CONFIG_PROFILING=y diff --git a/arch/arm/configs/lpc18xx_defconfig b/arch/arm/configs/lpc18xx_defconfig index d169da9b2824..f55c231e0870 100644 --- a/arch/arm/configs/lpc18xx_defconfig +++ b/arch/arm/configs/lpc18xx_defconfig @@ -8,7 +8,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_LZ4 is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_UID16 is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig index 1d41e73f4903..34d079e03b3c 100644 --- a/arch/arm/configs/moxart_defconfig +++ b/arch/arm/configs/moxart_defconfig @@ -6,7 +6,7 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_SIGNALFD is not set # CONFIG_TIMERFD is not set # CONFIG_EVENTFD is not set diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig index 3ed73f184d83..e995e50537ef 100644 --- a/arch/arm/configs/mps2_defconfig +++ b/arch/arm/configs/mps2_defconfig @@ -5,7 +5,7 @@ CONFIG_LOG_BUF_SHIFT=16 CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_EXPERT=y # CONFIG_UID16 is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig index 729ea8157e2a..025b595dd837 100644 --- a/arch/arm/configs/omap1_defconfig +++ b/arch/arm/configs/omap1_defconfig @@ -9,7 +9,7 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_SHMEM is not set # CONFIG_KALLSYMS is not set CONFIG_PROFILING=y diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig index b9fe3fbed5ae..3baec075d1ef 100644 --- a/arch/arm/configs/stm32_defconfig +++ b/arch/arm/configs/stm32_defconfig @@ -6,7 +6,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_EXPERT=y # CONFIG_UID16 is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 4da7bc4ac4a3..176314f3c9aa 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -4,7 +4,7 @@ CONFIG_AUDIT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_EXPERT=y -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y CONFIG_KALLSYMS_ALL=y CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 diff --git a/arch/mips/configs/rs90_defconfig b/arch/mips/configs/rs90_defconfig index 4b9e36d6400e..a53dd66e9b86 100644 --- a/arch/mips/configs/rs90_defconfig +++ b/arch/mips/configs/rs90_defconfig @@ -9,7 +9,7 @@ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y # CONFIG_SGETMASK_SYSCALL is not set # CONFIG_SYSFS_SYSCALL is not set # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_TIMERFD is not set # CONFIG_AIO is not set # CONFIG_IO_URING is not set diff --git a/arch/powerpc/configs/adder875_defconfig b/arch/powerpc/configs/adder875_defconfig index 7f35d5bc1229..97f4d4851735 100644 --- a/arch/powerpc/configs/adder875_defconfig +++ b/arch/powerpc/configs/adder875_defconfig @@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/powerpc/configs/ep88xc_defconfig b/arch/powerpc/configs/ep88xc_defconfig index a98ef6a4abef..50cc59eb36cf 100644 --- a/arch/powerpc/configs/ep88xc_defconfig +++ b/arch/powerpc/configs/ep88xc_defconfig @@ -6,7 +6,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/powerpc/configs/mpc866_ads_defconfig b/arch/powerpc/configs/mpc866_ads_defconfig index 5c56d36cdfc5..6f449411abf7 100644 --- a/arch/powerpc/configs/mpc866_ads_defconfig +++ b/arch/powerpc/configs/mpc866_ads_defconfig @@ -6,7 +6,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_BUG is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_EPOLL is not set # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index 56b876e418e9..77306be62e9e 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -7,7 +7,7 @@ CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/powerpc/configs/tqm8xx_defconfig b/arch/powerpc/configs/tqm8xx_defconfig index 083c2e57520a..383c0966e92f 100644 --- a/arch/powerpc/configs/tqm8xx_defconfig +++ b/arch/powerpc/configs/tqm8xx_defconfig @@ -6,7 +6,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_VM_EVENT_COUNTERS is not set CONFIG_MODULES=y diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index 7e75200543f4..c486d7aae01a 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_SYSFS_SYSCALL is not set # CONFIG_FHANDLE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig index 0ba353e9ca71..9da8afa01ba4 100644 --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -3,7 +3,7 @@ CONFIG_LOG_BUF_SHIFT=13 CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_SYSFS_SYSCALL is not set # CONFIG_FHANDLE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index b794e2f8144e..ab6d618c1828 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -10,7 +10,7 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set # CONFIG_FHANDLE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set # CONFIG_TIMERFD is not set diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig index 9ee35269bee2..ab3bf72264df 100644 --- a/arch/sh/configs/edosk7705_defconfig +++ b/arch/sh/configs/edosk7705_defconfig @@ -6,7 +6,7 @@ # CONFIG_PRINTK is not set # CONFIG_BUG is not set # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig index 14d0f5ead502..4765966fec99 100644 --- a/arch/sh/configs/se7619_defconfig +++ b/arch/sh/configs/se7619_defconfig @@ -4,7 +4,7 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index dc854293da43..20f07aee5bde 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -7,7 +7,7 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_KALLSYMS_ALL=y # CONFIG_BUG is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_SHMEM is not set CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index c891945b8a90..00862d3c030d 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -7,7 +7,7 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_KALLSYMS_ALL=y # CONFIG_BUG is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_SHMEM is not set CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig index e078b193a78a..bfeb004f130e 100644 --- a/arch/sh/configs/shmin_defconfig +++ b/arch/sh/configs/shmin_defconfig @@ -5,7 +5,7 @@ CONFIG_LOG_BUF_SHIFT=14 # CONFIG_HOTPLUG is not set # CONFIG_BUG is not set # CONFIG_ELF_CORE is not set -# CONFIG_BASE_FULL is not set +CONFIG_BASE_SMALL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set diff --git a/init/Kconfig b/init/Kconfig index 4b517523d566..52e16eb6221b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1585,11 +1585,10 @@ config PCSPKR_PLATFORM This option allows to disable the internal PC-Speaker support, saving some memory. -config BASE_FULL - default y - bool "Enable full-sized data structures for core" if EXPERT +config BASE_SMALL + bool "Enable smaller-sized data structures for core" if EXPERT help - Disabling this option reduces the size of miscellaneous core + Enabling this option reduces the size of miscellaneous core kernel data structures. This saves memory on small machines, but may reduce performance. @@ -1944,9 +1943,6 @@ config RT_MUTEXES bool default y if PREEMPT_RT -config BASE_SMALL - def_bool !BASE_FULL - config MODULE_SIG_FORMAT def_bool n select SYSTEM_DATA_VERIFICATION diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 507555714b1d..f314d3789f17 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -41,7 +41,6 @@ CONFIG_KALLSYMS=y CONFIG_BUG=y CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y CONFIG_JUMP_LABEL=y -CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_SHMEM=y CONFIG_SLUB=y -- cgit v1.2.3 From ee756ef7491eafd70f390343a1d90930af125a51 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 4 May 2024 14:38:01 -0700 Subject: perf dso: Add reference count checking and accessor functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add reference count checking to struct dso, this can help with implementing correct reference counting discipline. To avoid RC_CHK_ACCESS everywhere, add accessor functions for the variables in struct dso. The majority of the change is mechanical in nature and not easy to split up. Committer testing: 'perf test' up to this patch shows no regressions. But: util/symbol.c: In function ‘dso__load_bfd_symbols’: util/symbol.c:1683:9: error: too few arguments to function ‘dso__set_adjust_symbols’ 1683 | dso__set_adjust_symbols(dso); | ^~~~~~~~~~~~~~~~~~~~~~~ In file included from util/symbol.c:21: util/dso.h:268:20: note: declared here 268 | static inline void dso__set_adjust_symbols(struct dso *dso, bool val) | ^~~~~~~~~~~~~~~~~~~~~~~ make[6]: *** [/home/acme/git/perf-tools-next/tools/build/Makefile.build:106: /tmp/tmp.ZWHbQftdN6/util/symbol.o] Error 1 MKDIR /tmp/tmp.ZWHbQftdN6/tests/workloads/ make[6]: *** Waiting for unfinished jobs.... This was updated: - symbols__fixup_end(&dso->symbols, false); - symbols__fixup_duplicate(&dso->symbols); - dso->adjust_symbols = 1; + symbols__fixup_end(dso__symbols(dso), false); + symbols__fixup_duplicate(dso__symbols(dso)); + dso__set_adjust_symbols(dso); But not build tested with BUILD_NONDISTRO and libbfd devel files installed (binutils-devel on fedora). Add the missing argument: symbols__fixup_end(dso__symbols(dso), false); symbols__fixup_duplicate(dso__symbols(dso)); - dso__set_adjust_symbols(dso); + dso__set_adjust_symbols(dso, true); Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Changbin Du Cc: Chengen Du Cc: Colin Ian King Cc: Dima Kogan Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Thomas Richter Cc: Tiezhu Yang Cc: Yanteng Si Cc: zhaimingbing Link: https://lore.kernel.org/r/20240504213803.218974-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 6 +- tools/perf/builtin-buildid-cache.c | 2 +- tools/perf/builtin-buildid-list.c | 18 +- tools/perf/builtin-inject.c | 71 ++- tools/perf/builtin-kallsyms.c | 2 +- tools/perf/builtin-mem.c | 4 +- tools/perf/builtin-report.c | 6 +- tools/perf/builtin-script.c | 8 +- tools/perf/builtin-top.c | 4 +- tools/perf/builtin-trace.c | 2 +- tools/perf/tests/code-reading.c | 8 +- tools/perf/tests/dso-data.c | 11 +- tools/perf/tests/hists_common.c | 6 +- tools/perf/tests/hists_cumulate.c | 4 +- tools/perf/tests/hists_output.c | 2 +- tools/perf/tests/maps.c | 4 +- tools/perf/tests/symbols.c | 8 +- tools/perf/tests/vmlinux-kallsyms.c | 6 +- tools/perf/ui/browsers/annotate.c | 6 +- tools/perf/ui/browsers/hists.c | 8 +- tools/perf/ui/browsers/map.c | 4 +- tools/perf/util/annotate-data.c | 18 +- tools/perf/util/annotate.c | 17 +- tools/perf/util/auxtrace.c | 2 +- tools/perf/util/block-info.c | 2 +- tools/perf/util/bpf-event.c | 8 +- tools/perf/util/build-id.c | 38 +- tools/perf/util/callchain.c | 2 +- tools/perf/util/data-convert-json.c | 2 +- tools/perf/util/db-export.c | 6 +- tools/perf/util/disasm.c | 40 +- tools/perf/util/dlfilter.c | 12 +- tools/perf/util/dso.c | 368 ++++++++-------- tools/perf/util/dso.h | 488 +++++++++++++++++++-- tools/perf/util/dsos.c | 54 +-- tools/perf/util/event.c | 8 +- tools/perf/util/header.c | 8 +- tools/perf/util/hist.c | 4 +- tools/perf/util/intel-pt.c | 22 +- tools/perf/util/machine.c | 46 +- tools/perf/util/map.c | 77 ++-- tools/perf/util/maps.c | 14 +- tools/perf/util/print_insn.c | 2 +- tools/perf/util/probe-event.c | 25 +- .../perf/util/scripting-engines/trace-event-perl.c | 6 +- .../util/scripting-engines/trace-event-python.c | 21 +- tools/perf/util/sort.c | 19 +- tools/perf/util/srcline.c | 65 +-- tools/perf/util/symbol-elf.c | 94 ++-- tools/perf/util/symbol-minimal.c | 4 +- tools/perf/util/symbol.c | 186 ++++---- tools/perf/util/symbol_fprintf.c | 4 +- tools/perf/util/synthetic-events.c | 24 +- tools/perf/util/thread.c | 4 +- tools/perf/util/unwind-libunwind-local.c | 18 +- tools/perf/util/unwind-libunwind.c | 2 +- tools/perf/util/vdso.c | 8 +- 57 files changed, 1169 insertions(+), 739 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 83812b9d5363..50d2fb222d48 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -219,7 +219,7 @@ static int process_branch_callback(struct evsel *evsel, } if (a.map != NULL) - map__dso(a.map)->hit = 1; + dso__set_hit(map__dso(a.map)); hist__account_cycles(sample->branch_stack, al, sample, false, NULL); @@ -254,7 +254,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample, if (al->sym != NULL) { struct dso *dso = map__dso(al->map); - rb_erase_cached(&al->sym->rb_node, &dso->symbols); + rb_erase_cached(&al->sym->rb_node, dso__symbols(dso)); symbol__delete(al->sym); dso__reset_find_symbol_cache(dso); } @@ -419,7 +419,7 @@ static void hists__find_annotations(struct hists *hists, struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); struct annotation *notes; - if (he->ms.sym == NULL || map__dso(he->ms.map)->annotate_warned) + if (he->ms.sym == NULL || dso__annotate_warned(map__dso(he->ms.map))) goto find_next; if (ann->sym_hist_filter && diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index e2a40f1d9225..b0511d16aeb6 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -286,7 +286,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused) pr_warning("Problems with %s file, consider removing it from the cache\n", filename); - } else if (memcmp(dso->bid.data, bid.data, bid.size)) { + } else if (memcmp(dso__bid(dso)->data, bid.data, bid.size)) { pr_warning("Problems with %s file, consider removing it from the cache\n", filename); } diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index c9037477865a..383d5de36ce4 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -26,16 +26,18 @@ static int buildid__map_cb(struct map *map, void *arg __maybe_unused) { const struct dso *dso = map__dso(map); char bid_buf[SBUILD_ID_SIZE]; + const char *dso_long_name = dso__long_name(dso); + const char *dso_short_name = dso__short_name(dso); memset(bid_buf, 0, sizeof(bid_buf)); - if (dso->has_build_id) - build_id__sprintf(&dso->bid, bid_buf); + if (dso__has_build_id(dso)) + build_id__sprintf(dso__bid_const(dso), bid_buf); printf("%s %16" PRIx64 " %16" PRIx64, bid_buf, map__start(map), map__end(map)); - if (dso->long_name != NULL) { - printf(" %s", dso->long_name); - } else if (dso->short_name != NULL) { - printf(" %s", dso->short_name); - } + if (dso_long_name != NULL) + printf(" %s", dso_long_name); + else if (dso_short_name != NULL) + printf(" %s", dso_short_name); + printf("\n"); return 0; @@ -76,7 +78,7 @@ static int filename__fprintf_build_id(const char *name, FILE *fp) static bool dso__skip_buildid(struct dso *dso, int with_hits) { - return with_hits && !dso->hit; + return with_hits && !dso__hit(dso); } static int perf_session__list_build_ids(bool force, bool with_hits) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index ce5e28eaad90..a212678d47be 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -445,10 +445,9 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, } if (dso) { - mutex_lock(&dso->lock); - nsinfo__put(dso->nsinfo); - dso->nsinfo = nsi; - mutex_unlock(&dso->lock); + mutex_lock(dso__lock(dso)); + dso__set_nsinfo(dso, nsi); + mutex_unlock(dso__lock(dso)); } else nsinfo__put(nsi); @@ -466,8 +465,8 @@ static int perf_event__repipe_buildid_mmap(struct perf_tool *tool, dso = findnew_dso(event->mmap.pid, event->mmap.tid, event->mmap.filename, NULL, machine); - if (dso && !dso->hit) { - dso->hit = 1; + if (dso && !dso__hit(dso)) { + dso__set_hit(dso); dso__inject_build_id(dso, tool, machine, sample->cpumode, 0); } dso__put(dso); @@ -492,7 +491,7 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool, event->mmap2.filename, NULL, machine); if (dso) { /* mark it not to inject build-id */ - dso->hit = 1; + dso__set_hit(dso); } dso__put(dso); } @@ -544,7 +543,7 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, event->mmap2.filename, NULL, machine); if (dso) { /* mark it not to inject build-id */ - dso->hit = 1; + dso__set_hit(dso); } dso__put(dso); perf_event__repipe(tool, event, sample, machine); @@ -554,8 +553,8 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, event->mmap2.filename, &dso_id, machine); - if (dso && !dso->hit) { - dso->hit = 1; + if (dso && !dso__hit(dso)) { + dso__set_hit(dso); dso__inject_build_id(dso, tool, machine, sample->cpumode, event->mmap2.flags); } @@ -631,24 +630,24 @@ static int dso__read_build_id(struct dso *dso) { struct nscookie nsc; - if (dso->has_build_id) + if (dso__has_build_id(dso)) return 0; - mutex_lock(&dso->lock); - nsinfo__mountns_enter(dso->nsinfo, &nsc); - if (filename__read_build_id(dso->long_name, &dso->bid) > 0) - dso->has_build_id = true; - else if (dso->nsinfo) { - char *new_name = dso__filename_with_chroot(dso, dso->long_name); + mutex_lock(dso__lock(dso)); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + if (filename__read_build_id(dso__long_name(dso), dso__bid(dso)) > 0) + dso__set_has_build_id(dso); + else if (dso__nsinfo(dso)) { + char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &dso->bid) > 0) - dso->has_build_id = true; + if (new_name && filename__read_build_id(new_name, dso__bid(dso)) > 0) + dso__set_has_build_id(dso); free(new_name); } nsinfo__mountns_exit(&nsc); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); - return dso->has_build_id ? 0 : -1; + return dso__has_build_id(dso) ? 0 : -1; } static struct strlist *perf_inject__parse_known_build_ids( @@ -700,14 +699,14 @@ static bool perf_inject__lookup_known_build_id(struct perf_inject *inject, dso_name = strchr(build_id, ' '); bid_len = dso_name - pos->s; dso_name = skip_spaces(dso_name); - if (strcmp(dso->long_name, dso_name)) + if (strcmp(dso__long_name(dso), dso_name)) continue; for (int ix = 0; 2 * ix + 1 < bid_len; ++ix) { - dso->bid.data[ix] = (hex(build_id[2 * ix]) << 4 | - hex(build_id[2 * ix + 1])); + dso__bid(dso)->data[ix] = (hex(build_id[2 * ix]) << 4 | + hex(build_id[2 * ix + 1])); } - dso->bid.size = bid_len / 2; - dso->has_build_id = 1; + dso__bid(dso)->size = bid_len / 2; + dso__set_has_build_id(dso); return true; } return false; @@ -720,9 +719,9 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, tool); int err; - if (is_anon_memory(dso->long_name) || flags & MAP_HUGETLB) + if (is_anon_memory(dso__long_name(dso)) || flags & MAP_HUGETLB) return 0; - if (is_no_dso_memory(dso->long_name)) + if (is_no_dso_memory(dso__long_name(dso))) return 0; if (inject->known_build_ids != NULL && @@ -730,14 +729,14 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, return 1; if (dso__read_build_id(dso) < 0) { - pr_debug("no build_id found for %s\n", dso->long_name); + pr_debug("no build_id found for %s\n", dso__long_name(dso)); return -1; } err = perf_event__synthesize_build_id(tool, dso, cpumode, perf_event__repipe, machine); if (err) { - pr_err("Can't synthesize build_id event for %s\n", dso->long_name); + pr_err("Can't synthesize build_id event for %s\n", dso__long_name(dso)); return -1; } @@ -763,8 +762,8 @@ int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) { struct dso *dso = map__dso(al.map); - if (!dso->hit) { - dso->hit = 1; + if (!dso__hit(dso)) { + dso__set_hit(dso); dso__inject_build_id(dso, tool, machine, sample->cpumode, map__flags(al.map)); } @@ -1146,8 +1145,8 @@ static bool dso__is_in_kernel_space(struct dso *dso) return false; return dso__is_kcore(dso) || - dso->kernel || - is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); + dso__kernel(dso) || + is_kernel_module(dso__long_name(dso), PERF_RECORD_MISC_CPUMODE_UNKNOWN); } static u64 evlist__first_id(struct evlist *evlist) @@ -1181,7 +1180,7 @@ static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_ if (!machine) return -ENOMEM; - dso->hit = 1; + dso__set_hit(dso); return perf_event__synthesize_build_id(&inject->tool, dso, cpumode, process_build_id, machine); @@ -1192,7 +1191,7 @@ static int guest_session__add_build_ids_cb(struct dso *dso, void *data) struct guest_session *gs = data; struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) return 0; return synthesize_build_id(inject, dso, gs->machine_pid); diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c index 7f75c5b73f26..a3c2ffdc1af8 100644 --- a/tools/perf/builtin-kallsyms.c +++ b/tools/perf/builtin-kallsyms.c @@ -38,7 +38,7 @@ static int __cmd_kallsyms(int argc, const char **argv) dso = map__dso(map); printf("%s: %s %s %#" PRIx64 "-%#" PRIx64 " (%#" PRIx64 "-%#" PRIx64")\n", - symbol->name, dso->short_name, dso->long_name, + symbol->name, dso__short_name(dso), dso__long_name(dso), map__unmap_ip(map, symbol->start), map__unmap_ip(map, symbol->end), symbol->start, symbol->end); } diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 5b851e64e4a1..863fcd735dae 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -213,7 +213,7 @@ dump_raw_samples(struct perf_tool *tool, if (al.map != NULL) { dso = map__dso(al.map); if (dso) - dso->hit = 1; + dso__set_hit(dso); } field_sep = symbol_conf.field_sep; @@ -255,7 +255,7 @@ dump_raw_samples(struct perf_tool *tool, symbol_conf.field_sep, sample->data_src, symbol_conf.field_sep, - dso ? dso->long_name : "???", + dso ? dso__long_name(dso) : "???", al.sym ? al.sym->name : "???"); out_put: addr_location__exit(&al); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dafba6e030ef..b5525f4f7090 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -322,7 +322,7 @@ static int process_sample_event(struct perf_tool *tool, } if (al.map != NULL) - map__dso(al.map)->hit = 1; + dso__set_hit(map__dso(al.map)); if (ui__has_annotation() || rep->symbol_ipc || rep->total_cycles_mode) { hist__account_cycles(sample->branch_stack, &al, sample, @@ -609,7 +609,7 @@ static void report__warn_kptr_restrict(const struct report *rep) return; if (kernel_map == NULL || - (map__dso(kernel_map)->hit && + (dso__hit(map__dso(kernel_map)) && (kernel_kmap->ref_reloc_sym == NULL || kernel_kmap->ref_reloc_sym->addr == 0))) { const char *desc = @@ -850,7 +850,7 @@ static int maps__fprintf_task_cb(struct map *map, void *data) prot & PROT_EXEC ? 'x' : '-', map__flags(map) ? 's' : 'p', map__pgoff(map), - dso->id.ino, dso->name); + dso__id_const(dso)->ino, dso__name(dso)); if (ret < 0) return ret; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 647cb31a47c8..f7c3c3868c3c 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1017,11 +1017,11 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, to = entries[i].to; if (thread__find_map_fb(thread, sample->cpumode, from, &alf) && - !map__dso(alf.map)->adjust_symbols) + !dso__adjust_symbols(map__dso(alf.map))) from = map__dso_map_ip(alf.map, from); if (thread__find_map_fb(thread, sample->cpumode, to, &alt) && - !map__dso(alt.map)->adjust_symbols) + !dso__adjust_symbols(map__dso(alt.map))) to = map__dso_map_ip(alt.map, to); printed += fprintf(fp, " 0x%"PRIx64, from); @@ -1082,7 +1082,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); goto out; } - if (dso->data.status == DSO_DATA_STATUS_ERROR) { + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) { pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); goto out; } @@ -1094,7 +1094,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, len = dso__data_read_offset(dso, machine, offset, (u8 *)buffer, end - start + MAXINSN); - *is64bit = dso->is_64_bit; + *is64bit = dso__is_64_bit(dso); if (len <= 0) pr_debug("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n", start, end); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 5ac6dcc64cef..1d6aef51c122 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -129,7 +129,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) /* * We can't annotate with just /proc/kallsyms */ - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { + if (dso__symtab_type(dso) == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { pr_err("Can't annotate %s: No vmlinux file was found in the " "path\n", sym->name); sleep(1); @@ -182,7 +182,7 @@ static void ui__warn_map_erange(struct map *map, struct symbol *sym, u64 ip) "Tools: %s\n\n" "Not all samples will be on the annotation output.\n\n" "Please report to linux-kernel@vger.kernel.org\n", - ip, dso->long_name, dso__symtab_origin(dso), + ip, dso__long_name(dso), dso__symtab_origin(dso), map__start(map), map__end(map), sym->start, sym->end, sym->binding == STB_GLOBAL ? 'g' : sym->binding == STB_LOCAL ? 'l' : 'w', sym->name, diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 3818d3a62779..51eca671c797 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2917,7 +2917,7 @@ static void print_location(FILE *f, struct perf_sample *sample, { if ((verbose > 0 || print_dso) && al->map) - fprintf(f, "%s@", map__dso(al->map)->long_name); + fprintf(f, "%s@", dso__long_name(map__dso(al->map))); if ((verbose > 0 || print_sym) && al->sym) fprintf(f, "%s+0x%" PRIx64, al->sym->name, diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 29d2f3ee4e10..27c82cfb7e7d 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -253,9 +253,9 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, goto out; } dso = map__dso(al.map); - pr_debug("File is: %s\n", dso->long_name); + pr_debug("File is: %s\n", dso__long_name(dso)); - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { + if (dso__symtab_type(dso) == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { pr_debug("Unexpected kernel address - skipping\n"); goto out; } @@ -274,7 +274,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, * modules to manage long jumps. Check if the ip offset falls in stubs * sections for kernel modules. And skip module address after text end */ - if (dso->is_kmod && al.addr > dso->text_end) { + if (dso__is_kmod(dso) && al.addr > dso__text_end(dso)) { pr_debug("skipping the module address %#"PRIx64" after text end\n", al.addr); goto out; } @@ -315,7 +315,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, state->done[state->done_cnt++] = map__start(al.map); } - objdump_name = dso->long_name; + objdump_name = dso__long_name(dso); if (dso__needs_decompress(dso)) { if (dso__decompress_kmodule_path(dso, objdump_name, decomp_name, diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c index 2d67422c1222..fde4eca84b6f 100644 --- a/tools/perf/tests/dso-data.c +++ b/tools/perf/tests/dso-data.c @@ -228,7 +228,8 @@ static void dsos__delete(int cnt) for (i = 0; i < cnt; i++) { struct dso *dso = dsos[i]; - unlink(dso->name); + dso__data_close(dso); + unlink(dso__name(dso)); dso__put(dso); } @@ -289,14 +290,14 @@ static int test__dso_data_cache(struct test_suite *test __maybe_unused, int subt } /* verify the first one is already open */ - TEST_ASSERT_VAL("dsos[0] is not open", dsos[0]->data.fd != -1); + TEST_ASSERT_VAL("dsos[0] is not open", dso__data(dsos[0])->fd != -1); /* open +1 dso to reach the allowed limit */ fd = dso__data_fd(dsos[i], &machine); TEST_ASSERT_VAL("failed to get fd", fd > 0); /* should force the first one to be closed */ - TEST_ASSERT_VAL("failed to close dsos[0]", dsos[0]->data.fd == -1); + TEST_ASSERT_VAL("failed to close dsos[0]", dso__data(dsos[0])->fd == -1); /* cleanup everything */ dsos__delete(dso_cnt); @@ -371,7 +372,7 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub * dso_0 should get closed, because we reached * the file descriptor limit */ - TEST_ASSERT_VAL("failed to close dso_0", dso_0->data.fd == -1); + TEST_ASSERT_VAL("failed to close dso_0", dso__data(dso_0)->fd == -1); /* open dso_0 */ fd = dso__data_fd(dso_0, &machine); @@ -381,7 +382,7 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub * dso_1 should get closed, because we reached * the file descriptor limit */ - TEST_ASSERT_VAL("failed to close dso_1", dso_1->data.fd == -1); + TEST_ASSERT_VAL("failed to close dso_1", dso__data(dso_1)->fd == -1); /* cleanup everything */ close(fd_extra); diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c index d08add0f4da6..187f12f5bc21 100644 --- a/tools/perf/tests/hists_common.c +++ b/tools/perf/tests/hists_common.c @@ -146,7 +146,7 @@ struct machine *setup_fake_machine(struct machines *machines) goto out; } - symbols__insert(&dso->symbols, sym); + symbols__insert(dso__symbols(dso), sym); } dso__put(dso); @@ -183,7 +183,7 @@ void print_hists_in(struct hists *hists) pr_info("%2d: entry: %-8s [%-8s] %20s: period = %"PRIu64"\n", i, thread__comm_str(he->thread), - dso->short_name, + dso__short_name(dso), he->ms.sym->name, he->stat.period); } @@ -212,7 +212,7 @@ void print_hists_out(struct hists *hists) pr_info("%2d: entry: %8s:%5d [%-8s] %20s: period = %"PRIu64"/%"PRIu64"\n", i, thread__comm_str(he->thread), thread__tid(he->thread), - dso->short_name, + dso__short_name(dso), he->ms.sym->name, he->stat.period, he->stat_acc ? he->stat_acc->period : 0); } diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 71dacb0fec4d..1e0f5a310fd5 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -164,11 +164,11 @@ static void put_fake_samples(void) typedef int (*test_fn_t)(struct evsel *, struct machine *); #define COMM(he) (thread__comm_str(he->thread)) -#define DSO(he) (map__dso(he->ms.map)->short_name) +#define DSO(he) (dso__short_name(map__dso(he->ms.map))) #define SYM(he) (he->ms.sym->name) #define CPU(he) (he->cpu) #define DEPTH(he) (he->callchain->max_depth) -#define CDSO(cl) (map__dso(cl->ms.map)->short_name) +#define CDSO(cl) (dso__short_name(map__dso(cl->ms.map))) #define CSYM(cl) (cl->ms.sym->name) struct result { diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index ba1cccf57049..33b5cc8352a7 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -129,7 +129,7 @@ static void put_fake_samples(void) typedef int (*test_fn_t)(struct evsel *, struct machine *); #define COMM(he) (thread__comm_str(he->thread)) -#define DSO(he) (map__dso(he->ms.map)->short_name) +#define DSO(he) (dso__short_name(map__dso(he->ms.map))) #define SYM(he) (he->ms.sym->name) #define CPU(he) (he->cpu) #define PID(he) (thread__tid(he->thread)) diff --git a/tools/perf/tests/maps.c b/tools/perf/tests/maps.c index b15417a0d617..4f1f9385ea9c 100644 --- a/tools/perf/tests/maps.c +++ b/tools/perf/tests/maps.c @@ -26,7 +26,7 @@ static int check_maps_cb(struct map *map, void *data) if (map__start(map) != merged->start || map__end(map) != merged->end || - strcmp(map__dso(map)->name, merged->name) || + strcmp(dso__name(map__dso(map)), merged->name) || refcount_read(map__refcnt(map)) != 1) { return 1; } @@ -39,7 +39,7 @@ static int failed_cb(struct map *map, void *data __maybe_unused) pr_debug("\tstart: %" PRIu64 " end: %" PRIu64 " name: '%s' refcnt: %d\n", map__start(map), map__end(map), - map__dso(map)->name, + dso__name(map__dso(map)), refcount_read(map__refcnt(map))); return 0; diff --git a/tools/perf/tests/symbols.c b/tools/perf/tests/symbols.c index d208105919ed..ee20a366f32f 100644 --- a/tools/perf/tests/symbols.c +++ b/tools/perf/tests/symbols.c @@ -81,7 +81,7 @@ static int create_map(struct test_info *ti, char *filename, struct map **map_p) * If 'filename' matches a current kernel module, must use a kernel * map. Find the one that already exists. */ - if (dso && dso->kernel) { + if (dso && dso__kernel(dso) != DSO_SPACE__USER) { *map_p = find_module_map(ti->machine, dso); dso__put(dso); if (!*map_p) { @@ -116,7 +116,7 @@ static int test_dso(struct dso *dso) if (verbose > 1) dso__fprintf(dso, stderr); - for (nd = rb_first_cached(&dso->symbols); nd; nd = rb_next(nd)) { + for (nd = rb_first_cached(dso__symbols(dso)); nd; nd = rb_next(nd)) { struct symbol *sym = rb_entry(nd, struct symbol, rb_node); if (sym->type != STT_FUNC && sym->type != STT_GNU_IFUNC) @@ -145,7 +145,7 @@ static int subdivided_dso_cb(struct dso *dso, struct machine *machine __maybe_un { struct dso *text_dso = d; - if (dso != text_dso && strstarts(dso->short_name, text_dso->short_name)) + if (dso != text_dso && strstarts(dso__short_name(dso), dso__short_name(text_dso))) if (test_dso(dso) != TEST_OK) return -1; @@ -190,7 +190,7 @@ static int test_file(struct test_info *ti, char *filename) ret = test_dso(dso); /* Module dso is split into many dsos by section */ - if (ret == TEST_OK && dso->kernel) + if (ret == TEST_OK && dso__kernel(dso) != DSO_SPACE__USER) ret = process_subdivided_dso(ti->machine, dso); out_put: map__put(map); diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index fecbf851bb2e..e30fd55f8e51 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -129,7 +129,7 @@ static int test__vmlinux_matches_kallsyms_cb1(struct map *map, void *data) * cases. */ struct map *pair = maps__find_by_name(args->kallsyms.kmaps, - (dso->kernel ? dso->short_name : dso->name)); + (dso__kernel(dso) ? dso__short_name(dso) : dso__name(dso))); if (pair) { map__set_priv(pair, 1); @@ -162,11 +162,11 @@ static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data) } pr_info("WARN: %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as", - map__start(map), map__end(map), map__pgoff(map), dso->name); + map__start(map), map__end(map), map__pgoff(map), dso__name(dso)); if (mem_end != map__end(pair)) pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64, map__start(pair), map__end(pair), map__pgoff(pair)); - pr_info(" %s\n", dso->name); + pr_info(" %s\n", dso__name(dso)); map__set_priv(pair, 1); } map__put(pair); diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 0dd429cf612c..ea986430241e 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -438,7 +438,7 @@ static int sym_title(struct symbol *sym, struct map *map, char *title, size_t sz, int percent_type) { return snprintf(title, sz, "%s %s [Percent: %s]", sym->name, - map__dso(map)->long_name, + dso__long_name(map__dso(map)), percent_type_str(percent_type)); } @@ -967,14 +967,14 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, return -1; dso = map__dso(ms->map); - if (dso->annotate_warned) + if (dso__annotate_warned(dso)) return -1; if (not_annotated || !sym->annotate2) { err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); return -1; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 71b32591d61a..b7219df51236 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -2489,7 +2489,7 @@ add_annotate_opt(struct hist_browser *browser __maybe_unused, { struct dso *dso; - if (!ms->map || (dso = map__dso(ms->map)) == NULL || dso->annotate_warned) + if (!ms->map || (dso = map__dso(ms->map)) == NULL || dso__annotate_warned(dso)) return 0; if (!ms->sym) @@ -2608,7 +2608,7 @@ static int hists_browser__zoom_map(struct hist_browser *browser, struct map *map } else { struct dso *dso = map__dso(map); ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s DSO\"", - __map__is_kernel(map) ? "the Kernel" : dso->short_name); + __map__is_kernel(map) ? "the Kernel" : dso__short_name(dso)); browser->hists->dso_filter = dso; perf_hpp__set_elide(HISTC_DSO, true); pstack__push(browser->pstack, &browser->hists->dso_filter); @@ -2634,7 +2634,7 @@ add_dso_opt(struct hist_browser *browser, struct popup_action *act, if (asprintf(optstr, "Zoom %s %s DSO (use the 'k' hotkey to zoom directly into the kernel)", browser->hists->dso_filter ? "out of" : "into", - __map__is_kernel(map) ? "the Kernel" : map__dso(map)->short_name) < 0) + __map__is_kernel(map) ? "the Kernel" : dso__short_name(map__dso(map))) < 0) return 0; act->ms.map = map; @@ -3110,7 +3110,7 @@ do_hotkey: // key came straight from options ui__popup_menu() if (!browser->selection || !browser->selection->map || !map__dso(browser->selection->map) || - map__dso(browser->selection->map)->annotate_warned) { + dso__annotate_warned(map__dso(browser->selection->map))) { continue; } diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index 3d1b958d8832..fba55175a935 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -76,7 +76,7 @@ static int map_browser__run(struct map_browser *browser) { int key; - if (ui_browser__show(&browser->b, map__dso(browser->map)->long_name, + if (ui_browser__show(&browser->b, dso__long_name(map__dso(browser->map)), "Press ESC to exit, %s / to search", verbose > 0 ? "" : "restart with -v to use") < 0) return -1; @@ -106,7 +106,7 @@ int map__browse(struct map *map) { struct map_browser mb = { .b = { - .entries = &map__dso(map)->symbols, + .entries = dso__symbols(map__dso(map)), .refresh = ui_browser__rb_tree_refresh, .seek = ui_browser__rb_tree_seek, .write = map_browser__write, diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 2c98813f95cd..faefa444af1e 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -336,7 +336,7 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, /* Check existing nodes in dso->data_types tree */ key.self.type_name = type_name; key.self.size = size; - node = rb_find(&key, &dso->data_types, data_type_cmp); + node = rb_find(&key, dso__data_types(dso), data_type_cmp); if (node) { result = rb_entry(node, struct annotated_data_type, node); free(type_name); @@ -357,7 +357,7 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, if (symbol_conf.annotate_data_member) add_member_types(result, type_die); - rb_add(&result->node, &dso->data_types, data_type_less); + rb_add(&result->node, dso__data_types(dso), data_type_less); return result; } @@ -538,7 +538,7 @@ static struct global_var_entry *global_var__find(struct data_loc_info *dloc, u64 struct dso *dso = map__dso(dloc->ms->map); struct rb_node *node; - node = rb_find((void *)(uintptr_t)addr, &dso->global_vars, global_var_cmp); + node = rb_find((void *)(uintptr_t)addr, dso__global_vars(dso), global_var_cmp); if (node == NULL) return NULL; @@ -569,7 +569,7 @@ static bool global_var__add(struct data_loc_info *dloc, u64 addr, gvar->end = addr + size; gvar->die_offset = dwarf_dieoffset(type_die); - rb_add(&gvar->node, &dso->global_vars, global_var_less); + rb_add(&gvar->node, dso__global_vars(dso), global_var_less); return true; } @@ -672,7 +672,7 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, struct dso *dso = map__dso(dloc->ms->map); Dwarf_Die var_die; - if (RB_EMPTY_ROOT(&dso->global_vars)) + if (RB_EMPTY_ROOT(dso__global_vars(dso))) global_var__collect(dloc); gvar = global_var__find(dloc, var_addr); @@ -889,7 +889,7 @@ static void update_insn_state_x86(struct type_state *state, return; tsr = &state->regs[dst->reg1]; - if (map__dso(dloc->ms->map)->kernel && + if (dso__kernel(map__dso(dloc->ms->map)) && src->segment == INSN_SEG_X86_GS && src->imm) { u64 ip = dloc->ms->sym->start + dl->al.offset; u64 var_addr; @@ -1415,7 +1415,7 @@ static int check_matching_type(struct type_state *state, } check_kernel: - if (map__dso(dloc->ms->map)->kernel) { + if (dso__kernel(map__dso(dloc->ms->map))) { u64 addr; int offset; @@ -1767,7 +1767,7 @@ struct annotated_data_type *find_data_type(struct data_loc_info *dloc) struct dso *dso = map__dso(dloc->ms->map); Dwarf_Die type_die; - dloc->di = debuginfo__new(dso->long_name); + dloc->di = debuginfo__new(dso__long_name(dso)); if (dloc->di == NULL) { pr_debug_dtp("cannot get the debug info\n"); return NULL; @@ -1901,7 +1901,7 @@ static void print_annotated_data_header(struct hist_entry *he, struct evsel *evs } printf("Annotate type: '%s' in %s (%d samples):\n", - he->mem_type->self.type_name, dso->name, nr_samples); + he->mem_type->self.type_name, dso__name(dso), nr_samples); if (evsel__is_group_event(evsel)) { struct evsel *pos; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index f5b6b5e5e757..d7d55263fc91 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1103,7 +1103,7 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel) int graph_dotted_len; char buf[512]; - filename = strdup(dso->long_name); + filename = strdup(dso__long_name(dso)); if (!filename) return -ENOMEM; @@ -1268,7 +1268,7 @@ int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel) } fprintf(fp, "%s() %s\nEvent: %s\n\n", - ms->sym->name, map__dso(ms->map)->long_name, ev_name); + ms->sym->name, dso__long_name(map__dso(ms->map)), ev_name); symbol__annotate_fprintf2(ms->sym, fp); fclose(fp); @@ -1526,7 +1526,7 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel) if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); return -1; @@ -1535,13 +1535,12 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel) if (annotate_opts.print_lines) { srcline_full_filename = annotate_opts.full_path; symbol__calc_lines(ms, &source_line); - print_summary(&source_line, dso->long_name); + print_summary(&source_line, dso__long_name(dso)); } hists__scnprintf_title(hists, buf, sizeof(buf)); fprintf(stdout, "%s, [percent: %s]\n%s() %s\n", - buf, percent_type_str(annotate_opts.percent_type), sym->name, - dso->long_name); + buf, percent_type_str(annotate_opts.percent_type), sym->name, dso__long_name(dso)); symbol__annotate_fprintf2(sym, stdout); annotated_source__purge(symbol__annotation(sym)->src); @@ -1560,7 +1559,7 @@ int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel) if (err) { char msg[BUFSIZ]; - dso->annotate_warned = true; + dso__set_annotate_warned(dso); symbol__strerror_disassemble(ms, err, msg, sizeof(msg)); ui__error("Couldn't annotate %s:\n%s", sym->name, msg); return -1; @@ -1571,7 +1570,7 @@ int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel) if (annotate_opts.print_lines) { srcline_full_filename = annotate_opts.full_path; symbol__calc_lines(ms, &source_line); - print_summary(&source_line, dso->long_name); + print_summary(&source_line, dso__long_name(dso)); } symbol__annotate_printf(ms, evsel); @@ -2400,7 +2399,7 @@ retry: } /* This CPU access in kernel - pretend PC-relative addressing */ - if (map__dso(ms->map)->kernel && arch__is(arch, "x86") && + if (dso__kernel(map__dso(ms->map)) && arch__is(arch, "x86") && op_loc->segment == INSN_SEG_X86_GS && op_loc->imm) { dloc.var_addr = op_loc->offset; op_loc->reg1 = DWARF_REG_PC; diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index cfa2153d4611..3fd350de0051 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -2654,7 +2654,7 @@ static int addr_filter__entire_dso(struct addr_filter *filt, struct dso *dso) } filt->addr = 0; - filt->size = dso->data.file_size; + filt->size = dso__data(dso)->file_size; return 0; } diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index dec910989701..895ee8adf3b3 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -319,7 +319,7 @@ static int block_dso_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, if (map && map__dso(map)) { return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, - map__dso(map)->short_name); + dso__short_name(map__dso(map))); } return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 83709146a48a..827695cd0408 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -59,10 +59,10 @@ static int machine__process_bpf_event_load(struct machine *machine, if (map) { struct dso *dso = map__dso(map); - dso->binary_type = DSO_BINARY_TYPE__BPF_PROG_INFO; - dso->bpf_prog.id = id; - dso->bpf_prog.sub_id = i; - dso->bpf_prog.env = env; + dso__set_binary_type(dso, DSO_BINARY_TYPE__BPF_PROG_INFO); + dso__bpf_prog(dso)->id = id; + dso__bpf_prog(dso)->sub_id = i; + dso__bpf_prog(dso)->env = env; map__put(map); } } diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 864bc26b6b46..83a1581e8cf1 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -60,7 +60,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused, addr_location__init(&al); if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) - map__dso(al.map)->hit = 1; + dso__set_hit(map__dso(al.map)); addr_location__exit(&al); thread__put(thread); @@ -272,10 +272,10 @@ char *__dso__build_id_filename(const struct dso *dso, char *bf, size_t size, bool alloc = (bf == NULL); int ret; - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) return NULL; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid_const(dso), sbuild_id); linkname = build_id_cache__linkname(sbuild_id, NULL, 0); if (!linkname) return NULL; @@ -340,25 +340,25 @@ static int machine__write_buildid_table_cb(struct dso *dso, void *data) size_t name_len; bool in_kernel = false; - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) return 0; - if (!dso->hit && !dso__is_vdso(dso)) + if (!dso__hit(dso) && !dso__is_vdso(dso)) return 0; if (dso__is_vdso(dso)) { - name = dso->short_name; - name_len = dso->short_name_len; + name = dso__short_name(dso); + name_len = dso__short_name_len(dso); } else if (dso__is_kcore(dso)) { name = args->machine->mmap_name; name_len = strlen(name); } else { - name = dso->long_name; - name_len = dso->long_name_len; + name = dso__long_name(dso); + name_len = dso__long_name_len(dso); } - in_kernel = dso->kernel || is_kernel_module(name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); - return write_buildid(name, name_len, &dso->bid, args->machine->pid, + in_kernel = dso__kernel(dso) || is_kernel_module(name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); + return write_buildid(name, name_len, dso__bid(dso), args->machine->pid, in_kernel ? args->kmisc : args->umisc, args->fd); } @@ -876,11 +876,11 @@ static bool dso__build_id_mismatch(struct dso *dso, const char *name) struct build_id bid; bool ret = false; - mutex_lock(&dso->lock); - if (filename__read_build_id_ns(name, &bid, dso->nsinfo) >= 0) + mutex_lock(dso__lock(dso)); + if (filename__read_build_id_ns(name, &bid, dso__nsinfo(dso)) >= 0) ret = !dso__build_id_equal(dso, &bid); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); return ret; } @@ -890,13 +890,13 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, { bool is_kallsyms = dso__is_kallsyms(dso); bool is_vdso = dso__is_vdso(dso); - const char *name = dso->long_name; + const char *name = dso__long_name(dso); const char *proper_name = NULL; const char *root_dir = NULL; char *allocated_name = NULL; int ret = 0; - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) return 0; if (dso__is_kcore(dso)) { @@ -921,10 +921,10 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, if (!is_kallsyms && dso__build_id_mismatch(dso, name)) goto out_free; - mutex_lock(&dso->lock); - ret = build_id_cache__add_b(&dso->bid, name, dso->nsinfo, + mutex_lock(dso__lock(dso)); + ret = build_id_cache__add_b(dso__bid(dso), name, dso__nsinfo(dso), is_kallsyms, is_vdso, proper_name, root_dir); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); out_free: free(allocated_name); return ret; diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 7517d16c02ec..68feed871809 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1205,7 +1205,7 @@ char *callchain_list__sym_name(struct callchain_list *cl, if (show_dso) scnprintf(bf + printed, bfsize - printed, " %s", cl->ms.map ? - map__dso(cl->ms.map)->short_name : + dso__short_name(map__dso(cl->ms.map)) : "unknown"); return bf; diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 09d57efd2d9d..3cf64f5b23ee 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -134,7 +134,7 @@ static void output_sample_callchain_entry(struct perf_tool *tool, output_json_key_string(out, false, 5, "symbol", al->sym->name); if (dso) { - const char *dso_name = dso->short_name; + const char *dso_name = dso__short_name(dso); if (dso_name && strlen(dso_name) > 0) { fputc(',', out); diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 106429155c2e..50f916374d87 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -146,10 +146,10 @@ int db_export__comm_thread(struct db_export *dbe, struct comm *comm, int db_export__dso(struct db_export *dbe, struct dso *dso, struct machine *machine) { - if (dso->db_id) + if (dso__db_id(dso)) return 0; - dso->db_id = ++dbe->dso_last_db_id; + dso__set_db_id(dso, ++dbe->dso_last_db_id); if (dbe->export_dso) return dbe->export_dso(dbe, dso, machine); @@ -184,7 +184,7 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al, err = db_export__dso(dbe, dso, maps__machine(al->maps)); if (err) return err; - *dso_db_id = dso->db_id; + *dso_db_id = dso__db_id(dso); if (!al->sym) { al->sym = symbol__new(al->addr, 0, 0, 0, "unknown"); diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 6d1125e687b7..72aec8f61b94 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1054,8 +1054,8 @@ int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, s char bf[SBUILD_ID_SIZE + 15] = " with build id "; char *build_id_msg = NULL; - if (dso->has_build_id) { - build_id__sprintf(&dso->bid, bf + 15); + if (dso__has_build_id(dso)) { + build_id__sprintf(dso__bid(dso), bf + 15); build_id_msg = bf; } scnprintf(buf, buflen, @@ -1077,11 +1077,11 @@ int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, s scnprintf(buf, buflen, "Problems while parsing the CPUID in the arch specific initialization."); break; case SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE: - scnprintf(buf, buflen, "Invalid BPF file: %s.", dso->long_name); + scnprintf(buf, buflen, "Invalid BPF file: %s.", dso__long_name(dso)); break; case SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF: scnprintf(buf, buflen, "The %s BPF file has no BTF section, compile with -g or use pahole -J.", - dso->long_name); + dso__long_name(dso)); break; default: scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); @@ -1099,7 +1099,7 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil char *pos; int len; - if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && + if (dso__symtab_type(dso) == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX; @@ -1108,7 +1108,7 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil __symbol__join_symfs(filename, filename_size, build_id_filename); free(build_id_filename); } else { - if (dso->has_build_id) + if (dso__has_build_id(dso)) return ENOMEM; goto fallback; } @@ -1142,22 +1142,22 @@ fallback: * cache, or is just a kallsyms file, well, lets hope that this * DSO is the same as when 'perf record' ran. */ - if (dso->kernel && dso->long_name[0] == '/') - snprintf(filename, filename_size, "%s", dso->long_name); + if (dso__kernel(dso) && dso__long_name(dso)[0] == '/') + snprintf(filename, filename_size, "%s", dso__long_name(dso)); else - __symbol__join_symfs(filename, filename_size, dso->long_name); + __symbol__join_symfs(filename, filename_size, dso__long_name(dso)); - mutex_lock(&dso->lock); - if (access(filename, R_OK) && errno == ENOENT && dso->nsinfo) { + mutex_lock(dso__lock(dso)); + if (access(filename, R_OK) && errno == ENOENT && dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, filename); if (new_name) { strlcpy(filename, new_name, filename_size); free(new_name); } } - mutex_unlock(&dso->lock); - } else if (dso->binary_type == DSO_BINARY_TYPE__NOT_FOUND) { - dso->binary_type = DSO_BINARY_TYPE__BUILD_ID_CACHE; + mutex_unlock(dso__lock(dso)); + } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) { + dso__set_binary_type(dso, DSO_BINARY_TYPE__BUILD_ID_CACHE); } free(build_id_path); @@ -1425,7 +1425,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, orig_addr = addr + insn->size + op->mem.disp; addr = map__objdump_2mem(map, orig_addr); - if (map__dso(map)->kernel) { + if (dso__kernel(map__dso(map))) { /* * The kernel maps can be splitted into sections, * let's find the map first and the search the symbol. @@ -1479,7 +1479,7 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym, if (args->options->objdump_path) return -1; - nsinfo__mountns_enter(dso->nsinfo, &nsc); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); fd = open(filename, O_RDONLY); nsinfo__mountns_exit(&nsc); if (fd < 0) @@ -1679,13 +1679,13 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) map__unmap_ip(map, sym->end)); pr_debug("annotating [%p] %30s : [%p] %30s\n", - dso, dso->long_name, sym, sym->name); + dso, dso__long_name(dso), sym, sym->name); - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) { return symbol__disassemble_bpf(sym, args); - } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { + } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) { return symbol__disassemble_bpf_image(sym, args); - } else if (dso->binary_type == DSO_BINARY_TYPE__NOT_FOUND) { + } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) { return -1; } else if (dso__is_kcore(dso)) { kce.kcore_filename = symfs_filename; diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c index 908e16813722..7d180bdaedbc 100644 --- a/tools/perf/util/dlfilter.c +++ b/tools/perf/util/dlfilter.c @@ -33,13 +33,13 @@ static void al_to_d_al(struct addr_location *al, struct perf_dlfilter_al *d_al) if (al->map) { struct dso *dso = map__dso(al->map); - if (symbol_conf.show_kernel_path && dso->long_name) - d_al->dso = dso->long_name; + if (symbol_conf.show_kernel_path && dso__long_name(dso)) + d_al->dso = dso__long_name(dso); else - d_al->dso = dso->name; - d_al->is_64_bit = dso->is_64_bit; - d_al->buildid_size = dso->bid.size; - d_al->buildid = dso->bid.data; + d_al->dso = dso__name(dso); + d_al->is_64_bit = dso__is_64_bit(dso); + d_al->buildid_size = dso__bid(dso)->size; + d_al->buildid = dso__bid(dso)->data; } else { d_al->dso = NULL; d_al->is_64_bit = 0; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 3caca60a6ce3..27db65e96e04 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -40,6 +40,12 @@ static const char * const debuglink_paths[] = { "/usr/lib/debug%s/%s" }; +void dso__set_nsinfo(struct dso *dso, struct nsinfo *nsi) +{ + nsinfo__put(RC_CHK_ACCESS(dso)->nsinfo); + RC_CHK_ACCESS(dso)->nsinfo = nsi; +} + char dso__symtab_origin(const struct dso *dso) { static const char origin[] = { @@ -63,14 +69,14 @@ char dso__symtab_origin(const struct dso *dso) [DSO_BINARY_TYPE__GUEST_VMLINUX] = 'V', }; - if (dso == NULL || dso->symtab_type == DSO_BINARY_TYPE__NOT_FOUND) + if (dso == NULL || dso__symtab_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) return '!'; - return origin[dso->symtab_type]; + return origin[dso__symtab_type(dso)]; } bool dso__is_object_file(const struct dso *dso) { - switch (dso->binary_type) { + switch (dso__binary_type(dso)) { case DSO_BINARY_TYPE__KALLSYMS: case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__JAVA_JIT: @@ -117,7 +123,7 @@ int dso__read_binary_type_filename(const struct dso *dso, char symfile[PATH_MAX]; unsigned int i; - len = __symbol__join_symfs(filename, size, dso->long_name); + len = __symbol__join_symfs(filename, size, dso__long_name(dso)); last_slash = filename + len; while (last_slash != filename && *last_slash != '/') last_slash--; @@ -159,12 +165,12 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__FEDORA_DEBUGINFO: len = __symbol__join_symfs(filename, size, "/usr/lib/debug"); - snprintf(filename + len, size - len, "%s.debug", dso->long_name); + snprintf(filename + len, size - len, "%s.debug", dso__long_name(dso)); break; case DSO_BINARY_TYPE__UBUNTU_DEBUGINFO: len = __symbol__join_symfs(filename, size, "/usr/lib/debug"); - snprintf(filename + len, size - len, "%s", dso->long_name); + snprintf(filename + len, size - len, "%s", dso__long_name(dso)); break; case DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO: @@ -173,13 +179,13 @@ int dso__read_binary_type_filename(const struct dso *dso, * /usr/lib/debug/lib when it is expected to be in * /usr/lib/debug/usr/lib */ - if (strlen(dso->long_name) < 9 || - strncmp(dso->long_name, "/usr/lib/", 9)) { + if (strlen(dso__long_name(dso)) < 9 || + strncmp(dso__long_name(dso), "/usr/lib/", 9)) { ret = -1; break; } len = __symbol__join_symfs(filename, size, "/usr/lib/debug"); - snprintf(filename + len, size - len, "%s", dso->long_name + 4); + snprintf(filename + len, size - len, "%s", dso__long_name(dso) + 4); break; case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO: @@ -187,29 +193,29 @@ int dso__read_binary_type_filename(const struct dso *dso, const char *last_slash; size_t dir_size; - last_slash = dso->long_name + dso->long_name_len; - while (last_slash != dso->long_name && *last_slash != '/') + last_slash = dso__long_name(dso) + dso__long_name_len(dso); + while (last_slash != dso__long_name(dso) && *last_slash != '/') last_slash--; len = __symbol__join_symfs(filename, size, ""); - dir_size = last_slash - dso->long_name + 2; + dir_size = last_slash - dso__long_name(dso) + 2; if (dir_size > (size - len)) { ret = -1; break; } - len += scnprintf(filename + len, dir_size, "%s", dso->long_name); + len += scnprintf(filename + len, dir_size, "%s", dso__long_name(dso)); len += scnprintf(filename + len , size - len, ".debug%s", last_slash); break; } case DSO_BINARY_TYPE__BUILDID_DEBUGINFO: - if (!dso->has_build_id) { + if (!dso__has_build_id(dso)) { ret = -1; break; } - build_id__sprintf(&dso->bid, build_id_hex); + build_id__sprintf(dso__bid_const(dso), build_id_hex); len = __symbol__join_symfs(filename, size, "/usr/lib/debug/.build-id/"); snprintf(filename + len, size - len, "%.2s/%s.debug", build_id_hex, build_id_hex + 2); @@ -218,23 +224,23 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__VMLINUX: case DSO_BINARY_TYPE__GUEST_VMLINUX: case DSO_BINARY_TYPE__SYSTEM_PATH_DSO: - __symbol__join_symfs(filename, size, dso->long_name); + __symbol__join_symfs(filename, size, dso__long_name(dso)); break; case DSO_BINARY_TYPE__GUEST_KMODULE: case DSO_BINARY_TYPE__GUEST_KMODULE_COMP: path__join3(filename, size, symbol_conf.symfs, - root_dir, dso->long_name); + root_dir, dso__long_name(dso)); break; case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE: case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP: - __symbol__join_symfs(filename, size, dso->long_name); + __symbol__join_symfs(filename, size, dso__long_name(dso)); break; case DSO_BINARY_TYPE__KCORE: case DSO_BINARY_TYPE__GUEST_KCORE: - snprintf(filename, size, "%s", dso->long_name); + snprintf(filename, size, "%s", dso__long_name(dso)); break; default: @@ -310,8 +316,8 @@ bool is_kernel_module(const char *pathname, int cpumode) bool dso__needs_decompress(struct dso *dso) { - return dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; + return dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; } int filename__decompress(const char *name, char *pathname, @@ -363,11 +369,10 @@ static int decompress_kmodule(struct dso *dso, const char *name, if (!dso__needs_decompress(dso)) return -1; - if (dso->comp == COMP_ID__NONE) + if (dso__comp(dso) == COMP_ID__NONE) return -1; - return filename__decompress(name, pathname, len, dso->comp, - &dso->load_errno); + return filename__decompress(name, pathname, len, dso__comp(dso), dso__load_errno(dso)); } int dso__decompress_kmodule_fd(struct dso *dso, const char *name) @@ -468,17 +473,17 @@ void dso__set_module_info(struct dso *dso, struct kmod_path *m, struct machine *machine) { if (machine__is_host(machine)) - dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE; + dso__set_symtab_type(dso, DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE); else - dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE; + dso__set_symtab_type(dso, DSO_BINARY_TYPE__GUEST_KMODULE); /* _KMODULE_COMP should be next to _KMODULE */ if (m->kmod && m->comp) { - dso->symtab_type++; - dso->comp = m->comp; + dso__set_symtab_type(dso, dso__symtab_type(dso) + 1); + dso__set_comp(dso, m->comp); } - dso->is_kmod = 1; + dso__set_is_kmod(dso); dso__set_short_name(dso, strdup(m->name), true); } @@ -491,13 +496,15 @@ static pthread_mutex_t dso__data_open_lock = PTHREAD_MUTEX_INITIALIZER; static void dso__list_add(struct dso *dso) { - list_add_tail(&dso->data.open_entry, &dso__data_open); + list_add_tail(&dso__data(dso)->open_entry, &dso__data_open); + dso__data(dso)->dso = dso__get(dso); dso__data_open_cnt++; } static void dso__list_del(struct dso *dso) { - list_del_init(&dso->data.open_entry); + list_del_init(&dso__data(dso)->open_entry); + dso__put(dso__data(dso)->dso); WARN_ONCE(dso__data_open_cnt <= 0, "DSO data fd counter out of bounds."); dso__data_open_cnt--; @@ -528,7 +535,7 @@ static int do_open(char *name) char *dso__filename_with_chroot(const struct dso *dso, const char *filename) { - return filename_with_chroot(nsinfo__pid(dso->nsinfo), filename); + return filename_with_chroot(nsinfo__pid(dso__nsinfo_const(dso)), filename); } static int __open_dso(struct dso *dso, struct machine *machine) @@ -541,18 +548,18 @@ static int __open_dso(struct dso *dso, struct machine *machine) if (!name) return -ENOMEM; - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); if (machine) root_dir = machine->root_dir; - if (dso__read_binary_type_filename(dso, dso->binary_type, + if (dso__read_binary_type_filename(dso, dso__binary_type(dso), root_dir, name, PATH_MAX)) goto out; if (!is_regular_file(name)) { char *new_name; - if (errno != ENOENT || dso->nsinfo == NULL) + if (errno != ENOENT || dso__nsinfo(dso) == NULL) goto out; new_name = dso__filename_with_chroot(dso, name); @@ -568,7 +575,7 @@ static int __open_dso(struct dso *dso, struct machine *machine) size_t len = sizeof(newpath); if (dso__decompress_kmodule_path(dso, name, newpath, len) < 0) { - fd = -dso->load_errno; + fd = -(*dso__load_errno(dso)); goto out; } @@ -582,7 +589,7 @@ static int __open_dso(struct dso *dso, struct machine *machine) unlink(name); out: - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); free(name); return fd; } @@ -601,13 +608,13 @@ static int open_dso(struct dso *dso, struct machine *machine) int fd; struct nscookie nsc; - if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) { - mutex_lock(&dso->lock); - nsinfo__mountns_enter(dso->nsinfo, &nsc); - mutex_unlock(&dso->lock); + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BUILD_ID_CACHE) { + mutex_lock(dso__lock(dso)); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + mutex_unlock(dso__lock(dso)); } fd = __open_dso(dso, machine); - if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE) + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BUILD_ID_CACHE) nsinfo__mountns_exit(&nsc); if (fd >= 0) { @@ -624,10 +631,10 @@ static int open_dso(struct dso *dso, struct machine *machine) static void close_data_fd(struct dso *dso) { - if (dso->data.fd >= 0) { - close(dso->data.fd); - dso->data.fd = -1; - dso->data.file_size = 0; + if (dso__data(dso)->fd >= 0) { + close(dso__data(dso)->fd); + dso__data(dso)->fd = -1; + dso__data(dso)->file_size = 0; dso__list_del(dso); } } @@ -646,10 +653,10 @@ static void close_dso(struct dso *dso) static void close_first_dso(void) { - struct dso *dso; + struct dso_data *dso_data; - dso = list_first_entry(&dso__data_open, struct dso, data.open_entry); - close_dso(dso); + dso_data = list_first_entry(&dso__data_open, struct dso_data, open_entry); + close_dso(dso_data->dso); } static rlim_t get_fd_limit(void) @@ -728,28 +735,29 @@ static void try_to_open_dso(struct dso *dso, struct machine *machine) DSO_BINARY_TYPE__NOT_FOUND, }; int i = 0; + struct dso_data *dso_data = dso__data(dso); - if (dso->data.fd >= 0) + if (dso_data->fd >= 0) return; - if (dso->binary_type != DSO_BINARY_TYPE__NOT_FOUND) { - dso->data.fd = open_dso(dso, machine); + if (dso__binary_type(dso) != DSO_BINARY_TYPE__NOT_FOUND) { + dso_data->fd = open_dso(dso, machine); goto out; } do { - dso->binary_type = binary_type_data[i++]; + dso__set_binary_type(dso, binary_type_data[i++]); - dso->data.fd = open_dso(dso, machine); - if (dso->data.fd >= 0) + dso_data->fd = open_dso(dso, machine); + if (dso_data->fd >= 0) goto out; - } while (dso->binary_type != DSO_BINARY_TYPE__NOT_FOUND); + } while (dso__binary_type(dso) != DSO_BINARY_TYPE__NOT_FOUND); out: - if (dso->data.fd >= 0) - dso->data.status = DSO_DATA_STATUS_OK; + if (dso_data->fd >= 0) + dso_data->status = DSO_DATA_STATUS_OK; else - dso->data.status = DSO_DATA_STATUS_ERROR; + dso_data->status = DSO_DATA_STATUS_ERROR; } /** @@ -763,7 +771,7 @@ out: */ int dso__data_get_fd(struct dso *dso, struct machine *machine) { - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; if (pthread_mutex_lock(&dso__data_open_lock) < 0) @@ -771,10 +779,10 @@ int dso__data_get_fd(struct dso *dso, struct machine *machine) try_to_open_dso(dso, machine); - if (dso->data.fd < 0) + if (dso__data(dso)->fd < 0) pthread_mutex_unlock(&dso__data_open_lock); - return dso->data.fd; + return dso__data(dso)->fd; } void dso__data_put_fd(struct dso *dso __maybe_unused) @@ -786,10 +794,10 @@ bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by) { u32 flag = 1 << by; - if (dso->data.status_seen & flag) + if (dso__data(dso)->status_seen & flag) return true; - dso->data.status_seen |= flag; + dso__data(dso)->status_seen |= flag; return false; } @@ -799,12 +807,13 @@ static ssize_t bpf_read(struct dso *dso, u64 offset, char *data) { struct bpf_prog_info_node *node; ssize_t size = DSO__DATA_CACHE_SIZE; + struct dso_bpf_prog *dso_bpf_prog = dso__bpf_prog(dso); u64 len; u8 *buf; - node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, dso->bpf_prog.id); + node = perf_env__find_bpf_prog_info(dso_bpf_prog->env, dso_bpf_prog->id); if (!node || !node->info_linear) { - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; return -1; } @@ -822,14 +831,15 @@ static ssize_t bpf_read(struct dso *dso, u64 offset, char *data) static int bpf_size(struct dso *dso) { struct bpf_prog_info_node *node; + struct dso_bpf_prog *dso_bpf_prog = dso__bpf_prog(dso); - node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, dso->bpf_prog.id); + node = perf_env__find_bpf_prog_info(dso_bpf_prog->env, dso_bpf_prog->id); if (!node || !node->info_linear) { - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; return -1; } - dso->data.file_size = node->info_linear->info.jited_prog_len; + dso__data(dso)->file_size = node->info_linear->info.jited_prog_len; return 0; } #endif // HAVE_LIBBPF_SUPPORT @@ -837,10 +847,10 @@ static int bpf_size(struct dso *dso) static void dso_cache__free(struct dso *dso) { - struct rb_root *root = &dso->data.cache; + struct rb_root *root = &dso__data(dso)->cache; struct rb_node *next = rb_first(root); - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); while (next) { struct dso_cache *cache; @@ -849,12 +859,12 @@ dso_cache__free(struct dso *dso) rb_erase(&cache->rb_node, root); free(cache); } - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); } static struct dso_cache *__dso_cache__find(struct dso *dso, u64 offset) { - const struct rb_root *root = &dso->data.cache; + const struct rb_root *root = &dso__data(dso)->cache; struct rb_node * const *p = &root->rb_node; const struct rb_node *parent = NULL; struct dso_cache *cache; @@ -880,13 +890,13 @@ static struct dso_cache *__dso_cache__find(struct dso *dso, u64 offset) static struct dso_cache * dso_cache__insert(struct dso *dso, struct dso_cache *new) { - struct rb_root *root = &dso->data.cache; + struct rb_root *root = &dso__data(dso)->cache; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct dso_cache *cache; u64 offset = new->offset; - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); while (*p != NULL) { u64 end; @@ -907,7 +917,7 @@ dso_cache__insert(struct dso *dso, struct dso_cache *new) cache = NULL; out: - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); return cache; } @@ -932,18 +942,18 @@ static ssize_t file_read(struct dso *dso, struct machine *machine, pthread_mutex_lock(&dso__data_open_lock); /* - * dso->data.fd might be closed if other thread opened another + * dso__data(dso)->fd might be closed if other thread opened another * file (dso) due to open file limit (RLIMIT_NOFILE). */ try_to_open_dso(dso, machine); - if (dso->data.fd < 0) { - dso->data.status = DSO_DATA_STATUS_ERROR; + if (dso__data(dso)->fd < 0) { + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; ret = -errno; goto out; } - ret = pread(dso->data.fd, data, DSO__DATA_CACHE_SIZE, offset); + ret = pread(dso__data(dso)->fd, data, DSO__DATA_CACHE_SIZE, offset); out: pthread_mutex_unlock(&dso__data_open_lock); return ret; @@ -963,11 +973,11 @@ static struct dso_cache *dso_cache__populate(struct dso *dso, return NULL; } #ifdef HAVE_LIBBPF_SUPPORT - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) *ret = bpf_read(dso, cache_offset, cache->data); else #endif - if (dso->binary_type == DSO_BINARY_TYPE__OOL) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__OOL) *ret = DSO__DATA_CACHE_SIZE; else *ret = file_read(dso, machine, cache_offset, cache->data); @@ -1056,25 +1066,25 @@ static int file_size(struct dso *dso, struct machine *machine) pthread_mutex_lock(&dso__data_open_lock); /* - * dso->data.fd might be closed if other thread opened another + * dso__data(dso)->fd might be closed if other thread opened another * file (dso) due to open file limit (RLIMIT_NOFILE). */ try_to_open_dso(dso, machine); - if (dso->data.fd < 0) { + if (dso__data(dso)->fd < 0) { ret = -errno; - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; goto out; } - if (fstat(dso->data.fd, &st) < 0) { + if (fstat(dso__data(dso)->fd, &st) < 0) { ret = -errno; pr_err("dso cache fstat failed: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); - dso->data.status = DSO_DATA_STATUS_ERROR; + dso__data(dso)->status = DSO_DATA_STATUS_ERROR; goto out; } - dso->data.file_size = st.st_size; + dso__data(dso)->file_size = st.st_size; out: pthread_mutex_unlock(&dso__data_open_lock); @@ -1083,13 +1093,13 @@ out: int dso__data_file_size(struct dso *dso, struct machine *machine) { - if (dso->data.file_size) + if (dso__data(dso)->file_size) return 0; - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; #ifdef HAVE_LIBBPF_SUPPORT - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) return bpf_size(dso); #endif return file_size(dso, machine); @@ -1108,7 +1118,7 @@ off_t dso__data_size(struct dso *dso, struct machine *machine) return -1; /* For now just estimate dso data size is close to file size */ - return dso->data.file_size; + return dso__data(dso)->file_size; } static ssize_t data_read_write_offset(struct dso *dso, struct machine *machine, @@ -1119,7 +1129,7 @@ static ssize_t data_read_write_offset(struct dso *dso, struct machine *machine, return -1; /* Check the offset sanity. */ - if (offset > dso->data.file_size) + if (offset > dso__data(dso)->file_size) return -1; if (offset + size < offset) @@ -1142,7 +1152,7 @@ static ssize_t data_read_write_offset(struct dso *dso, struct machine *machine, ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine, u64 offset, u8 *data, ssize_t size) { - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; return data_read_write_offset(dso, machine, offset, data, size, true); @@ -1182,7 +1192,7 @@ ssize_t dso__data_write_cache_offs(struct dso *dso, struct machine *machine, { u8 *data = (u8 *)data_in; /* cast away const to use same fns for r/w */ - if (dso->data.status == DSO_DATA_STATUS_ERROR) + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR) return -1; return data_read_write_offset(dso, machine, offset, data, size, false); @@ -1235,7 +1245,7 @@ struct dso *machine__findnew_kernel(struct machine *machine, const char *name, */ if (dso != NULL) { dso__set_short_name(dso, short_name, false); - dso->kernel = dso_type; + dso__set_kernel(dso, dso_type); } return dso; @@ -1243,7 +1253,7 @@ struct dso *machine__findnew_kernel(struct machine *machine, const char *name, static void dso__set_long_name_id(struct dso *dso, const char *name, bool name_allocated) { - struct dsos *dsos = dso->dsos; + struct dsos *dsos = dso__dsos(dso); if (name == NULL) return; @@ -1256,12 +1266,12 @@ static void dso__set_long_name_id(struct dso *dso, const char *name, bool name_a down_write(&dsos->lock); } - if (dso->long_name_allocated) - free((char *)dso->long_name); + if (dso__long_name_allocated(dso)) + free((char *)dso__long_name(dso)); - dso->long_name = name; - dso->long_name_len = strlen(name); - dso->long_name_allocated = name_allocated; + RC_CHK_ACCESS(dso)->long_name = name; + RC_CHK_ACCESS(dso)->long_name_len = strlen(name); + dso__set_long_name_allocated(dso, name_allocated); if (dsos) { dsos->sorted = false; @@ -1307,14 +1317,15 @@ bool dso_id__empty(const struct dso_id *id) void __dso__inject_id(struct dso *dso, struct dso_id *id) { - struct dsos *dsos = dso->dsos; + struct dsos *dsos = dso__dsos(dso); + struct dso_id *dso_id = dso__id(dso); /* dsos write lock held by caller. */ - dso->id.maj = id->maj; - dso->id.min = id->min; - dso->id.ino = id->ino; - dso->id.ino_generation = id->ino_generation; + dso_id->maj = id->maj; + dso_id->min = id->min; + dso_id->ino = id->ino; + dso_id->ino_generation = id->ino_generation; if (dsos) dsos->sorted = false; @@ -1334,7 +1345,7 @@ int dso_id__cmp(const struct dso_id *a, const struct dso_id *b) int dso__cmp_id(struct dso *a, struct dso *b) { - return __dso_id__cmp(&a->id, &b->id); + return __dso_id__cmp(dso__id(a), dso__id(b)); } void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) @@ -1344,7 +1355,7 @@ void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated) { - struct dsos *dsos = dso->dsos; + struct dsos *dsos = dso__dsos(dso); if (name == NULL) return; @@ -1356,12 +1367,12 @@ void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated) */ down_write(&dsos->lock); } - if (dso->short_name_allocated) - free((char *)dso->short_name); + if (dso__short_name_allocated(dso)) + free((char *)dso__short_name(dso)); - dso->short_name = name; - dso->short_name_len = strlen(name); - dso->short_name_allocated = name_allocated; + RC_CHK_ACCESS(dso)->short_name = name; + RC_CHK_ACCESS(dso)->short_name_len = strlen(name); + dso__set_short_name_allocated(dso, name_allocated); if (dsos) { dsos->sorted = false; @@ -1374,40 +1385,44 @@ int dso__name_len(const struct dso *dso) if (!dso) return strlen("[unknown]"); if (verbose > 0) - return dso->long_name_len; + return dso__long_name_len(dso); - return dso->short_name_len; + return dso__short_name_len(dso); } bool dso__loaded(const struct dso *dso) { - return dso->loaded; + return RC_CHK_ACCESS(dso)->loaded; } bool dso__sorted_by_name(const struct dso *dso) { - return dso->sorted_by_name; + return RC_CHK_ACCESS(dso)->sorted_by_name; } void dso__set_sorted_by_name(struct dso *dso) { - dso->sorted_by_name = true; + RC_CHK_ACCESS(dso)->sorted_by_name = true; } struct dso *dso__new_id(const char *name, struct dso_id *id) { - struct dso *dso = calloc(1, sizeof(*dso) + strlen(name) + 1); + RC_STRUCT(dso) *dso = zalloc(sizeof(*dso) + strlen(name) + 1); + struct dso *res; + struct dso_data *data; - if (dso != NULL) { + if (!dso) + return NULL; + + if (ADD_RC_CHK(res, dso)) { strcpy(dso->name, name); if (id) dso->id = *id; - dso__set_long_name_id(dso, dso->name, false); - dso__set_short_name(dso, dso->name, false); + dso__set_long_name_id(res, dso->name, false); + dso__set_short_name(res, dso->name, false); dso->symbols = RB_ROOT_CACHED; dso->symbol_names = NULL; dso->symbol_names_len = 0; - dso->data.cache = RB_ROOT; dso->inlined_nodes = RB_ROOT_CACHED; dso->srclines = RB_ROOT_CACHED; dso->data_types = RB_ROOT; @@ -1427,12 +1442,16 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) dso->is_kmod = 0; dso->needs_swap = DSO_SWAP__UNSET; dso->comp = COMP_ID__NONE; - INIT_LIST_HEAD(&dso->data.open_entry); mutex_init(&dso->lock); refcount_set(&dso->refcnt, 1); + data = &dso->data; + data->cache = RB_ROOT; + data->fd = -1; + data->status = DSO_DATA_STATUS_UNKNOWN; + INIT_LIST_HEAD(&data->open_entry); + data->dso = NULL; /* Set when on the open_entry list. */ } - - return dso; + return res; } struct dso *dso__new(const char *name) @@ -1442,71 +1461,78 @@ struct dso *dso__new(const char *name) void dso__delete(struct dso *dso) { - if (dso->dsos) - pr_err("DSO %s is still in rbtree when being deleted!\n", dso->long_name); + if (dso__dsos(dso)) + pr_err("DSO %s is still in rbtree when being deleted!\n", dso__long_name(dso)); /* free inlines first, as they reference symbols */ - inlines__tree_delete(&dso->inlined_nodes); - srcline__tree_delete(&dso->srclines); - symbols__delete(&dso->symbols); - dso->symbol_names_len = 0; - zfree(&dso->symbol_names); - annotated_data_type__tree_delete(&dso->data_types); - global_var_type__tree_delete(&dso->global_vars); - - if (dso->short_name_allocated) { - zfree((char **)&dso->short_name); - dso->short_name_allocated = false; + inlines__tree_delete(&RC_CHK_ACCESS(dso)->inlined_nodes); + srcline__tree_delete(&RC_CHK_ACCESS(dso)->srclines); + symbols__delete(&RC_CHK_ACCESS(dso)->symbols); + RC_CHK_ACCESS(dso)->symbol_names_len = 0; + zfree(&RC_CHK_ACCESS(dso)->symbol_names); + annotated_data_type__tree_delete(dso__data_types(dso)); + global_var_type__tree_delete(dso__global_vars(dso)); + + if (RC_CHK_ACCESS(dso)->short_name_allocated) { + zfree((char **)&RC_CHK_ACCESS(dso)->short_name); + RC_CHK_ACCESS(dso)->short_name_allocated = false; } - if (dso->long_name_allocated) { - zfree((char **)&dso->long_name); - dso->long_name_allocated = false; + if (RC_CHK_ACCESS(dso)->long_name_allocated) { + zfree((char **)&RC_CHK_ACCESS(dso)->long_name); + RC_CHK_ACCESS(dso)->long_name_allocated = false; } dso__data_close(dso); - auxtrace_cache__free(dso->auxtrace_cache); + auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache); dso_cache__free(dso); dso__free_a2l(dso); - zfree(&dso->symsrc_filename); - nsinfo__zput(dso->nsinfo); - mutex_destroy(&dso->lock); - free(dso); + zfree(&RC_CHK_ACCESS(dso)->symsrc_filename); + nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo); + mutex_destroy(dso__lock(dso)); + RC_CHK_FREE(dso); } struct dso *dso__get(struct dso *dso) { - if (dso) - refcount_inc(&dso->refcnt); - return dso; + struct dso *result; + + if (RC_CHK_GET(result, dso)) + refcount_inc(&RC_CHK_ACCESS(dso)->refcnt); + + return result; } void dso__put(struct dso *dso) { - if (dso && refcount_dec_and_test(&dso->refcnt)) + if (dso && refcount_dec_and_test(&RC_CHK_ACCESS(dso)->refcnt)) dso__delete(dso); + else + RC_CHK_PUT(dso); } void dso__set_build_id(struct dso *dso, struct build_id *bid) { - dso->bid = *bid; - dso->has_build_id = 1; + RC_CHK_ACCESS(dso)->bid = *bid; + RC_CHK_ACCESS(dso)->has_build_id = 1; } bool dso__build_id_equal(const struct dso *dso, struct build_id *bid) { - if (dso->bid.size > bid->size && dso->bid.size == BUILD_ID_SIZE) { + const struct build_id *dso_bid = dso__bid_const(dso); + + if (dso_bid->size > bid->size && dso_bid->size == BUILD_ID_SIZE) { /* * For the backward compatibility, it allows a build-id has * trailing zeros. */ - return !memcmp(dso->bid.data, bid->data, bid->size) && - !memchr_inv(&dso->bid.data[bid->size], 0, - dso->bid.size - bid->size); + return !memcmp(dso_bid->data, bid->data, bid->size) && + !memchr_inv(&dso_bid->data[bid->size], 0, + dso_bid->size - bid->size); } - return dso->bid.size == bid->size && - memcmp(dso->bid.data, bid->data, dso->bid.size) == 0; + return dso_bid->size == bid->size && + memcmp(dso_bid->data, bid->data, dso_bid->size) == 0; } void dso__read_running_kernel_build_id(struct dso *dso, struct machine *machine) @@ -1516,8 +1542,8 @@ void dso__read_running_kernel_build_id(struct dso *dso, struct machine *machine) if (machine__is_default_guest(machine)) return; sprintf(path, "%s/sys/kernel/notes", machine->root_dir); - if (sysfs__read_build_id(path, &dso->bid) == 0) - dso->has_build_id = true; + if (sysfs__read_build_id(path, dso__bid(dso)) == 0) + dso__set_has_build_id(dso); } int dso__kernel_module_get_build_id(struct dso *dso, @@ -1528,14 +1554,14 @@ int dso__kernel_module_get_build_id(struct dso *dso, * kernel module short names are of the form "[module]" and * we need just "module" here. */ - const char *name = dso->short_name + 1; + const char *name = dso__short_name(dso) + 1; snprintf(filename, sizeof(filename), "%s/sys/module/%.*s/notes/.note.gnu.build-id", root_dir, (int)strlen(name) - 1, name); - if (sysfs__read_build_id(filename, &dso->bid) == 0) - dso->has_build_id = true; + if (sysfs__read_build_id(filename, dso__bid(dso)) == 0) + dso__set_has_build_id(dso); return 0; } @@ -1544,21 +1570,21 @@ static size_t dso__fprintf_buildid(struct dso *dso, FILE *fp) { char sbuild_id[SBUILD_ID_SIZE]; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); return fprintf(fp, "%s", sbuild_id); } size_t dso__fprintf(struct dso *dso, FILE *fp) { struct rb_node *nd; - size_t ret = fprintf(fp, "dso: %s (", dso->short_name); + size_t ret = fprintf(fp, "dso: %s (", dso__short_name(dso)); - if (dso->short_name != dso->long_name) - ret += fprintf(fp, "%s, ", dso->long_name); + if (dso__short_name(dso) != dso__long_name(dso)) + ret += fprintf(fp, "%s, ", dso__long_name(dso)); ret += fprintf(fp, "%sloaded, ", dso__loaded(dso) ? "" : "NOT "); ret += dso__fprintf_buildid(dso, fp); ret += fprintf(fp, ")\n"); - for (nd = rb_first_cached(&dso->symbols); nd; nd = rb_next(nd)) { + for (nd = rb_first_cached(dso__symbols(dso)); nd; nd = rb_next(nd)) { struct symbol *pos = rb_entry(nd, struct symbol, rb_node); ret += symbol__fprintf(pos, fp); } @@ -1582,7 +1608,7 @@ enum dso_type dso__type(struct dso *dso, struct machine *machine) int dso__strerror_load(struct dso *dso, char *buf, size_t buflen) { - int idx, errnum = dso->load_errno; + int idx, errnum = *dso__load_errno(dso); /* * This must have a same ordering as the enum dso_load_errno. */ diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index b22dec8b3f3a..f9689dd60de3 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -11,6 +11,7 @@ #include #include "build-id.h" #include "mutex.h" +#include struct machine; struct map; @@ -100,26 +101,27 @@ enum dso_load_errno { __DSO_LOAD_ERRNO__END, }; -#define DSO__SWAP(dso, type, val) \ -({ \ - type ____r = val; \ - BUG_ON(dso->needs_swap == DSO_SWAP__UNSET); \ - if (dso->needs_swap == DSO_SWAP__YES) { \ - switch (sizeof(____r)) { \ - case 2: \ - ____r = bswap_16(val); \ - break; \ - case 4: \ - ____r = bswap_32(val); \ - break; \ - case 8: \ - ____r = bswap_64(val); \ - break; \ - default: \ - BUG_ON(1); \ - } \ - } \ - ____r; \ +#define DSO__SWAP(dso, type, val) \ +({ \ + type ____r = val; \ + enum dso_swap_type ___dst = dso__needs_swap(dso); \ + BUG_ON(___dst == DSO_SWAP__UNSET); \ + if (___dst == DSO_SWAP__YES) { \ + switch (sizeof(____r)) { \ + case 2: \ + ____r = bswap_16(val); \ + break; \ + case 4: \ + ____r = bswap_32(val); \ + break; \ + case 8: \ + ____r = bswap_64(val); \ + break; \ + default: \ + BUG_ON(1); \ + } \ + } \ + ____r; \ }) #define DSO__DATA_CACHE_SIZE 4096 @@ -142,9 +144,29 @@ struct dso_cache { char data[]; }; +struct dso_data { + struct rb_root cache; + struct list_head open_entry; + struct dso *dso; + int fd; + int status; + u32 status_seen; + u64 file_size; + u64 elf_base_addr; + u64 debug_frame_offset; + u64 eh_frame_hdr_addr; + u64 eh_frame_hdr_offset; +}; + +struct dso_bpf_prog { + u32 id; + u32 sub_id; + struct perf_env *env; +}; + struct auxtrace_cache; -struct dso { +DECLARE_RC_STRUCT(dso) { struct mutex lock; struct dsos *dsos; struct rb_root_cached symbols; @@ -176,24 +198,9 @@ struct dso { u64 db_id; }; /* bpf prog information */ - struct { - struct perf_env *env; - u32 id; - u32 sub_id; - } bpf_prog; + struct dso_bpf_prog bpf_prog; /* dso data file */ - struct { - struct rb_root cache; - struct list_head open_entry; - u64 file_size; - u64 elf_base_addr; - u64 debug_frame_offset; - u64 eh_frame_hdr_addr; - u64 eh_frame_hdr_offset; - int fd; - int status; - u32 status_seen; - } data; + struct dso_data data; struct dso_id id; unsigned int a2l_fails; int comp; @@ -229,11 +236,388 @@ struct dso { * @n: the 'struct rb_node *' to use as a temporary storage */ #define dso__for_each_symbol(dso, pos, n) \ - symbols__for_each_entry(&(dso)->symbols, pos, n) + symbols__for_each_entry(dso__symbols(dso), pos, n) + +static inline void *dso__a2l(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->a2l; +} + +static inline void dso__set_a2l(struct dso *dso, void *val) +{ + RC_CHK_ACCESS(dso)->a2l = val; +} + +static inline unsigned int dso__a2l_fails(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->a2l_fails; +} + +static inline void dso__set_a2l_fails(struct dso *dso, unsigned int val) +{ + RC_CHK_ACCESS(dso)->a2l_fails = val; +} + +static inline bool dso__adjust_symbols(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->adjust_symbols; +} + +static inline void dso__set_adjust_symbols(struct dso *dso, bool val) +{ + RC_CHK_ACCESS(dso)->adjust_symbols = val; +} + +static inline bool dso__annotate_warned(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->annotate_warned; +} + +static inline void dso__set_annotate_warned(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->annotate_warned = 1; +} + +static inline struct auxtrace_cache *dso__auxtrace_cache(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->auxtrace_cache; +} + +static inline void dso__set_auxtrace_cache(struct dso *dso, struct auxtrace_cache *cache) +{ + RC_CHK_ACCESS(dso)->auxtrace_cache = cache; +} + +static inline struct build_id *dso__bid(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->bid; +} + +static inline const struct build_id *dso__bid_const(const struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->bid; +} + +static inline struct dso_bpf_prog *dso__bpf_prog(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->bpf_prog; +} + +static inline bool dso__has_build_id(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->has_build_id; +} + +static inline void dso__set_has_build_id(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->has_build_id = true; +} + +static inline bool dso__has_srcline(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->has_srcline; +} + +static inline void dso__set_has_srcline(struct dso *dso, bool val) +{ + RC_CHK_ACCESS(dso)->has_srcline = val; +} + +static inline int dso__comp(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->comp; +} + +static inline void dso__set_comp(struct dso *dso, int comp) +{ + RC_CHK_ACCESS(dso)->comp = comp; +} + +static inline struct dso_data *dso__data(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->data; +} + +static inline u64 dso__db_id(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->db_id; +} + +static inline void dso__set_db_id(struct dso *dso, u64 db_id) +{ + RC_CHK_ACCESS(dso)->db_id = db_id; +} + +static inline struct dsos *dso__dsos(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->dsos; +} + +static inline void dso__set_dsos(struct dso *dso, struct dsos *dsos) +{ + RC_CHK_ACCESS(dso)->dsos = dsos; +} + +static inline bool dso__header_build_id(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->header_build_id; +} + +static inline void dso__set_header_build_id(struct dso *dso, bool val) +{ + RC_CHK_ACCESS(dso)->header_build_id = val; +} + +static inline bool dso__hit(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->hit; +} + +static inline void dso__set_hit(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->hit = 1; +} + +static inline struct dso_id *dso__id(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->id; +} + +static inline const struct dso_id *dso__id_const(const struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->id; +} + +static inline struct rb_root_cached *dso__inlined_nodes(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->inlined_nodes; +} + +static inline bool dso__is_64_bit(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->is_64_bit; +} + +static inline void dso__set_is_64_bit(struct dso *dso, bool is) +{ + RC_CHK_ACCESS(dso)->is_64_bit = is; +} + +static inline bool dso__is_kmod(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->is_kmod; +} + +static inline void dso__set_is_kmod(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->is_kmod = 1; +} + +static inline enum dso_space_type dso__kernel(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->kernel; +} + +static inline void dso__set_kernel(struct dso *dso, enum dso_space_type kernel) +{ + RC_CHK_ACCESS(dso)->kernel = kernel; +} + +static inline u64 dso__last_find_result_addr(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->last_find_result.addr; +} + +static inline void dso__set_last_find_result_addr(struct dso *dso, u64 addr) +{ + RC_CHK_ACCESS(dso)->last_find_result.addr = addr; +} + +static inline struct symbol *dso__last_find_result_symbol(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->last_find_result.symbol; +} + +static inline void dso__set_last_find_result_symbol(struct dso *dso, struct symbol *symbol) +{ + RC_CHK_ACCESS(dso)->last_find_result.symbol = symbol; +} + +static inline enum dso_load_errno *dso__load_errno(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->load_errno; +} static inline void dso__set_loaded(struct dso *dso) { - dso->loaded = true; + RC_CHK_ACCESS(dso)->loaded = true; +} + +static inline struct mutex *dso__lock(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->lock; +} + +static inline const char *dso__long_name(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->long_name; +} + +static inline bool dso__long_name_allocated(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->long_name_allocated; +} + +static inline void dso__set_long_name_allocated(struct dso *dso, bool allocated) +{ + RC_CHK_ACCESS(dso)->long_name_allocated = allocated; +} + +static inline u16 dso__long_name_len(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->long_name_len; +} + +static inline const char *dso__name(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->name; +} + +static inline enum dso_swap_type dso__needs_swap(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->needs_swap; +} + +static inline void dso__set_needs_swap(struct dso *dso, enum dso_swap_type type) +{ + RC_CHK_ACCESS(dso)->needs_swap = type; +} + +static inline struct nsinfo *dso__nsinfo(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->nsinfo; +} + +static inline const struct nsinfo *dso__nsinfo_const(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->nsinfo; +} + +static inline struct nsinfo **dso__nsinfo_ptr(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->nsinfo; +} + +void dso__set_nsinfo(struct dso *dso, struct nsinfo *nsi); + +static inline u8 dso__rel(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->rel; +} + +static inline void dso__set_rel(struct dso *dso, u8 rel) +{ + RC_CHK_ACCESS(dso)->rel = rel; +} + +static inline const char *dso__short_name(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->short_name; +} + +static inline bool dso__short_name_allocated(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->short_name_allocated; +} + +static inline void dso__set_short_name_allocated(struct dso *dso, bool allocated) +{ + RC_CHK_ACCESS(dso)->short_name_allocated = allocated; +} + +static inline u16 dso__short_name_len(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->short_name_len; +} + +static inline struct rb_root_cached *dso__srclines(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->srclines; +} + +static inline struct rb_root *dso__data_types(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->data_types; +} + +static inline struct rb_root *dso__global_vars(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->global_vars; +} + +static inline struct rb_root_cached *dso__symbols(struct dso *dso) +{ + return &RC_CHK_ACCESS(dso)->symbols; +} + +static inline struct symbol **dso__symbol_names(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symbol_names; +} + +static inline void dso__set_symbol_names(struct dso *dso, struct symbol **names) +{ + RC_CHK_ACCESS(dso)->symbol_names = names; +} + +static inline size_t dso__symbol_names_len(struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symbol_names_len; +} + +static inline void dso__set_symbol_names_len(struct dso *dso, size_t len) +{ + RC_CHK_ACCESS(dso)->symbol_names_len = len; +} + +static inline const char *dso__symsrc_filename(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symsrc_filename; +} + +static inline void dso__set_symsrc_filename(struct dso *dso, char *val) +{ + RC_CHK_ACCESS(dso)->symsrc_filename = val; +} + +static inline enum dso_binary_type dso__symtab_type(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->symtab_type; +} + +static inline void dso__set_symtab_type(struct dso *dso, enum dso_binary_type bt) +{ + RC_CHK_ACCESS(dso)->symtab_type = bt; +} + +static inline u64 dso__text_end(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->text_end; +} + +static inline void dso__set_text_end(struct dso *dso, u64 val) +{ + RC_CHK_ACCESS(dso)->text_end = val; +} + +static inline u64 dso__text_offset(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->text_offset; +} + +static inline void dso__set_text_offset(struct dso *dso, u64 val) +{ + RC_CHK_ACCESS(dso)->text_offset = val; } int dso_id__cmp(const struct dso_id *a, const struct dso_id *b); @@ -265,7 +649,7 @@ bool dso__loaded(const struct dso *dso); static inline bool dso__has_symbols(const struct dso *dso) { - return !RB_EMPTY_ROOT(&dso->symbols.rb_root); + return !RB_EMPTY_ROOT(&RC_CHK_ACCESS(dso)->symbols.rb_root); } char *dso__filename_with_chroot(const struct dso *dso, const char *filename); @@ -381,21 +765,33 @@ void dso__reset_find_symbol_cache(struct dso *dso); size_t dso__fprintf_symbols_by_name(struct dso *dso, FILE *fp); size_t dso__fprintf(struct dso *dso, FILE *fp); +static inline enum dso_binary_type dso__binary_type(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->binary_type; +} + +static inline void dso__set_binary_type(struct dso *dso, enum dso_binary_type bt) +{ + RC_CHK_ACCESS(dso)->binary_type = bt; +} + static inline bool dso__is_vmlinux(const struct dso *dso) { - return dso->binary_type == DSO_BINARY_TYPE__VMLINUX || - dso->binary_type == DSO_BINARY_TYPE__GUEST_VMLINUX; + enum dso_binary_type bt = dso__binary_type(dso); + + return bt == DSO_BINARY_TYPE__VMLINUX || bt == DSO_BINARY_TYPE__GUEST_VMLINUX; } static inline bool dso__is_kcore(const struct dso *dso) { - return dso->binary_type == DSO_BINARY_TYPE__KCORE || - dso->binary_type == DSO_BINARY_TYPE__GUEST_KCORE; + enum dso_binary_type bt = dso__binary_type(dso); + + return bt == DSO_BINARY_TYPE__KCORE || bt == DSO_BINARY_TYPE__GUEST_KCORE; } static inline bool dso__is_kallsyms(const struct dso *dso) { - return dso->kernel && dso->long_name[0] != '/'; + return RC_CHK_ACCESS(dso)->kernel && RC_CHK_ACCESS(dso)->long_name[0] != '/'; } bool dso__is_object_file(const struct dso *dso); diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index 2e4e86dc6c17..ab3d0c01dd63 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -29,7 +29,7 @@ static void dsos__purge(struct dsos *dsos) for (unsigned int i = 0; i < dsos->cnt; i++) { struct dso *dso = dsos->dsos[i]; - dso->dsos = NULL; + dso__set_dsos(dso, NULL); dso__put(dso); } @@ -73,22 +73,22 @@ static int dsos__read_build_ids_cb(struct dso *dso, void *data) struct dsos__read_build_ids_cb_args *args = data; struct nscookie nsc; - if (args->with_hits && !dso->hit && !dso__is_vdso(dso)) + if (args->with_hits && !dso__hit(dso) && !dso__is_vdso(dso)) return 0; - if (dso->has_build_id) { + if (dso__has_build_id(dso)) { args->have_build_id = true; return 0; } - nsinfo__mountns_enter(dso->nsinfo, &nsc); - if (filename__read_build_id(dso->long_name, &dso->bid) > 0) { + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + if (filename__read_build_id(dso__long_name(dso), dso__bid(dso)) > 0) { args->have_build_id = true; - dso->has_build_id = true; - } else if (errno == ENOENT && dso->nsinfo) { - char *new_name = dso__filename_with_chroot(dso, dso->long_name); + dso__set_has_build_id(dso); + } else if (errno == ENOENT && dso__nsinfo(dso)) { + char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &dso->bid) > 0) { + if (new_name && filename__read_build_id(new_name, dso__bid(dso)) > 0) { args->have_build_id = true; - dso->has_build_id = true; + dso__set_has_build_id(dso); } free(new_name); } @@ -110,27 +110,27 @@ bool dsos__read_build_ids(struct dsos *dsos, bool with_hits) static int __dso__cmp_long_name(const char *long_name, const struct dso_id *id, const struct dso *b) { - int rc = strcmp(long_name, b->long_name); - return rc ?: dso_id__cmp(id, &b->id); + int rc = strcmp(long_name, dso__long_name(b)); + return rc ?: dso_id__cmp(id, dso__id_const(b)); } static int __dso__cmp_short_name(const char *short_name, const struct dso_id *id, const struct dso *b) { - int rc = strcmp(short_name, b->short_name); - return rc ?: dso_id__cmp(id, &b->id); + int rc = strcmp(short_name, dso__short_name(b)); + return rc ?: dso_id__cmp(id, dso__id_const(b)); } static int dsos__cmp_long_name_id_short_name(const void *va, const void *vb) { const struct dso *a = *((const struct dso **)va); const struct dso *b = *((const struct dso **)vb); - int rc = strcmp(a->long_name, b->long_name); + int rc = strcmp(dso__long_name(a), dso__long_name(b)); if (!rc) { - rc = dso_id__cmp(&a->id, &b->id); + rc = dso_id__cmp(dso__id_const(a), dso__id_const(b)); if (!rc) - rc = strcmp(a->short_name, b->short_name); + rc = strcmp(dso__short_name(a), dso__short_name(b)); } return rc; } @@ -209,7 +209,7 @@ int __dsos__add(struct dsos *dsos, struct dso *dso) &dsos->dsos[dsos->cnt - 1]) <= 0; } - dso->dsos = dsos; + dso__set_dsos(dso, dsos); return 0; } @@ -275,7 +275,7 @@ static void dso__set_basename(struct dso *dso) char *base, *lname; int tid; - if (sscanf(dso->long_name, "/tmp/perf-%d.map", &tid) == 1) { + if (sscanf(dso__long_name(dso), "/tmp/perf-%d.map", &tid) == 1) { if (asprintf(&base, "[JIT] tid %d", tid) < 0) return; } else { @@ -283,7 +283,7 @@ static void dso__set_basename(struct dso *dso) * basename() may modify path buffer, so we must pass * a copy. */ - lname = strdup(dso->long_name); + lname = strdup(dso__long_name(dso)); if (!lname) return; @@ -322,7 +322,7 @@ static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struc { struct dso *dso = __dsos__find_id(dsos, name, id, false, /*write_locked=*/true); - if (dso && dso_id__empty(&dso->id) && !dso_id__empty(id)) + if (dso && dso_id__empty(dso__id(dso)) && !dso_id__empty(id)) __dso__inject_id(dso, id); return dso ? dso : __dsos__addnew_id(dsos, name, id); @@ -351,8 +351,8 @@ static int dsos__fprintf_buildid_cb(struct dso *dso, void *data) if (args->skip && args->skip(dso, args->parm)) return 0; - build_id__sprintf(&dso->bid, sbuild_id); - args->ret += fprintf(args->fp, "%-40s %s\n", sbuild_id, dso->long_name); + build_id__sprintf(dso__bid(dso), sbuild_id); + args->ret += fprintf(args->fp, "%-40s %s\n", sbuild_id, dso__long_name(dso)); return 0; } @@ -396,7 +396,7 @@ size_t dsos__fprintf(struct dsos *dsos, FILE *fp) static int dsos__hit_all_cb(struct dso *dso, void *data __maybe_unused) { - dso->hit = true; + dso__set_hit(dso); return 0; } @@ -432,7 +432,7 @@ struct dso *dsos__findnew_module_dso(struct dsos *dsos, dso__set_basename(dso); dso__set_module_info(dso, m, machine); dso__set_long_name(dso, strdup(filename), true); - dso->kernel = DSO_SPACE__KERNEL; + dso__set_kernel(dso, DSO_SPACE__KERNEL); __dsos__add(dsos, dso); up_write(&dsos->lock); @@ -455,8 +455,8 @@ static int dsos__find_kernel_dso_cb(struct dso *dso, void *data) * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN. * is_kernel_module() treats it as a kernel cpumode. */ - if (!dso->kernel || - is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN)) + if (!dso__kernel(dso) || + is_kernel_module(dso__long_name(dso), PERF_RECORD_MISC_CPUMODE_UNKNOWN)) return 0; *res = dso__get(dso); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 198903157f9e..f32f9abf6344 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -726,7 +726,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al, dso = al->map ? map__dso(al->map) : NULL; dump_printf(" ...... dso: %s\n", dso - ? dso->long_name + ? dso__long_name(dso) : (al->level == 'H' ? "[hypervisor]" : "")); if (thread__is_filtered(thread)) @@ -750,10 +750,10 @@ int machine__resolve(struct machine *machine, struct addr_location *al, if (al->map) { if (symbol_conf.dso_list && (!dso || !(strlist__has_entry(symbol_conf.dso_list, - dso->short_name) || - (dso->short_name != dso->long_name && + dso__short_name(dso)) || + (dso__short_name(dso) != dso__long_name(dso) && strlist__has_entry(symbol_conf.dso_list, - dso->long_name))))) { + dso__long_name(dso)))))) { al->filtered |= (1 << HIST_FILTER__DSO); } diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3fe28edc3d01..55e9553861d0 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2308,7 +2308,7 @@ static int __event_process_build_id(struct perf_record_header_build_id *bev, build_id__init(&bid, bev->data, size); dso__set_build_id(dso, &bid); - dso->header_build_id = 1; + dso__set_header_build_id(dso, true); if (dso_space != DSO_SPACE__USER) { struct kmod_path m = { .name = NULL, }; @@ -2316,13 +2316,13 @@ static int __event_process_build_id(struct perf_record_header_build_id *bev, if (!kmod_path__parse_name(&m, filename) && m.kmod) dso__set_module_info(dso, &m, machine); - dso->kernel = dso_space; + dso__set_kernel(dso, dso_space); free(m.name); } - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); pr_debug("build id event received for %s: %s [%zu]\n", - dso->long_name, sbuild_id, size); + dso__long_name(dso), sbuild_id, size); dso__put(dso); } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 9d43f8ae412d..55ea6afcc437 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2136,7 +2136,7 @@ static bool hists__filter_entry_by_dso(struct hists *hists, struct hist_entry *he) { if (hists->dso_filter != NULL && - (he->ms.map == NULL || map__dso(he->ms.map) != hists->dso_filter)) { + (he->ms.map == NULL || !RC_CHK_EQUAL(map__dso(he->ms.map), hists->dso_filter))) { he->filtered |= (1 << HIST_FILTER__DSO); return true; } @@ -2816,7 +2816,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh } if (dso) printed += scnprintf(bf + printed, size - printed, - ", DSO: %s", dso->short_name); + ", DSO: %s", dso__short_name(dso)); if (socket_id > -1) printed += scnprintf(bf + printed, size - printed, ", Processor Socket: %d", socket_id); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 4db9a098f592..d6d7b7512505 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -598,15 +598,15 @@ static struct auxtrace_cache *intel_pt_cache(struct dso *dso, struct auxtrace_cache *c; unsigned int bits; - if (dso->auxtrace_cache) - return dso->auxtrace_cache; + if (dso__auxtrace_cache(dso)) + return dso__auxtrace_cache(dso); bits = intel_pt_cache_size(dso, machine); /* Ignoring cache creation failure */ c = auxtrace_cache__new(bits, sizeof(struct intel_pt_cache_entry), 200); - dso->auxtrace_cache = c; + dso__set_auxtrace_cache(dso, c); return c; } @@ -650,7 +650,7 @@ intel_pt_cache_lookup(struct dso *dso, struct machine *machine, u64 offset) if (!c) return NULL; - return auxtrace_cache__lookup(dso->auxtrace_cache, offset); + return auxtrace_cache__lookup(dso__auxtrace_cache(dso), offset); } static void intel_pt_cache_invalidate(struct dso *dso, struct machine *machine, @@ -661,7 +661,7 @@ static void intel_pt_cache_invalidate(struct dso *dso, struct machine *machine, if (!c) return; - auxtrace_cache__remove(dso->auxtrace_cache, offset); + auxtrace_cache__remove(dso__auxtrace_cache(dso), offset); } static inline bool intel_pt_guest_kernel_ip(uint64_t ip) @@ -821,8 +821,8 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, } dso = map__dso(al.map); - if (dso->data.status == DSO_DATA_STATUS_ERROR && - dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) { + if (dso__data(dso)->status == DSO_DATA_STATUS_ERROR && + dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) { ret = -ENOENT; goto out_ret; } @@ -855,7 +855,7 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, /* Load maps to ensure dso->is_64_bit has been updated */ map__load(al.map); - x86_64 = dso->is_64_bit; + x86_64 = dso__is_64_bit(dso); while (1) { len = dso__data_read_offset(dso, machine, @@ -1010,7 +1010,7 @@ static int __intel_pt_pgd_ip(uint64_t ip, void *data) offset = map__map_ip(al.map, ip); - res = intel_pt_match_pgd_ip(ptq->pt, ip, offset, map__dso(al.map)->long_name); + res = intel_pt_match_pgd_ip(ptq->pt, ip, offset, dso__long_name(map__dso(al.map))); addr_location__exit(&al); return res; } @@ -3418,7 +3418,7 @@ static int intel_pt_text_poke(struct intel_pt *pt, union perf_event *event) } dso = map__dso(al.map); - if (!dso || !dso->auxtrace_cache) + if (!dso || !dso__auxtrace_cache(dso)) continue; offset = map__map_ip(al.map, addr); @@ -3438,7 +3438,7 @@ static int intel_pt_text_poke(struct intel_pt *pt, union perf_event *event) } else { intel_pt_cache_invalidate(dso, machine, offset); intel_pt_log("Invalidated instruction cache for %s at %#"PRIx64"\n", - dso->long_name, addr); + dso__long_name(dso), addr); } } out: diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index c5c895131bca..0b8fb14f5ff6 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -694,7 +694,7 @@ static int machine__process_ksymbol_register(struct machine *machine, err = -ENOMEM; goto out; } - dso->kernel = DSO_SPACE__KERNEL; + dso__set_kernel(dso, DSO_SPACE__KERNEL); map = map__new2(0, dso); dso__put(dso); if (!map) { @@ -702,8 +702,8 @@ static int machine__process_ksymbol_register(struct machine *machine, goto out; } if (event->ksymbol.ksym_type == PERF_RECORD_KSYMBOL_TYPE_OOL) { - dso->binary_type = DSO_BINARY_TYPE__OOL; - dso->data.file_size = event->ksymbol.len; + dso__set_binary_type(dso, DSO_BINARY_TYPE__OOL); + dso__data(dso)->file_size = event->ksymbol.len; dso__set_loaded(dso); } @@ -718,7 +718,7 @@ static int machine__process_ksymbol_register(struct machine *machine, dso__set_loaded(dso); if (is_bpf_image(event->ksymbol.name)) { - dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE; + dso__set_binary_type(dso, DSO_BINARY_TYPE__BPF_IMAGE); dso__set_long_name(dso, "", false); } } else { @@ -888,17 +888,17 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) size_t printed = 0; struct dso *kdso = machine__kernel_dso(machine); - if (kdso->has_build_id) { + if (dso__has_build_id(kdso)) { char filename[PATH_MAX]; - if (dso__build_id_filename(kdso, filename, sizeof(filename), - false)) + + if (dso__build_id_filename(kdso, filename, sizeof(filename), false)) printed += fprintf(fp, "[0] %s\n", filename); } - for (i = 0; i < vmlinux_path__nr_entries; ++i) - printed += fprintf(fp, "[%d] %s\n", - i + kdso->has_build_id, vmlinux_path[i]); - + for (i = 0; i < vmlinux_path__nr_entries; ++i) { + printed += fprintf(fp, "[%d] %s\n", i + dso__has_build_id(kdso), + vmlinux_path[i]); + } return printed; } @@ -948,7 +948,7 @@ static struct dso *machine__get_kernel(struct machine *machine) DSO_SPACE__KERNEL_GUEST); } - if (kernel != NULL && (!kernel->has_build_id)) + if (kernel != NULL && (!dso__has_build_id(kernel))) dso__read_running_kernel_build_id(kernel, machine); return kernel; @@ -1313,8 +1313,8 @@ static char *get_kernel_version(const char *root_dir) static bool is_kmod_dso(struct dso *dso) { - return dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE; + return dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE; } static int maps__set_module_path(struct maps *maps, const char *path, struct kmod_path *m) @@ -1341,8 +1341,8 @@ static int maps__set_module_path(struct maps *maps, const char *path, struct kmo * we need to update the symtab_type if needed. */ if (m->comp && is_kmod_dso(dso)) { - dso->symtab_type++; - dso->comp = m->comp; + dso__set_symtab_type(dso, dso__symtab_type(dso)); + dso__set_comp(dso, m->comp); } map__put(map); return 0; @@ -1643,13 +1643,13 @@ static int machine__process_kernel_mmap_event(struct machine *machine, if (kernel == NULL) goto out_problem; - kernel->kernel = dso_space; + dso__set_kernel(kernel, dso_space); if (__machine__create_kernel_maps(machine, kernel) < 0) { dso__put(kernel); goto out_problem; } - if (strstr(kernel->long_name, "vmlinux")) + if (strstr(dso__long_name(kernel), "vmlinux")) dso__set_short_name(kernel, "[kernel.vmlinux]", false); if (machine__update_kernel_mmap(machine, xm->start, xm->end) < 0) { @@ -2031,14 +2031,14 @@ static char *callchain_srcline(struct map_symbol *ms, u64 ip) return srcline; dso = map__dso(map); - srcline = srcline__tree_find(&dso->srclines, ip); + srcline = srcline__tree_find(dso__srclines(dso), ip); if (!srcline) { bool show_sym = false; bool show_addr = callchain_param.key == CCKEY_ADDRESS; srcline = get_srcline(dso, map__rip_2objdump(map, ip), ms->sym, show_sym, show_addr, ip); - srcline__tree_insert(&dso->srclines, ip, srcline); + srcline__tree_insert(dso__srclines(dso), ip, srcline); } return srcline; @@ -2836,12 +2836,12 @@ static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms addr = map__rip_2objdump(map, addr); dso = map__dso(map); - inline_node = inlines__tree_find(&dso->inlined_nodes, addr); + inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr); if (!inline_node) { inline_node = dso__parse_addr_inlines(dso, addr, sym); if (!inline_node) return ret; - inlines__tree_insert(&dso->inlined_nodes, inline_node); + inlines__tree_insert(dso__inlined_nodes(dso), inline_node); } ilist_ms = (struct map_symbol) { @@ -3130,7 +3130,7 @@ char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, ch if (sym == NULL) return NULL; - *modp = __map__is_kmodule(map) ? (char *)map__dso(map)->short_name : NULL; + *modp = __map__is_kmodule(map) ? (char *)dso__short_name(map__dso(map)) : NULL; *addrp = map__unmap_ip(map, sym->start); return sym->name; } diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index cca871959f87..117c4bb78b35 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -168,7 +168,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, if (dso == NULL) goto out_delete; - assert(!dso->kernel); + assert(!dso__kernel(dso)); map__init(result, start, start + len, pgoff, dso); if (anon || no_dso) { @@ -182,10 +182,9 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, if (!(prot & PROT_EXEC)) dso__set_loaded(dso); } - mutex_lock(&dso->lock); - nsinfo__put(dso->nsinfo); - dso->nsinfo = nsi; - mutex_unlock(&dso->lock); + mutex_lock(dso__lock(dso)); + dso__set_nsinfo(dso, nsi); + mutex_unlock(dso__lock(dso)); if (build_id__is_defined(bid)) { dso__set_build_id(dso, bid); @@ -197,9 +196,9 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, * have it missing. */ header_bid_dso = dsos__find(&machine->dsos, filename, false); - if (header_bid_dso && header_bid_dso->header_build_id) { - dso__set_build_id(dso, &header_bid_dso->bid); - dso->header_build_id = 1; + if (header_bid_dso && dso__header_build_id(header_bid_dso)) { + dso__set_build_id(dso, dso__bid(header_bid_dso)); + dso__set_header_build_id(dso, 1); } } dso__put(dso); @@ -221,7 +220,7 @@ struct map *map__new2(u64 start, struct dso *dso) struct map *result; RC_STRUCT(map) *map; - map = calloc(1, sizeof(*map) + (dso->kernel ? sizeof(struct kmap) : 0)); + map = calloc(1, sizeof(*map) + (dso__kernel(dso) ? sizeof(struct kmap) : 0)); if (ADD_RC_CHK(result, map)) { /* * ->end will be filled after we load all the symbols @@ -234,7 +233,7 @@ struct map *map__new2(u64 start, struct dso *dso) bool __map__is_kernel(const struct map *map) { - if (!map__dso(map)->kernel) + if (!dso__kernel(map__dso(map))) return false; return machine__kernel_map(maps__machine(map__kmaps((struct map *)map))) == map; } @@ -251,7 +250,7 @@ bool __map__is_bpf_prog(const struct map *map) const char *name; struct dso *dso = map__dso(map); - if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) return true; /* @@ -259,7 +258,7 @@ bool __map__is_bpf_prog(const struct map *map) * type of DSO_BINARY_TYPE__BPF_PROG_INFO. In such cases, we can * guess the type based on name. */ - name = dso->short_name; + name = dso__short_name(dso); return name && (strstr(name, "bpf_prog_") == name); } @@ -268,7 +267,7 @@ bool __map__is_bpf_image(const struct map *map) const char *name; struct dso *dso = map__dso(map); - if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) return true; /* @@ -276,7 +275,7 @@ bool __map__is_bpf_image(const struct map *map) * type of DSO_BINARY_TYPE__BPF_IMAGE. In such cases, we can * guess the type based on name. */ - name = dso->short_name; + name = dso__short_name(dso); return name && is_bpf_image(name); } @@ -284,7 +283,7 @@ bool __map__is_ool(const struct map *map) { const struct dso *dso = map__dso(map); - return dso && dso->binary_type == DSO_BINARY_TYPE__OOL; + return dso && dso__binary_type(dso) == DSO_BINARY_TYPE__OOL; } bool map__has_symbols(const struct map *map) @@ -315,7 +314,7 @@ void map__put(struct map *map) void map__fixup_start(struct map *map) { struct dso *dso = map__dso(map); - struct rb_root_cached *symbols = &dso->symbols; + struct rb_root_cached *symbols = dso__symbols(dso); struct rb_node *nd = rb_first_cached(symbols); if (nd != NULL) { @@ -328,7 +327,7 @@ void map__fixup_start(struct map *map) void map__fixup_end(struct map *map) { struct dso *dso = map__dso(map); - struct rb_root_cached *symbols = &dso->symbols; + struct rb_root_cached *symbols = dso__symbols(dso); struct rb_node *nd = rb_last(&symbols->rb_root); if (nd != NULL) { @@ -342,7 +341,7 @@ void map__fixup_end(struct map *map) int map__load(struct map *map) { struct dso *dso = map__dso(map); - const char *name = dso->long_name; + const char *name = dso__long_name(dso); int nr; if (dso__loaded(dso)) @@ -350,10 +349,10 @@ int map__load(struct map *map) nr = dso__load(dso, map); if (nr < 0) { - if (dso->has_build_id) { + if (dso__has_build_id(dso)) { char sbuild_id[SBUILD_ID_SIZE]; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); pr_debug("%s with build id %s not found", name, sbuild_id); } else pr_debug("Failed to open %s", name); @@ -415,7 +414,7 @@ struct map *map__clone(struct map *from) size_t size = sizeof(RC_STRUCT(map)); struct dso *dso = map__dso(from); - if (dso && dso->kernel) + if (dso && dso__kernel(dso)) size += sizeof(struct kmap); map = memdup(RC_CHK_ACCESS(from), size); @@ -432,14 +431,14 @@ size_t map__fprintf(struct map *map, FILE *fp) const struct dso *dso = map__dso(map); return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s\n", - map__start(map), map__end(map), map__pgoff(map), dso->name); + map__start(map), map__end(map), map__pgoff(map), dso__name(dso)); } static bool prefer_dso_long_name(const struct dso *dso, bool print_off) { - return dso->long_name && + return dso__long_name(dso) && (symbol_conf.show_kernel_path || - (print_off && (dso->name[0] == '[' || dso__is_kcore(dso)))); + (print_off && (dso__name(dso)[0] == '[' || dso__is_kcore(dso)))); } static size_t __map__fprintf_dsoname(struct map *map, bool print_off, FILE *fp) @@ -450,9 +449,9 @@ static size_t __map__fprintf_dsoname(struct map *map, bool print_off, FILE *fp) if (dso) { if (prefer_dso_long_name(dso, print_off)) - dsoname = dso->long_name; + dsoname = dso__long_name(dso); else - dsoname = dso->name; + dsoname = dso__name(dso); } if (symbol_conf.pad_output_len_dso) { @@ -545,14 +544,14 @@ u64 map__rip_2objdump(struct map *map, u64 rip) } } - if (!dso->adjust_symbols) + if (!dso__adjust_symbols(dso)) return rip; - if (dso->rel) + if (dso__rel(dso)) return rip - map__pgoff(map); - if (dso->kernel == DSO_SPACE__USER) - return rip + dso->text_offset; + if (dso__kernel(dso) == DSO_SPACE__USER) + return rip + dso__text_offset(dso); return map__unmap_ip(map, rip) - map__reloc(map); } @@ -573,14 +572,14 @@ u64 map__objdump_2mem(struct map *map, u64 ip) { const struct dso *dso = map__dso(map); - if (!dso->adjust_symbols) + if (!dso__adjust_symbols(dso)) return map__unmap_ip(map, ip); - if (dso->rel) + if (dso__rel(dso)) return map__unmap_ip(map, ip + map__pgoff(map)); - if (dso->kernel == DSO_SPACE__USER) - return map__unmap_ip(map, ip - dso->text_offset); + if (dso__kernel(dso) == DSO_SPACE__USER) + return map__unmap_ip(map, ip - dso__text_offset(dso)); return ip + map__reloc(map); } @@ -590,14 +589,14 @@ u64 map__objdump_2rip(struct map *map, u64 ip) { const struct dso *dso = map__dso(map); - if (!dso->adjust_symbols) + if (!dso__adjust_symbols(dso)) return ip; - if (dso->rel) + if (dso__rel(dso)) return ip + map__pgoff(map); - if (dso->kernel == DSO_SPACE__USER) - return ip - dso->text_offset; + if (dso__kernel(dso) == DSO_SPACE__USER) + return ip - dso__text_offset(dso); return map__map_ip(map, ip + map__reloc(map)); } @@ -613,7 +612,7 @@ struct kmap *__map__kmap(struct map *map) { const struct dso *dso = map__dso(map); - if (!dso || !dso->kernel) + if (!dso || !dso__kernel(dso)) return NULL; return (struct kmap *)(&RC_CHK_ACCESS(map)[1]); } diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index ac9fb880ddc7..61eb742d91e3 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -76,7 +76,7 @@ static void check_invariants(const struct maps *maps __maybe_unused) /* Expect at least 1 reference count. */ assert(refcount_read(map__refcnt(map)) > 0); - if (map__dso(map) && map__dso(map)->kernel) + if (map__dso(map) && dso__kernel(map__dso(map))) assert(RC_CHK_EQUAL(map__kmap(map)->kmaps, maps)); if (i > 0) { @@ -341,7 +341,7 @@ static int map__strcmp(const void *a, const void *b) const struct map *map_b = *(const struct map * const *)b; const struct dso *dso_a = map__dso(map_a); const struct dso *dso_b = map__dso(map_b); - int ret = strcmp(dso_a->short_name, dso_b->short_name); + int ret = strcmp(dso__short_name(dso_a), dso__short_name(dso_b)); if (ret == 0 && RC_CHK_ACCESS(map_a) != RC_CHK_ACCESS(map_b)) { /* Ensure distinct but name equal maps have an order. */ @@ -482,7 +482,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - if (dso && dso->kernel) { + if (dso && dso__kernel(dso)) { struct kmap *kmap = map__kmap(new); if (kmap) @@ -766,7 +766,7 @@ sort_again: if (use_browser) { pr_debug("overlapping maps in %s (disable tui for more info)\n", - map__dso(new)->name); + dso__name(map__dso(new))); } else if (verbose >= 2) { pr_debug("overlapping maps:\n"); map__fprintf(new, fp); @@ -989,7 +989,7 @@ static int map__strcmp_name(const void *name, const void *b) { const struct dso *dso = map__dso(*(const struct map **)b); - return strcmp(name, dso->short_name); + return strcmp(name, dso__short_name(dso)); } struct map *maps__find_by_name(struct maps *maps, const char *name) @@ -1008,7 +1008,7 @@ struct map *maps__find_by_name(struct maps *maps, const char *name) if (i < maps__nr_maps(maps) && maps__maps_by_name(maps)) { struct dso *dso = map__dso(maps__maps_by_name(maps)[i]); - if (dso && strcmp(dso->short_name, name) == 0) { + if (dso && strcmp(dso__short_name(dso), name) == 0) { result = map__get(maps__maps_by_name(maps)[i]); done = true; } @@ -1045,7 +1045,7 @@ struct map *maps__find_by_name(struct maps *maps, const char *name) struct map *pos = maps_by_address[i]; struct dso *dso = map__dso(pos); - if (dso && strcmp(dso->short_name, name) == 0) { + if (dso && strcmp(dso__short_name(dso), name) == 0) { result = map__get(pos); break; } diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index aab11d8e1b1d..a950e9157d2d 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -104,7 +104,7 @@ static bool is64bitip(struct machine *machine, struct addr_location *al) const struct dso *dso = al->map ? map__dso(al->map) : NULL; if (dso) - return dso->is_64_bit; + return dso__is_64_bit(dso); return machine__is(machine, "x86_64") || machine__normalized_is(machine, "arm64") || diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 8a73c9464b70..a17c9b8a7a79 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -159,8 +159,8 @@ static int kernel_get_module_map_cb(struct map *map, void *data) { struct kernel_get_module_map_cb_args *args = data; struct dso *dso = map__dso(map); - const char *short_name = dso->short_name; /* short_name is "[module]" */ - u16 short_name_len = dso->short_name_len; + const char *short_name = dso__short_name(dso); + u16 short_name_len = dso__short_name_len(dso); if (strncmp(short_name + 1, args->module, short_name_len - 2) == 0 && args->module[short_name_len - 2] == '\0') { @@ -202,10 +202,9 @@ struct map *get_target_map(const char *target, struct nsinfo *nsi, bool user) map = dso__new_map(target); dso = map ? map__dso(map) : NULL; if (dso) { - mutex_lock(&dso->lock); - nsinfo__put(dso->nsinfo); - dso->nsinfo = nsinfo__get(nsi); - mutex_unlock(&dso->lock); + mutex_lock(dso__lock(dso)); + dso__set_nsinfo(dso, nsinfo__get(nsi)); + mutex_unlock(dso__lock(dso)); } return map; } else { @@ -368,11 +367,11 @@ static int kernel_get_module_dso(const char *module, struct dso **pdso) map = machine__kernel_map(host_machine); dso = map__dso(map); - if (!dso->has_build_id) + if (!dso__has_build_id(dso)) dso__read_running_kernel_build_id(dso, host_machine); vmlinux_name = symbol_conf.vmlinux_name; - dso->load_errno = 0; + *dso__load_errno(dso) = 0; if (vmlinux_name) ret = dso__load_vmlinux(dso, map, vmlinux_name, false); else @@ -499,7 +498,7 @@ static struct debuginfo *open_from_debuginfod(struct dso *dso, struct nsinfo *ns if (!c) return NULL; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); fd = debuginfod_find_debuginfo(c, (const unsigned char *)sbuild_id, 0, &path); if (fd >= 0) @@ -542,7 +541,7 @@ static struct debuginfo *open_debuginfo(const char *module, struct nsinfo *nsi, if (!module || !strchr(module, '/')) { err = kernel_get_module_dso(module, &dso); if (err < 0) { - if (!dso || dso->load_errno == 0) { + if (!dso || *dso__load_errno(dso) == 0) { if (!str_error_r(-err, reason, STRERR_BUFSIZE)) strcpy(reason, "(unknown)"); } else @@ -559,7 +558,7 @@ static struct debuginfo *open_debuginfo(const char *module, struct nsinfo *nsi, } return NULL; } - path = dso->long_name; + path = dso__long_name(dso); } nsinfo__mountns_enter(nsi, &nsc); ret = debuginfo__new(path); @@ -3795,8 +3794,8 @@ int show_available_funcs(const char *target, struct nsinfo *nsi, /* Show all (filtered) symbols */ setup_pager(); - for (size_t i = 0; i < dso->symbol_names_len; i++) { - struct symbol *pos = dso->symbol_names[i]; + for (size_t i = 0; i < dso__symbol_names_len(dso); i++) { + struct symbol *pos = dso__symbol_names(dso)[i]; if (strfilter__compare(_filter, pos->name)) printf("%s\n", pos->name); diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index b072ac5d3bc2..e16257d5ab2c 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -320,10 +320,10 @@ static SV *perl_process_callchain(struct perf_sample *sample, const char *dsoname = "[unknown]"; if (dso) { - if (symbol_conf.show_kernel_path && dso->long_name) - dsoname = dso->long_name; + if (symbol_conf.show_kernel_path && dso__long_name(dso)) + dsoname = dso__long_name(dso); else - dsoname = dso->name; + dsoname = dso__name(dso); } if (!hv_stores(elem, "dso", newSVpv(dsoname,0))) { hv_undef(elem); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 8aa301948de5..c2caa5720299 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -393,10 +393,10 @@ static const char *get_dsoname(struct map *map) struct dso *dso = map ? map__dso(map) : NULL; if (dso) { - if (symbol_conf.show_kernel_path && dso->long_name) - dsoname = dso->long_name; + if (symbol_conf.show_kernel_path && dso__long_name(dso)) + dsoname = dso__long_name(dso); else - dsoname = dso->name; + dsoname = dso__name(dso); } return dsoname; @@ -799,8 +799,9 @@ static void set_sym_in_dict(PyObject *dict, struct addr_location *al, if (al->map) { struct dso *dso = map__dso(al->map); - pydict_set_item_string_decref(dict, dso_field, _PyUnicode_FromString(dso->name)); - build_id__sprintf(&dso->bid, sbuild_id); + pydict_set_item_string_decref(dict, dso_field, + _PyUnicode_FromString(dso__name(dso))); + build_id__sprintf(dso__bid(dso), sbuild_id); pydict_set_item_string_decref(dict, dso_bid_field, _PyUnicode_FromString(sbuild_id)); pydict_set_item_string_decref(dict, dso_map_start, @@ -1246,14 +1247,14 @@ static int python_export_dso(struct db_export *dbe, struct dso *dso, char sbuild_id[SBUILD_ID_SIZE]; PyObject *t; - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); t = tuple_new(5); - tuple_set_d64(t, 0, dso->db_id); + tuple_set_d64(t, 0, dso__db_id(dso)); tuple_set_d64(t, 1, machine->db_id); - tuple_set_string(t, 2, dso->short_name); - tuple_set_string(t, 3, dso->long_name); + tuple_set_string(t, 2, dso__short_name(dso)); + tuple_set_string(t, 3, dso__long_name(dso)); tuple_set_string(t, 4, sbuild_id); call_object(tables->dso_handler, t, "dso_table"); @@ -1273,7 +1274,7 @@ static int python_export_symbol(struct db_export *dbe, struct symbol *sym, t = tuple_new(6); tuple_set_d64(t, 0, *sym_db_id); - tuple_set_d64(t, 1, dso->db_id); + tuple_set_d64(t, 1, dso__db_id(dso)); tuple_set_d64(t, 2, sym->start); tuple_set_d64(t, 3, sym->end); tuple_set_s32(t, 4, sym->binding); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index add8601c57fd..704664e5b4ea 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -239,11 +239,11 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r) return cmp_null(dso_r, dso_l); if (verbose > 0) { - dso_name_l = dso_l->long_name; - dso_name_r = dso_r->long_name; + dso_name_l = dso__long_name(dso_l); + dso_name_r = dso__long_name(dso_r); } else { - dso_name_l = dso_l->short_name; - dso_name_r = dso_r->short_name; + dso_name_l = dso__short_name(dso_l); + dso_name_r = dso__short_name(dso_r); } return strcmp(dso_name_l, dso_name_r); @@ -262,7 +262,7 @@ static int _hist_entry__dso_snprintf(struct map *map, char *bf, const char *dso_name = "[unknown]"; if (dso) - dso_name = verbose > 0 ? dso->long_name : dso->short_name; + dso_name = verbose > 0 ? dso__long_name(dso) : dso__short_name(dso); return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name); } @@ -364,7 +364,7 @@ static int _hist_entry__sym_snprintf(struct map_symbol *ms, char o = dso ? dso__symtab_origin(dso) : '!'; u64 rip = ip; - if (dso && dso->kernel && dso->adjust_symbols) + if (dso && dso__kernel(dso) && dso__adjust_symbols(dso)) rip = map__unmap_ip(map, ip); ret += repsep_snprintf(bf, size, "%-#*llx %c ", @@ -1586,8 +1586,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) */ if ((left->cpumode != PERF_RECORD_MISC_KERNEL) && - (!(map__flags(l_map) & MAP_SHARED)) && !l_dso->id.maj && !l_dso->id.min && - !l_dso->id.ino && !l_dso->id.ino_generation) { + (!(map__flags(l_map) & MAP_SHARED)) && !dso__id(l_dso)->maj && !dso__id(l_dso)->min && + !dso__id(l_dso)->ino && !dso__id(l_dso)->ino_generation) { /* userspace anonymous */ if (thread__pid(left->thread) > thread__pid(right->thread)) @@ -1626,7 +1626,8 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf, if ((he->cpumode != PERF_RECORD_MISC_KERNEL) && map && !(map__prot(map) & PROT_EXEC) && (map__flags(map) & MAP_SHARED) && - (dso->id.maj || dso->id.min || dso->id.ino || dso->id.ino_generation)) + (dso__id(dso)->maj || dso__id(dso)->min || dso__id(dso)->ino || + dso__id(dso)->ino_generation)) level = 's'; else if (!map) level = 'X'; diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 7addc34afcf5..9d670d8c1c08 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -27,14 +27,14 @@ bool srcline_full_filename; char *srcline__unknown = (char *)"??:0"; -static const char *dso__name(struct dso *dso) +static const char *srcline_dso_name(struct dso *dso) { const char *dso_name; - if (dso->symsrc_filename) - dso_name = dso->symsrc_filename; + if (dso__symsrc_filename(dso)) + dso_name = dso__symsrc_filename(dso); else - dso_name = dso->long_name; + dso_name = dso__long_name(dso); if (dso_name[0] == '[') return NULL; @@ -638,7 +638,7 @@ static int addr2line(const char *dso_name, u64 addr, struct inline_node *node, struct symbol *sym __maybe_unused) { - struct child_process *a2l = dso->a2l; + struct child_process *a2l = dso__a2l(dso); char *record_function = NULL; char *record_filename = NULL; unsigned int record_line_nr = 0; @@ -655,8 +655,9 @@ static int addr2line(const char *dso_name, u64 addr, if (!filename__has_section(dso_name, ".debug_line")) goto out; - dso->a2l = addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name); - a2l = dso->a2l; + dso__set_a2l(dso, + addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name)); + a2l = dso__a2l(dso); } if (a2l == NULL) { @@ -770,7 +771,7 @@ out: free(record_function); free(record_filename); if (io.eof) { - dso->a2l = NULL; + dso__set_a2l(dso, NULL); addr2line_subprocess_cleanup(a2l); } return ret; @@ -778,14 +779,14 @@ out: void dso__free_a2l(struct dso *dso) { - struct child_process *a2l = dso->a2l; + struct child_process *a2l = dso__a2l(dso); if (!a2l) return; addr2line_subprocess_cleanup(a2l); - dso->a2l = NULL; + dso__set_a2l(dso, NULL); } #endif /* HAVE_LIBBFD_SUPPORT */ @@ -823,33 +824,34 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, char *srcline; const char *dso_name; - if (!dso->has_srcline) + if (!dso__has_srcline(dso)) goto out; - dso_name = dso__name(dso); + dso_name = srcline_dso_name(dso); if (dso_name == NULL) - goto out; + goto out_err; if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines, NULL, sym)) - goto out; + goto out_err; srcline = srcline_from_fileline(file, line); free(file); if (!srcline) - goto out; + goto out_err; - dso->a2l_fails = 0; + dso__set_a2l_fails(dso, 0); return srcline; -out: - if (dso->a2l_fails && ++dso->a2l_fails > A2L_FAIL_LIMIT) { - dso->has_srcline = 0; +out_err: + dso__set_a2l_fails(dso, dso__a2l_fails(dso) + 1); + if (dso__a2l_fails(dso) > A2L_FAIL_LIMIT) { + dso__set_has_srcline(dso, false); dso__free_a2l(dso); } - +out: if (!show_addr) return (show_sym && sym) ? strndup(sym->name, sym->namelen) : SRCLINE_UNKNOWN; @@ -858,7 +860,7 @@ out: if (asprintf(&srcline, "%s+%" PRIu64, show_sym ? sym->name : "", ip - sym->start) < 0) return SRCLINE_UNKNOWN; - } else if (asprintf(&srcline, "%s[%" PRIx64 "]", dso->short_name, addr) < 0) + } else if (asprintf(&srcline, "%s[%" PRIx64 "]", dso__short_name(dso), addr) < 0) return SRCLINE_UNKNOWN; return srcline; } @@ -869,22 +871,23 @@ char *get_srcline_split(struct dso *dso, u64 addr, unsigned *line) char *file = NULL; const char *dso_name; - if (!dso->has_srcline) - goto out; + if (!dso__has_srcline(dso)) + return NULL; - dso_name = dso__name(dso); + dso_name = srcline_dso_name(dso); if (dso_name == NULL) - goto out; + goto out_err; if (!addr2line(dso_name, addr, &file, line, dso, true, NULL, NULL)) - goto out; + goto out_err; - dso->a2l_fails = 0; + dso__set_a2l_fails(dso, 0); return file; -out: - if (dso->a2l_fails && ++dso->a2l_fails > A2L_FAIL_LIMIT) { - dso->has_srcline = 0; +out_err: + dso__set_a2l_fails(dso, dso__a2l_fails(dso) + 1); + if (dso__a2l_fails(dso) > A2L_FAIL_LIMIT) { + dso__set_has_srcline(dso, false); dso__free_a2l(dso); } @@ -982,7 +985,7 @@ struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr, { const char *dso_name; - dso_name = dso__name(dso); + dso_name = srcline_dso_name(dso); if (dso_name == NULL) return NULL; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 0b91f813c4fa..3be5e8d1e278 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -174,7 +174,7 @@ static inline bool elf_sec__is_data(const GElf_Shdr *shdr, static bool elf_sec__filter(GElf_Shdr *shdr, Elf_Data *secstrs) { - return elf_sec__is_text(shdr, secstrs) || + return elf_sec__is_text(shdr, secstrs) || elf_sec__is_data(shdr, secstrs); } @@ -312,8 +312,8 @@ static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name) * DWARF DW_compile_unit has this, but we don't always have access * to it... */ - if (!want_demangle(dso->kernel || kmodule)) - return demangled; + if (!want_demangle(dso__kernel(dso) || kmodule)) + return demangled; demangled = cxx_demangle_sym(elf_name, verbose > 0, verbose > 0); if (demangled == NULL) { @@ -470,7 +470,7 @@ static bool get_plt_sizes(struct dso *dso, GElf_Ehdr *ehdr, GElf_Shdr *shdr_plt, } if (*plt_entry_size) return true; - pr_debug("Missing PLT entry size for %s\n", dso->long_name); + pr_debug("Missing PLT entry size for %s\n", dso__long_name(dso)); return false; } @@ -654,7 +654,7 @@ static int dso__synthesize_plt_got_symbols(struct dso *dso, Elf *elf, sym = symbol__new(shdr.sh_offset + i, shdr.sh_entsize, STB_GLOBAL, STT_FUNC, buf); if (!sym) goto out; - symbols__insert(&dso->symbols, sym); + symbols__insert(dso__symbols(dso), sym); } err = 0; out: @@ -708,7 +708,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss) plt_sym = symbol__new(shdr_plt.sh_offset, plt_header_size, STB_GLOBAL, STT_FUNC, ".plt"); if (!plt_sym) goto out_elf_end; - symbols__insert(&dso->symbols, plt_sym); + symbols__insert(dso__symbols(dso), plt_sym); /* Only x86 has .plt.got */ if (machine_is_x86(ehdr.e_machine) && @@ -830,7 +830,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss) goto out_elf_end; plt_offset += plt_entry_size; - symbols__insert(&dso->symbols, f); + symbols__insert(dso__symbols(dso), f); ++nr; } @@ -840,7 +840,7 @@ out_elf_end: if (err == 0) return nr; pr_debug("%s: problems reading %s PLT info.\n", - __func__, dso->long_name); + __func__, dso__long_name(dso)); return 0; } @@ -1175,19 +1175,19 @@ static int dso__swap_init(struct dso *dso, unsigned char eidata) { static unsigned int const endian = 1; - dso->needs_swap = DSO_SWAP__NO; + dso__set_needs_swap(dso, DSO_SWAP__NO); switch (eidata) { case ELFDATA2LSB: /* We are big endian, DSO is little endian. */ if (*(unsigned char const *)&endian != 1) - dso->needs_swap = DSO_SWAP__YES; + dso__set_needs_swap(dso, DSO_SWAP__YES); break; case ELFDATA2MSB: /* We are little endian, DSO is big endian. */ if (*(unsigned char const *)&endian != 0) - dso->needs_swap = DSO_SWAP__YES; + dso__set_needs_swap(dso, DSO_SWAP__YES); break; default: @@ -1238,11 +1238,11 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, if (fd < 0) return -1; - type = dso->symtab_type; + type = dso__symtab_type(dso); } else { fd = open(name, O_RDONLY); if (fd < 0) { - dso->load_errno = errno; + *dso__load_errno(dso) = errno; return -1; } } @@ -1250,37 +1250,37 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); if (elf == NULL) { pr_debug("%s: cannot read %s ELF file.\n", __func__, name); - dso->load_errno = DSO_LOAD_ERRNO__INVALID_ELF; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__INVALID_ELF; goto out_close; } if (gelf_getehdr(elf, &ehdr) == NULL) { - dso->load_errno = DSO_LOAD_ERRNO__INVALID_ELF; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__INVALID_ELF; pr_debug("%s: cannot get elf header.\n", __func__); goto out_elf_end; } if (dso__swap_init(dso, ehdr.e_ident[EI_DATA])) { - dso->load_errno = DSO_LOAD_ERRNO__INTERNAL_ERROR; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__INTERNAL_ERROR; goto out_elf_end; } /* Always reject images with a mismatched build-id: */ - if (dso->has_build_id && !symbol_conf.ignore_vmlinux_buildid) { + if (dso__has_build_id(dso) && !symbol_conf.ignore_vmlinux_buildid) { u8 build_id[BUILD_ID_SIZE]; struct build_id bid; int size; size = elf_read_build_id(elf, build_id, BUILD_ID_SIZE); if (size <= 0) { - dso->load_errno = DSO_LOAD_ERRNO__CANNOT_READ_BUILDID; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__CANNOT_READ_BUILDID; goto out_elf_end; } build_id__init(&bid, build_id, size); if (!dso__build_id_equal(dso, &bid)) { pr_debug("%s: build id mismatch for %s.\n", __func__, name); - dso->load_errno = DSO_LOAD_ERRNO__MISMATCHING_BUILDID; + *dso__load_errno(dso) = DSO_LOAD_ERRNO__MISMATCHING_BUILDID; goto out_elf_end; } } @@ -1305,14 +1305,14 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, if (ss->opdshdr.sh_type != SHT_PROGBITS) ss->opdsec = NULL; - if (dso->kernel == DSO_SPACE__USER) + if (dso__kernel(dso) == DSO_SPACE__USER) ss->adjust_symbols = true; else ss->adjust_symbols = elf__needs_adjust_symbols(ehdr); ss->name = strdup(name); if (!ss->name) { - dso->load_errno = errno; + *dso__load_errno(dso) = errno; goto out_elf_end; } @@ -1432,7 +1432,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, if (adjust_kernel_syms) sym->st_value -= shdr->sh_addr - shdr->sh_offset; - if (strcmp(section_name, (curr_dso->short_name + dso->short_name_len)) == 0) + if (strcmp(section_name, (dso__short_name(curr_dso) + dso__short_name_len(dso))) == 0) return 0; if (strcmp(section_name, ".text") == 0) { @@ -1441,7 +1441,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, * kallsyms and identity maps. Overwrite it to * map to the kernel dso. */ - if (*remap_kernel && dso->kernel && !kmodule) { + if (*remap_kernel && dso__kernel(dso) && !kmodule) { *remap_kernel = false; map__set_start(map, shdr->sh_addr + ref_reloc(kmap)); map__set_end(map, map__start(map) + shdr->sh_size); @@ -1489,7 +1489,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, return 0; } - snprintf(dso_name, sizeof(dso_name), "%s%s", dso->short_name, section_name); + snprintf(dso_name, sizeof(dso_name), "%s%s", dso__short_name(dso), section_name); curr_map = maps__find_by_name(kmaps, dso_name); if (curr_map == NULL) { @@ -1501,17 +1501,17 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, curr_dso = dso__new(dso_name); if (curr_dso == NULL) return -1; - curr_dso->kernel = dso->kernel; - curr_dso->long_name = dso->long_name; - curr_dso->long_name_len = dso->long_name_len; - curr_dso->binary_type = dso->binary_type; - curr_dso->adjust_symbols = dso->adjust_symbols; + dso__set_kernel(curr_dso, dso__kernel(dso)); + RC_CHK_ACCESS(curr_dso)->long_name = dso__long_name(dso); + RC_CHK_ACCESS(curr_dso)->long_name_len = dso__long_name_len(dso); + dso__set_binary_type(curr_dso, dso__binary_type(dso)); + dso__set_adjust_symbols(curr_dso, dso__adjust_symbols(dso)); curr_map = map__new2(start, curr_dso); dso__put(curr_dso); if (curr_map == NULL) return -1; - if (curr_dso->kernel) + if (dso__kernel(curr_dso)) map__kmap(curr_map)->kmaps = kmaps; if (adjust_kernel_syms) { @@ -1521,7 +1521,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, } else { map__set_mapping_type(curr_map, MAPPING_TYPE__IDENTITY); } - curr_dso->symtab_type = dso->symtab_type; + dso__set_symtab_type(curr_dso, dso__symtab_type(dso)); if (maps__insert(kmaps, curr_map)) return -1; /* @@ -1547,7 +1547,7 @@ static int dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, int kmodule, int dynsym) { - struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL; + struct kmap *kmap = dso__kernel(dso) ? map__kmap(map) : NULL; struct maps *kmaps = kmap ? map__kmaps(map) : NULL; struct map *curr_map = map; struct dso *curr_dso = dso; @@ -1581,8 +1581,8 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, if (elf_section_by_name(runtime_ss->elf, &runtime_ss->ehdr, &tshdr, ".text", NULL)) { - dso->text_offset = tshdr.sh_addr - tshdr.sh_offset; - dso->text_end = tshdr.sh_offset + tshdr.sh_size; + dso__set_text_offset(dso, tshdr.sh_addr - tshdr.sh_offset); + dso__set_text_end(dso, tshdr.sh_offset + tshdr.sh_size); } if (runtime_ss->opdsec) @@ -1641,16 +1641,16 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, * attempted to prelink vdso to its virtual address. */ if (dso__is_vdso(dso)) - map__set_reloc(map, map__start(map) - dso->text_offset); + map__set_reloc(map, map__start(map) - dso__text_offset(dso)); - dso->adjust_symbols = runtime_ss->adjust_symbols || ref_reloc(kmap); + dso__set_adjust_symbols(dso, runtime_ss->adjust_symbols || ref_reloc(kmap)); /* * Initial kernel and module mappings do not map to the dso. * Flag the fixups. */ - if (dso->kernel) { + if (dso__kernel(dso)) { remap_kernel = true; - adjust_kernel_syms = dso->adjust_symbols; + adjust_kernel_syms = dso__adjust_symbols(dso); } if (kmodule && adjust_kernel_syms) @@ -1743,7 +1743,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, (sym.st_value & 1)) --sym.st_value; - if (dso->kernel) { + if (dso__kernel(dso)) { if (dso__process_kernel_symbol(dso, map, &sym, &shdr, kmaps, kmap, &curr_dso, &curr_map, section_name, adjust_kernel_syms, kmodule, &remap_kernel, max_text_sh_offset)) @@ -1792,7 +1792,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, arch__sym_update(f, &sym); - __symbols__insert(&curr_dso->symbols, f, dso->kernel); + __symbols__insert(dso__symbols(curr_dso), f, dso__kernel(dso)); nr++; } @@ -1800,8 +1800,8 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, * For misannotated, zeroed, ASM function sizes. */ if (nr > 0) { - symbols__fixup_end(&dso->symbols, false); - symbols__fixup_duplicate(&dso->symbols); + symbols__fixup_end(dso__symbols(dso), false); + symbols__fixup_duplicate(dso__symbols(dso)); if (kmap) { /* * We need to fixup this here too because we create new @@ -1821,16 +1821,16 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, int nr = 0; int err = -1; - dso->symtab_type = syms_ss->type; - dso->is_64_bit = syms_ss->is_64_bit; - dso->rel = syms_ss->ehdr.e_type == ET_REL; + dso__set_symtab_type(dso, syms_ss->type); + dso__set_is_64_bit(dso, syms_ss->is_64_bit); + dso__set_rel(dso, syms_ss->ehdr.e_type == ET_REL); /* * Modules may already have symbols from kallsyms, but those symbols * have the wrong values for the dso maps, so remove them. */ if (kmodule && syms_ss->symtab) - symbols__delete(&dso->symbols); + symbols__delete(dso__symbols(dso)); if (!syms_ss->symtab) { /* @@ -1838,7 +1838,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, * to using kallsyms. The vmlinux runtime symbols aren't * of much use. */ - if (dso->kernel) + if (dso__kernel(dso)) return err; } else { err = dso__load_sym_internal(dso, map, syms_ss, runtime_ss, diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 1da8b713509c..c6f369b5d893 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -273,7 +273,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, out_close: close(fd); out_errno: - dso->load_errno = errno; + RC_CHK_ACCESS(dso)->load_errno = errno; return -1; } @@ -348,7 +348,7 @@ int dso__load_sym(struct dso *dso, struct map *map __maybe_unused, ret = fd__is_64_bit(ss->fd); if (ret >= 0) - dso->is_64_bit = ret; + RC_CHK_ACCESS(dso)->is_64_bit = ret; if (filename__read_build_id(ss->name, &bid) > 0) dso__set_build_id(dso, &bid); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 9ebdb8e13c0b..7772a4d3e66c 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -532,52 +532,52 @@ static struct symbol *symbols__find_by_name(struct symbol *symbols[], void dso__reset_find_symbol_cache(struct dso *dso) { - dso->last_find_result.addr = 0; - dso->last_find_result.symbol = NULL; + dso__set_last_find_result_addr(dso, 0); + dso__set_last_find_result_symbol(dso, NULL); } void dso__insert_symbol(struct dso *dso, struct symbol *sym) { - __symbols__insert(&dso->symbols, sym, dso->kernel); + __symbols__insert(dso__symbols(dso), sym, dso__kernel(dso)); /* update the symbol cache if necessary */ - if (dso->last_find_result.addr >= sym->start && - (dso->last_find_result.addr < sym->end || + if (dso__last_find_result_addr(dso) >= sym->start && + (dso__last_find_result_addr(dso) < sym->end || sym->start == sym->end)) { - dso->last_find_result.symbol = sym; + dso__set_last_find_result_symbol(dso, sym); } } void dso__delete_symbol(struct dso *dso, struct symbol *sym) { - rb_erase_cached(&sym->rb_node, &dso->symbols); + rb_erase_cached(&sym->rb_node, dso__symbols(dso)); symbol__delete(sym); dso__reset_find_symbol_cache(dso); } struct symbol *dso__find_symbol(struct dso *dso, u64 addr) { - if (dso->last_find_result.addr != addr || dso->last_find_result.symbol == NULL) { - dso->last_find_result.addr = addr; - dso->last_find_result.symbol = symbols__find(&dso->symbols, addr); + if (dso__last_find_result_addr(dso) != addr || dso__last_find_result_symbol(dso) == NULL) { + dso__set_last_find_result_addr(dso, addr); + dso__set_last_find_result_symbol(dso, symbols__find(dso__symbols(dso), addr)); } - return dso->last_find_result.symbol; + return dso__last_find_result_symbol(dso); } struct symbol *dso__find_symbol_nocache(struct dso *dso, u64 addr) { - return symbols__find(&dso->symbols, addr); + return symbols__find(dso__symbols(dso), addr); } struct symbol *dso__first_symbol(struct dso *dso) { - return symbols__first(&dso->symbols); + return symbols__first(dso__symbols(dso)); } struct symbol *dso__last_symbol(struct dso *dso) { - return symbols__last(&dso->symbols); + return symbols__last(dso__symbols(dso)); } struct symbol *dso__next_symbol(struct symbol *sym) @@ -587,11 +587,11 @@ struct symbol *dso__next_symbol(struct symbol *sym) struct symbol *dso__next_symbol_by_name(struct dso *dso, size_t *idx) { - if (*idx + 1 >= dso->symbol_names_len) + if (*idx + 1 >= dso__symbol_names_len(dso)) return NULL; ++*idx; - return dso->symbol_names[*idx]; + return dso__symbol_names(dso)[*idx]; } /* @@ -599,27 +599,29 @@ struct symbol *dso__next_symbol_by_name(struct dso *dso, size_t *idx) */ struct symbol *dso__find_symbol_by_name(struct dso *dso, const char *name, size_t *idx) { - struct symbol *s = symbols__find_by_name(dso->symbol_names, dso->symbol_names_len, - name, SYMBOL_TAG_INCLUDE__NONE, idx); - if (!s) - s = symbols__find_by_name(dso->symbol_names, dso->symbol_names_len, - name, SYMBOL_TAG_INCLUDE__DEFAULT_ONLY, idx); + struct symbol *s = symbols__find_by_name(dso__symbol_names(dso), + dso__symbol_names_len(dso), + name, SYMBOL_TAG_INCLUDE__NONE, idx); + if (!s) { + s = symbols__find_by_name(dso__symbol_names(dso), dso__symbol_names_len(dso), + name, SYMBOL_TAG_INCLUDE__DEFAULT_ONLY, idx); + } return s; } void dso__sort_by_name(struct dso *dso) { - mutex_lock(&dso->lock); + mutex_lock(dso__lock(dso)); if (!dso__sorted_by_name(dso)) { size_t len; - dso->symbol_names = symbols__sort_by_name(&dso->symbols, &len); - if (dso->symbol_names) { - dso->symbol_names_len = len; + dso__set_symbol_names(dso, symbols__sort_by_name(dso__symbols(dso), &len)); + if (dso__symbol_names(dso)) { + dso__set_symbol_names_len(dso, len); dso__set_sorted_by_name(dso); } } - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); } /* @@ -746,7 +748,7 @@ static int map__process_kallsym_symbol(void *arg, const char *name, { struct symbol *sym; struct dso *dso = arg; - struct rb_root_cached *root = &dso->symbols; + struct rb_root_cached *root = dso__symbols(dso); if (!symbol_type__filter(type)) return 0; @@ -786,8 +788,8 @@ static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso) { struct symbol *pos; int count = 0; - struct rb_root_cached old_root = dso->symbols; - struct rb_root_cached *root = &dso->symbols; + struct rb_root_cached *root = dso__symbols(dso); + struct rb_root_cached old_root = *root; struct rb_node *next = rb_first_cached(root); if (!kmaps) @@ -821,13 +823,13 @@ static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso) pos->end = map__end(curr_map); if (pos->end) pos->end -= map__start(curr_map) - map__pgoff(curr_map); - symbols__insert(&curr_map_dso->symbols, pos); + symbols__insert(dso__symbols(curr_map_dso), pos); ++count; map__put(curr_map); } /* Symbols have been adjusted */ - dso->adjust_symbols = 1; + dso__set_adjust_symbols(dso, true); return count; } @@ -844,7 +846,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, struct map *curr_map = map__get(initial_map); struct symbol *pos; int count = 0, moved = 0; - struct rb_root_cached *root = &dso->symbols; + struct rb_root_cached *root = dso__symbols(dso); struct rb_node *next = rb_first_cached(root); int kernel_range = 0; bool x86_64; @@ -871,9 +873,9 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, *module++ = '\0'; curr_map_dso = map__dso(curr_map); - if (strcmp(curr_map_dso->short_name, module)) { + if (strcmp(dso__short_name(curr_map_dso), module)) { if (!RC_CHK_EQUAL(curr_map, initial_map) && - dso->kernel == DSO_SPACE__KERNEL_GUEST && + dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST && machine__is_default_guest(machine)) { /* * We assume all symbols of a module are @@ -896,7 +898,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, goto discard_symbol; } curr_map_dso = map__dso(curr_map); - if (curr_map_dso->loaded && + if (dso__loaded(curr_map_dso) && !machine__is_default_guest(machine)) goto discard_symbol; } @@ -932,7 +934,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, goto add_symbol; } - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) snprintf(dso_name, sizeof(dso_name), "[guest.kernel].%d", kernel_range++); @@ -946,7 +948,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, if (ndso == NULL) return -1; - ndso->kernel = dso->kernel; + dso__set_kernel(ndso, dso__kernel(dso)); curr_map = map__new2(pos->start, ndso); if (curr_map == NULL) { @@ -971,7 +973,7 @@ add_symbol: struct dso *curr_map_dso = map__dso(curr_map); rb_erase_cached(&pos->rb_node, root); - symbols__insert(&curr_map_dso->symbols, pos); + symbols__insert(dso__symbols(curr_map_dso), pos); ++moved; } else ++count; @@ -983,7 +985,7 @@ discard_symbol: } if (!RC_CHK_EQUAL(curr_map, initial_map) && - dso->kernel == DSO_SPACE__KERNEL_GUEST && + dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST && machine__is_default_guest(maps__machine(kmaps))) { dso__set_loaded(map__dso(curr_map)); } @@ -1157,7 +1159,7 @@ static int do_validate_kcore_modules_cb(struct map *old_map, void *data) dso = map__dso(old_map); /* Module must be in memory at the same address */ - mi = find_module(dso->short_name, modules); + mi = find_module(dso__short_name(dso), modules); if (!mi || mi->start != map__start(old_map)) return -EINVAL; @@ -1326,7 +1328,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map, &is_64_bit); if (err) goto out_err; - dso->is_64_bit = is_64_bit; + dso__set_is_64_bit(dso, is_64_bit); if (list_empty(&md.maps)) { err = -EINVAL; @@ -1418,10 +1420,10 @@ static int dso__load_kcore(struct dso *dso, struct map *map, * Set the data type and long name so that kcore can be read via * dso__data_read_addr(). */ - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) - dso->binary_type = DSO_BINARY_TYPE__GUEST_KCORE; + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) + dso__set_binary_type(dso, DSO_BINARY_TYPE__GUEST_KCORE); else - dso->binary_type = DSO_BINARY_TYPE__KCORE; + dso__set_binary_type(dso, DSO_BINARY_TYPE__KCORE); dso__set_long_name(dso, strdup(kcore_filename), true); close(fd); @@ -1482,13 +1484,13 @@ int __dso__load_kallsyms(struct dso *dso, const char *filename, if (kallsyms__delta(kmap, filename, &delta)) return -1; - symbols__fixup_end(&dso->symbols, true); - symbols__fixup_duplicate(&dso->symbols); + symbols__fixup_end(dso__symbols(dso), true); + symbols__fixup_duplicate(dso__symbols(dso)); - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) - dso->symtab_type = DSO_BINARY_TYPE__GUEST_KALLSYMS; + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) + dso__set_symtab_type(dso, DSO_BINARY_TYPE__GUEST_KALLSYMS); else - dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS; + dso__set_symtab_type(dso, DSO_BINARY_TYPE__KALLSYMS); if (!no_kcore && !dso__load_kcore(dso, map, filename)) return maps__split_kallsyms_for_kcore(kmap->kmaps, dso); @@ -1544,7 +1546,7 @@ static int dso__load_perf_map(const char *map_path, struct dso *dso) if (sym == NULL) goto out_delete_line; - symbols__insert(&dso->symbols, sym); + symbols__insert(dso__symbols(dso), sym); nr_syms++; } @@ -1670,15 +1672,15 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) if (!symbol) goto out_free; - symbols__insert(&dso->symbols, symbol); + symbols__insert(dso__symbols(dso), symbol); } #ifdef bfd_get_section #undef bfd_asymbol_section #endif - symbols__fixup_end(&dso->symbols, false); - symbols__fixup_duplicate(&dso->symbols); - dso->adjust_symbols = 1; + symbols__fixup_end(dso__symbols(dso), false); + symbols__fixup_duplicate(dso__symbols(dso)); + dso__set_adjust_symbols(dso, true); err = 0; out_free: @@ -1701,17 +1703,17 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, case DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO: case DSO_BINARY_TYPE__BUILDID_DEBUGINFO: case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO: - return !kmod && dso->kernel == DSO_SPACE__USER; + return !kmod && dso__kernel(dso) == DSO_SPACE__USER; case DSO_BINARY_TYPE__KALLSYMS: case DSO_BINARY_TYPE__VMLINUX: case DSO_BINARY_TYPE__KCORE: - return dso->kernel == DSO_SPACE__KERNEL; + return dso__kernel(dso) == DSO_SPACE__KERNEL; case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__GUEST_VMLINUX: case DSO_BINARY_TYPE__GUEST_KCORE: - return dso->kernel == DSO_SPACE__KERNEL_GUEST; + return dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST; case DSO_BINARY_TYPE__GUEST_KMODULE: case DSO_BINARY_TYPE__GUEST_KMODULE_COMP: @@ -1721,7 +1723,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, * kernel modules know their symtab type - it's set when * creating a module dso in machine__addnew_module_map(). */ - return kmod && dso->symtab_type == type; + return kmod && dso__symtab_type(dso) == type; case DSO_BINARY_TYPE__BUILD_ID_CACHE: case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO: @@ -1789,18 +1791,19 @@ int dso__load(struct dso *dso, struct map *map) struct build_id bid; struct nscookie nsc; char newmapname[PATH_MAX]; - const char *map_path = dso->long_name; + const char *map_path = dso__long_name(dso); - mutex_lock(&dso->lock); - perfmap = strncmp(dso->name, "/tmp/perf-", 10) == 0; + mutex_lock(dso__lock(dso)); + perfmap = strncmp(dso__name(dso), "/tmp/perf-", 10) == 0; if (perfmap) { - if (dso->nsinfo && (dso__find_perf_map(newmapname, - sizeof(newmapname), &dso->nsinfo) == 0)) { + if (dso__nsinfo(dso) && + (dso__find_perf_map(newmapname, sizeof(newmapname), + dso__nsinfo_ptr(dso)) == 0)) { map_path = newmapname; } } - nsinfo__mountns_enter(dso->nsinfo, &nsc); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); /* check again under the dso->lock */ if (dso__loaded(dso)) { @@ -1808,15 +1811,15 @@ int dso__load(struct dso *dso, struct map *map) goto out; } - kmod = dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE || - dso->symtab_type == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; + kmod = dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE || + dso__symtab_type(dso) == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE || + dso__symtab_type(dso) == DSO_BINARY_TYPE__GUEST_KMODULE_COMP; - if (dso->kernel && !kmod) { - if (dso->kernel == DSO_SPACE__KERNEL) + if (dso__kernel(dso) && !kmod) { + if (dso__kernel(dso) == DSO_SPACE__KERNEL) ret = dso__load_kernel_sym(dso, map); - else if (dso->kernel == DSO_SPACE__KERNEL_GUEST) + else if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) ret = dso__load_guest_kernel_sym(dso, map); machine = maps__machine(map__kmaps(map)); @@ -1825,12 +1828,13 @@ int dso__load(struct dso *dso, struct map *map) goto out; } - dso->adjust_symbols = 0; + dso__set_adjust_symbols(dso, false); if (perfmap) { ret = dso__load_perf_map(map_path, dso); - dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT : - DSO_BINARY_TYPE__NOT_FOUND; + dso__set_symtab_type(dso, ret > 0 + ? DSO_BINARY_TYPE__JAVA_JIT + : DSO_BINARY_TYPE__NOT_FOUND); goto out; } @@ -1845,9 +1849,9 @@ int dso__load(struct dso *dso, struct map *map) * Read the build id if possible. This is required for * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work */ - if (!dso->has_build_id && - is_regular_file(dso->long_name)) { - __symbol__join_symfs(name, PATH_MAX, dso->long_name); + if (!dso__has_build_id(dso) && + is_regular_file(dso__long_name(dso))) { + __symbol__join_symfs(name, PATH_MAX, dso__long_name(dso)); if (filename__read_build_id(name, &bid) > 0) dso__set_build_id(dso, &bid); } @@ -1881,7 +1885,7 @@ int dso__load(struct dso *dso, struct map *map) nsinfo__mountns_exit(&nsc); is_reg = is_regular_file(name); - if (!is_reg && errno == ENOENT && dso->nsinfo) { + if (!is_reg && errno == ENOENT && dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, name); if (new_name) { is_reg = is_regular_file(new_name); @@ -1898,7 +1902,7 @@ int dso__load(struct dso *dso, struct map *map) sirc = symsrc__init(ss, dso, name, symtab_type); if (nsexit) - nsinfo__mountns_enter(dso->nsinfo, &nsc); + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); if (bfdrc == 0) { ret = 0; @@ -1911,8 +1915,8 @@ int dso__load(struct dso *dso, struct map *map) if (!syms_ss && symsrc__has_symtab(ss)) { syms_ss = ss; next_slot = true; - if (!dso->symsrc_filename) - dso->symsrc_filename = strdup(name); + if (!dso__symsrc_filename(dso)) + dso__set_symsrc_filename(dso, strdup(name)); } if (!runtime_ss && symsrc__possibly_runtime(ss)) { @@ -1959,11 +1963,11 @@ int dso__load(struct dso *dso, struct map *map) symsrc__destroy(&ss_[ss_pos - 1]); out_free: free(name); - if (ret < 0 && strstr(dso->name, " (deleted)") != NULL) + if (ret < 0 && strstr(dso__name(dso), " (deleted)") != NULL) ret = 0; out: dso__set_loaded(dso); - mutex_unlock(&dso->lock); + mutex_unlock(dso__lock(dso)); nsinfo__mountns_exit(&nsc); return ret; @@ -1982,7 +1986,7 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, else symbol__join_symfs(symfs_vmlinux, vmlinux); - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) symtab_type = DSO_BINARY_TYPE__GUEST_VMLINUX; else symtab_type = DSO_BINARY_TYPE__VMLINUX; @@ -1995,10 +1999,10 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, * an incorrect long name unless we set it here first. */ dso__set_long_name(dso, vmlinux, vmlinux_allocated); - if (dso->kernel == DSO_SPACE__KERNEL_GUEST) - dso->binary_type = DSO_BINARY_TYPE__GUEST_VMLINUX; + if (dso__kernel(dso) == DSO_SPACE__KERNEL_GUEST) + dso__set_binary_type(dso, DSO_BINARY_TYPE__GUEST_VMLINUX); else - dso->binary_type = DSO_BINARY_TYPE__VMLINUX; + dso__set_binary_type(dso, DSO_BINARY_TYPE__VMLINUX); err = dso__load_sym(dso, map, &ss, &ss, 0); symsrc__destroy(&ss); @@ -2091,7 +2095,7 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map) bool is_host = false; char path[PATH_MAX]; - if (!dso->has_build_id) { + if (!dso__has_build_id(dso)) { /* * Last resort, if we don't have a build-id and couldn't find * any vmlinux file, try the running kernel kallsyms table. @@ -2116,7 +2120,7 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map) goto proc_kallsyms; } - build_id__sprintf(&dso->bid, sbuild_id); + build_id__sprintf(dso__bid(dso), sbuild_id); /* Find kallsyms in build-id cache with kcore */ scnprintf(path, sizeof(path), "%s/%s/%s", @@ -2209,7 +2213,7 @@ do_kallsyms: free(kallsyms_allocated_filename); if (err > 0 && !dso__is_kcore(dso)) { - dso->binary_type = DSO_BINARY_TYPE__KALLSYMS; + dso__set_binary_type(dso, DSO_BINARY_TYPE__KALLSYMS); dso__set_long_name(dso, DSO__NAME_KALLSYMS, false); map__fixup_start(map); map__fixup_end(map); @@ -2252,7 +2256,7 @@ static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map) if (err > 0) pr_debug("Using %s for symbols\n", kallsyms_filename); if (err > 0 && !dso__is_kcore(dso)) { - dso->binary_type = DSO_BINARY_TYPE__GUEST_KALLSYMS; + dso__set_binary_type(dso, DSO_BINARY_TYPE__GUEST_KALLSYMS); dso__set_long_name(dso, machine->mmap_name, false); map__fixup_start(map); map__fixup_end(map); diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c index 088f4abf230f..53e1af4ed9ac 100644 --- a/tools/perf/util/symbol_fprintf.c +++ b/tools/perf/util/symbol_fprintf.c @@ -64,8 +64,8 @@ size_t dso__fprintf_symbols_by_name(struct dso *dso, { size_t ret = 0; - for (size_t i = 0; i < dso->symbol_names_len; i++) { - struct symbol *pos = dso->symbol_names[i]; + for (size_t i = 0; i < dso__symbol_names_len(dso); i++) { + struct symbol *pos = dso__symbol_names(dso)[i]; ret += fprintf(fp, "%s\n", pos->name); } diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 2a0289c14959..5498048f56ea 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -385,8 +385,8 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, id.ino_generation = event->ino_generation; dso = dsos__findnew_id(&machine->dsos, event->filename, &id); - if (dso && dso->has_build_id) { - bid = dso->bid; + if (dso && dso__has_build_id(dso)) { + bid = *dso__bid(dso); rc = 0; goto out; } @@ -407,7 +407,7 @@ out: event->__reserved_1 = 0; event->__reserved_2 = 0; - if (dso && !dso->has_build_id) + if (dso && !dso__has_build_id(dso)) dso__set_build_id(dso, &bid); } else { if (event->filename[0] == '/') { @@ -684,7 +684,7 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) dso = map__dso(map); if (symbol_conf.buildid_mmap2) { - size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); + size = PERF_ALIGN(dso__long_name_len(dso) + 1, sizeof(u64)); event->mmap2.header.type = PERF_RECORD_MMAP2; event->mmap2.header.size = (sizeof(event->mmap2) - (sizeof(event->mmap2.filename) - size)); @@ -694,11 +694,11 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) event->mmap2.len = map__size(map); event->mmap2.pid = args->machine->pid; - memcpy(event->mmap2.filename, dso->long_name, dso->long_name_len + 1); + memcpy(event->mmap2.filename, dso__long_name(dso), dso__long_name_len(dso) + 1); perf_record_mmap2__read_build_id(&event->mmap2, args->machine, false); } else { - size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); + size = PERF_ALIGN(dso__long_name_len(dso) + 1, sizeof(u64)); event->mmap.header.type = PERF_RECORD_MMAP; event->mmap.header.size = (sizeof(event->mmap) - (sizeof(event->mmap.filename) - size)); @@ -708,7 +708,7 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) event->mmap.len = map__size(map); event->mmap.pid = args->machine->pid; - memcpy(event->mmap.filename, dso->long_name, dso->long_name_len + 1); + memcpy(event->mmap.filename, dso__long_name(dso), dso__long_name_len(dso) + 1); } if (perf_tool__process_synth_event(args->tool, event, args->machine, args->process) != 0) @@ -2231,20 +2231,20 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 union perf_event ev; size_t len; - if (!pos->hit) + if (!dso__hit(pos)) return 0; memset(&ev, 0, sizeof(ev)); - len = pos->long_name_len + 1; + len = dso__long_name_len(pos) + 1; len = PERF_ALIGN(len, NAME_ALIGN); - ev.build_id.size = min(pos->bid.size, sizeof(pos->bid.data)); - memcpy(&ev.build_id.build_id, pos->bid.data, ev.build_id.size); + ev.build_id.size = min(dso__bid(pos)->size, sizeof(dso__bid(pos)->data)); + memcpy(&ev.build_id.build_id, dso__bid(pos)->data, ev.build_id.size); ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID; ev.build_id.header.misc = misc | PERF_RECORD_MISC_BUILD_ID_SIZE; ev.build_id.pid = machine->pid; ev.build_id.header.size = sizeof(ev.build_id) + len; - memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); + memcpy(&ev.build_id.filename, dso__long_name(pos), dso__long_name_len(pos)); return process(tool, &ev, NULL, machine); } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 1aa8962dcf52..0a473112f881 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -457,14 +457,14 @@ int thread__memcpy(struct thread *thread, struct machine *machine, dso = map__dso(al.map); - if (!dso || dso->data.status == DSO_DATA_STATUS_ERROR || map__load(al.map) < 0) { + if (!dso || dso__data(dso)->status == DSO_DATA_STATUS_ERROR || map__load(al.map) < 0) { addr_location__exit(&al); return -1; } offset = map__map_ip(al.map, ip); if (is64bit) - *is64bit = dso->is_64_bit; + *is64bit = dso__is_64_bit(dso); addr_location__exit(&al); diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 6a5ac0faa6f4..cde267ea3e99 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -329,27 +329,27 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct unwind_info *ui, }; int ret, fd; - if (dso->data.eh_frame_hdr_offset == 0) { + if (dso__data(dso)->eh_frame_hdr_offset == 0) { fd = dso__data_get_fd(dso, ui->machine); if (fd < 0) return -EINVAL; /* Check the .eh_frame section for unwinding info */ ret = elf_section_address_and_offset(fd, ".eh_frame_hdr", - &dso->data.eh_frame_hdr_addr, - &dso->data.eh_frame_hdr_offset); - dso->data.elf_base_addr = elf_base_address(fd); + &dso__data(dso)->eh_frame_hdr_addr, + &dso__data(dso)->eh_frame_hdr_offset); + dso__data(dso)->elf_base_addr = elf_base_address(fd); dso__data_put_fd(dso); - if (ret || dso->data.eh_frame_hdr_offset == 0) + if (ret || dso__data(dso)->eh_frame_hdr_offset == 0) return -EINVAL; } maps__for_each_map(thread__maps(ui->thread), read_unwind_spec_eh_frame_maps_cb, &args); - args.base_addr -= dso->data.elf_base_addr; + args.base_addr -= dso__data(dso)->elf_base_addr; /* Address of .eh_frame_hdr */ - *segbase = args.base_addr + dso->data.eh_frame_hdr_addr; - ret = unwind_spec_ehframe(dso, ui->machine, dso->data.eh_frame_hdr_offset, + *segbase = args.base_addr + dso__data(dso)->eh_frame_hdr_addr; + ret = unwind_spec_ehframe(dso, ui->machine, dso__data(dso)->eh_frame_hdr_offset, table_data, fde_count); if (ret) return ret; @@ -460,7 +460,7 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, return -EINVAL; } - pr_debug("unwind: find_proc_info dso %s\n", dso->name); + pr_debug("unwind: find_proc_info dso %s\n", dso__name(dso)); /* Check the .eh_frame section for unwinding info */ if (!read_unwind_spec_eh_frame(dso, ui, &table_data, &segbase, &fde_count)) { diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c index 2728eb4f13ea..cb8be6acfb6f 100644 --- a/tools/perf/util/unwind-libunwind.c +++ b/tools/perf/util/unwind-libunwind.c @@ -25,7 +25,7 @@ int unwind__prepare_access(struct maps *maps, struct map *map, bool *initialized return 0; if (maps__addr_space(maps)) { - pr_debug("unwind: thread map already set, dso=%s\n", dso->name); + pr_debug("unwind: thread map already set, dso=%s\n", dso__name(dso)); if (initialized) *initialized = true; return 0; diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index 35532dcbff74..1b6f8f6db7aa 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -148,7 +148,7 @@ static int machine__thread_dso_type_maps_cb(struct map *map, void *data) struct machine__thread_dso_type_maps_cb_args *args = data; struct dso *dso = map__dso(map); - if (!dso || dso->long_name[0] != '/') + if (!dso || dso__long_name(dso)[0] != '/') return 0; args->dso_type = dso__type(dso, args->machine); @@ -361,7 +361,7 @@ struct dso *machine__findnew_vdso(struct machine *machine, bool dso__is_vdso(struct dso *dso) { - return !strcmp(dso->short_name, DSO__NAME_VDSO) || - !strcmp(dso->short_name, DSO__NAME_VDSO32) || - !strcmp(dso->short_name, DSO__NAME_VDSOX32); + return !strcmp(dso__short_name(dso), DSO__NAME_VDSO) || + !strcmp(dso__short_name(dso), DSO__NAME_VDSO32) || + !strcmp(dso__short_name(dso), DSO__NAME_VDSOX32); } -- cgit v1.2.3 From 7fdc33f84261466591fc00e66168d9c809e7e4c0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 6 May 2024 11:01:01 -0700 Subject: perf map: Add missing dso__put() in map__new() A dso__put() is needed for the dsos__find() when the map is created and a buildid is sought. Fixes: f649ed80f3cabbf1 ("perf dsos: Tidy reference counting and locking") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Changbin Du Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: https://lore.kernel.org/r/20240506180104.485674-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 117c4bb78b35..e1d14936a60d 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -200,6 +200,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, dso__set_build_id(dso, dso__bid(header_bid_dso)); dso__set_header_build_id(dso, 1); } + dso__put(header_bid_dso); } dso__put(dso); } -- cgit v1.2.3 From ee5061f82449f7ac867a39daae64c9f9681156e8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 6 May 2024 11:01:02 -0700 Subject: perf symbol-elf: Ensure dso__put() in machine__process_ksymbol_register() The dso__put() after the map creation causes a use after put in dso__set_loaded(). To ensure there is a +1 reference count on both sides of the if-else, do a dso__get() on the found map's dso. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: https://lore.kernel.org/r/20240506180104.485674-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 0b8fb14f5ff6..a3ff2ab154bd 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -683,7 +683,7 @@ static int machine__process_ksymbol_register(struct machine *machine, struct perf_sample *sample __maybe_unused) { struct symbol *sym; - struct dso *dso; + struct dso *dso = NULL; struct map *map = maps__find(machine__kernel_maps(machine), event->ksymbol.addr); int err = 0; @@ -696,7 +696,6 @@ static int machine__process_ksymbol_register(struct machine *machine, } dso__set_kernel(dso, DSO_SPACE__KERNEL); map = map__new2(0, dso); - dso__put(dso); if (!map) { err = -ENOMEM; goto out; @@ -722,7 +721,7 @@ static int machine__process_ksymbol_register(struct machine *machine, dso__set_long_name(dso, "", false); } } else { - dso = map__dso(map); + dso = dso__get(map__dso(map)); } sym = symbol__new(map__map_ip(map, map__start(map)), @@ -735,6 +734,7 @@ static int machine__process_ksymbol_register(struct machine *machine, dso__insert_symbol(dso, sym); out: map__put(map); + dso__put(dso); return err; } -- cgit v1.2.3 From 23106e318888849e95babe4c2bdc8a1a948ac36d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 6 May 2024 11:01:03 -0700 Subject: perf symbol-elf: dso__load_sym_internal() reference count fixes dso__load_sym_internal() passed curr_mapp as an out argument to dso__process_kernel_symbol(). The out argument was never used so remove it to simplify the reference counting logic. Simplify reference counting issues with curr_dso by ensuring the value it points to has a +1 reference count, and then putting as necessary. This avoids some reference counting games when the dso is created making the code more obviously correct with some possible introduced overhead due to the reference counting get/puts. This, however, silences reference count checking and we can always optimize from a seemingly correct point. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: https://lore.kernel.org/r/20240506180104.485674-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 51 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 3be5e8d1e278..e398abfd13a0 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1419,7 +1419,7 @@ void __weak arch__sym_update(struct symbol *s __maybe_unused, static int dso__process_kernel_symbol(struct dso *dso, struct map *map, GElf_Sym *sym, GElf_Shdr *shdr, struct maps *kmaps, struct kmap *kmap, - struct dso **curr_dsop, struct map **curr_mapp, + struct dso **curr_dsop, const char *section_name, bool adjust_kernel_syms, bool kmodule, bool *remap_kernel, u64 max_text_sh_offset) @@ -1470,8 +1470,8 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, map__set_pgoff(map, shdr->sh_offset); } - *curr_mapp = map; - *curr_dsop = dso; + dso__put(*curr_dsop); + *curr_dsop = dso__get(dso); return 0; } @@ -1484,8 +1484,8 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, */ if (kmodule && adjust_kernel_syms && is_exe_text(shdr->sh_flags) && shdr->sh_offset <= max_text_sh_offset) { - *curr_mapp = map; - *curr_dsop = dso; + dso__put(*curr_dsop); + *curr_dsop = dso__get(dso); return 0; } @@ -1507,10 +1507,10 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, dso__set_binary_type(curr_dso, dso__binary_type(dso)); dso__set_adjust_symbols(curr_dso, dso__adjust_symbols(dso)); curr_map = map__new2(start, curr_dso); - dso__put(curr_dso); - if (curr_map == NULL) + if (curr_map == NULL) { + dso__put(curr_dso); return -1; - + } if (dso__kernel(curr_dso)) map__kmap(curr_map)->kmaps = kmaps; @@ -1524,21 +1524,15 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, dso__set_symtab_type(curr_dso, dso__symtab_type(dso)); if (maps__insert(kmaps, curr_map)) return -1; - /* - * Add it before we drop the reference to curr_map, i.e. while - * we still are sure to have a reference to this DSO via - * *curr_map->dso. - */ dsos__add(&maps__machine(kmaps)->dsos, curr_dso); - /* kmaps already got it */ - map__put(curr_map); dso__set_loaded(curr_dso); - *curr_mapp = curr_map; + dso__put(*curr_dsop); *curr_dsop = curr_dso; } else { - *curr_dsop = map__dso(curr_map); - map__put(curr_map); + dso__put(*curr_dsop); + *curr_dsop = dso__get(map__dso(curr_map)); } + map__put(curr_map); return 0; } @@ -1549,11 +1543,9 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, { struct kmap *kmap = dso__kernel(dso) ? map__kmap(map) : NULL; struct maps *kmaps = kmap ? map__kmaps(map) : NULL; - struct map *curr_map = map; - struct dso *curr_dso = dso; + struct dso *curr_dso = NULL; Elf_Data *symstrs, *secstrs, *secstrs_run, *secstrs_sym; uint32_t nr_syms; - int err = -1; uint32_t idx; GElf_Ehdr ehdr; GElf_Shdr shdr; @@ -1656,6 +1648,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, if (kmodule && adjust_kernel_syms) max_text_sh_offset = max_text_section(runtime_ss->elf, &runtime_ss->ehdr); + curr_dso = dso__get(dso); elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) { struct symbol *f; const char *elf_name = elf_sym__name(&sym, symstrs); @@ -1744,9 +1737,13 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, --sym.st_value; if (dso__kernel(dso)) { - if (dso__process_kernel_symbol(dso, map, &sym, &shdr, kmaps, kmap, &curr_dso, &curr_map, - section_name, adjust_kernel_syms, kmodule, - &remap_kernel, max_text_sh_offset)) + if (dso__process_kernel_symbol(dso, map, &sym, &shdr, + kmaps, kmap, &curr_dso, + section_name, + adjust_kernel_syms, + kmodule, + &remap_kernel, + max_text_sh_offset)) goto out_elf_end; } else if ((used_opd && runtime_ss->adjust_symbols) || (!used_opd && syms_ss->adjust_symbols)) { @@ -1795,6 +1792,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, __symbols__insert(dso__symbols(curr_dso), f, dso__kernel(dso)); nr++; } + dso__put(curr_dso); /* * For misannotated, zeroed, ASM function sizes. @@ -1810,9 +1808,10 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, maps__fixup_end(kmaps); } } - err = nr; + return nr; out_elf_end: - return err; + dso__put(curr_dso); + return -1; } int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, -- cgit v1.2.3 From 37862d6fdced70a72b5a06d8f3440f7c567d5272 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 6 May 2024 11:01:04 -0700 Subject: perf dso: Use container_of() to avoid a pointer in 'struct dso_data' The dso pointer in 'struct dso_data' is necessary for reference count checking to account for the dso_data forming a global list of open dso's with references to the dso. The dso pointer also allows for the indirection that reference count checking needs. Outside of reference count checking the indirection isn't needed and container_of() is more efficient and saves space. The reference count won't be increased by placing items onto the global list, matching how things were before the reference count checking change, but we assert the dso is in dsos holding it live (and that the set of open dsos is a subset of all dsos for the machine). Update the DSO data tests so that they use a dsos struct to make the invariant true. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: https://lore.kernel.org/r/20240506180104.485674-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/dso-data.c | 60 ++++++++++++++++++++++----------------------- tools/perf/util/dso.c | 16 +++++++++++- tools/perf/util/dso.h | 2 ++ 3 files changed, 46 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c index fde4eca84b6f..5286ae8bd2d7 100644 --- a/tools/perf/tests/dso-data.c +++ b/tools/perf/tests/dso-data.c @@ -10,6 +10,7 @@ #include #include #include "dso.h" +#include "dsos.h" #include "machine.h" #include "symbol.h" #include "tests.h" @@ -123,9 +124,10 @@ static int test__dso_data(struct test_suite *test __maybe_unused, int subtest __ TEST_ASSERT_VAL("No test file", file); memset(&machine, 0, sizeof(machine)); + dsos__init(&machine.dsos); - dso = dso__new((const char *)file); - + dso = dso__new(file); + TEST_ASSERT_VAL("Failed to add dso", !dsos__add(&machine.dsos, dso)); TEST_ASSERT_VAL("Failed to access to dso", dso__data_fd(dso, &machine) >= 0); @@ -170,6 +172,7 @@ static int test__dso_data(struct test_suite *test __maybe_unused, int subtest __ } dso__put(dso); + dsos__exit(&machine.dsos); unlink(file); return 0; } @@ -199,41 +202,35 @@ static long open_files_cnt(void) return nr - 1; } -static struct dso **dsos; - -static int dsos__create(int cnt, int size) +static int dsos__create(int cnt, int size, struct dsos *dsos) { int i; - dsos = malloc(sizeof(*dsos) * cnt); - TEST_ASSERT_VAL("failed to alloc dsos array", dsos); + dsos__init(dsos); for (i = 0; i < cnt; i++) { - char *file; + struct dso *dso; + char *file = test_file(size); - file = test_file(size); TEST_ASSERT_VAL("failed to get dso file", file); - - dsos[i] = dso__new(file); - TEST_ASSERT_VAL("failed to get dso", dsos[i]); + dso = dso__new(file); + TEST_ASSERT_VAL("failed to get dso", dso); + TEST_ASSERT_VAL("failed to add dso", !dsos__add(dsos, dso)); + dso__put(dso); } return 0; } -static void dsos__delete(int cnt) +static void dsos__delete(struct dsos *dsos) { - int i; - - for (i = 0; i < cnt; i++) { - struct dso *dso = dsos[i]; + for (unsigned int i = 0; i < dsos->cnt; i++) { + struct dso *dso = dsos->dsos[i]; dso__data_close(dso); unlink(dso__name(dso)); - dso__put(dso); } - - free(dsos); + dsos__exit(dsos); } static int set_fd_limit(int n) @@ -267,10 +264,10 @@ static int test__dso_data_cache(struct test_suite *test __maybe_unused, int subt /* and this is now our dso open FDs limit */ dso_cnt = limit / 2; TEST_ASSERT_VAL("failed to create dsos\n", - !dsos__create(dso_cnt, TEST_FILE_SIZE)); + !dsos__create(dso_cnt, TEST_FILE_SIZE, &machine.dsos)); for (i = 0; i < (dso_cnt - 1); i++) { - struct dso *dso = dsos[i]; + struct dso *dso = machine.dsos.dsos[i]; /* * Open dsos via dso__data_fd(), it opens the data @@ -290,17 +287,17 @@ static int test__dso_data_cache(struct test_suite *test __maybe_unused, int subt } /* verify the first one is already open */ - TEST_ASSERT_VAL("dsos[0] is not open", dso__data(dsos[0])->fd != -1); + TEST_ASSERT_VAL("dsos[0] is not open", dso__data(machine.dsos.dsos[0])->fd != -1); /* open +1 dso to reach the allowed limit */ - fd = dso__data_fd(dsos[i], &machine); + fd = dso__data_fd(machine.dsos.dsos[i], &machine); TEST_ASSERT_VAL("failed to get fd", fd > 0); /* should force the first one to be closed */ - TEST_ASSERT_VAL("failed to close dsos[0]", dso__data(dsos[0])->fd == -1); + TEST_ASSERT_VAL("failed to close dsos[0]", dso__data(machine.dsos.dsos[0])->fd == -1); /* cleanup everything */ - dsos__delete(dso_cnt); + dsos__delete(&machine.dsos); /* Make sure we did not leak any file descriptor. */ nr_end = open_files_cnt(); @@ -325,9 +322,9 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub long nr_end, nr = open_files_cnt(), lim = new_limit(3); int fd, fd_extra; -#define dso_0 (dsos[0]) -#define dso_1 (dsos[1]) -#define dso_2 (dsos[2]) +#define dso_0 (machine.dsos.dsos[0]) +#define dso_1 (machine.dsos.dsos[1]) +#define dso_2 (machine.dsos.dsos[2]) /* Rest the internal dso open counter limit. */ reset_fd_limit(); @@ -347,7 +344,8 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub TEST_ASSERT_VAL("failed to set file limit", !set_fd_limit((lim))); - TEST_ASSERT_VAL("failed to create dsos\n", !dsos__create(3, TEST_FILE_SIZE)); + TEST_ASSERT_VAL("failed to create dsos\n", + !dsos__create(3, TEST_FILE_SIZE, &machine.dsos)); /* open dso_0 */ fd = dso__data_fd(dso_0, &machine); @@ -386,7 +384,7 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub /* cleanup everything */ close(fd_extra); - dsos__delete(3); + dsos__delete(&machine.dsos); /* Make sure we did not leak any file descriptor. */ nr_end = open_files_cnt(); diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 27db65e96e04..dde706b71da7 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -497,14 +497,20 @@ static pthread_mutex_t dso__data_open_lock = PTHREAD_MUTEX_INITIALIZER; static void dso__list_add(struct dso *dso) { list_add_tail(&dso__data(dso)->open_entry, &dso__data_open); +#ifdef REFCNT_CHECKING dso__data(dso)->dso = dso__get(dso); +#endif + /* Assume the dso is part of dsos, hence the optional reference count above. */ + assert(dso__dsos(dso)); dso__data_open_cnt++; } static void dso__list_del(struct dso *dso) { list_del_init(&dso__data(dso)->open_entry); +#ifdef REFCNT_CHECKING dso__put(dso__data(dso)->dso); +#endif WARN_ONCE(dso__data_open_cnt <= 0, "DSO data fd counter out of bounds."); dso__data_open_cnt--; @@ -654,9 +660,15 @@ static void close_dso(struct dso *dso) static void close_first_dso(void) { struct dso_data *dso_data; + struct dso *dso; dso_data = list_first_entry(&dso__data_open, struct dso_data, open_entry); - close_dso(dso_data->dso); +#ifdef REFCNT_CHECKING + dso = dso_data->dso; +#else + dso = container_of(dso_data, struct dso, data); +#endif + close_dso(dso); } static rlim_t get_fd_limit(void) @@ -1449,7 +1461,9 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) data->fd = -1; data->status = DSO_DATA_STATUS_UNKNOWN; INIT_LIST_HEAD(&data->open_entry); +#ifdef REFCNT_CHECKING data->dso = NULL; /* Set when on the open_entry list. */ +#endif } return res; } diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index f9689dd60de3..df2c98402af3 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -147,7 +147,9 @@ struct dso_cache { struct dso_data { struct rb_root cache; struct list_head open_entry; +#ifdef REFCNT_CHECKING struct dso *dso; +#endif int fd; int status; u32 status_seen; -- cgit v1.2.3 From 6d75d75d77cac61552c6c0e95b335dfe0380c153 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 6 Mar 2024 19:21:25 +0000 Subject: kselftest: Add mechanism for reporting a KSFT_ result code Currently there's no helper which a test can use to report it's result as a KSFT_ result code, we can report a boolean pass/fail but not a skip. This is sometimes a useful idiom so let's add a helper ksft_test_result_report() which translates into the relevant report types. Due to the use of va_args in the result reporting functions this is done as a macro rather than an inline function as one might expect, none of the alternatives looked particularly great. Resolved merge conflict in next betwwen the following commits: f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn") 5d3a9274f0d1 ("kselftest: Add mechanism for reporting a KSFT_ result code") Reported-by: Stephen Rothwell Shuah Khan Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 14bbab0cce13..29efce369a36 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -20,6 +20,7 @@ * and finally report the pass/fail/skip/xfail state of the test with one of: * * ksft_test_result(condition, fmt, ...); + * ksft_test_result_report(result, fmt, ...); * ksft_test_result_pass(fmt, ...); * ksft_test_result_fail(fmt, ...); * ksft_test_result_skip(fmt, ...); @@ -305,6 +306,27 @@ void ksft_test_result_code(int exit_code, const char *test_name, printf("\n"); } +/** + * ksft_test_result() - Report test success based on truth of condition + * + * @condition: if true, report test success, otherwise failure. + */ +#define ksft_test_result_report(result, fmt, ...) do { \ + switch (result) { \ + case KSFT_PASS: \ + ksft_test_result_pass(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_FAIL: \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_XFAIL: \ + ksft_test_result_xfail(fmt, ##__VA_ARGS__); \ + break; \ + case KSFT_SKIP: \ + ksft_test_result_skip(fmt, ##__VA_ARGS__); \ + break; \ + } } while (0) + static inline __noreturn int ksft_exit_pass(void) { ksft_print_cnts(); -- cgit v1.2.3 From d6283d08c75b2d403fd05feb9bbe79e063db391c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 6 Mar 2024 19:21:26 +0000 Subject: kselftest/tty: Report a consistent test name for the one test we run Currently the tty_tstamp_update test reports a different exit message for every path it can exit via. This can be confusing for automated systems as the string that gets logged is interpreted as a test name so if the test status changes they can't tell that it's the same test case that was run, they can see that the overall status of the test program is a failure but it's not clear that it was running the same test. Change all the messages that are logged to be diagnostic prints and log the name of the program as the test name. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/tty/tty_tstamp_update.c | 48 +++++++++++++++++-------- 1 file changed, 33 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/tty/tty_tstamp_update.c b/tools/testing/selftests/tty/tty_tstamp_update.c index 0ee97943dccc..9e1a40f5db17 100644 --- a/tools/testing/selftests/tty/tty_tstamp_update.c +++ b/tools/testing/selftests/tty/tty_tstamp_update.c @@ -47,42 +47,60 @@ int main(int argc, char **argv) int r; char tty[PATH_MAX] = {}; struct stat st1, st2; + int result = KSFT_FAIL; ksft_print_header(); ksft_set_plan(1); r = readlink("/proc/self/fd/0", tty, PATH_MAX); - if (r < 0) - ksft_exit_fail_msg("readlink on /proc/self/fd/0 failed: %m\n"); + if (r < 0) { + ksft_print_msg("readlink on /proc/self/fd/0 failed: %m\n"); + goto out; + } + + if (!tty_valid(tty)) { + ksft_print_msg("invalid tty path '%s'\n", tty); + result = KSFT_SKIP; + goto out; - if (!tty_valid(tty)) - ksft_exit_skip("invalid tty path '%s'\n", tty); + } r = stat(tty, &st1); - if (r < 0) - ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty); + if (r < 0) { + ksft_print_msg("stat failed on tty path '%s': %m\n", tty); + goto out; + } /* We need to wait at least 8 seconds in order to observe timestamp change */ /* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fbf47635315ab308c9b58a1ea0906e711a9228de */ sleep(10); r = write_dev_tty(); - if (r < 0) - ksft_exit_fail_msg("failed to write to /dev/tty: %s\n", - strerror(-r)); + if (r < 0) { + ksft_print_msg("failed to write to /dev/tty: %s\n", + strerror(-r)); + goto out; + } r = stat(tty, &st2); - if (r < 0) - ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty); + if (r < 0) { + ksft_print_msg("stat failed on tty path '%s': %m\n", tty); + goto out; + } /* We wrote to the terminal so timestamps should have been updated */ if (st1.st_atim.tv_sec == st2.st_atim.tv_sec && st1.st_mtim.tv_sec == st2.st_mtim.tv_sec) { - ksft_test_result_fail("tty timestamps not updated\n"); - ksft_exit_fail(); + ksft_print_msg("tty timestamps not updated\n"); + goto out; } - ksft_test_result_pass( + ksft_print_msg( "timestamps of terminal '%s' updated after write to /dev/tty\n", tty); - return EXIT_SUCCESS; + result = KSFT_PASS; + +out: + ksft_test_result_report(result, "tty_tstamp_update\n"); + + ksft_finished(); } -- cgit v1.2.3 From 113ad23f6e5bd77636e6fafec3b29563398faf48 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 27 Mar 2024 23:46:34 +0500 Subject: selftests: x86: test_vsyscall: reorder code to reduce #ifdef blocks There are multiple #ifdef blocks inside functions where they return just 0 if #ifdef is false. This makes number of tests counting difficult. Move those functions inside one #ifdef block and move all of them together. This is preparatory patch for next patch to convert this into TAP format. So in this patch, we are just moving functions around without any changes. With and without this patch, the output of this patch is same. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/test_vsyscall.c | 176 +++++++++++++--------------- 1 file changed, 83 insertions(+), 93 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 47cab972807c..c78b0648aa52 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -21,6 +21,7 @@ #include #include "helpers.h" +#include "../kselftest.h" #ifdef __x86_64__ # define VSYS(x) (x) @@ -39,18 +40,6 @@ /* max length of lines in /proc/self/maps - anything longer is skipped here */ #define MAPS_LINE_LEN 128 -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - /* vsyscalls and vDSO */ bool vsyscall_map_r = false, vsyscall_map_x = false; @@ -96,64 +85,6 @@ static void init_vdso(void) printf("[WARN]\tfailed to find getcpu in vDSO\n"); } -static int init_vsys(void) -{ -#ifdef __x86_64__ - int nerrs = 0; - FILE *maps; - char line[MAPS_LINE_LEN]; - bool found = false; - - maps = fopen("/proc/self/maps", "r"); - if (!maps) { - printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); - vsyscall_map_r = true; - return 0; - } - - while (fgets(line, MAPS_LINE_LEN, maps)) { - char r, x; - void *start, *end; - char name[MAPS_LINE_LEN]; - - /* sscanf() is safe here as strlen(name) >= strlen(line) */ - if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", - &start, &end, &r, &x, name) != 5) - continue; - - if (strcmp(name, "[vsyscall]")) - continue; - - printf("\tvsyscall map: %s", line); - - if (start != (void *)0xffffffffff600000 || - end != (void *)0xffffffffff601000) { - printf("[FAIL]\taddress range is nonsense\n"); - nerrs++; - } - - printf("\tvsyscall permissions are %c-%c\n", r, x); - vsyscall_map_r = (r == 'r'); - vsyscall_map_x = (x == 'x'); - - found = true; - break; - } - - fclose(maps); - - if (!found) { - printf("\tno vsyscall map in /proc/self/maps\n"); - vsyscall_map_r = false; - vsyscall_map_x = false; - } - - return nerrs; -#else - return 0; -#endif -} - /* syscalls */ static inline long sys_gtod(struct timeval *tv, struct timezone *tz) { @@ -176,17 +107,6 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node, return syscall(SYS_getcpu, cpu, node, cache); } -static jmp_buf jmpbuf; -static volatile unsigned long segv_err; - -static void sigsegv(int sig, siginfo_t *info, void *ctx_void) -{ - ucontext_t *ctx = (ucontext_t *)ctx_void; - - segv_err = ctx->uc_mcontext.gregs[REG_ERR]; - siglongjmp(jmpbuf, 1); -} - static double tv_diff(const struct timeval *a, const struct timeval *b) { return (double)(a->tv_sec - b->tv_sec) + @@ -396,9 +316,33 @@ static int test_getcpu(int cpu) return nerrs; } +#ifdef __x86_64__ + +static jmp_buf jmpbuf; +static volatile unsigned long segv_err; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + + segv_err = ctx->uc_mcontext.gregs[REG_ERR]; + siglongjmp(jmpbuf, 1); +} + static int test_vsys_r(void) { -#ifdef __x86_64__ printf("[RUN]\tChecking read access to the vsyscall page\n"); bool can_read; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -420,14 +364,12 @@ static int test_vsys_r(void) printf("[OK]\tWe do not have read access: #PF(0x%lx)\n", segv_err); } -#endif return 0; } static int test_vsys_x(void) { -#ifdef __x86_64__ if (vsyscall_map_x) { /* We already tested this adequately. */ return 0; @@ -454,8 +396,6 @@ static int test_vsys_x(void) segv_err); return 1; } -#endif - return 0; } @@ -472,7 +412,6 @@ static int test_vsys_x(void) */ static int test_process_vm_readv(void) { -#ifdef __x86_64__ char buf[4096]; struct iovec local, remote; int ret; @@ -504,12 +443,63 @@ static int test_process_vm_readv(void) printf("[FAIL]\tprocess_rm_readv() succeeded, but it should have failed in this configuration\n"); return 1; } -#endif - return 0; } -#ifdef __x86_64__ +static int init_vsys(void) +{ + int nerrs = 0; + FILE *maps; + char line[MAPS_LINE_LEN]; + bool found = false; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); + vsyscall_map_r = true; + return 0; + } + + while (fgets(line, MAPS_LINE_LEN, maps)) { + char r, x; + void *start, *end; + char name[MAPS_LINE_LEN]; + + /* sscanf() is safe here as strlen(name) >= strlen(line) */ + if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", + &start, &end, &r, &x, name) != 5) + continue; + + if (strcmp(name, "[vsyscall]")) + continue; + + printf("\tvsyscall map: %s", line); + + if (start != (void *)0xffffffffff600000 || + end != (void *)0xffffffffff601000) { + printf("[FAIL]\taddress range is nonsense\n"); + nerrs++; + } + + printf("\tvsyscall permissions are %c-%c\n", r, x); + vsyscall_map_r = (r == 'r'); + vsyscall_map_x = (x == 'x'); + + found = true; + break; + } + + fclose(maps); + + if (!found) { + printf("\tno vsyscall map in /proc/self/maps\n"); + vsyscall_map_r = false; + vsyscall_map_x = false; + } + + return nerrs; +} + static volatile sig_atomic_t num_vsyscall_traps; static void sigtrap(int sig, siginfo_t *info, void *ctx_void) @@ -559,20 +549,20 @@ int main(int argc, char **argv) int nerrs = 0; init_vdso(); +#ifdef __x86_64__ nerrs += init_vsys(); +#endif nerrs += test_gtod(); nerrs += test_time(); nerrs += test_getcpu(0); nerrs += test_getcpu(1); +#ifdef __x86_64__ sethandler(SIGSEGV, sigsegv, 0); nerrs += test_vsys_r(); nerrs += test_vsys_x(); - nerrs += test_process_vm_readv(); - -#ifdef __x86_64__ nerrs += test_emulation(); #endif -- cgit v1.2.3 From d17e752b827491d67ea9ad6d2e67dfe57a226071 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 27 Mar 2024 23:46:35 +0500 Subject: selftests: x86: test_vsyscall: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Add more logic code to skip the tests if particular configuration isn't available to make sure that either we skip each test or mark it pass/fail. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/test_vsyscall.c | 356 +++++++++++++--------------- 1 file changed, 168 insertions(+), 188 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index c78b0648aa52..d4c8e8d79d38 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -23,6 +23,12 @@ #include "helpers.h" #include "../kselftest.h" +#ifdef __x86_64__ +#define TOTAL_TESTS 13 +#else +#define TOTAL_TESTS 8 +#endif + #ifdef __x86_64__ # define VSYS(x) (x) #else @@ -64,25 +70,25 @@ static void init_vdso(void) if (!vdso) vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); if (!vdso) { - printf("[WARN]\tfailed to find vDSO\n"); + ksft_print_msg("[WARN] failed to find vDSO\n"); return; } vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday"); if (!vdso_gtod) - printf("[WARN]\tfailed to find gettimeofday in vDSO\n"); + ksft_print_msg("[WARN] failed to find gettimeofday in vDSO\n"); vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); if (!vdso_gettime) - printf("[WARN]\tfailed to find clock_gettime in vDSO\n"); + ksft_print_msg("[WARN] failed to find clock_gettime in vDSO\n"); vdso_time = (time_func_t)dlsym(vdso, "__vdso_time"); if (!vdso_time) - printf("[WARN]\tfailed to find time in vDSO\n"); + ksft_print_msg("[WARN] failed to find time in vDSO\n"); vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); if (!vdso_getcpu) - printf("[WARN]\tfailed to find getcpu in vDSO\n"); + ksft_print_msg("[WARN] failed to find getcpu in vDSO\n"); } /* syscalls */ @@ -113,81 +119,70 @@ static double tv_diff(const struct timeval *a, const struct timeval *b) (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6; } -static int check_gtod(const struct timeval *tv_sys1, - const struct timeval *tv_sys2, - const struct timezone *tz_sys, - const char *which, - const struct timeval *tv_other, - const struct timezone *tz_other) +static void check_gtod(const struct timeval *tv_sys1, + const struct timeval *tv_sys2, + const struct timezone *tz_sys, + const char *which, + const struct timeval *tv_other, + const struct timezone *tz_other) { - int nerrs = 0; double d1, d2; - if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) { - printf("[FAIL] %s tz mismatch\n", which); - nerrs++; - } + if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || + tz_sys->tz_dsttime != tz_other->tz_dsttime)) + ksft_print_msg("%s tz mismatch\n", which); d1 = tv_diff(tv_other, tv_sys1); d2 = tv_diff(tv_sys2, tv_other); - printf("\t%s time offsets: %lf %lf\n", which, d1, d2); - if (d1 < 0 || d2 < 0) { - printf("[FAIL]\t%s time was inconsistent with the syscall\n", which); - nerrs++; - } else { - printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which); - } + ksft_print_msg("%s time offsets: %lf %lf\n", which, d1, d2); - return nerrs; + ksft_test_result(!(d1 < 0 || d2 < 0), "%s gettimeofday()'s timeval\n", which); } -static int test_gtod(void) +static void test_gtod(void) { struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys; struct timezone tz_sys, tz_vdso, tz_vsys; long ret_vdso = -1; long ret_vsys = -1; - int nerrs = 0; - printf("[RUN]\ttest gettimeofday()\n"); + ksft_print_msg("test gettimeofday()\n"); if (sys_gtod(&tv_sys1, &tz_sys) != 0) - err(1, "syscall gettimeofday"); + ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno)); if (vdso_gtod) ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); if (vsyscall_map_x) ret_vsys = vgtod(&tv_vsys, &tz_vsys); if (sys_gtod(&tv_sys2, &tz_sys) != 0) - err(1, "syscall gettimeofday"); + ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno)); if (vdso_gtod) { - if (ret_vdso == 0) { - nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); - } else { - printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso); - nerrs++; - } + if (ret_vdso == 0) + check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); + else + ksft_test_result_fail("vDSO gettimeofday() failed: %ld\n", ret_vdso); + } else { + ksft_test_result_skip("vdso_gtod isn't set\n"); } if (vsyscall_map_x) { - if (ret_vsys == 0) { - nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); - } else { - printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys); - nerrs++; - } + if (ret_vsys == 0) + check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); + else + ksft_test_result_fail("vsys gettimeofday() failed: %ld\n", ret_vsys); + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } -static int test_time(void) { - int nerrs = 0; - - printf("[RUN]\ttest time()\n"); +static void test_time(void) +{ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0; long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1; + + ksft_print_msg("test time()\n"); t_sys1 = sys_time(&t2_sys1); if (vdso_time) t_vdso = vdso_time(&t2_vdso); @@ -195,56 +190,60 @@ static int test_time(void) { t_vsys = vtime(&t2_vsys); t_sys2 = sys_time(&t2_sys2); if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { - printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2); - nerrs++; - return nerrs; + ksft_print_msg("syscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", + t_sys1, t2_sys1, t_sys2, t2_sys2); + ksft_test_result_skip("vdso_time\n"); + ksft_test_result_skip("vdso_time\n"); + return; } if (vdso_time) { - if (t_vdso < 0 || t_vdso != t2_vdso) { - printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso); - nerrs++; - } else if (t_vdso < t_sys1 || t_vdso > t_sys2) { - printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2); - nerrs++; - } else { - printf("[OK]\tvDSO time() is okay\n"); - } + if (t_vdso < 0 || t_vdso != t2_vdso) + ksft_test_result_fail("vDSO failed (ret:%ld output:%ld)\n", + t_vdso, t2_vdso); + else if (t_vdso < t_sys1 || t_vdso > t_sys2) + ksft_test_result_fail("vDSO returned the wrong time (%ld %ld %ld)\n", + t_sys1, t_vdso, t_sys2); + else + ksft_test_result_pass("vDSO time() is okay\n"); + } else { + ksft_test_result_skip("vdso_time isn't set\n"); } if (vsyscall_map_x) { - if (t_vsys < 0 || t_vsys != t2_vsys) { - printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); - nerrs++; - } else if (t_vsys < t_sys1 || t_vsys > t_sys2) { - printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2); - nerrs++; - } else { - printf("[OK]\tvsyscall time() is okay\n"); - } + if (t_vsys < 0 || t_vsys != t2_vsys) + ksft_test_result_fail("vsyscall failed (ret:%ld output:%ld)\n", + t_vsys, t2_vsys); + else if (t_vsys < t_sys1 || t_vsys > t_sys2) + ksft_test_result_fail("vsyscall returned the wrong time (%ld %ld %ld)\n", + t_sys1, t_vsys, t_sys2); + else + ksft_test_result_pass("vsyscall time() is okay\n"); + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } -static int test_getcpu(int cpu) +static void test_getcpu(int cpu) { - int nerrs = 0; + unsigned int cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; long ret_sys, ret_vdso = -1, ret_vsys = -1; + unsigned int node = 0; + bool have_node = false; + cpu_set_t cpuset; - printf("[RUN]\tgetcpu() on CPU %d\n", cpu); + ksft_print_msg("getcpu() on CPU %d\n", cpu); - cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { - printf("[SKIP]\tfailed to force CPU %d\n", cpu); - return nerrs; + ksft_print_msg("failed to force CPU %d\n", cpu); + ksft_test_result_skip("vdso_getcpu\n"); + ksft_test_result_skip("vsyscall_map_x\n"); + + return; } - unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; - unsigned node = 0; - bool have_node = false; ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); if (vdso_getcpu) ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); @@ -252,10 +251,9 @@ static int test_getcpu(int cpu) ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); if (ret_sys == 0) { - if (cpu_sys != cpu) { - printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu); - nerrs++; - } + if (cpu_sys != cpu) + ksft_print_msg("syscall reported CPU %hu but should be %d\n", + cpu_sys, cpu); have_node = true; node = node_sys; @@ -263,57 +261,53 @@ static int test_getcpu(int cpu) if (vdso_getcpu) { if (ret_vdso) { - printf("[FAIL]\tvDSO getcpu() failed\n"); - nerrs++; + ksft_test_result_fail("vDSO getcpu() failed\n"); } else { if (!have_node) { have_node = true; node = node_vdso; } - if (cpu_vdso != cpu) { - printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu); - nerrs++; - } else { - printf("[OK]\tvDSO reported correct CPU\n"); - } - - if (node_vdso != node) { - printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node); - nerrs++; + if (cpu_vdso != cpu || node_vdso != node) { + if (cpu_vdso != cpu) + ksft_print_msg("vDSO reported CPU %hu but should be %d\n", + cpu_vdso, cpu); + if (node_vdso != node) + ksft_print_msg("vDSO reported node %hu but should be %hu\n", + node_vdso, node); + ksft_test_result_fail("Wrong values\n"); } else { - printf("[OK]\tvDSO reported correct node\n"); + ksft_test_result_pass("vDSO reported correct CPU and node\n"); } } + } else { + ksft_test_result_skip("vdso_getcpu isn't set\n"); } if (vsyscall_map_x) { if (ret_vsys) { - printf("[FAIL]\tvsyscall getcpu() failed\n"); - nerrs++; + ksft_test_result_fail("vsyscall getcpu() failed\n"); } else { if (!have_node) { have_node = true; node = node_vsys; } - if (cpu_vsys != cpu) { - printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu); - nerrs++; + if (cpu_vsys != cpu || node_vsys != node) { + if (cpu_vsys != cpu) + ksft_print_msg("vsyscall reported CPU %hu but should be %d\n", + cpu_vsys, cpu); + if (node_vsys != node) + ksft_print_msg("vsyscall reported node %hu but should be %hu\n", + node_vsys, node); + ksft_test_result_fail("Wrong values\n"); } else { - printf("[OK]\tvsyscall reported correct CPU\n"); - } - - if (node_vsys != node) { - printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node); - nerrs++; - } else { - printf("[OK]\tvsyscall reported correct node\n"); + ksft_test_result_pass("vsyscall reported correct CPU and node\n"); } } + } else { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); } - - return nerrs; } #ifdef __x86_64__ @@ -325,12 +319,13 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { struct sigaction sa; + memset(&sa, 0, sizeof(sa)); sa.sa_sigaction = handler; sa.sa_flags = SA_SIGINFO | flags; sigemptyset(&sa.sa_mask); if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); + ksft_exit_fail_msg("sigaction failed\n"); } static void sigsegv(int sig, siginfo_t *info, void *ctx_void) @@ -341,9 +336,9 @@ static void sigsegv(int sig, siginfo_t *info, void *ctx_void) siglongjmp(jmpbuf, 1); } -static int test_vsys_r(void) +static void test_vsys_r(void) { - printf("[RUN]\tChecking read access to the vsyscall page\n"); + ksft_print_msg("Checking read access to the vsyscall page\n"); bool can_read; if (sigsetjmp(jmpbuf, 1) == 0) { *(volatile int *)0xffffffffff600000; @@ -352,30 +347,25 @@ static int test_vsys_r(void) can_read = false; } - if (can_read && !vsyscall_map_r) { - printf("[FAIL]\tWe have read access, but we shouldn't\n"); - return 1; - } else if (!can_read && vsyscall_map_r) { - printf("[FAIL]\tWe don't have read access, but we should\n"); - return 1; - } else if (can_read) { - printf("[OK]\tWe have read access\n"); - } else { - printf("[OK]\tWe do not have read access: #PF(0x%lx)\n", - segv_err); - } - - return 0; + if (can_read && !vsyscall_map_r) + ksft_test_result_fail("We have read access, but we shouldn't\n"); + else if (!can_read && vsyscall_map_r) + ksft_test_result_fail("We don't have read access, but we should\n"); + else if (can_read) + ksft_test_result_pass("We have read access\n"); + else + ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err); } -static int test_vsys_x(void) +static void test_vsys_x(void) { if (vsyscall_map_x) { /* We already tested this adequately. */ - return 0; + ksft_test_result_pass("vsyscall_map_x is true\n"); + return; } - printf("[RUN]\tMake sure that vsyscalls really page fault\n"); + ksft_print_msg("Make sure that vsyscalls really page fault\n"); bool can_exec; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -385,18 +375,14 @@ static int test_vsys_x(void) can_exec = false; } - if (can_exec) { - printf("[FAIL]\tExecuting the vsyscall did not page fault\n"); - return 1; - } else if (segv_err & (1 << 4)) { /* INSTR */ - printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n", - segv_err); - } else { - printf("[FAIL]\tExecution failed with the wrong error: #PF(0x%lx)\n", - segv_err); - return 1; - } - return 0; + if (can_exec) + ksft_test_result_fail("Executing the vsyscall did not page fault\n"); + else if (segv_err & (1 << 4)) /* INSTR */ + ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n", + segv_err); + else + ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n", + segv_err); } /* @@ -410,13 +396,13 @@ static int test_vsys_x(void) * fact that ptrace() ever worked was a nice courtesy of old kernels, * but the code to support it is fairly gross. */ -static int test_process_vm_readv(void) +static void test_process_vm_readv(void) { char buf[4096]; struct iovec local, remote; int ret; - printf("[RUN]\tprocess_vm_readv() from vsyscall page\n"); + ksft_print_msg("process_vm_readv() from vsyscall page\n"); local.iov_base = buf; local.iov_len = 4096; @@ -428,25 +414,18 @@ static int test_process_vm_readv(void) * We expect process_vm_readv() to work if and only if the * vsyscall page is readable. */ - printf("[%s]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", vsyscall_map_r ? "FAIL" : "OK", ret, errno); - return vsyscall_map_r ? 1 : 0; + ksft_test_result(!vsyscall_map_r, + "process_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno); + return; } - if (vsyscall_map_r) { - if (!memcmp(buf, remote.iov_base, sizeof(buf))) { - printf("[OK]\tIt worked and read correct data\n"); - } else { - printf("[FAIL]\tIt worked but returned incorrect data\n"); - return 1; - } - } else { - printf("[FAIL]\tprocess_rm_readv() succeeded, but it should have failed in this configuration\n"); - return 1; - } - return 0; + if (vsyscall_map_r) + ksft_test_result(!memcmp(buf, remote.iov_base, sizeof(buf)), "Read data\n"); + else + ksft_test_result_fail("process_rm_readv() succeeded, but it should have failed in this configuration\n"); } -static int init_vsys(void) +static void init_vsys(void) { int nerrs = 0; FILE *maps; @@ -455,9 +434,9 @@ static int init_vsys(void) maps = fopen("/proc/self/maps", "r"); if (!maps) { - printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); + ksft_test_result_skip("Could not open /proc/self/maps -- assuming vsyscall is r-x\n"); vsyscall_map_r = true; - return 0; + return; } while (fgets(line, MAPS_LINE_LEN, maps)) { @@ -473,15 +452,15 @@ static int init_vsys(void) if (strcmp(name, "[vsyscall]")) continue; - printf("\tvsyscall map: %s", line); + ksft_print_msg("vsyscall map: %s", line); if (start != (void *)0xffffffffff600000 || end != (void *)0xffffffffff601000) { - printf("[FAIL]\taddress range is nonsense\n"); + ksft_print_msg("address range is nonsense\n"); nerrs++; } - printf("\tvsyscall permissions are %c-%c\n", r, x); + ksft_print_msg("vsyscall permissions are %c-%c\n", r, x); vsyscall_map_r = (r == 'r'); vsyscall_map_x = (x == 'x'); @@ -492,12 +471,12 @@ static int init_vsys(void) fclose(maps); if (!found) { - printf("\tno vsyscall map in /proc/self/maps\n"); + ksft_print_msg("no vsyscall map in /proc/self/maps\n"); vsyscall_map_r = false; vsyscall_map_x = false; } - return nerrs; + ksft_test_result(!nerrs, "vsyscall map\n"); } static volatile sig_atomic_t num_vsyscall_traps; @@ -511,15 +490,17 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) num_vsyscall_traps++; } -static int test_emulation(void) +static void test_emulation(void) { time_t tmp; bool is_native; - if (!vsyscall_map_x) - return 0; + if (!vsyscall_map_x) { + ksft_test_result_skip("vsyscall_map_x isn't set\n"); + return; + } - printf("[RUN]\tchecking that vsyscalls are emulated\n"); + ksft_print_msg("checking that vsyscalls are emulated\n"); sethandler(SIGTRAP, sigtrap, 0); set_eflags(get_eflags() | X86_EFLAGS_TF); vtime(&tmp); @@ -535,36 +516,35 @@ static int test_emulation(void) */ is_native = (num_vsyscall_traps > 1); - printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n", - (is_native ? "FAIL" : "OK"), - (is_native ? "native" : "emulated"), - (int)num_vsyscall_traps); - - return is_native; + ksft_test_result(!is_native, "vsyscalls are %s (%d instructions in vsyscall page)\n", + (is_native ? "native" : "emulated"), (int)num_vsyscall_traps); } #endif int main(int argc, char **argv) { - int nerrs = 0; + int total_tests = TOTAL_TESTS; + + ksft_print_header(); + ksft_set_plan(total_tests); init_vdso(); #ifdef __x86_64__ - nerrs += init_vsys(); + init_vsys(); #endif - nerrs += test_gtod(); - nerrs += test_time(); - nerrs += test_getcpu(0); - nerrs += test_getcpu(1); + test_gtod(); + test_time(); + test_getcpu(0); + test_getcpu(1); #ifdef __x86_64__ sethandler(SIGSEGV, sigsegv, 0); - nerrs += test_vsys_r(); - nerrs += test_vsys_x(); - nerrs += test_process_vm_readv(); - nerrs += test_emulation(); + test_vsys_r(); + test_vsys_x(); + test_process_vm_readv(); + test_emulation(); #endif - return nerrs ? 1 : 0; + ksft_finished(); } -- cgit v1.2.3 From 5549a79835a24dc9a5618e2a5bb164060a0eaf50 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 27 Mar 2024 23:46:36 +0500 Subject: selftests: x86: test_mremap_vdso: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/test_mremap_vdso.c | 43 +++++++++++++------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c index f0d876d48277..d53959e03593 100644 --- a/tools/testing/selftests/x86/test_mremap_vdso.c +++ b/tools/testing/selftests/x86/test_mremap_vdso.c @@ -19,6 +19,7 @@ #include #include #include +#include "../kselftest.h" #define PAGE_SIZE 4096 @@ -29,13 +30,13 @@ static int try_to_remap(void *vdso_addr, unsigned long size) /* Searching for memory location where to remap */ dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (dest_addr == MAP_FAILED) { - printf("[WARN]\tmmap failed (%d): %m\n", errno); + ksft_print_msg("WARN: mmap failed (%d): %m\n", errno); return 0; } - printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n", - vdso_addr, (unsigned long)vdso_addr + size, - dest_addr, (unsigned long)dest_addr + size); + ksft_print_msg("Moving vDSO: [%p, %#lx] -> [%p, %#lx]\n", + vdso_addr, (unsigned long)vdso_addr + size, + dest_addr, (unsigned long)dest_addr + size); fflush(stdout); new_addr = mremap(vdso_addr, size, size, @@ -43,10 +44,10 @@ static int try_to_remap(void *vdso_addr, unsigned long size) if ((unsigned long)new_addr == (unsigned long)-1) { munmap(dest_addr, size); if (errno == EINVAL) { - printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n"); + ksft_print_msg("vDSO partial move failed, will try with bigger size\n"); return -1; /* Retry with larger */ } - printf("[FAIL]\tmremap failed (%d): %m\n", errno); + ksft_print_msg("[FAIL]\tmremap failed (%d): %m\n", errno); return 1; } @@ -58,11 +59,12 @@ int main(int argc, char **argv, char **envp) { pid_t child; + ksft_print_header(); + ksft_set_plan(1); + child = fork(); - if (child == -1) { - printf("[WARN]\tfailed to fork (%d): %m\n", errno); - return 1; - } + if (child == -1) + ksft_exit_fail_msg("failed to fork (%d): %m\n", errno); if (child == 0) { unsigned long vdso_size = PAGE_SIZE; @@ -70,9 +72,9 @@ int main(int argc, char **argv, char **envp) int ret = -1; auxval = getauxval(AT_SYSINFO_EHDR); - printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval); + ksft_print_msg("AT_SYSINFO_EHDR is %#lx\n", auxval); if (!auxval || auxval == -ENOENT) { - printf("[WARN]\tgetauxval failed\n"); + ksft_print_msg("WARN: getauxval failed\n"); return 0; } @@ -92,16 +94,13 @@ int main(int argc, char **argv, char **envp) int status; if (waitpid(child, &status, 0) != child || - !WIFEXITED(status)) { - printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n"); - return 1; - } else if (WEXITSTATUS(status) != 0) { - printf("[FAIL]\tChild failed with %d\n", - WEXITSTATUS(status)); - return 1; - } - printf("[OK]\n"); + !WIFEXITED(status)) + ksft_test_result_fail("mremap() of the vDSO does not work on this kernel!\n"); + else if (WEXITSTATUS(status) != 0) + ksft_test_result_fail("Child failed with %d\n", WEXITSTATUS(status)); + else + ksft_test_result_pass("%s\n", __func__); } - return 0; + ksft_finished(); } -- cgit v1.2.3 From fa04b7ffc23f2f6b84add32f7d31222434116365 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 27 Mar 2024 16:53:52 +0500 Subject: selftests/dmabuf-heap: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Improve the TAP messages as well. Reviewed-by: T.J. Mercier Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c | 247 +++++++++------------ 1 file changed, 101 insertions(+), 146 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index 890a8236a8ba..5f541522364f 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -15,6 +15,7 @@ #include #include #include +#include "../kselftest.h" #define DEVPATH "/dev/dma_heap" @@ -90,14 +91,13 @@ static int dmabuf_heap_open(char *name) char buf[256]; ret = snprintf(buf, 256, "%s/%s", DEVPATH, name); - if (ret < 0) { - printf("snprintf failed!\n"); - return ret; - } + if (ret < 0) + ksft_exit_fail_msg("snprintf failed! %d\n", ret); fd = open(buf, O_RDWR); if (fd < 0) - printf("open %s failed!\n", buf); + ksft_exit_fail_msg("open %s failed: %s\n", buf, strerror(errno)); + return fd; } @@ -140,7 +140,7 @@ static int dmabuf_sync(int fd, int start_stop) #define ONE_MEG (1024 * 1024) -static int test_alloc_and_import(char *heap_name) +static void test_alloc_and_import(char *heap_name) { int heap_fd = -1, dmabuf_fd = -1, importer_fd = -1; uint32_t handle = 0; @@ -148,27 +148,19 @@ static int test_alloc_and_import(char *heap_name) int ret; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing allocation and importing: "); + ksft_print_msg("Testing allocation and importing:\n"); ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0, &dmabuf_fd); if (ret) { - printf("FAIL (Allocation Failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (Allocation Failed!) %d\n", ret); + return; } + /* mmap and write a simple pattern */ - p = mmap(NULL, - ONE_MEG, - PROT_READ | PROT_WRITE, - MAP_SHARED, - dmabuf_fd, - 0); + p = mmap(NULL, ONE_MEG, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd, 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed): %s\n", strerror(errno)); + goto close_and_return; } dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START); @@ -178,71 +170,64 @@ static int test_alloc_and_import(char *heap_name) importer_fd = open_vgem(); if (importer_fd < 0) { - ret = importer_fd; - printf("(Could not open vgem - skipping): "); + ksft_test_result_skip("Could not open vgem %d\n", importer_fd); } else { ret = import_vgem_fd(importer_fd, dmabuf_fd, &handle); - if (ret < 0) { - printf("FAIL (Failed to import buffer)\n"); - goto out; - } + ksft_test_result(ret >= 0, "Import buffer %d\n", ret); } ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START); if (ret < 0) { - printf("FAIL (DMA_BUF_SYNC_START failed!)\n"); + ksft_print_msg("FAIL (DMA_BUF_SYNC_START failed!) %d\n", ret); goto out; } memset(p, 0xff, ONE_MEG); ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_END); if (ret < 0) { - printf("FAIL (DMA_BUF_SYNC_END failed!)\n"); + ksft_print_msg("FAIL (DMA_BUF_SYNC_END failed!) %d\n", ret); goto out; } close_handle(importer_fd, handle); - ret = 0; - printf(" OK\n"); + ksft_test_result_pass("%s dmabuf sync succeeded\n", __func__); + return; + out: - if (p) - munmap(p, ONE_MEG); - if (importer_fd >= 0) - close(importer_fd); - if (dmabuf_fd >= 0) - close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + ksft_test_result_fail("%s dmabuf sync failed\n", __func__); + munmap(p, ONE_MEG); + close(importer_fd); - return ret; +close_and_return: + close(dmabuf_fd); + close(heap_fd); } -static int test_alloc_zeroed(char *heap_name, size_t size) +static void test_alloc_zeroed(char *heap_name, size_t size) { int heap_fd = -1, dmabuf_fd[32]; - int i, j, ret; + int i, j, k, ret; void *p = NULL; char *c; - printf(" Testing alloced %ldk buffers are zeroed: ", size / 1024); + ksft_print_msg("Testing alloced %ldk buffers are zeroed:\n", size / 1024); heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; /* Allocate and fill a bunch of buffers */ for (i = 0; i < 32; i++) { ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]); - if (ret < 0) { - printf("FAIL (Allocation (%i) failed)\n", i); - goto out; + if (ret) { + ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret); + goto close_and_return; } + /* mmap and fill with simple pattern */ p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno)); + goto close_and_return; } + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START); memset(p, 0xff, size); dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); @@ -251,48 +236,47 @@ static int test_alloc_zeroed(char *heap_name, size_t size) /* close them all */ for (i = 0; i < 32; i++) close(dmabuf_fd[i]); + ksft_test_result_pass("Allocate and fill a bunch of buffers\n"); /* Allocate and validate all buffers are zeroed */ for (i = 0; i < 32; i++) { ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]); if (ret < 0) { - printf("FAIL (Allocation (%i) failed)\n", i); - goto out; + ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret); + goto close_and_return; } /* mmap and validate everything is zero */ p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0); if (p == MAP_FAILED) { - printf("FAIL (mmap() failed!)\n"); - ret = -1; - goto out; + ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno)); + goto close_and_return; } + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START); c = (char *)p; for (j = 0; j < size; j++) { if (c[j] != 0) { - printf("FAIL (Allocated buffer not zeroed @ %i)\n", j); - break; + ksft_print_msg("FAIL (Allocated buffer not zeroed @ %i)\n", j); + dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); + munmap(p, size); + goto out; } } dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END); munmap(p, size); } - /* close them all */ - for (i = 0; i < 32; i++) - close(dmabuf_fd[i]); - - close(heap_fd); - printf("OK\n"); - return 0; out: - while (i > 0) { - close(dmabuf_fd[i]); - i--; - } + ksft_test_result(i == 32, "Allocate and validate all buffers are zeroed\n"); + +close_and_return: + /* close them all */ + for (k = 0; k < i; k++) + close(dmabuf_fd[k]); + close(heap_fd); - return ret; + return; } /* Test the ioctl version compatibility w/ a smaller structure then expected */ @@ -360,126 +344,97 @@ static int dmabuf_heap_alloc_newer(int fd, size_t len, unsigned int flags, return ret; } -static int test_alloc_compat(char *heap_name) +static void test_alloc_compat(char *heap_name) { - int heap_fd = -1, dmabuf_fd = -1; - int ret; + int ret, heap_fd = -1, dmabuf_fd = -1; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing (theoretical)older alloc compat: "); + ksft_print_msg("Testing (theoretical) older alloc compat:\n"); ret = dmabuf_heap_alloc_older(heap_fd, ONE_MEG, 0, &dmabuf_fd); - if (ret) { - printf("FAIL (Older compat allocation failed!)\n"); - ret = -1; - goto out; - } - close(dmabuf_fd); - printf("OK\n"); + if (dmabuf_fd >= 0) + close(dmabuf_fd); + ksft_test_result(!ret, "dmabuf_heap_alloc_older\n"); - printf(" Testing (theoretical)newer alloc compat: "); + ksft_print_msg("Testing (theoretical) newer alloc compat:\n"); ret = dmabuf_heap_alloc_newer(heap_fd, ONE_MEG, 0, &dmabuf_fd); - if (ret) { - printf("FAIL (Newer compat allocation failed!)\n"); - ret = -1; - goto out; - } - printf("OK\n"); -out: if (dmabuf_fd >= 0) close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + ksft_test_result(!ret, "dmabuf_heap_alloc_newer\n"); - return ret; + close(heap_fd); } -static int test_alloc_errors(char *heap_name) +static void test_alloc_errors(char *heap_name) { int heap_fd = -1, dmabuf_fd = -1; int ret; heap_fd = dmabuf_heap_open(heap_name); - if (heap_fd < 0) - return -1; - printf(" Testing expected error cases: "); + ksft_print_msg("Testing expected error cases:\n"); ret = dmabuf_heap_alloc(0, ONE_MEG, 0x111111, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid fd)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid fd %d\n", ret); ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0x111111, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid heap flags)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret); ret = dmabuf_heap_alloc_fdflags(heap_fd, ONE_MEG, ~(O_RDWR | O_CLOEXEC), 0, &dmabuf_fd); - if (!ret) { - printf("FAIL (Did not see expected error (invalid fd flags)!)\n"); - ret = -1; - goto out; - } + ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret); - printf("OK\n"); - ret = 0; -out: if (dmabuf_fd >= 0) close(dmabuf_fd); - if (heap_fd >= 0) - close(heap_fd); + close(heap_fd); +} - return ret; +static int numer_of_heaps(void) +{ + DIR *d = opendir(DEVPATH); + struct dirent *dir; + int heaps = 0; + + while ((dir = readdir(d))) { + if (!strncmp(dir->d_name, ".", 2)) + continue; + if (!strncmp(dir->d_name, "..", 3)) + continue; + heaps++; + } + + return heaps; } int main(void) { - DIR *d; struct dirent *dir; - int ret = -1; + DIR *d; + + ksft_print_header(); d = opendir(DEVPATH); if (!d) { - printf("No %s directory?\n", DEVPATH); - return -1; + ksft_print_msg("No %s directory?\n", DEVPATH); + return KSFT_SKIP; } - while ((dir = readdir(d)) != NULL) { + ksft_set_plan(11 * numer_of_heaps()); + + while ((dir = readdir(d))) { if (!strncmp(dir->d_name, ".", 2)) continue; if (!strncmp(dir->d_name, "..", 3)) continue; - printf("Testing heap: %s\n", dir->d_name); - printf("=======================================\n"); - ret = test_alloc_and_import(dir->d_name); - if (ret) - break; - - ret = test_alloc_zeroed(dir->d_name, 4 * 1024); - if (ret) - break; - - ret = test_alloc_zeroed(dir->d_name, ONE_MEG); - if (ret) - break; - - ret = test_alloc_compat(dir->d_name); - if (ret) - break; - - ret = test_alloc_errors(dir->d_name); - if (ret) - break; + ksft_print_msg("Testing heap: %s\n", dir->d_name); + ksft_print_msg("=======================================\n"); + test_alloc_and_import(dir->d_name); + test_alloc_zeroed(dir->d_name, 4 * 1024); + test_alloc_zeroed(dir->d_name, ONE_MEG); + test_alloc_compat(dir->d_name); + test_alloc_errors(dir->d_name); } closedir(d); - return ret; + ksft_finished(); } -- cgit v1.2.3 From 8780bc88d4c86cdb7e9591bde9dd1110156a203b Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Tue, 27 Feb 2024 08:21:41 +0100 Subject: selftests/resctrl: Add cleanup function to test framework Resctrl selftests use very similar functions to cleanup after themselves. This creates a lot of code duplication. Also not being hooked to the test framework means that ctrl-c handler isn't aware of what test is currently running and executes all cleanups even though only one is needed. Add a function pointer to the resctrl_test struct and attach to it cleanup functions from individual tests. Signed-off-by: Maciej Wieczor-Retman Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 1 + tools/testing/selftests/resctrl/cmt_test.c | 1 + tools/testing/selftests/resctrl/mba_test.c | 1 + tools/testing/selftests/resctrl/mbm_test.c | 1 + tools/testing/selftests/resctrl/resctrl.h | 2 ++ 5 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 4cb991be8e31..8fa4348ab461 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -373,6 +373,7 @@ struct resctrl_test l3_cat_test = { .resource = "L3", .feature_check = test_resource_feature_check, .run_test = cat_run_test, + .cleanup = cat_test_cleanup, }; struct resctrl_test l3_noncont_cat_test = { diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index a81f91222a89..a01ccf86e6ce 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -178,4 +178,5 @@ struct resctrl_test cmt_test = { .resource = "L3", .feature_check = cmt_feature_check, .run_test = cmt_run_test, + .cleanup = cmt_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 7946e32e85c8..189fbe20dc7b 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -180,4 +180,5 @@ struct resctrl_test mba_test = { .vendor_specific = ARCH_INTEL, .feature_check = mba_feature_check, .run_test = mba_run_test, + .cleanup = mba_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index d67ffa3ec63a..73d6a8b989f5 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -150,4 +150,5 @@ struct resctrl_test mbm_test = { .vendor_specific = ARCH_INTEL, .feature_check = mbm_feature_check, .run_test = mbm_run_test, + .cleanup = mbm_test_cleanup, }; diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 2051bd135e0d..221c94532733 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -72,6 +72,7 @@ struct user_params { * @disabled: Test is disabled * @feature_check: Callback to check required resctrl features * @run_test: Callback to run the test + * @cleanup: Callback to cleanup after the test */ struct resctrl_test { const char *name; @@ -82,6 +83,7 @@ struct resctrl_test { bool (*feature_check)(const struct resctrl_test *test); int (*run_test)(const struct resctrl_test *test, const struct user_params *uparams); + void (*cleanup)(void); }; /* -- cgit v1.2.3 From e6487230e952cfd4070c61141f011608841f36eb Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Tue, 27 Feb 2024 08:21:42 +0100 Subject: selftests/resctrl: Simplify cleanup in ctrl-c handler Ctrl-c handler isn't aware of what test is currently running. Because of that it executes all cleanups even if they aren't necessary. Since the ctrl-c handler uses the sa_sigaction system no parameters can be passed to it as function arguments. Add a global variable to make ctrl-c handler aware of the currently run test and only execute the correct cleanup callback. Signed-off-by: Maciej Wieczor-Retman Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 3 +-- tools/testing/selftests/resctrl/resctrl_tests.c | 14 +++----------- tools/testing/selftests/resctrl/resctrl_val.c | 8 ++++++-- 3 files changed, 10 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 221c94532733..bc486f92aceb 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -158,7 +158,6 @@ int resctrl_val(const struct resctrl_test *test, const struct user_params *uparams, const char * const *benchmark_cmd, struct resctrl_val_param *param); -void tests_cleanup(void); void mbm_test_cleanup(void); void mba_test_cleanup(void); unsigned long create_bit_mask(unsigned int start, unsigned int len); @@ -168,7 +167,7 @@ int get_mask_no_shareable(const char *cache_type, unsigned long *mask); int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size); int resource_info_unsigned_get(const char *resource, const char *filename, unsigned int *val); void ctrlc_handler(int signum, siginfo_t *info, void *ptr); -int signal_handler_register(void); +int signal_handler_register(const struct resctrl_test *test); void signal_handler_unregister(void); void cat_test_cleanup(void); unsigned int count_bits(unsigned long n); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index f3dc1b9696e7..0590daec2f44 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -81,19 +81,11 @@ static void cmd_help(void) printf("\t-h: help\n"); } -void tests_cleanup(void) -{ - mbm_test_cleanup(); - mba_test_cleanup(); - cmt_test_cleanup(); - cat_test_cleanup(); -} - -static int test_prepare(void) +static int test_prepare(const struct resctrl_test *test) { int res; - res = signal_handler_register(); + res = signal_handler_register(test); if (res) { ksft_print_msg("Failed to register signal handler\n"); return res; @@ -136,7 +128,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p ksft_print_msg("Starting %s test ...\n", test->name); - if (test_prepare()) { + if (test_prepare(test)) { ksft_exit_fail_msg("Abnormal failure when preparing for the test\n"); return; } diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 5a49f07a6c85..445f306d4c2f 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -62,6 +62,7 @@ struct imc_counter_config { static char mbm_total_path[1024]; static int imcs; static struct imc_counter_config imc_counters_config[MAX_IMCS][2]; +static const struct resctrl_test *current_test; void membw_initialize_perf_event_attr(int i, int j) { @@ -472,7 +473,8 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) if (bm_pid) kill(bm_pid, SIGKILL); umount_resctrlfs(); - tests_cleanup(); + if (current_test && current_test->cleanup) + current_test->cleanup(); ksft_print_msg("Ending\n\n"); exit(EXIT_SUCCESS); @@ -482,13 +484,14 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) * Register CTRL-C handler for parent, as it has to kill * child process before exiting. */ -int signal_handler_register(void) +int signal_handler_register(const struct resctrl_test *test) { struct sigaction sigact = {}; int ret = 0; bm_pid = 0; + current_test = test; sigact.sa_sigaction = ctrlc_handler; sigemptyset(&sigact.sa_mask); sigact.sa_flags = SA_SIGINFO; @@ -510,6 +513,7 @@ void signal_handler_unregister(void) { struct sigaction sigact = {}; + current_test = NULL; sigact.sa_handler = SIG_DFL; sigemptyset(&sigact.sa_mask); if (sigaction(SIGINT, &sigact, NULL) || -- cgit v1.2.3 From 6cd368982cf3f972e5298025af0a9d2b69045cfe Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Tue, 27 Feb 2024 08:21:43 +0100 Subject: selftests/resctrl: Move cleanups out of individual tests Every test calls its cleanup function at the end of it's test function. After the cleanup function pointer is added to the test framework this can be simplified to executing the callback function at the end of the generic test running function. Make test cleanup functions static and call them from the end of run_single_test() from the resctrl_test's cleanup function pointer. Signed-off-by: Maciej Wieczor-Retman Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 7 ++----- tools/testing/selftests/resctrl/cmt_test.c | 3 +-- tools/testing/selftests/resctrl/mba_test.c | 7 ++----- tools/testing/selftests/resctrl/mbm_test.c | 7 ++----- tools/testing/selftests/resctrl/resctrl.h | 4 ---- tools/testing/selftests/resctrl/resctrl_tests.c | 6 ++++-- 6 files changed, 11 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 8fa4348ab461..c7686fb6641a 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -128,7 +128,7 @@ static int check_results(struct resctrl_val_param *param, const char *cache_type return fail; } -void cat_test_cleanup(void) +static void cat_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -284,13 +284,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param ret = cat_test(test, uparams, ¶m, span, start_mask); if (ret) - goto out; + return ret; ret = check_results(¶m, test->resource, cache_total_size, full_cache_mask, start_mask); -out: - cat_test_cleanup(); - return ret; } diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index a01ccf86e6ce..a44e6fcd37b7 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -91,7 +91,7 @@ static int check_results(struct resctrl_val_param *param, size_t span, int no_of MAX_DIFF, MAX_DIFF_PERCENT, runs - 1, true); } -void cmt_test_cleanup(void) +static void cmt_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -161,7 +161,6 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); out: - cmt_test_cleanup(); free(span_str); return ret; diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 189fbe20dc7b..5d6af9e8afed 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -137,7 +137,7 @@ static int check_results(void) return show_mba_info(bw_imc, bw_resc); } -void mba_test_cleanup(void) +static void mba_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -158,13 +158,10 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param ret = resctrl_val(test, uparams, uparams->benchmark_cmd, ¶m); if (ret) - goto out; + return ret; ret = check_results(); -out: - mba_test_cleanup(); - return ret; } diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 73d6a8b989f5..3059ccc51a5a 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -105,7 +105,7 @@ static int mbm_setup(const struct resctrl_test *test, return ret; } -void mbm_test_cleanup(void) +static void mbm_test_cleanup(void) { remove(RESULT_FILE_NAME); } @@ -126,15 +126,12 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param ret = resctrl_val(test, uparams, uparams->benchmark_cmd, ¶m); if (ret) - goto out; + return ret; ret = check_results(DEFAULT_SPAN); if (ret && (get_vendor() == ARCH_INTEL)) ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n"); -out: - mbm_test_cleanup(); - return ret; } diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index bc486f92aceb..00d51fa7531c 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -158,8 +158,6 @@ int resctrl_val(const struct resctrl_test *test, const struct user_params *uparams, const char * const *benchmark_cmd, struct resctrl_val_param *param); -void mbm_test_cleanup(void); -void mba_test_cleanup(void); unsigned long create_bit_mask(unsigned int start, unsigned int len); unsigned int count_contiguous_bits(unsigned long val, unsigned int *start); int get_full_cbm(const char *cache_type, unsigned long *mask); @@ -169,9 +167,7 @@ int resource_info_unsigned_get(const char *resource, const char *filename, unsig void ctrlc_handler(int signum, siginfo_t *info, void *ptr); int signal_handler_register(const struct resctrl_test *test); void signal_handler_unregister(void); -void cat_test_cleanup(void); unsigned int count_bits(unsigned long n); -void cmt_test_cleanup(void); void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config); void perf_event_initialize_read_format(struct perf_event_read *pe_read); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 0590daec2f44..348d17cb2a84 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -100,8 +100,10 @@ static int test_prepare(const struct resctrl_test *test) return 0; } -static void test_cleanup(void) +static void test_cleanup(const struct resctrl_test *test) { + if (test->cleanup) + test->cleanup(); umount_resctrlfs(); signal_handler_unregister(); } @@ -143,7 +145,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p ksft_test_result(!ret, "%s: test\n", test->name); cleanup: - test_cleanup(); + test_cleanup(test); } static void init_user_params(struct user_params *uparams) -- cgit v1.2.3 From 449a3c6c394bdd2a72a8ee8f3c23c51a680a59e2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 25 Mar 2024 14:29:09 +0000 Subject: kselftest/clone3: Make test names for set_tid test stable The test results reported for the clone3_set_tid tests interact poorly with automation for running kselftest since the reported test names include TIDs dynamically allocated at runtime. A lot of automation for running kselftest will compare runs by looking at the test name to identify if the same test is being run so changing names make it look like the testsuite has been updated to include new tests. This makes the results display less clearly and breaks cases like bisection. Address this by providing a brief description of the tests and logging that along with the stable parameters for the test currently logged. The TIDs are already logged separately in existing logging except for the final test which has a new log message added. We also tweak the formatting of the logging of expected/actual values for clarity. There are still issues with the logging of skipped tests (many are simply not logged at all when skipped and all are logged with different names) but these are less disruptive since the skips are all based on not being run as root, a condition likely to be stable for a given test system. Acked-by: Christian Brauner Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3_set_tid.c | 117 ++++++++++++++---------- 1 file changed, 69 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index ed785afb6077..9ae38733cb6e 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -114,7 +114,8 @@ static int call_clone3_set_tid(pid_t *set_tid, return WEXITSTATUS(status); } -static void test_clone3_set_tid(pid_t *set_tid, +static void test_clone3_set_tid(const char *desc, + pid_t *set_tid, size_t set_tid_size, int flags, int expected, @@ -129,17 +130,13 @@ static void test_clone3_set_tid(pid_t *set_tid, ret = call_clone3_set_tid(set_tid, set_tid_size, flags, expected_pid, wait_for_it); ksft_print_msg( - "[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n", + "[%d] clone3() with CLONE_SET_TID %d says: %d - expected %d\n", getpid(), set_tid[0], ret, expected); - if (ret != expected) - ksft_test_result_fail( - "[%d] Result (%d) is different than expected (%d)\n", - getpid(), ret, expected); - else - ksft_test_result_pass( - "[%d] Result (%d) matches expectation (%d)\n", - getpid(), ret, expected); + + ksft_test_result(ret == expected, "%s with %d TIDs and flags 0x%x\n", + desc, set_tid_size, flags); } + int main(int argc, char *argv[]) { FILE *f; @@ -172,73 +169,91 @@ int main(int argc, char *argv[]) /* Try invalid settings */ memset(&set_tid, 0, sizeof(set_tid)); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, - -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); /* * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 * nested PID namespace. */ - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, 0 TID", + set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); memset(&set_tid, 0xff, sizeof(set_tid)); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, - -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); /* * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 * nested PID namespace. */ - test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("invalid size, TID all 1s", + set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); memset(&set_tid, 0, sizeof(set_tid)); /* Try with an invalid PID */ set_tid[0] = 0; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, 0 TID", + set_tid, 1, 0, -EINVAL, 0, 0); set_tid[0] = -1; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, -1 TID", + set_tid, 1, 0, -EINVAL, 0, 0); /* Claim that the set_tid array actually contains 2 elements. */ - test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("2 TIDs, -1 and 0", + set_tid, 2, 0, -EINVAL, 0, 0); /* Try it in a new PID namespace */ if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("valid size, -1 TID", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* Try with a valid PID (1) this should return -EEXIST. */ set_tid[0] = 1; if (uid == 0) - test_clone3_set_tid(set_tid, 1, 0, -EEXIST, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, 0, -EEXIST, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* Try it in a new PID namespace */ if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, 0, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, CLONE_NEWPID, 0, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); /* pid_max should fail everywhere */ set_tid[0] = pid_max; - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("set TID to maximum", + set_tid, 1, 0, -EINVAL, 0, 0); if (uid == 0) - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("set TID to maximum", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); else ksft_test_result_skip("Clone3() with set_tid requires root\n"); @@ -262,10 +277,12 @@ int main(int argc, char *argv[]) /* After the child has finished, its PID should be free. */ set_tid[0] = pid; - test_clone3_set_tid(set_tid, 1, 0, 0, 0, 0); + test_clone3_set_tid("reallocate child TID", + set_tid, 1, 0, 0, 0, 0); /* This should fail as there is no PID 1 in that namespace */ - test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("duplicate child TID", + set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); /* * Creating a process with PID 1 in the newly created most nested @@ -274,7 +291,8 @@ int main(int argc, char *argv[]) */ set_tid[0] = 1; set_tid[1] = pid; - test_clone3_set_tid(set_tid, 2, CLONE_NEWPID, 0, pid, 0); + test_clone3_set_tid("create PID 1 in new NS", + set_tid, 2, CLONE_NEWPID, 0, pid, 0); ksft_print_msg("unshare PID namespace\n"); if (unshare(CLONE_NEWPID) == -1) @@ -284,7 +302,8 @@ int main(int argc, char *argv[]) set_tid[0] = pid; /* This should fail as there is no PID 1 in that namespace */ - test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + test_clone3_set_tid("duplicate PID 1", + set_tid, 1, 0, -EINVAL, 0, 0); /* Let's create a PID 1 */ ns_pid = fork(); @@ -295,21 +314,25 @@ int main(int argc, char *argv[]) */ set_tid[0] = 43; set_tid[1] = -1; - test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + test_clone3_set_tid("check leak on invalid TID -1", + set_tid, 2, 0, -EINVAL, 0, 0); set_tid[0] = 43; set_tid[1] = pid; - test_clone3_set_tid(set_tid, 2, 0, 0, 43, 0); + test_clone3_set_tid("check leak on invalid specific TID", + set_tid, 2, 0, 0, 43, 0); ksft_print_msg("Child in PID namespace has PID %d\n", getpid()); set_tid[0] = 2; - test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0); + test_clone3_set_tid("create PID 2 in child NS", + set_tid, 1, 0, 0, 2, 0); set_tid[0] = 1; set_tid[1] = -1; set_tid[2] = pid; /* This should fail as there is invalid PID at level '1'. */ - test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("fail due to invalid TID at level 1", + set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); set_tid[0] = 1; set_tid[1] = 42; @@ -319,13 +342,15 @@ int main(int argc, char *argv[]) * namespaces. Again assuming this is running in the host's * PID namespace. Not yet nested. */ - test_clone3_set_tid(set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); + test_clone3_set_tid("fail due to too few active PID NSs", + set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); /* * This should work and from the parent we should see * something like 'NSpid: pid 42 1'. */ - test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true); + test_clone3_set_tid("verify that we have 3 PID NSs", + set_tid, 3, CLONE_NEWPID, 0, 42, true); child_exit(ksft_cnt.ksft_fail); } @@ -380,14 +405,10 @@ int main(int argc, char *argv[]) ksft_cnt.ksft_pass += 6 - (ksft_cnt.ksft_fail - WEXITSTATUS(status)); ksft_cnt.ksft_fail = WEXITSTATUS(status); - if (ns3 == pid && ns2 == 42 && ns1 == 1) - ksft_test_result_pass( - "PIDs in all namespaces as expected (%d,%d,%d)\n", - ns3, ns2, ns1); - else - ksft_test_result_fail( - "PIDs in all namespaces not as expected (%d,%d,%d)\n", - ns3, ns2, ns1); + ksft_print_msg("Expecting PIDs %d, 42, 1\n", pid); + ksft_print_msg("Have PIDs in namespaces: %d, %d, %d\n", ns3, ns2, ns1); + ksft_test_result(ns3 == pid && ns2 == 42 && ns1 == 1, + "PIDs in all namespaces as expected\n"); out: ret = 0; -- cgit v1.2.3 From b4970a8c50c4a8310e907d25abbce5094564633b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 4 Apr 2024 20:55:10 +0500 Subject: kselftest: Add missing signature to the comments The comment on top of the file is used by many developers to glance over all the available functions. Add the recently added ksft_perror() to it. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 29efce369a36..8e9e4b86645a 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -16,6 +16,7 @@ * For each test, report any progress, debugging, etc with: * * ksft_print_msg(fmt, ...); + * ksft_perror(msg); * * and finally report the pass/fail/skip/xfail state of the test with one of: * -- cgit v1.2.3 From 86483f8b4e8d3eb364bfa9049b46fea3578af050 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 4 Apr 2024 21:14:32 +0500 Subject: selftests: add ksft_exit_fail_perror() Add a version of ksft_exit_fail_msg() which prints the errno and its string form with ease. There is no benefit of exit message without errno. Whenever some error occurs, instead of printing errno manually, this function would be very helpful. In the next TAP ports or new tests, this function will be used instead of ksft_exit_fail_msg() as it prints errno. Resolved merge conflict found in next between the following commits: f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn") f07041728422 ("selftests: add ksft_exit_fail_perror()") Reported-by: Stephen Rothwell Shuah Khan Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 8e9e4b86645a..0a983b27771e 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -41,6 +41,7 @@ * the program is aborting before finishing all tests): * * ksft_exit_fail_msg(fmt, ...); + * ksft_exit_fail_perror(msg); * */ #ifndef __KSELFTEST_H @@ -376,6 +377,19 @@ static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, exit(KSFT_FAIL); } +static inline void ksft_exit_fail_perror(const char *msg) +{ +#ifndef NOLIBC + ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno); +#else + /* + * nolibc doesn't provide strerror() and it seems + * inappropriate to add one, just print the errno. + */ + ksft_exit_fail_msg("%s: %d)\n", msg, errno); +#endif +} + static inline __noreturn int ksft_exit_xfail(void) { ksft_print_cnts(); -- cgit v1.2.3 From 9c84b890b8f60c4dfb69f4ba206949db72fd2416 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 4 Apr 2024 21:14:33 +0500 Subject: selftests: exec: Use new ksft_exit_fail_perror() helper Use ksft_exit_fail_perror() to print the value of errno and its string form. This is the first user of the ksft_exit_fail_perror() and proves the usefulness of this API. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/exec/recursion-depth.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c index b2f37d86a5f6..438c8ff2fd26 100644 --- a/tools/testing/selftests/exec/recursion-depth.c +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -37,25 +37,25 @@ int main(void) ksft_test_result_skip("error: unshare, errno %d\n", errno); ksft_finished(); } - ksft_exit_fail_msg("error: unshare, errno %d\n", errno); + ksft_exit_fail_perror("error: unshare"); } if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL) == -1) - ksft_exit_fail_msg("error: mount '/', errno %d\n", errno); + ksft_exit_fail_perror("error: mount '/'"); /* Require "exec" filesystem. */ if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) - ksft_exit_fail_msg("error: mount ramfs, errno %d\n", errno); + ksft_exit_fail_perror("error: mount ramfs"); #define FILENAME "/tmp/1" fd = creat(FILENAME, 0700); if (fd == -1) - ksft_exit_fail_msg("error: creat, errno %d\n", errno); + ksft_exit_fail_perror("error: creat"); #define S "#!" FILENAME "\n" if (write(fd, S, strlen(S)) != strlen(S)) - ksft_exit_fail_msg("error: write, errno %d\n", errno); + ksft_exit_fail_perror("error: write"); close(fd); -- cgit v1.2.3 From 57c6c58919c929b30820b400334e6291602b4477 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 25 Mar 2024 16:15:50 +0000 Subject: tracing/selftests: Support log output when generating KTAP output When -v is specified ftracetest will dump logs of test execution to the console which if -K is also specified for KTAP output will result in output that is not properly KTAP formatted. All that's required for KTAP formatting is that anything we log have a '#' at the start of the line so we can improve things by washing the output through a simple read loop. This will help automated parsers when verbose mode is enabled. Signed-off-by: Mark Brown Reviewed-by: Muhammad Usama Anjum Acked-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 25d4e0fca385..cce72f8b03dc 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -255,7 +255,13 @@ prlog() { # messages [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE } catlog() { #file - cat $1 + if [ "${KTAP}" = "1" ]; then + cat $1 | while read line ; do + echo "# $line" + done + else + cat $1 + fi [ "$LOG_FILE" ] && cat $1 | strip_esc >> $LOG_FILE } prlog "=== Ftrace unit tests ===" -- cgit v1.2.3 From c1b121eafd9b94079300bc1f21a5bf82e72a99b4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 25 Mar 2024 16:15:51 +0000 Subject: tracing/selftests: Default to verbose mode when running in kselftest In order to facilitate debugging of issues from automated runs of the ftrace selftests turn on verbose logging by default when run from the kselftest runner. This is primarily used by automated systems where developers may not have direct access to the system so defaulting to providing diagnostic information which might help debug problems seems like a good idea. When tests pass no extra output is generated, when they fail a full log of the test run is provided. Since this really is rather verbose when there are a large number of test failures or output is slow (eg, with a serial console) this could substantially increase the run time for the tests which might present problems with timeout detection for affected systems, hopefully we keep the tests running well enough that this is not too much of an issue. Signed-off-by: Mark Brown Reviewed-by: Muhammad Usama Anjum Acked-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest-ktap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/ftracetest-ktap b/tools/testing/selftests/ftrace/ftracetest-ktap index b3284679ef3a..14e62ef3f3b9 100755 --- a/tools/testing/selftests/ftrace/ftracetest-ktap +++ b/tools/testing/selftests/ftrace/ftracetest-ktap @@ -5,4 +5,4 @@ # # Copyright (C) Arm Ltd., 2023 -./ftracetest -K +./ftracetest -K -v -- cgit v1.2.3 From 7b8674cae80f68304ec6628e455c169a7dc2ad0d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 9 Apr 2024 23:24:51 +0100 Subject: selftests/clone3: Fix compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shuah reported a compiler warning with an Ubuntu GCC 13 build, I've been unable to reproduce it but hopefully this fixes the issue: clone3_set_tid.c:136:43: warning: format ‘%d’ expects argument of type ‘int’, but argument 3 has type ‘size_t’ {aka ‘long unsigned int’} [-Wformat=] Reported-by: Shuah Khan Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3_set_tid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index 9ae38733cb6e..fbf813a5a06f 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -133,7 +133,7 @@ static void test_clone3_set_tid(const char *desc, "[%d] clone3() with CLONE_SET_TID %d says: %d - expected %d\n", getpid(), set_tid[0], ret, expected); - ksft_test_result(ret == expected, "%s with %d TIDs and flags 0x%x\n", + ksft_test_result(ret == expected, "%s with %zu TIDs and flags 0x%x\n", desc, set_tid_size, flags); } -- cgit v1.2.3 From 698eb790e016427bf3b2e55693e653222af13691 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 9 Apr 2024 21:39:43 +0100 Subject: selftests/clone3: Check that the child exited cleanly When the child exits during the clone3() selftest we use WEXITSTATUS() to get the exit status from the process without first checking WIFEXITED() to see if the result will be valid. This can lead to incorrect results, for example if the child exits due to signal. Add a WIFEXTED() check and report any non-standard exit as a failure, using EXIT_FAILURE as the exit status for call_clone3() since we otherwise report 0 or negative errnos. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 3c9bf0cd82a8..0e0e5dfa97c6 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -98,6 +98,11 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) ksft_print_msg("Child returned %s\n", strerror(errno)); return -errno; } + if (!WIFEXITED(status)) { + ksft_print_msg("Child did not exit normally, status 0x%x\n", + status); + return EXIT_FAILURE; + } if (WEXITSTATUS(status)) return WEXITSTATUS(status); -- cgit v1.2.3 From 6a5695119e0a8755d907a9a76b08307e41b98fec Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 9 Apr 2024 21:40:32 +0100 Subject: selftests/clone3: Correct log message for waitpid() failures When logging an error from calling waitpid() on the child we print a misleading error message saying that the error we report was returned by the chilld. Fix this to say the error is from waitpid(). Applied after fixing merge conflict: Shuah Khan Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 0e0e5dfa97c6..e61f07973ce5 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -95,7 +95,7 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) getpid(), pid); if (waitpid(-1, &status, __WALL) < 0) { - ksft_print_msg("Child returned %s\n", strerror(errno)); + ksft_print_msg("waitpid() returned %s\n", strerror(errno)); return -errno; } if (!WIFEXITED(status)) { -- cgit v1.2.3 From 557f1375275e04ef92d22aa14203e4b763fbd22b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Sun, 14 Apr 2024 11:26:53 +0500 Subject: selftests: Mark ksft_exit_fail_perror() as __noreturn Let the compilers (clang) know that this function would just call exit() and would never return. It is needed to avoid false positive static analysis errors. All similar functions calling exit() unconditionally have been marked as __noreturn. Signed-off-by: Muhammad Usama Anjum Reviewed-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 0a983b27771e..3d2256885cff 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -377,7 +377,7 @@ static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, exit(KSFT_FAIL); } -static inline void ksft_exit_fail_perror(const char *msg) +static inline __noreturn void ksft_exit_fail_perror(const char *msg) { #ifndef NOLIBC ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno); -- cgit v1.2.3 From c7e84706fd3be0b56ae23c6a8930a9e5615a869a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 18 Apr 2024 20:31:45 +0500 Subject: selftests: cpufreq: conform test to TAP This test outputs lots of information. Let's conform the core part of the test to TAP and leave the information printing messages for now. Include ktap_helpers.sh to print conformed logs. Use KSFT_* macros to return the correct exit code for the kselftest framework and CIs to understand the exit status. Signed-off-by: Muhammad Usama Anjum Acked-by: Viresh Kumar Signed-off-by: Shuah Khan --- tools/testing/selftests/cpufreq/cpufreq.sh | 3 +- tools/testing/selftests/cpufreq/main.sh | 47 ++++++++++++++++++------------ tools/testing/selftests/cpufreq/module.sh | 6 ++-- 3 files changed, 31 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh index b583a2fb4504..a8b1dbc0a3a5 100755 --- a/tools/testing/selftests/cpufreq/cpufreq.sh +++ b/tools/testing/selftests/cpufreq/cpufreq.sh @@ -178,8 +178,7 @@ cpufreq_basic_tests() count=$(count_cpufreq_managed_cpus) if [ $count = 0 ]; then - printf "No cpu is managed by cpufreq core, exiting\n" - exit; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting\n" else printf "CPUFreq manages: $count CPUs\n\n" fi diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index 60ce18ed0666..a0eb84cf7167 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -7,15 +7,15 @@ source governor.sh source module.sh source special-tests.sh +DIR="$(dirname $(readlink -f "$0"))" +source "${DIR}"/../kselftest/ktap_helpers.sh + FUNC=basic # do basic tests by default OUTFILE=cpufreq_selftest SYSFS= CPUROOT= CPUFREQROOT= -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - helpme() { printf "Usage: $0 [-h] [-todg args] @@ -32,7 +32,7 @@ helpme() [-d \"] [-g \"] \n" - exit 2 + exit "${KSFT_FAIL}" } prerequisite() @@ -40,8 +40,8 @@ prerequisite() msg="skip all tests:" if [ $UID != 0 ]; then - echo $msg must be run as root >&2 - exit $ksft_skip + ktap_skip_all "$msg must be run as root" + exit "${KSFT_SKIP}" fi taskset -p 01 $$ @@ -49,21 +49,21 @@ prerequisite() SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'` if [ ! -d "$SYSFS" ]; then - echo $msg sysfs is not mounted >&2 - exit 2 + ktap_skip_all "$msg sysfs is not mounted" + exit "${KSFT_SKIP}" fi CPUROOT=$SYSFS/devices/system/cpu CPUFREQROOT="$CPUROOT/cpufreq" if ! ls $CPUROOT/cpu* > /dev/null 2>&1; then - echo $msg cpus not available in sysfs >&2 - exit 2 + ktap_skip_all "$msg cpus not available in sysfs" + exit "${KSFT_SKIP}" fi if ! ls $CPUROOT/cpufreq > /dev/null 2>&1; then - echo $msg cpufreq directory not available in sysfs >&2 - exit 2 + ktap_skip_all "$msg cpufreq directory not available in sysfs" + exit "${KSFT_SKIP}" fi } @@ -105,8 +105,7 @@ do_test() count=$(count_cpufreq_managed_cpus) if [ $count = 0 -a $FUNC != "modtest" ]; then - echo "No cpu is managed by cpufreq core, exiting" - exit 2; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting" fi case "$FUNC" in @@ -125,8 +124,7 @@ do_test() "modtest") # Do we have modules in place? if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then - echo "No driver or governor module passed with -d or -g" - exit 2; + ktap_exit_fail_msg "No driver or governor module passed with -d or -g" fi if [ $DRIVER_MOD ]; then @@ -137,8 +135,7 @@ do_test() fi else if [ $count = 0 ]; then - echo "No cpu is managed by cpufreq core, exiting" - exit 2; + ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting" fi module_governor_test $GOVERNOR_MOD @@ -162,7 +159,7 @@ do_test() ;; *) - echo "Invalid [-f] function type" + ktap_print_msg "Invalid [-f] function type" helpme ;; esac @@ -186,13 +183,25 @@ dmesg_dumps() dmesg >> $1.dmesg_full.txt } +ktap_print_header + # Parse arguments parse_arguments $@ +ktap_set_plan 1 + # Make sure all requirements are met prerequisite # Run requested functions clear_dumps $OUTFILE do_test | tee -a $OUTFILE.txt +if [ "${PIPESTATUS[0]}" -ne 0 ]; then + exit ${PIPESTATUS[0]}; +fi dmesg_dumps $OUTFILE + +ktap_test_pass "Completed successfully" + +ktap_print_totals +exit "${KSFT_PASS}" diff --git a/tools/testing/selftests/cpufreq/module.sh b/tools/testing/selftests/cpufreq/module.sh index 22563cd122e7..7f2667e0ae2d 100755 --- a/tools/testing/selftests/cpufreq/module.sh +++ b/tools/testing/selftests/cpufreq/module.sh @@ -24,16 +24,14 @@ test_basic_insmod_rmmod() # insert module insmod $1 if [ $? != 0 ]; then - printf "Insmod $1 failed\n" - exit; + ktap_exit_fail_msg "Insmod $1 failed\n" fi printf "Removing $1 module\n" # remove module rmmod $1 if [ $? != 0 ]; then - printf "rmmod $1 failed\n" - exit; + ktap_exit_fail_msg "rmmod $1 failed\n" fi printf "\n" -- cgit v1.2.3 From 45d5a2b1886a3ff0fe5627ebee84c089db7ff5f2 Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Mon, 15 Apr 2024 11:32:15 -0400 Subject: selftests: ktap_helpers: Make it POSIX-compliant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple uses of bash specific syntax in the script. Change them to the equivalent POSIX syntax. This doesn't change functionality and allows non-bash test scripts to make use of these helpers. Reported-by: Mike Looijmans Closes: https://lore.kernel.org/all/efae4037-c22a-40be-8ba9-7c1c12ece042@topic.nl/ Fixes: 2dd0b5a8fcc4 ("selftests: ktap_helpers: Add a helper to finish the test") Fixes: 14571ab1ad21 ("kselftest: Add new test for detecting unprobed Devicetree devices") Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest/ktap_helpers.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh index f2fbb914e058..79a125eb24c2 100644 --- a/tools/testing/selftests/kselftest/ktap_helpers.sh +++ b/tools/testing/selftests/kselftest/ktap_helpers.sh @@ -43,7 +43,7 @@ __ktap_test() { directive="$3" # optional local directive_str= - [[ ! -z "$directive" ]] && directive_str="# $directive" + [ ! -z "$directive" ] && directive_str="# $directive" echo $result $KTAP_TESTNO $description $directive_str @@ -99,7 +99,7 @@ ktap_exit_fail_msg() { ktap_finished() { ktap_print_totals - if [ $(("$KTAP_CNT_PASS" + "$KTAP_CNT_SKIP")) -eq "$KSFT_NUM_TESTS" ]; then + if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then exit "$KSFT_PASS" else exit "$KSFT_FAIL" -- cgit v1.2.3 From 5b1c8b1e56ff8b5e9c1a09606af3627bb55933cf Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Mon, 15 Apr 2024 11:32:16 -0400 Subject: selftests: power_supply: Make it POSIX-compliant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is one use of bash specific syntax in the script. Change it to the equivalent POSIX syntax. This doesn't change functionality and allows the test to be run on shells other than bash. Reported-by: Mike Looijmans Closes: https://lore.kernel.org/all/efae4037-c22a-40be-8ba9-7c1c12ece042@topic.nl/ Fixes: 4a679c5afca0 ("selftests: Add test to verify power supply properties") Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/power_supply/test_power_supply_properties.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/power_supply/test_power_supply_properties.sh b/tools/testing/selftests/power_supply/test_power_supply_properties.sh index df272dfe1d2a..a66b1313ed88 100755 --- a/tools/testing/selftests/power_supply/test_power_supply_properties.sh +++ b/tools/testing/selftests/power_supply/test_power_supply_properties.sh @@ -23,7 +23,7 @@ count_tests() { total_tests=0 for i in $SUPPLIES; do - total_tests=$(("$total_tests" + "$NUM_TESTS")) + total_tests=$((total_tests + NUM_TESTS)) done echo "$total_tests" -- cgit v1.2.3 From 5ca6110661bd601e6a791eafa92c028af1816797 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:04 -0700 Subject: selftests/clone3: ksft_exit functions do not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_{pass,fail}(), as __noreturn prevents the compiler from warning that a caller of the ksft_exit functions does not return a value because the program will terminate upon calling these functions. Just removing 'return' would have resulted in !ret ? ksft_exit_pass() : ksft_exit_fail(); so convert that into the more idiomatic if (ret) ksft_exit_fail(); ksft_exit_pass(); Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/clone3/clone3_clear_sighand.c | 2 +- tools/testing/selftests/clone3/clone3_set_tid.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index 54a8b2445be9..ce0426786828 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -120,5 +120,5 @@ int main(int argc, char **argv) test_clone3_clear_sighand(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index fbf813a5a06f..bfb0da2b4fdd 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -412,5 +412,7 @@ int main(int argc, char *argv[]) out: ret = 0; - return !ret ? ksft_exit_pass() : ksft_exit_fail(); + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); } -- cgit v1.2.3 From e84b354e6ea15ce04b4fe766e75ab1d4379df5c4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:05 -0700 Subject: selftests/ipc: ksft_exit functions do not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_...(), as __noreturn prevents the compiler from warning that a caller of the ksft_exit functions does not return a value because the program will terminate upon calling these functions. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/ipc/msgque.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c index 656c43c24044..c75ea4094870 100644 --- a/tools/testing/selftests/ipc/msgque.c +++ b/tools/testing/selftests/ipc/msgque.c @@ -198,13 +198,12 @@ int main(int argc, char **argv) struct msgque_data msgque; if (getuid() != 0) - return ksft_exit_skip( - "Please run the test as root - Exiting.\n"); + ksft_exit_skip("Please run the test as root - Exiting.\n"); msgque.key = ftok(argv[0], 822155650); if (msgque.key == -1) { printf("Can't make key: %d\n", -errno); - return ksft_exit_fail(); + ksft_exit_fail(); } msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666); @@ -243,13 +242,13 @@ int main(int argc, char **argv) printf("Failed to test queue: %d\n", err); goto err_out; } - return ksft_exit_pass(); + ksft_exit_pass(); err_destroy: if (msgctl(msgque.msq_id, IPC_RMID, NULL)) { printf("Failed to destroy queue: %d\n", -errno); - return ksft_exit_fail(); + ksft_exit_fail(); } err_out: - return ksft_exit_fail(); + ksft_exit_fail(); } -- cgit v1.2.3 From a9c91ecddc76fe7900213bf090d5df4035802cba Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:06 -0700 Subject: selftests: membarrier: ksft_exit_pass() does not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_pass(), as __noreturn prevents the compiler from warning that a caller of ksft_exit_pass() does not return a value because the program will terminate upon calling these functions. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/membarrier/membarrier_test_multi_thread.c | 2 +- tools/testing/selftests/membarrier/membarrier_test_single_thread.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c index a9cc17facfb3..4e14dba81234 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c +++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c @@ -69,5 +69,5 @@ int main(int argc, char **argv) /* Multi-threaded */ test_mt_membarrier(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c index 4cdc8b1d124c..fa3f1d6c37a0 100644 --- a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c +++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c @@ -24,5 +24,5 @@ int main(int argc, char **argv) test_membarrier_get_registrations(/*cmd=*/0); - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From 69e545edbe8b17c26aa06ef7e430d0be7f08d876 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:07 -0700 Subject: selftests/mm: ksft_exit functions do not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_...(), as __noreturn prevents the compiler from warning that a caller of the ksft_exit functions does not return a value because the program will terminate upon calling these functions. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/mm/compaction_test.c | 6 +++--- tools/testing/selftests/mm/cow.c | 2 +- tools/testing/selftests/mm/gup_longterm.c | 2 +- tools/testing/selftests/mm/gup_test.c | 4 ++-- tools/testing/selftests/mm/ksm_functional_tests.c | 2 +- tools/testing/selftests/mm/madv_populate.c | 2 +- tools/testing/selftests/mm/mkdirty.c | 2 +- tools/testing/selftests/mm/pagemap_ioctl.c | 4 ++-- tools/testing/selftests/mm/soft-dirty.c | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 533999b6c284..4f42eb7d7636 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -177,7 +177,7 @@ int main(int argc, char **argv) ksft_print_header(); if (prereq() || geteuid()) - return ksft_exit_skip("Prerequisites unsatisfied\n"); + ksft_exit_skip("Prerequisites unsatisfied\n"); ksft_set_plan(1); @@ -225,7 +225,7 @@ int main(int argc, char **argv) } if (check_compaction(mem_free, hugepage_size) == 0) - return ksft_exit_pass(); + ksft_exit_pass(); - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 363bf5f801be..fe078d6e1806 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1779,5 +1779,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index ad168d35b23b..d7eaca5bbe9b 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -456,5 +456,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 18a49c70d4c6..bd335cf9bc0e 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -228,7 +228,7 @@ int main(int argc, char **argv) break; } ksft_test_result_skip("Please run this test as root\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); @@ -267,5 +267,5 @@ int main(int argc, char **argv) free(tid); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d615767e396b..508287560c45 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -646,5 +646,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index 17bcb07f19f3..ef7d911da13e 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -307,5 +307,5 @@ int main(int argc, char **argv) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c index 301abb99e027..b8a7efe9204e 100644 --- a/tools/testing/selftests/mm/mkdirty.c +++ b/tools/testing/selftests/mm/mkdirty.c @@ -375,5 +375,5 @@ int main(void) if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index d59517ed3d48..2d785aca72a5 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1484,7 +1484,7 @@ int main(int argc, char *argv[]) ksft_print_header(); if (init_uffd()) - return ksft_exit_pass(); + ksft_exit_pass(); ksft_set_plan(115); @@ -1660,5 +1660,5 @@ int main(int argc, char *argv[]) userfaultfd_tests(); close(pagemap_fd); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 7dbfa53d93a0..d9dbf879748b 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -209,5 +209,5 @@ int main(int argc, char **argv) close(pagemap_fd); - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From a3bf0755f01568493e1f2e83ff98284c8c12e35d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:08 -0700 Subject: selftests: pidfd: ksft_exit functions do not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_{pass,fail}(), as __noreturn prevents the compiler from warning that a caller of the ksft_exit functions does not return a value because the program will terminate upon calling these functions. Just removing 'return' would have resulted in !ret ? ksft_exit_pass() : ksft_exit_fail(); so convert that into the more idiomatic if (ret) ksft_exit_fail(); ksft_exit_pass(); Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_fdinfo_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_open_test.c | 4 +++- tools/testing/selftests/pidfd/pidfd_poll_test.c | 2 +- tools/testing/selftests/pidfd/pidfd_test.c | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index 01cc37bf611c..f062a986e382 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -307,5 +307,5 @@ int main(int argc, char **argv) test_pidfd_fdinfo_nspid(); test_pidfd_dead_fdinfo(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index 8a59438ccc78..c62564c264b1 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -159,5 +159,7 @@ on_error: if (pidfd >= 0) close(pidfd); - return !ret ? ksft_exit_pass() : ksft_exit_fail(); + if (ret) + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c index 610811275357..55d74a50358f 100644 --- a/tools/testing/selftests/pidfd/pidfd_poll_test.c +++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c @@ -112,5 +112,5 @@ int main(int argc, char **argv) } ksft_test_result_pass("pidfd poll test: pass\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index c081ae91313a..9faa686f90e4 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -572,5 +572,5 @@ int main(int argc, char **argv) test_pidfd_send_signal_exited_fail(); test_pidfd_send_signal_recycled_pid_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From 47b59f3603d49d4898aafb256405b16ccdc68cc7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:09 -0700 Subject: selftests/resctrl: ksft_exit_skip() does not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_skip(), as __noreturn prevents the compiler from warning that a caller of ksft_exit_skip() does not return a value because the program will terminate upon calling these functions. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_tests.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 348d17cb2a84..ecbb7605a981 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -247,13 +247,13 @@ last_arg: * 2. We execute perf commands */ if (geteuid() != 0) - return ksft_exit_skip("Not running as root. Skipping...\n"); + ksft_exit_skip("Not running as root. Skipping...\n"); if (!check_resctrlfs_support()) - return ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n"); + ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n"); if (umount_resctrlfs()) - return ksft_exit_skip("resctrl FS unmount failed.\n"); + ksft_exit_skip("resctrl FS unmount failed.\n"); filter_dmesg(); -- cgit v1.2.3 From 102690be45b9345d7c3d8f578dae2e51c02b794c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:10 -0700 Subject: selftests: sync: ksft_exit_pass() does not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the call to ksft_exit_pass(), as __noreturn prevents the compiler from warning that a caller of ksft_exit_pass() does not return a value because the program will terminate upon calling these functions (which is what the comment alluded to as well). Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/sync_test.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/sync/sync_test.c b/tools/testing/selftests/sync/sync_test.c index 414a617db993..93db5aa246a3 100644 --- a/tools/testing/selftests/sync/sync_test.c +++ b/tools/testing/selftests/sync/sync_test.c @@ -109,6 +109,5 @@ int main(void) ksft_exit_fail_msg("%d out of %d sync tests failed\n", err, ksft_test_num()); - /* need this return to keep gcc happy */ - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From bc7e5d23be8925c3049f0232f83e2118e1cf373b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:11 -0700 Subject: selftests: timers: ksft_exit functions do not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the calls to ksft_exit_...(), as __noreturn prevents the compiler from warning that a caller of the ksft_exit functions does not return a value because the program will terminate upon calling these functions. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/adjtick.c | 4 ++-- tools/testing/selftests/timers/alarmtimer-suspend.c | 4 ++-- tools/testing/selftests/timers/change_skew.c | 4 ++-- tools/testing/selftests/timers/freq-step.c | 4 ++-- tools/testing/selftests/timers/leap-a-day.c | 10 +++++----- tools/testing/selftests/timers/leapcrash.c | 4 ++-- tools/testing/selftests/timers/mqueue-lat.c | 4 ++-- tools/testing/selftests/timers/posix_timers.c | 12 ++++++------ tools/testing/selftests/timers/raw_skew.c | 6 +++--- tools/testing/selftests/timers/set-2038.c | 4 ++-- tools/testing/selftests/timers/set-tai.c | 4 ++-- tools/testing/selftests/timers/set-timer-lat.c | 4 ++-- tools/testing/selftests/timers/set-tz.c | 4 ++-- tools/testing/selftests/timers/skew_consistency.c | 4 ++-- tools/testing/selftests/timers/threadtest.c | 2 +- tools/testing/selftests/timers/valid-adjtimex.c | 6 +++--- 16 files changed, 40 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c index 47e05fdc32c5..205b76a4abb4 100644 --- a/tools/testing/selftests/timers/adjtick.c +++ b/tools/testing/selftests/timers/adjtick.c @@ -205,7 +205,7 @@ int main(int argc, char **argv) adjtimex(&tx1); if (err) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index 4332b494103d..ad52e608b88e 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -173,6 +173,6 @@ int main(void) timer_delete(tm1); } if (final_ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index 992a77f2a74c..4421cd562c24 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -89,8 +89,8 @@ int main(int argc, char **argv) if (ret) { printf("[FAIL]"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c index 4b76450d78d1..73b636f89fdc 100644 --- a/tools/testing/selftests/timers/freq-step.c +++ b/tools/testing/selftests/timers/freq-step.c @@ -257,7 +257,7 @@ int main(int argc, char **argv) set_frequency(0.0); if (fails) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 23eb398c8140..986abbdb1521 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -268,7 +268,7 @@ int main(int argc, char **argv) if (ret < 0) { printf("Error: Problem setting STA_INS/STA_DEL!: %s\n", time_state_str(ret)); - return ksft_exit_fail(); + ksft_exit_fail(); } /* Validate STA_INS was set */ @@ -277,7 +277,7 @@ int main(int argc, char **argv) if (tx.status != STA_INS && tx.status != STA_DEL) { printf("Error: STA_INS/STA_DEL not set!: %s\n", time_state_str(ret)); - return ksft_exit_fail(); + ksft_exit_fail(); } if (tai_time) { @@ -295,7 +295,7 @@ int main(int argc, char **argv) se.sigev_value.sival_int = 0; if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) { printf("Error: timer_create failed\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } its1.it_value.tv_sec = next_leap; its1.it_value.tv_nsec = 0; @@ -366,7 +366,7 @@ int main(int argc, char **argv) if (error_found) { printf("Errors observed\n"); clear_time_state(); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("\n"); if ((iterations != -1) && !(--iterations)) @@ -374,5 +374,5 @@ int main(int argc, char **argv) } clear_time_state(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c index f70802c5dd0d..8fd065eec904 100644 --- a/tools/testing/selftests/timers/leapcrash.c +++ b/tools/testing/selftests/timers/leapcrash.c @@ -87,7 +87,7 @@ int main(void) tv.tv_usec = 0; if (settimeofday(&tv, NULL)) { printf("Error: You're likely not running with proper (ie: root) permissions\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } tx.modes = 0; adjtimex(&tx); @@ -104,5 +104,5 @@ int main(void) fflush(stdout); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c index 7916cf5cc6ff..f3179a605bba 100644 --- a/tools/testing/selftests/timers/mqueue-lat.c +++ b/tools/testing/selftests/timers/mqueue-lat.c @@ -107,8 +107,8 @@ int main(int argc, char **argv) ret = mqueue_lat_test(); if (ret < 0) { printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index c001dd79179d..07c81c0093c0 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -260,16 +260,16 @@ int main(int argc, char **argv) ksft_print_msg("based timers if other threads run on the CPU...\n"); if (check_itimer(ITIMER_VIRTUAL) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_itimer(ITIMER_PROF) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_itimer(ITIMER_REAL) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); /* * It's unfortunately hard to reliably test a timer expiration @@ -281,10 +281,10 @@ int main(int argc, char **argv) * find a better solution. */ if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) - return ksft_exit_fail(); + ksft_exit_fail(); if (check_timer_distribution() < 0) - return ksft_exit_fail(); + ksft_exit_fail(); ksft_finished(); } diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 6eba203f9da7..030143eb09b4 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -137,11 +137,11 @@ int main(int argc, char **argv) if (tx1.offset || tx2.offset || tx1.freq != tx2.freq || tx1.tick != tx2.tick) { printf(" [SKIP]\n"); - return ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); + ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); } printf(" [FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf(" [OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c index 688cfd81b531..f7d978721b9e 100644 --- a/tools/testing/selftests/timers/set-2038.c +++ b/tools/testing/selftests/timers/set-2038.c @@ -128,6 +128,6 @@ out: /* restore clock */ settime(start); if (ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c index 8c4179ee2ca2..5b67462efcd6 100644 --- a/tools/testing/selftests/timers/set-tai.c +++ b/tools/testing/selftests/timers/set-tai.c @@ -61,9 +61,9 @@ int main(int argc, char **argv) ret = get_tai(); if (ret != i) { printf("[FAILED] expected: %i got %i\n", i, ret); - return ksft_exit_fail(); + ksft_exit_fail(); } } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 50da45437daa..7ce240c89b21 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -278,6 +278,6 @@ int main(void) ret |= do_timer_oneshot(clock_id, 0); } if (ret) - return ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_fail(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/set-tz.c b/tools/testing/selftests/timers/set-tz.c index 62bd33eb16f0..20daaf1782b7 100644 --- a/tools/testing/selftests/timers/set-tz.c +++ b/tools/testing/selftests/timers/set-tz.c @@ -102,9 +102,9 @@ int main(int argc, char **argv) printf("[OK]\n"); set_tz(min, dst); - return ksft_exit_pass(); + ksft_exit_pass(); err: set_tz(min, dst); - return ksft_exit_fail(); + ksft_exit_fail(); } diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 63913f75b384..c8e6bffe4e0a 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -70,8 +70,8 @@ int main(int argc, char **argv) if (ret) { printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_exit_fail(); } printf("[OK]\n"); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index 80aed4bf06fb..76b38e41d9c7 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -189,5 +189,5 @@ out: /* die */ if (ret) ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index d13ebde20322..d500884801d8 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -320,10 +320,10 @@ int validate_set_offset(void) int main(int argc, char **argv) { if (validate_freq()) - return ksft_exit_fail(); + ksft_exit_fail(); if (validate_set_offset()) - return ksft_exit_fail(); + ksft_exit_fail(); - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From 8860d86f52b16718bfc38de73a589898112e3776 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:12 -0700 Subject: selftests: x86: ksft_exit_pass() does not return After commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn"), ksft_exit_...() functions are marked as __noreturn, which means the return type should not be 'int' but 'void' because they are not returning anything (and never were since exit() has always been called). To facilitate updating the return type of these functions, remove 'return' before the call to ksft_exit_pass(), as __noreturn prevents the compiler from warning that a caller of ksft_exit_pass() does not return a value because the program will terminate upon calling these functions. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/x86/lam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 215b8150b7cc..820db4864292 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -1237,5 +1237,5 @@ int main(int argc, char **argv) ksft_set_plan(tests_cnt); - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From eb116f80002a402fa3ebb8da48023b39cb471c97 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 24 Apr 2024 10:24:13 -0700 Subject: selftests: kselftest: Make ksft_exit functions return void instead of int Commit f7d5bcd35d42 ("selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn") marked functions that call exit() as __noreturn but it did not change the return type of these functions from 'void' to 'int' like it should have (since a noreturn function by definition cannot return an integer because it does not return...) because there were many tests that return the result of the ksft_exit functions, even though it has never been used due to calling exit(). Now that all uses of 'return ksft_exit...()' have been cleaned up properly, change the types of the ksft_exit...() functions to void to match their __noreturn nature. Reviewed-by: Muhammad Usama Anjum Reviewed-by: Thomas Gleixner Signed-off-by: Nathan Chancellor Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 3d2256885cff..76c2a6945d3e 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -329,13 +329,13 @@ void ksft_test_result_code(int exit_code, const char *test_name, break; \ } } while (0) -static inline __noreturn int ksft_exit_pass(void) +static inline __noreturn void ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline __noreturn int ksft_exit_fail(void) +static inline __noreturn void ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -362,7 +362,7 @@ static inline __noreturn int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) void ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -390,19 +390,19 @@ static inline __noreturn void ksft_exit_fail_perror(const char *msg) #endif } -static inline __noreturn int ksft_exit_xfail(void) +static inline __noreturn void ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline __noreturn int ksft_exit_xpass(void) +static inline __noreturn void ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; -- cgit v1.2.3 From 019baf635eb6ffe8d6c1343f81788f02a7e0ed98 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 May 2024 18:58:20 -0700 Subject: selftests/binderfs: use the Makefile's rules, not Make's implicit rules First of all, in order to build with clang at all, one must first apply Valentin Obst's build fix for LLVM [1]. Once that is done, then when building with clang, via: make LLVM=1 -C tools/testing/selftests ...the following error occurs: clang: error: cannot specify -o when generating multiple output files This is because clang, unlike gcc, won't accept invocations of this form: clang file1.c header2.h While trying to fix this, I noticed that: a) selftests/lib.mk already avoids the problem, and b) The binderfs Makefile indavertently bypasses the selftests/lib.mk build system, and quitely uses Make's implicit build rules for .c files instead. The Makefile attempts to set up both a dependency and a source file, neither of which was needed, because lib.mk is able to automatically handle both. This line: binderfs_test: binderfs_test.c ...causes Make's implicit rules to run, which builds binderfs_test without ever looking at lib.mk. Fix this by simply deleting the "binderfs_test:" Makefile target and letting lib.mk handle it instead. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Fixes: 6e29225af902 ("binderfs: port tests to test harness infrastructure") Cc: Christian Brauner Signed-off-by: John Hubbard Reviewed-by: Christian Brauner Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/binderfs/Makefile | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile index c2f7cef919c0..eb4c3b411934 100644 --- a/tools/testing/selftests/filesystems/binderfs/Makefile +++ b/tools/testing/selftests/filesystems/binderfs/Makefile @@ -3,6 +3,4 @@ CFLAGS += $(KHDR_INCLUDES) -pthread TEST_GEN_PROGS := binderfs_test -binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h - include ../../lib.mk -- cgit v1.2.3 From d8171aa4ca72f1a67bf3c14c59441d63c1d2585f Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 May 2024 19:17:12 -0700 Subject: selftests/resctrl: fix clang build failure: use LOCAL_HDRS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First of all, in order to build with clang at all, one must first apply Valentin Obst's build fix for LLVM [1]. Once that is done, then when building with clang, via: make LLVM=1 -C tools/testing/selftests ...the following error occurs: clang: error: cannot specify -o when generating multiple output files This is because clang, unlike gcc, won't accept invocations of this form: clang file1.c header2.h Fix this by using selftests/lib.mk facilities for tracking local header file dependencies: add them to LOCAL_HDRS, leaving only the .c files to be passed to the compiler. [1] https://lore.kernel.org/all/20240329-selftests-libmk-llvm-rfc-v1-1-2f9ed7d1c49f@valentinobst.de/ Fixes: 8e289f454289 ("selftests/resctrl: Add resctrl.h into build deps") Cc: Ilpo Järvinen Signed-off-by: John Hubbard Acked-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile index 2deac2031de9..021863f86053 100644 --- a/tools/testing/selftests/resctrl/Makefile +++ b/tools/testing/selftests/resctrl/Makefile @@ -5,6 +5,8 @@ CFLAGS += $(KHDR_INCLUDES) TEST_GEN_PROGS := resctrl_tests +LOCAL_HDRS += $(wildcard *.h) + include ../lib.mk -$(OUTPUT)/resctrl_tests: $(wildcard *.[ch]) +$(OUTPUT)/resctrl_tests: $(wildcard *.c) -- cgit v1.2.3 From d4e6fbd245c48b272cc591d1c5e7c07aedd7f071 Mon Sep 17 00:00:00 2001 From: Valentin Obst Date: Fri, 29 Mar 2024 11:49:43 +0100 Subject: selftests: default to host arch for LLVM builds Align the behavior for gcc and clang builds by interpreting unset `ARCH` and `CROSS_COMPILE` variables in `LLVM` builds as a sign that the user wants to build for the host architecture. This patch preserves the properties that setting the `ARCH` variable to an unknown value will trigger an error that complains about insufficient information, and that a set `CROSS_COMPILE` variable will override the target triple that is determined based on presence/absence of `ARCH`. When compiling with clang, i.e., `LLVM` is set, an unset `ARCH` variable in combination with an unset `CROSS_COMPILE` variable, i.e., compiling for the host architecture, leads to compilation failures since `lib.mk` can not determine the clang target triple. In this case, the following error message is displayed for each subsystem that does not set `ARCH` in its own Makefile before including `lib.mk` (lines wrapped at 75 chrs): make[1]: Entering directory '/mnt/build/linux/tools/testing/selftests/ sysctl' ../lib.mk:33: *** Specify CROSS_COMPILE or add '--target=' option to lib.mk. Stop. make[1]: Leaving directory '/mnt/build/linux/tools/testing/selftests/ sysctl' In the same scenario a gcc build would default to the host architecture, i.e., it would use plain `gcc`. Fixes: 795285ef2425 ("selftests: Fix clang cross compilation") Reviewed-by: Mark Brown Signed-off-by: Valentin Obst Reviewed-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index da2cade3bab0..8ae203d8ed7f 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -7,6 +7,8 @@ else ifneq ($(filter -%,$(LLVM)),) LLVM_SUFFIX := $(LLVM) endif +CLANG := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) + CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl @@ -18,7 +20,13 @@ CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu CLANG_TARGET_FLAGS_x86_64 := x86_64-linux-gnu -CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +# Default to host architecture if ARCH is not explicitly given. +ifeq ($(ARCH),) +CLANG_TARGET_FLAGS := $(shell $(CLANG) -print-target-triple) +else +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) +endif ifeq ($(CROSS_COMPILE),) ifeq ($(CLANG_TARGET_FLAGS),) @@ -30,7 +38,7 @@ else CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) endif # CROSS_COMPILE -CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +CC := $(CLANG) $(CLANG_FLAGS) -fintegrated-as else CC := $(CROSS_COMPILE)gcc endif # LLVM -- cgit v1.2.3 From 8e6d9ae2e09f1f6ba65614a5e5c5a2a2e335dcba Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 3 May 2024 17:50:45 -0700 Subject: selftests/bpf: Use bpf_tracing.h instead of bpf_tcp_helpers.h The bpf programs that this patch changes require the BPF_PROG macro. The BPF_PROG macro is defined in the libbpf's bpf_tracing.h. Some tests include bpf_tcp_helpers.h which includes bpf_tracing.h. They don't need other things from bpf_tcp_helpers.h other than bpf_tracing.h. This patch simplifies it by directly including the bpf_tracing.h. The motivation of this unnecessary code churn is to retire the bpf_tcp_helpers.h by directly using vmlinux.h. Right now, the main usage of the bpf_tcp_helpers.h is the partial kernel socket definitions (e.g. socket, sock, tcp_sock). While the test cases continue to grow, fields are kept adding to those partial socket definitions (e.g. the recent bpf_cc_cubic.c test which tried to extend bpf_tcp_helpers.c but eventually used the vmlinux.h instead). The idea is to retire bpf_tcp_helpers.c and consistently use vmlinux.h for the tests that require the kernel sockets. This patch tackles the obvious tests that can directly use bpf_tracing.h instead of bpf_tcp_helpers.h. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240504005045.848376-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/progs/timer.c | 3 ++- tools/testing/selftests/bpf/progs/timer_failure.c | 2 +- tools/testing/selftests/bpf/progs/timer_mim.c | 2 +- tools/testing/selftests/bpf/progs/timer_mim_reject.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index f615da97df26..4c677c001258 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -2,9 +2,10 @@ /* Copyright (c) 2021 Facebook */ #include #include +#include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c index 0996c2486f05..5a2e9dabf1c6 100644 --- a/tools/testing/selftests/bpf/progs/timer_failure.c +++ b/tools/testing/selftests/bpf/progs/timer_failure.c @@ -5,8 +5,8 @@ #include #include #include +#include #include "bpf_misc.h" -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c index 2fee7ab105ef..50ebc3f68522 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim.c +++ b/tools/testing/selftests/bpf/progs/timer_mim.c @@ -4,7 +4,7 @@ #include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c index 5d648e3d8a41..dd3f1ed6d6e6 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim_reject.c +++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c @@ -4,7 +4,7 @@ #include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { -- cgit v1.2.3 From e549b39a0ab8880d7ae6c6495b00fc1cb8f36174 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 6 May 2024 16:50:22 +0200 Subject: selftests/bpf: Fix pointer arithmetic in test_xdp_do_redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cast operation has a higher precedence than addition. The code here wants to zero the 2nd half of the 64-bit metadata, but due to a pointer arithmetic mistake, it writes the zero at offset 16 instead. Just adding parentheses around "data + 4" would fix this, but I think this will be slightly better readable with array syntax. I was unable to test this with tools/testing/selftests/bpf/vmtest.sh, because my glibc is newer than glibc in the provided VM image. So I just checked the difference in the compiled code. objdump -S tools/testing/selftests/bpf/xdp_do_redirect.test.o: - *((__u32 *)data) = 0x42; /* metadata test value */ + ((__u32 *)data)[0] = 0x42; /* metadata test value */ be7: 48 8d 85 30 fc ff ff lea -0x3d0(%rbp),%rax bee: c7 00 42 00 00 00 movl $0x42,(%rax) - *((__u32 *)data + 4) = 0; + ((__u32 *)data)[1] = 0; bf4: 48 8d 85 30 fc ff ff lea -0x3d0(%rbp),%rax - bfb: 48 83 c0 10 add $0x10,%rax + bfb: 48 83 c0 04 add $0x4,%rax bff: c7 00 00 00 00 00 movl $0x0,(%rax) Fixes: 5640b6d89434 ("selftests/bpf: fix "metadata marker" getting overwritten by the netstack") Signed-off-by: Michal Schmidt Signed-off-by: Andrii Nakryiko Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20240506145023.214248-1-mschmidt@redhat.com --- tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index 498d3bdaa4b0..bad0ea167be7 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -107,8 +107,8 @@ void test_xdp_do_redirect(void) .attach_point = BPF_TC_INGRESS); memcpy(&data[sizeof(__u64)], &pkt_udp, sizeof(pkt_udp)); - *((__u32 *)data) = 0x42; /* metadata test value */ - *((__u32 *)data + 4) = 0; + ((__u32 *)data)[0] = 0x42; /* metadata test value */ + ((__u32 *)data)[1] = 0; skel = test_xdp_do_redirect__open(); if (!ASSERT_OK_PTR(skel, "skel")) -- cgit v1.2.3 From 41b307ad756e1b7b618bf9d9c1cce3595705ede4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 5 May 2024 16:00:54 -0700 Subject: bpftool, selftests/hid/bpf: Fix 29 clang warnings When building either tools/bpf/bpftool, or tools/testing/selftests/hid, (the same Makefile is used for these), clang generates many instances of the following: "clang: warning: -lLLVM-17: 'linker' input unused" Quentin points out that the LLVM version is only required in $(LIBS), not in $(CFLAGS), so the fix is to remove it from CFLAGS. Suggested-by: Quentin Monnet Signed-off-by: John Hubbard Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240505230054.13813-1-jhubbard@nvidia.com --- tools/bpf/bpftool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index b67454b45a49..dfa4f1bebbb3 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -147,7 +147,7 @@ ifeq ($(feature-llvm),1) # If LLVM is available, use it for JIT disassembly CFLAGS += -DHAVE_LLVM_SUPPORT LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets - CFLAGS += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + CFLAGS += $(shell $(LLVM_CONFIG) --cflags) LIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) -- cgit v1.2.3 From 5ec9a7d13f49b9c1c5ba854244d1f2ba414cf139 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:47 +0100 Subject: selftests/bpf: XOR and OR range computation tests. Added a test for bound computation in XOR and OR when non constant values are used and both registers have bounded ranges. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-5-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_bounds.c | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 960998f16306..7d570acf23ee 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -885,6 +885,48 @@ l1_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("bounds check for non const xor src dst") +__success __log_level(2) +__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_xor_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 ^= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("bounds check for non const or src dst") +__success __log_level(2) +__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_or_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 |= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("bounds checks after 32-bit truncation. test 1") __success __failure_unpriv __msg_unpriv("R0 leaks addr") -- cgit v1.2.3 From 92956786b4e26ea22e5b3c1c86cc71f5c9b3b9d8 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:49 +0100 Subject: selftests/bpf: MUL range computation tests. Added a test for bound computation in MUL when non constant values are used and both registers have bounded ranges. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20240506141849.185293-7-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_bounds.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 7d570acf23ee..a0bb7fb40ea5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -927,6 +927,27 @@ __naked void non_const_or_src_dst(void) : __clobber_all); } +SEC("socket") +__description("bounds check for non const mul regs") +__success __log_level(2) +__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__naked void non_const_mul_regs(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xff; \ + r0 &= 0x0f; \ + r0 *= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("bounds checks after 32-bit truncation. test 1") __success __failure_unpriv __msg_unpriv("R0 leaks addr") -- cgit v1.2.3 From a7def2e51c667578140d9aa3282533463ed3df91 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:30 +0200 Subject: selftests/hid: import base_device.py from hid-tools We need to slightly change base_device.py for supporting HID-BPF, so instead of monkey patching, let's just embed it in the kernel tree. Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-10-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/base.py | 2 +- tools/testing/selftests/hid/tests/base_device.py | 412 +++++++++++++++++++++++ 2 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/hid/tests/base_device.py (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/base.py b/tools/testing/selftests/hid/tests/base.py index 51433063b227..6bb5b887baaf 100644 --- a/tools/testing/selftests/hid/tests/base.py +++ b/tools/testing/selftests/hid/tests/base.py @@ -12,7 +12,7 @@ import time import logging -from hidtools.device.base_device import BaseDevice, EvdevMatch, SysfsFile +from .base_device import BaseDevice, EvdevMatch, SysfsFile from pathlib import Path from typing import Final, List, Tuple diff --git a/tools/testing/selftests/hid/tests/base_device.py b/tools/testing/selftests/hid/tests/base_device.py new file mode 100644 index 000000000000..092c7c4e62ef --- /dev/null +++ b/tools/testing/selftests/hid/tests/base_device.py @@ -0,0 +1,412 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# -*- coding: utf-8 -*- +# +# Copyright (c) 2017 Benjamin Tissoires +# Copyright (c) 2017 Red Hat, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import fcntl +import functools +import libevdev +import os + +try: + import pyudev +except ImportError: + raise ImportError("UHID is not supported due to missing pyudev dependency") + +import logging + +import hidtools.hid as hid +from hidtools.uhid import UHIDDevice +from hidtools.util import BusType + +from pathlib import Path +from typing import Any, ClassVar, Dict, List, Optional, Type, Union + +logger = logging.getLogger("hidtools.device.base_device") + + +class SysfsFile(object): + def __init__(self, path): + self.path = path + + def __set_value(self, value): + with open(self.path, "w") as f: + return f.write(f"{value}\n") + + def __get_value(self): + with open(self.path) as f: + return f.read().strip() + + @property + def int_value(self) -> int: + return int(self.__get_value()) + + @int_value.setter + def int_value(self, v: int) -> None: + self.__set_value(v) + + @property + def str_value(self) -> str: + return self.__get_value() + + @str_value.setter + def str_value(self, v: str) -> None: + self.__set_value(v) + + +class LED(object): + def __init__(self, sys_path): + self.max_brightness = SysfsFile(sys_path / "max_brightness").int_value + self.__brightness = SysfsFile(sys_path / "brightness") + + @property + def brightness(self) -> int: + return self.__brightness.int_value + + @brightness.setter + def brightness(self, value: int) -> None: + self.__brightness.int_value = value + + +class PowerSupply(object): + """Represents Linux power_supply_class sysfs nodes.""" + + def __init__(self, sys_path): + self._capacity = SysfsFile(sys_path / "capacity") + self._status = SysfsFile(sys_path / "status") + self._type = SysfsFile(sys_path / "type") + + @property + def capacity(self) -> int: + return self._capacity.int_value + + @property + def status(self) -> str: + return self._status.str_value + + @property + def type(self) -> str: + return self._type.str_value + + +class HIDIsReady(object): + """ + Companion class that binds to a kernel mechanism + and that allows to know when a uhid device is ready or not. + + See :meth:`is_ready` for details. + """ + + def __init__(self: "HIDIsReady", uhid: UHIDDevice) -> None: + self.uhid = uhid + + def is_ready(self: "HIDIsReady") -> bool: + """ + Overwrite in subclasses: should return True or False whether + the attached uhid device is ready or not. + """ + return False + + +class UdevHIDIsReady(HIDIsReady): + _pyudev_context: ClassVar[Optional[pyudev.Context]] = None + _pyudev_monitor: ClassVar[Optional[pyudev.Monitor]] = None + _uhid_devices: ClassVar[Dict[int, bool]] = {} + + def __init__(self: "UdevHIDIsReady", uhid: UHIDDevice) -> None: + super().__init__(uhid) + self._init_pyudev() + + @classmethod + def _init_pyudev(cls: Type["UdevHIDIsReady"]) -> None: + if cls._pyudev_context is None: + cls._pyudev_context = pyudev.Context() + cls._pyudev_monitor = pyudev.Monitor.from_netlink(cls._pyudev_context) + cls._pyudev_monitor.filter_by("hid") + cls._pyudev_monitor.start() + + UHIDDevice._append_fd_to_poll( + cls._pyudev_monitor.fileno(), cls._cls_udev_event_callback + ) + + @classmethod + def _cls_udev_event_callback(cls: Type["UdevHIDIsReady"]) -> None: + if cls._pyudev_monitor is None: + return + event: pyudev.Device + for event in iter(functools.partial(cls._pyudev_monitor.poll, 0.02), None): + if event.action not in ["bind", "remove"]: + return + + logger.debug(f"udev event: {event.action} -> {event}") + + id = int(event.sys_path.strip().split(".")[-1], 16) + + cls._uhid_devices[id] = event.action == "bind" + + def is_ready(self: "UdevHIDIsReady") -> bool: + try: + return self._uhid_devices[self.uhid.hid_id] + except KeyError: + return False + + +class EvdevMatch(object): + def __init__( + self: "EvdevMatch", + *, + requires: List[Any] = [], + excludes: List[Any] = [], + req_properties: List[Any] = [], + excl_properties: List[Any] = [], + ) -> None: + self.requires = requires + self.excludes = excludes + self.req_properties = req_properties + self.excl_properties = excl_properties + + def is_a_match(self: "EvdevMatch", evdev: libevdev.Device) -> bool: + for m in self.requires: + if not evdev.has(m): + return False + for m in self.excludes: + if evdev.has(m): + return False + for p in self.req_properties: + if not evdev.has_property(p): + return False + for p in self.excl_properties: + if evdev.has_property(p): + return False + return True + + +class EvdevDevice(object): + """ + Represents an Evdev node and its properties. + This is a stub for the libevdev devices, as they are relying on + uevent to get the data, saving us some ioctls to fetch the names + and properties. + """ + + def __init__(self: "EvdevDevice", sysfs: Path) -> None: + self.sysfs = sysfs + self.event_node: Any = None + self.libevdev: Optional[libevdev.Device] = None + + self.uevents = {} + # all of the interesting properties are stored in the input uevent, so in the parent + # so convert the uevent file of the parent input node into a dict + with open(sysfs.parent / "uevent") as f: + for line in f.readlines(): + key, value = line.strip().split("=") + self.uevents[key] = value.strip('"') + + # we open all evdev nodes in order to not miss any event + self.open() + + @property + def name(self: "EvdevDevice") -> str: + assert "NAME" in self.uevents + + return self.uevents["NAME"] + + @property + def evdev(self: "EvdevDevice") -> Path: + return Path("/dev/input") / self.sysfs.name + + def matches_application( + self: "EvdevDevice", application: str, matches: Dict[str, EvdevMatch] + ) -> bool: + if self.libevdev is None: + return False + + if application in matches: + return matches[application].is_a_match(self.libevdev) + + logger.error( + f"application '{application}' is unknown, please update/fix hid-tools" + ) + assert False # hid-tools likely needs an update + + def open(self: "EvdevDevice") -> libevdev.Device: + self.event_node = open(self.evdev, "rb") + self.libevdev = libevdev.Device(self.event_node) + + assert self.libevdev.fd is not None + + fd = self.libevdev.fd.fileno() + flag = fcntl.fcntl(fd, fcntl.F_GETFD) + fcntl.fcntl(fd, fcntl.F_SETFL, flag | os.O_NONBLOCK) + + return self.libevdev + + def close(self: "EvdevDevice") -> None: + if self.libevdev is not None and self.libevdev.fd is not None: + self.libevdev.fd.close() + self.libevdev = None + if self.event_node is not None: + self.event_node.close() + self.event_node = None + + +class BaseDevice(UHIDDevice): + # default _application_matches that matches nothing. This needs + # to be set in the subclasses to have get_evdev() working + _application_matches: Dict[str, EvdevMatch] = {} + + def __init__( + self, + name, + application, + rdesc_str: Optional[str] = None, + rdesc: Optional[Union[hid.ReportDescriptor, str, bytes]] = None, + input_info=None, + ) -> None: + self._kernel_is_ready: HIDIsReady = UdevHIDIsReady(self) + if rdesc_str is None and rdesc is None: + raise Exception("Please provide at least a rdesc or rdesc_str") + super().__init__() + if name is None: + name = f"uhid gamepad test {self.__class__.__name__}" + if input_info is None: + input_info = (BusType.USB, 1, 2) + self.name = name + self.info = input_info + self.default_reportID = None + self.opened = False + self.started = False + self.application = application + self._input_nodes: Optional[list[EvdevDevice]] = None + if rdesc is None: + assert rdesc_str is not None + self.rdesc = hid.ReportDescriptor.from_human_descr(rdesc_str) # type: ignore + else: + self.rdesc = rdesc # type: ignore + + @property + def power_supply_class(self: "BaseDevice") -> Optional[PowerSupply]: + ps = self.walk_sysfs("power_supply", "power_supply/*") + if ps is None or len(ps) < 1: + return None + + return PowerSupply(ps[0]) + + @property + def led_classes(self: "BaseDevice") -> List[LED]: + leds = self.walk_sysfs("led", "**/max_brightness") + if leds is None: + return [] + + return [LED(led.parent) for led in leds] + + @property + def kernel_is_ready(self: "BaseDevice") -> bool: + return self._kernel_is_ready.is_ready() and self.started + + @property + def input_nodes(self: "BaseDevice") -> List[EvdevDevice]: + if self._input_nodes is not None: + return self._input_nodes + + if not self.kernel_is_ready or not self.started: + return [] + + self._input_nodes = [ + EvdevDevice(path) + for path in self.walk_sysfs("input", "input/input*/event*") + ] + return self._input_nodes + + def match_evdev_rule(self, application, evdev): + """Replace this in subclasses if the device has multiple reports + of the same type and we need to filter based on the actual evdev + node. + + returning True will append the corresponding report to + `self.input_nodes[type]` + returning False will ignore this report / type combination + for the device. + """ + return True + + def open(self): + self.opened = True + + def _close_all_opened_evdev(self): + if self._input_nodes is not None: + for e in self._input_nodes: + e.close() + + def __del__(self): + self._close_all_opened_evdev() + + def close(self): + self.opened = False + + def start(self, flags): + self.started = True + + def stop(self): + self.started = False + self._close_all_opened_evdev() + + def next_sync_events(self, application=None): + evdev = self.get_evdev(application) + if evdev is not None: + return list(evdev.events()) + return [] + + @property + def application_matches(self: "BaseDevice") -> Dict[str, EvdevMatch]: + return self._application_matches + + @application_matches.setter + def application_matches(self: "BaseDevice", data: Dict[str, EvdevMatch]) -> None: + self._application_matches = data + + def get_evdev(self, application=None): + if application is None: + application = self.application + + if len(self.input_nodes) == 0: + return None + + assert self._input_nodes is not None + + if len(self._input_nodes) == 1: + evdev = self._input_nodes[0] + if self.match_evdev_rule(application, evdev.libevdev): + return evdev.libevdev + else: + for _evdev in self._input_nodes: + if _evdev.matches_application(application, self.application_matches): + if self.match_evdev_rule(application, _evdev.libevdev): + return _evdev.libevdev + + def is_ready(self): + """Returns whether a UHID device is ready. Can be overwritten in + subclasses to add extra conditions on when to consider a UHID + device ready. This can be: + + - we need to wait on different types of input devices to be ready + (Touch Screen and Pen for example) + - we need to have at least 4 LEDs present + (len(self.uhdev.leds_classes) == 4) + - or any other combinations""" + return self.kernel_is_ready -- cgit v1.2.3 From e906463087cec0a179ddcafe08aeef5899af6b00 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:31 +0200 Subject: selftests/hid: add support for HID-BPF pre-loading before starting a test few required changes: - we need to count how many times a udev 'bind' event happens - we need to tell `udev-hid-bpf` to not automatically attach the provided HID-BPF objects - we need to manually attach the ones from the kernel tree, and wait for the second udev 'bind' event to happen Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-11-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/base.py | 85 +++++++++++++++++++++--- tools/testing/selftests/hid/tests/base_device.py | 23 +++++-- 2 files changed, 93 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/base.py b/tools/testing/selftests/hid/tests/base.py index 6bb5b887baaf..2d006c0f5fcd 100644 --- a/tools/testing/selftests/hid/tests/base.py +++ b/tools/testing/selftests/hid/tests/base.py @@ -8,6 +8,7 @@ import libevdev import os import pytest +import subprocess import time import logging @@ -157,6 +158,17 @@ class BaseTestCase: # for example ("playstation", "hid-playstation") kernel_modules: List[Tuple[str, str]] = [] + # List of in kernel HID-BPF object files to load + # before starting the test + # Any existing pre-loaded HID-BPF module will be removed + # before the ones in this list will be manually loaded. + # Each Element is a tuple '(hid_bpf_object, rdesc_fixup_present)', + # for example '("xppen-ArtistPro16Gen2.bpf.o", True)' + # If 'rdesc_fixup_present' is True, the test needs to wait + # for one unbind and rebind before it can be sure the kernel is + # ready + hid_bpfs: List[Tuple[str, bool]] = [] + def assertInputEventsIn(self, expected_events, effective_events): effective_events = effective_events.copy() for ev in expected_events: @@ -211,8 +223,6 @@ class BaseTestCase: # we don't know beforehand the name of the module from modinfo sysfs_path = Path("/sys/module") / kernel_module.replace("-", "_") if not sysfs_path.exists(): - import subprocess - ret = subprocess.run(["/usr/sbin/modprobe", kernel_module]) if ret.returncode != 0: pytest.skip( @@ -225,6 +235,60 @@ class BaseTestCase: self._load_kernel_module(kernel_driver, kernel_module) yield + def load_hid_bpfs(self): + script_dir = Path(os.path.dirname(os.path.realpath(__file__))) + root_dir = (script_dir / "../../../../..").resolve() + bpf_dir = root_dir / "drivers/hid/bpf/progs" + + wait = False + for _, rdesc_fixup in self.hid_bpfs: + if rdesc_fixup: + wait = True + + for hid_bpf, _ in self.hid_bpfs: + # We need to start `udev-hid-bpf` in the background + # and dispatch uhid events in case the kernel needs + # to fetch features on the device + process = subprocess.Popen( + [ + "udev-hid-bpf", + "--verbose", + "add", + str(self.uhdev.sys_path), + str(bpf_dir / hid_bpf), + ], + ) + while process.poll() is None: + self.uhdev.dispatch(1) + + if process.poll() != 0: + pytest.fail( + f"Couldn't insert hid-bpf program '{hid_bpf}', marking the test as failed" + ) + + if wait: + # the HID-BPF program exports a rdesc fixup, so it needs to be + # unbound by the kernel and then rebound. + # Ensure we get the bound event exactly 2 times (one for the normal + # uhid loading, and then the reload from HID-BPF) + now = time.time() + while self.uhdev.kernel_ready_count < 2 and time.time() - now < 2: + self.uhdev.dispatch(1) + + if self.uhdev.kernel_ready_count < 2: + pytest.fail( + f"Couldn't insert hid-bpf programs, marking the test as failed" + ) + + def unload_hid_bpfs(self): + ret = subprocess.run( + ["udev-hid-bpf", "--verbose", "remove", str(self.uhdev.sys_path)], + ) + if ret.returncode != 0: + pytest.fail( + f"Couldn't unload hid-bpf programs, marking the test as failed" + ) + @pytest.fixture() def new_uhdev(self, load_kernel_module): return self.create_device() @@ -248,12 +312,18 @@ class BaseTestCase: now = time.time() while not self.uhdev.is_ready() and time.time() - now < 5: self.uhdev.dispatch(1) + + if self.hid_bpfs: + self.load_hid_bpfs() + if self.uhdev.get_evdev() is None: logger.warning( f"available list of input nodes: (default application is '{self.uhdev.application}')" ) logger.warning(self.uhdev.input_nodes) yield + if self.hid_bpfs: + self.unload_hid_bpfs() self.uhdev = None except PermissionError: pytest.skip("Insufficient permissions, run me as root") @@ -313,8 +383,6 @@ class HIDTestUdevRule(object): self.reload_udev_rules() def reload_udev_rules(self): - import subprocess - subprocess.run("udevadm control --reload-rules".split()) subprocess.run("systemd-hwdb update".split()) @@ -330,10 +398,11 @@ class HIDTestUdevRule(object): delete=False, ) as f: f.write( - 'KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1"\n' - ) - f.write( - 'KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1"\n' + """ +KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1" +KERNELS=="*hid*", ENV{HID_NAME}=="*uhid test *", ENV{HID_BPF_IGNORE_DEVICE}="1" +KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1" +""" ) self.rulesfile = f diff --git a/tools/testing/selftests/hid/tests/base_device.py b/tools/testing/selftests/hid/tests/base_device.py index 092c7c4e62ef..e0515be97f83 100644 --- a/tools/testing/selftests/hid/tests/base_device.py +++ b/tools/testing/selftests/hid/tests/base_device.py @@ -35,7 +35,7 @@ from hidtools.uhid import UHIDDevice from hidtools.util import BusType from pathlib import Path -from typing import Any, ClassVar, Dict, List, Optional, Type, Union +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, Union logger = logging.getLogger("hidtools.device.base_device") @@ -126,7 +126,7 @@ class HIDIsReady(object): class UdevHIDIsReady(HIDIsReady): _pyudev_context: ClassVar[Optional[pyudev.Context]] = None _pyudev_monitor: ClassVar[Optional[pyudev.Monitor]] = None - _uhid_devices: ClassVar[Dict[int, bool]] = {} + _uhid_devices: ClassVar[Dict[int, Tuple[bool, int]]] = {} def __init__(self: "UdevHIDIsReady", uhid: UHIDDevice) -> None: super().__init__(uhid) @@ -150,20 +150,25 @@ class UdevHIDIsReady(HIDIsReady): return event: pyudev.Device for event in iter(functools.partial(cls._pyudev_monitor.poll, 0.02), None): - if event.action not in ["bind", "remove"]: + if event.action not in ["bind", "remove", "unbind"]: return logger.debug(f"udev event: {event.action} -> {event}") id = int(event.sys_path.strip().split(".")[-1], 16) - cls._uhid_devices[id] = event.action == "bind" + device_ready, count = cls._uhid_devices.get(id, (False, 0)) - def is_ready(self: "UdevHIDIsReady") -> bool: + ready = event.action == "bind" + if not device_ready and ready: + count += 1 + cls._uhid_devices[id] = (ready, count) + + def is_ready(self: "UdevHIDIsReady") -> Tuple[bool, int]: try: return self._uhid_devices[self.uhid.hid_id] except KeyError: - return False + return (False, 0) class EvdevMatch(object): @@ -317,7 +322,11 @@ class BaseDevice(UHIDDevice): @property def kernel_is_ready(self: "BaseDevice") -> bool: - return self._kernel_is_ready.is_ready() and self.started + return self._kernel_is_ready.is_ready()[0] and self.started + + @property + def kernel_ready_count(self: "BaseDevice") -> int: + return self._kernel_is_ready.is_ready()[1] @property def input_nodes(self: "BaseDevice") -> List[EvdevDevice]: -- cgit v1.2.3 From e14d88d9b8dae40c6f612c6fc74b7d03d12f3c94 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:32 +0200 Subject: selftests/hid: tablets: reduce the number of pen state All the *_WITH*BUTTON states were almost identical except for the button itself. I need to add a new device with a third button, and adding a bunch of states is going to be quite cumbersome. So convert the `button` parameter of PenState as a boolean, and store which button is the target as an argument to all functions that need it. Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-12-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/test_tablet.py | 272 +++++++---------------- 1 file changed, 81 insertions(+), 191 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index 903f19f7cbe9..df1134e5713c 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -44,58 +44,28 @@ class PenState(Enum): We extend it with the various buttons when we need to check them. """ - PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, None - PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, None - PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, BtnPressed.PRIMARY_PRESSED - PEN_IS_IN_RANGE_WITH_SECOND_BUTTON = ( - BtnTouch.UP, - ToolType.PEN, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, None - PEN_IS_IN_CONTACT_WITH_BUTTON = ( - BtnTouch.DOWN, - ToolType.PEN, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON = ( - BtnTouch.DOWN, - ToolType.PEN, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, None - PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = ( - BtnTouch.UP, - ToolType.RUBBER, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_SECOND_BUTTON = ( - BtnTouch.UP, - ToolType.RUBBER, - BtnPressed.SECONDARY_PRESSED, - ) - PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, None - PEN_IS_ERASING_WITH_BUTTON = ( - BtnTouch.DOWN, - ToolType.RUBBER, - BtnPressed.PRIMARY_PRESSED, - ) - PEN_IS_ERASING_WITH_SECOND_BUTTON = ( - BtnTouch.DOWN, - ToolType.RUBBER, - BtnPressed.SECONDARY_PRESSED, - ) - - def __init__(self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[BtnPressed]): + PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, False + PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, False + PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, True + PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, False + PEN_IS_IN_CONTACT_WITH_BUTTON = BtnTouch.DOWN, ToolType.PEN, True + PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, False + PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = BtnTouch.UP, ToolType.RUBBER, True + PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, False + PEN_IS_ERASING_WITH_BUTTON = BtnTouch.DOWN, ToolType.RUBBER, True + + def __init__( + self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[bool] + ): self.touch = touch # type: ignore self.tool = tool # type: ignore self.button = button # type: ignore @classmethod - def from_evdev(cls, evdev) -> "PenState": + def from_evdev(cls, evdev, test_button) -> "PenState": touch = BtnTouch(evdev.value[libevdev.EV_KEY.BTN_TOUCH]) tool = None - button = None + button = False if ( evdev.value[libevdev.EV_KEY.BTN_TOOL_RUBBER] and not evdev.value[libevdev.EV_KEY.BTN_TOOL_PEN] @@ -112,19 +82,20 @@ class PenState(Enum): ): raise ValueError("2 tools are not allowed") - # we take only the highest button in account - for b in [libevdev.EV_KEY.BTN_STYLUS, libevdev.EV_KEY.BTN_STYLUS2]: - if bool(evdev.value[b]): - button = BtnPressed(b) + # we take only the provided button into account + if test_button is not None: + button = bool(evdev.value[test_button.value]) # the kernel tends to insert an EV_SYN once removing the tool, so # the button will be released after if tool is None: - button = None + button = False return cls((touch, tool, button)) # type: ignore - def apply(self, events: List[libevdev.InputEvent], strict: bool) -> "PenState": + def apply( + self, events: List[libevdev.InputEvent], strict: bool, test_button: BtnPressed + ) -> "PenState": if libevdev.EV_SYN.SYN_REPORT in events: raise ValueError("EV_SYN is in the event sequence") touch = self.touch @@ -148,19 +119,16 @@ class PenState(Enum): raise ValueError(f"duplicated BTN_TOOL_* in {events}") tool_found = True tool = ToolType(ev.code) if ev.value else None - elif ev in ( - libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS), - libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS2), - ): + elif test_button is not None and ev in (test_button.value,): if button_found: raise ValueError(f"duplicated BTN_STYLUS* in {events}") button_found = True - button = BtnPressed(ev.code) if ev.value else None + button = bool(ev.value) # the kernel tends to insert an EV_SYN once removing the tool, so # the button will be released after if tool is None: - button = None + button = False new_state = PenState((touch, tool, button)) # type: ignore if strict: @@ -183,11 +151,9 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_ERASING, ) @@ -195,7 +161,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_CONTACT, ) @@ -204,7 +169,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE, ) @@ -236,21 +200,6 @@ class PenState(Enum): PenState.PEN_IS_IN_RANGE_WITH_BUTTON, ) - if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_OUT_OF_RANGE, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - ) - - if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - ) - return tuple() def historically_tolerated_transitions(self) -> Tuple["PenState", ...]: @@ -263,11 +212,9 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_ERASING, ) @@ -275,7 +222,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, PenState.PEN_IS_OUT_OF_RANGE, PenState.PEN_IS_IN_CONTACT, ) @@ -284,7 +230,6 @@ class PenState(Enum): return ( PenState.PEN_IS_IN_CONTACT, PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, PenState.PEN_IS_IN_RANGE, PenState.PEN_IS_OUT_OF_RANGE, ) @@ -319,22 +264,6 @@ class PenState(Enum): PenState.PEN_IS_OUT_OF_RANGE, ) - if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_OUT_OF_RANGE, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - ) - - if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - return ( - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_OUT_OF_RANGE, - ) - return tuple() @staticmethod @@ -402,9 +331,9 @@ class PenState(Enum): } @staticmethod - def legal_transitions_with_primary_button() -> Dict[str, Tuple["PenState", ...]]: + def legal_transitions_with_button() -> Dict[str, Tuple["PenState", ...]]: """We revisit the Windows Pen Implementation state machine: - we now have a primary button. + we now have a button. """ return { "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_BUTTON,), @@ -450,56 +379,6 @@ class PenState(Enum): ), } - @staticmethod - def legal_transitions_with_secondary_button() -> Dict[str, Tuple["PenState", ...]]: - """We revisit the Windows Pen Implementation state machine: - we now have a secondary button. - Note: we don't looks for 2 buttons interactions. - """ - return { - "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,), - "hover-button -> out-of-range": ( - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_OUT_OF_RANGE, - ), - "in-range -> button-press": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - ), - "in-range -> button-press -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> touch -> button-press -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - ), - "in-range -> touch -> button-press -> release -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> button-press -> touch -> release -> button-release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_RANGE, - ), - "in-range -> button-press -> touch -> button-release -> release": ( - PenState.PEN_IS_IN_RANGE, - PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON, - PenState.PEN_IS_IN_CONTACT, - PenState.PEN_IS_IN_RANGE, - ), - } - @staticmethod def tolerated_transitions() -> Dict[str, Tuple["PenState", ...]]: """This is not adhering to the Windows Pen Implementation state machine @@ -616,10 +495,21 @@ class Pen(object): evdev.value[axis] == value ), f"assert evdev.value[{axis}] ({evdev.value[axis]}) != {value}" - def assert_expected_input_events(self, evdev): + def assert_expected_input_events(self, evdev, button): assert evdev.value[libevdev.EV_ABS.ABS_X] == self.x assert evdev.value[libevdev.EV_ABS.ABS_Y] == self.y - assert self.current_state == PenState.from_evdev(evdev) + + # assert no other buttons than the tested ones are set + buttons = [ + BtnPressed.PRIMARY_PRESSED, + BtnPressed.SECONDARY_PRESSED, + ] + if button is not None: + buttons.remove(button) + for b in buttons: + assert evdev.value[b.value] is None or evdev.value[b.value] == False + + assert self.current_state == PenState.from_evdev(evdev, button) class PenDigitizer(base.UHIDTestDevice): @@ -647,7 +537,7 @@ class PenDigitizer(base.UHIDTestDevice): continue self.fields = [f.usage_name for f in r] - def move_to(self, pen, state): + def move_to(self, pen, state, button): # fill in the previous values if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: pen.restore() @@ -690,29 +580,17 @@ class PenDigitizer(base.UHIDTestDevice): pen.inrange = True pen.invert = False pen.eraser = False - pen.barrelswitch = True - pen.secondarybarrelswitch = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: pen.tipswitch = True pen.inrange = True pen.invert = False pen.eraser = False - pen.barrelswitch = True - pen.secondarybarrelswitch = False - elif state == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON: - pen.tipswitch = False - pen.inrange = True - pen.invert = False - pen.eraser = False - pen.barrelswitch = False - pen.secondarybarrelswitch = True - elif state == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON: - pen.tipswitch = True - pen.inrange = True - pen.invert = False - pen.eraser = False - pen.barrelswitch = False - pen.secondarybarrelswitch = True + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: pen.tipswitch = False pen.inrange = True @@ -730,7 +608,7 @@ class PenDigitizer(base.UHIDTestDevice): pen.current_state = state - def event(self, pen): + def event(self, pen, button): rs = [] r = self.create_report(application=self.cur_application, data=pen) self.call_input_event(r) @@ -771,17 +649,17 @@ class BaseTest: def create_device(self): raise Exception("please reimplement me in subclasses") - def post(self, uhdev, pen): - r = uhdev.event(pen) + def post(self, uhdev, pen, test_button): + r = uhdev.event(pen, test_button) events = uhdev.next_sync_events() self.debug_reports(r, uhdev, events) return events def validate_transitions( - self, from_state, pen, evdev, events, allow_intermediate_states + self, from_state, pen, evdev, events, allow_intermediate_states, button ): # check that the final state is correct - pen.assert_expected_input_events(evdev) + pen.assert_expected_input_events(evdev, button) state = from_state @@ -794,12 +672,14 @@ class BaseTest: events = events[idx + 1 :] # now check for a valid transition - state = state.apply(sync_events, not allow_intermediate_states) + state = state.apply(sync_events, not allow_intermediate_states, button) if events: - state = state.apply(sync_events, not allow_intermediate_states) + state = state.apply(sync_events, not allow_intermediate_states, button) - def _test_states(self, state_list, scribble, allow_intermediate_states): + def _test_states( + self, state_list, scribble, allow_intermediate_states, button=None + ): """Internal method to test against a list of transition between states. state_list is a list of PenState objects @@ -812,10 +692,10 @@ class BaseTest: cur_state = PenState.PEN_IS_OUT_OF_RANGE p = Pen(50, 60) - uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE) - events = self.post(uhdev, p) + uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE, button) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) cur_state = p.current_state @@ -824,18 +704,18 @@ class BaseTest: if scribble and cur_state != PenState.PEN_IS_OUT_OF_RANGE: p.x += 1 p.y -= 1 - events = self.post(uhdev, p) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) assert len(events) >= 3 # X, Y, SYN - uhdev.move_to(p, state) + uhdev.move_to(p, state, button) if scribble and state != PenState.PEN_IS_OUT_OF_RANGE: p.x += 1 p.y -= 1 - events = self.post(uhdev, p) + events = self.post(uhdev, p, button) self.validate_transitions( - cur_state, p, evdev, events, allow_intermediate_states + cur_state, p, evdev, events, allow_intermediate_states, button ) cur_state = p.current_state @@ -874,12 +754,17 @@ class BaseTest: "state_list", [ pytest.param(v, id=k) - for k, v in PenState.legal_transitions_with_primary_button().items() + for k, v in PenState.legal_transitions_with_button().items() ], ) def test_valid_primary_button_pen_states(self, state_list, scribble): """Rework the transition state machine by adding the primary button.""" - self._test_states(state_list, scribble, allow_intermediate_states=False) + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.PRIMARY_PRESSED, + ) @pytest.mark.skip_if_uhdev( lambda uhdev: "Secondary Barrel Switch" not in uhdev.fields, @@ -890,12 +775,17 @@ class BaseTest: "state_list", [ pytest.param(v, id=k) - for k, v in PenState.legal_transitions_with_secondary_button().items() + for k, v in PenState.legal_transitions_with_button().items() ], ) def test_valid_secondary_button_pen_states(self, state_list, scribble): """Rework the transition state machine by adding the secondary button.""" - self._test_states(state_list, scribble, allow_intermediate_states=False) + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.SECONDARY_PRESSED, + ) @pytest.mark.skip_if_uhdev( lambda uhdev: "Invert" not in uhdev.fields, @@ -956,7 +846,7 @@ class BaseTest: class GXTP_pen(PenDigitizer): - def event(self, pen): + def event(self, pen, test_button): if not hasattr(self, "prev_tip_state"): self.prev_tip_state = False @@ -977,7 +867,7 @@ class GXTP_pen(PenDigitizer): if pen.eraser: internal_pen.invert = False - return super().event(internal_pen) + return super().event(internal_pen, test_button) class USIPen(PenDigitizer): -- cgit v1.2.3 From 03899011df4b2bb0f9b3ac57b1044b161a336f31 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:33 +0200 Subject: selftests/hid: tablets: add a couple of XP-PEN tablets Those tablets don't need special initialization, but are reporting the events with the wrong usages: - tip switch is used when the eraser should be used - eraser is used instead of the secondary barrel switch Add tests for those so we don't regress in the future. Currently we set x/y tilt to 0 to not trigger the bpf program compensate_coordinates_by_tilt() Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-13-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/test_tablet.py | 246 +++++++++++++++++++++++ 1 file changed, 246 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index df1134e5713c..e265f1d4e089 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -874,6 +874,229 @@ class USIPen(PenDigitizer): pass +class XPPen_ArtistPro16Gen2_28bd_095b(PenDigitizer): + """ + Pen with two buttons and a rubber end, but which reports + the second button as an eraser + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x28BD, 0x095B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + + def move_to(self, pen, state, button): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.x = 0 + pen.y = 0 + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: + pen.tipswitch = False + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_ERASING: + pen.tipswitch = True + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + + pen.xtilt = 0 + pen.ytilt = 0 + pen.current_state = state + + +class XPPen_Artist24_28bd_093a(PenDigitizer): + """ + Pen that reports secondary barrel switch through eraser + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x28BD, 0x093A), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + self.previous_state = PenState.PEN_IS_OUT_OF_RANGE + + def move_to(self, pen, state, button, debug=True): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + if debug: + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.eraser = button == BtnPressed.SECONDARY_PRESSED + + pen.current_state = state + + def send_intermediate_state(self, pen, state, button): + intermediate_pen = copy.copy(pen) + self.move_to(intermediate_pen, state, button, debug=False) + return super().event(intermediate_pen, button) + + def event(self, pen, button): + rs = [] + + # the pen reliably sends in-range events in a normal case (non emulation of eraser mode) + if self.previous_state == PenState.PEN_IS_IN_CONTACT: + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button) + ) + + if button == BtnPressed.SECONDARY_PRESSED: + if self.previous_state == PenState.PEN_IS_IN_RANGE: + if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + if pen.current_state == PenState.PEN_IS_IN_RANGE: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_CONTACT: + if pen.current_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, button + ) + ) + + if self.previous_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + if pen.current_state == PenState.PEN_IS_IN_CONTACT: + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_OUT_OF_RANGE, button + ) + ) + rs.extend( + self.send_intermediate_state( + pen, PenState.PEN_IS_IN_RANGE, button + ) + ) + + rs.extend(super().event(pen, button)) + self.previous_state = pen.current_state + return rs + + ################################################################################ # # Windows 7 compatible devices @@ -1052,3 +1275,26 @@ class TestGoodix_27c6_0e00(BaseTest.TestTablet): rdesc="05 0d 09 04 a1 01 85 01 09 22 a1 02 55 0e 65 11 35 00 15 00 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 54 15 00 25 7f 75 08 95 01 81 02 85 02 09 55 95 01 25 0a b1 02 85 03 06 00 ff 09 c5 15 00 26 ff 00 75 08 96 00 01 b1 02 c0 05 0d 09 02 a1 01 09 20 a1 00 85 08 05 01 a4 09 30 35 00 46 e6 09 15 00 26 04 20 55 0d 65 13 75 10 95 01 81 02 09 31 46 9a 06 26 60 15 81 02 b4 05 0d 09 38 95 01 75 08 15 00 25 01 81 02 09 30 75 10 26 ff 0f 81 02 09 31 81 02 09 42 09 44 09 5a 09 3c 09 45 09 32 75 01 95 06 25 01 81 02 95 02 81 03 09 3d 55 0e 65 14 36 d8 dc 46 28 23 16 d8 dc 26 28 23 95 01 75 10 81 02 09 3e 81 02 09 41 15 00 27 a0 8c 00 00 35 00 47 a0 8c 00 00 81 02 05 20 0a 53 04 65 00 16 01 f8 26 ff 07 75 10 95 01 81 02 0a 54 04 81 02 0a 55 04 81 02 0a 57 04 81 02 0a 58 04 81 02 0a 59 04 81 02 0a 72 04 81 02 0a 73 04 81 02 0a 74 04 81 02 05 0d 09 3b 15 00 25 64 75 08 81 02 09 5b 25 ff 75 40 81 02 06 00 ff 09 5b 75 20 81 02 05 0d 09 5c 26 ff 00 75 08 81 02 09 5e 81 02 09 70 a1 02 15 01 25 06 09 72 09 73 09 74 09 75 09 76 09 77 81 20 c0 06 00 ff 09 01 15 00 27 ff ff 00 00 75 10 95 01 81 02 85 09 09 81 a1 02 09 81 15 01 25 04 09 82 09 83 09 84 09 85 81 20 c0 85 10 09 5c a1 02 15 00 25 01 75 08 95 01 09 38 b1 02 09 5c 26 ff 00 b1 02 09 5d 75 01 95 01 25 01 b1 02 95 07 b1 03 c0 85 11 09 5e a1 02 09 38 15 00 25 01 75 08 95 01 b1 02 09 5e 26 ff 00 b1 02 09 5f 75 01 25 01 b1 02 75 07 b1 03 c0 85 12 09 70 a1 02 75 08 95 01 15 00 25 01 09 38 b1 02 09 70 a1 02 25 06 09 72 09 73 09 74 09 75 09 76 09 77 b1 20 c0 09 71 75 01 25 01 b1 02 75 07 b1 03 c0 85 13 09 80 15 00 25 ff 75 40 95 01 b1 02 85 14 09 44 a1 02 09 38 75 08 95 01 25 01 b1 02 15 01 25 03 09 44 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 5a a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 45 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 c0 85 15 75 08 95 01 05 0d 09 90 a1 02 09 38 25 01 b1 02 09 91 75 10 26 ff 0f b1 02 09 92 75 40 25 ff b1 02 05 06 09 2a 75 08 26 ff 00 a1 02 09 2d b1 02 09 2e b1 02 c0 c0 85 16 05 06 09 2b a1 02 05 0d 25 01 09 38 b1 02 05 06 09 2b a1 02 09 2d 26 ff 00 b1 02 09 2e b1 02 c0 c0 85 17 06 00 ff 09 01 a1 02 05 0d 09 38 75 08 95 01 25 01 b1 02 06 00 ff 09 01 75 10 27 ff ff 00 00 b1 02 c0 85 18 05 0d 09 38 75 08 95 01 15 00 25 01 b1 02 c0 c0 06 f0 ff 09 01 a1 01 85 0e 09 01 15 00 25 ff 75 08 95 40 91 02 09 01 15 00 25 ff 75 08 95 40 81 02 c0", input_info=(BusType.I2C, 0x27C6, 0x0E00), ) + + +class TestXPPen_ArtistPro16Gen2_28bd_095b(BaseTest.TestTablet): + hid_bpfs = [("XPPen__ArtistPro16Gen2.bpf.o", True)] + + def create_device(self): + dev = XPPen_ArtistPro16Gen2_28bd_095b( + "uhid test XPPen Artist Pro 16 Gen2 28bd 095b", + rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 09 3c 15 00 25 01 75 01 95 04 81 02 95 01 81 03 09 32 15 00 25 01 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 ff 34 26 ff 7f 81 02 09 31 46 20 21 26 ff 7f 81 02 b4 09 30 45 00 26 ff 3f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", + input_info=(BusType.USB, 0x28BD, 0x095B), + ) + return dev + + +class TestXPPen_Artist24_28bd_093a(BaseTest.TestTablet): + hid_bpfs = [("XPPen__Artist24.bpf.o", True)] + + def create_device(self): + return XPPen_Artist24_28bd_093a( + "uhid test XPPen Artist 24 28bd 093a", + rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 15 00 25 01 75 01 95 03 81 02 95 02 81 03 09 32 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 f0 50 26 ff 7f 81 02 09 31 46 91 2d 26 ff 7f 81 02 b4 09 30 45 00 26 ff 1f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", + input_info=(BusType.USB, 0x28BD, 0x093A), + ) -- cgit v1.2.3 From 1b2c3caf7839adff892d8397995803d93e347974 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:34 +0200 Subject: selftests/hid: tablets: also check for XP-Pen offset correction The values are taken from the HID-BPF file. Basically we are recomputing the array provided there. Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-14-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/test_tablet.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index e265f1d4e089..ae0eda9cd3d8 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -957,10 +957,24 @@ class XPPen_ArtistPro16Gen2_28bd_095b(PenDigitizer): pen.eraser = False pen.barrelswitch = False - pen.xtilt = 0 - pen.ytilt = 0 pen.current_state = state + def event(self, pen, test_button): + import math + + pen_copy = copy.copy(pen) + width = 13.567 + height = 8.480 + tip_height = 0.055677699 + hx = tip_height * (32767 / width) + hy = tip_height * (32767 / height) + if pen_copy.xtilt != 0: + pen_copy.x += round(hx * math.sin(math.radians(pen_copy.xtilt))) + if pen_copy.ytilt != 0: + pen_copy.y += round(hy * math.sin(math.radians(pen_copy.ytilt))) + + return super().event(pen_copy, test_button) + class XPPen_Artist24_28bd_093a(PenDigitizer): """ -- cgit v1.2.3 From 51de9ee0a6c7f0d06fa7b80ff2ef9f3f661c3eb6 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:35 +0200 Subject: selftests/hid: add Huion Kamvas Pro 19 tests This tablets gets a lot of things wrong: - the secondary button is reported through Secondary Tip Switch - the third button is reported through Invert We need to add some out of proximity intermediate state when moving back and forth with the eraser mode as it can only be triggered by physically returning the pen, meaning that the tolerated transitions can never happen. Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-15-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/test_tablet.py | 191 +++++++++++++++++++++++ 1 file changed, 191 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index ae0eda9cd3d8..a9e2de1e8861 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -35,6 +35,7 @@ class BtnPressed(Enum): PRIMARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS SECONDARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS2 + THIRD_PRESSED = libevdev.EV_KEY.BTN_STYLUS3 class PenState(Enum): @@ -503,6 +504,7 @@ class Pen(object): buttons = [ BtnPressed.PRIMARY_PRESSED, BtnPressed.SECONDARY_PRESSED, + BtnPressed.THIRD_PRESSED, ] if button is not None: buttons.remove(button) @@ -787,6 +789,27 @@ class BaseTest: button=BtnPressed.SECONDARY_PRESSED, ) + @pytest.mark.skip_if_uhdev( + lambda uhdev: "Third Barrel Switch" not in uhdev.fields, + "Device not compatible, missing Third Barrel Switch usage", + ) + @pytest.mark.parametrize("scribble", [True, False], ids=["scribble", "static"]) + @pytest.mark.parametrize( + "state_list", + [ + pytest.param(v, id=k) + for k, v in PenState.legal_transitions_with_button().items() + ], + ) + def test_valid_third_button_pen_states(self, state_list, scribble): + """Rework the transition state machine by adding the secondary button.""" + self._test_states( + state_list, + scribble, + allow_intermediate_states=False, + button=BtnPressed.THIRD_PRESSED, + ) + @pytest.mark.skip_if_uhdev( lambda uhdev: "Invert" not in uhdev.fields, "Device not compatible, missing Invert usage", @@ -1111,6 +1134,163 @@ class XPPen_Artist24_28bd_093a(PenDigitizer): return rs +class Huion_Kamvas_Pro_19_256c_006b(PenDigitizer): + """ + Pen that reports secondary barrel switch through secondary TipSwtich + and 3rd button through Invert + """ + + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Stylus", + physical=None, + input_info=(BusType.USB, 0x256C, 0x006B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + self.fields.append("Secondary Barrel Switch") + self.fields.append("Third Barrel Switch") + self.previous_state = PenState.PEN_IS_OUT_OF_RANGE + + def move_to(self, pen, state, button, debug=True): + # fill in the previous values + if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE: + pen.restore() + + if debug: + print(f"\n *** pen is moving to {state} ***") + + if state == PenState.PEN_IS_OUT_OF_RANGE: + pen.backup() + pen.tipswitch = False + pen.tippressure = 0 + pen.azimuth = 0 + pen.inrange = False + pen.width = 0 + pen.height = 0 + pen.invert = False + pen.eraser = False + pen.xtilt = 0 + pen.ytilt = 0 + pen.twist = 0 + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_RANGE: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_CONTACT: + pen.tipswitch = True + pen.inrange = True + pen.invert = False + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + pen.tipswitch = False + pen.inrange = True + pen.eraser = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED + pen.invert = button == BtnPressed.THIRD_PRESSED + elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON: + pen.tipswitch = True + pen.inrange = True + pen.eraser = False + assert button is not None + pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED + pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED + pen.invert = button == BtnPressed.THIRD_PRESSED + elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT: + pen.tipswitch = False + pen.inrange = True + pen.invert = True + pen.eraser = False + pen.barrelswitch = False + pen.secondarytipswitch = False + elif state == PenState.PEN_IS_ERASING: + pen.tipswitch = False + pen.inrange = True + pen.invert = False + pen.eraser = True + pen.barrelswitch = False + pen.secondarytipswitch = False + + pen.current_state = state + + def call_input_event(self, report): + if report[0] == 0x0a: + # ensures the original second Eraser usage is null + report[1] &= 0xdf + + # ensures the original last bit is equal to bit 6 (In Range) + if report[1] & 0x40: + report[1] |= 0x80 + + super().call_input_event(report) + + def send_intermediate_state(self, pen, state, test_button): + intermediate_pen = copy.copy(pen) + self.move_to(intermediate_pen, state, test_button, debug=False) + return super().event(intermediate_pen, test_button) + + def event(self, pen, button): + rs = [] + + # it's not possible to go between eraser mode or not without + # going out-of-prox: the eraser mode is activated by presenting + # the tail of the pen + if self.previous_state in ( + PenState.PEN_IS_IN_RANGE, + PenState.PEN_IS_IN_RANGE_WITH_BUTTON, + PenState.PEN_IS_IN_CONTACT, + PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, + ) and pen.current_state in ( + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON, + PenState.PEN_IS_ERASING, + PenState.PEN_IS_ERASING_WITH_BUTTON, + ): + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button) + ) + + # same than above except from eraser to normal + if self.previous_state in ( + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT, + PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON, + PenState.PEN_IS_ERASING, + PenState.PEN_IS_ERASING_WITH_BUTTON, + ) and pen.current_state in ( + PenState.PEN_IS_IN_RANGE, + PenState.PEN_IS_IN_RANGE_WITH_BUTTON, + PenState.PEN_IS_IN_CONTACT, + PenState.PEN_IS_IN_CONTACT_WITH_BUTTON, + ): + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button) + ) + + if self.previous_state == PenState.PEN_IS_OUT_OF_RANGE: + if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON: + rs.extend( + self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button) + ) + + rs.extend(super().event(pen, button)) + self.previous_state = pen.current_state + return rs + + ################################################################################ # # Windows 7 compatible devices @@ -1312,3 +1492,14 @@ class TestXPPen_Artist24_28bd_093a(BaseTest.TestTablet): rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 15 00 25 01 75 01 95 03 81 02 95 02 81 03 09 32 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 f0 50 26 ff 7f 81 02 09 31 46 91 2d 26 ff 7f 81 02 b4 09 30 45 00 26 ff 1f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0", input_info=(BusType.USB, 0x28BD, 0x093A), ) + + +class TestHuion_Kamvas_Pro_19_256c_006b(BaseTest.TestTablet): + hid_bpfs = [("Huion__Kamvas-Pro-19.bpf.o", True)] + + def create_device(self): + return Huion_Kamvas_Pro_19_256c_006b( + "uhid test HUION Huion Tablet_GT1902", + rdesc="05 0d 09 02 a1 01 85 0a 09 20 a1 01 09 42 09 44 09 43 09 3c 09 45 15 00 25 01 75 01 95 06 81 02 09 32 75 01 95 01 81 02 81 03 05 01 09 30 09 31 55 0d 65 33 26 ff 7f 35 00 46 00 08 75 10 95 02 81 02 05 0d 09 30 26 ff 3f 75 10 95 01 81 02 09 3d 09 3e 15 a6 25 5a 75 08 95 02 81 02 c0 c0 05 0d 09 04 a1 01 85 04 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 56 55 00 65 00 27 ff ff ff 7f 95 01 75 20 81 02 09 54 25 7f 95 01 75 08 81 02 75 08 95 08 81 03 85 05 09 55 25 0a 75 08 95 01 b1 02 06 00 ff 09 c5 85 06 15 00 26 ff 00 75 08 96 00 01 b1 02 c0", + input_info=(BusType.USB, 0x256C, 0x006B), + ) -- cgit v1.2.3 From c6b03c736a523902bb53bb9897f5c75292b3424b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:36 +0200 Subject: selftests/hid: import base_gamepad.py from hid-tools We need to slightly change base_device.py for supporting HID-BPF, so instead of monkey patching, let's just embed it in the kernel tree. Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-16-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/base_gamepad.py | 238 ++++++++++++++++++++++ tools/testing/selftests/hid/tests/test_gamepad.py | 5 +- 2 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/hid/tests/base_gamepad.py (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/base_gamepad.py b/tools/testing/selftests/hid/tests/base_gamepad.py new file mode 100644 index 000000000000..ec74d75767a2 --- /dev/null +++ b/tools/testing/selftests/hid/tests/base_gamepad.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: GPL-2.0 +import libevdev + +from .base_device import BaseDevice +from hidtools.util import BusType + + +class InvalidHIDCommunication(Exception): + pass + + +class GamepadData(object): + pass + + +class AxisMapping(object): + """Represents a mapping between a HID type + and an evdev event""" + + def __init__(self, hid, evdev=None): + self.hid = hid.lower() + + if evdev is None: + evdev = f"ABS_{hid.upper()}" + + self.evdev = libevdev.evbit("EV_ABS", evdev) + + +class BaseGamepad(BaseDevice): + buttons_map = { + 1: "BTN_SOUTH", + 2: "BTN_EAST", + 3: "BTN_C", + 4: "BTN_NORTH", + 5: "BTN_WEST", + 6: "BTN_Z", + 7: "BTN_TL", + 8: "BTN_TR", + 9: "BTN_TL2", + 10: "BTN_TR2", + 11: "BTN_SELECT", + 12: "BTN_START", + 13: "BTN_MODE", + 14: "BTN_THUMBL", + 15: "BTN_THUMBR", + } + + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("z"), + "y": AxisMapping("Rz"), + }, + } + + def __init__(self, rdesc, application="Game Pad", name=None, input_info=None): + assert rdesc is not None + super().__init__(name, application, input_info=input_info, rdesc=rdesc) + self.buttons = (1, 2, 3) + self._buttons = {} + self.left = (127, 127) + self.right = (127, 127) + self.hat_switch = 15 + assert self.parsed_rdesc is not None + + self.fields = [] + for r in self.parsed_rdesc.input_reports.values(): + if r.application_name == self.application: + self.fields.extend([f.usage_name for f in r]) + + def store_axes(self, which, gamepad, data): + amap = self.axes_map[which] + x, y = data + setattr(gamepad, amap["x"].hid, x) + setattr(gamepad, amap["y"].hid, y) + + def create_report( + self, + *, + left=(None, None), + right=(None, None), + hat_switch=None, + buttons=None, + reportID=None, + application="Game Pad", + ): + """ + Return an input report for this device. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + (expressed in 1/8 of circle, 0 being North, 2 East) + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + :param reportID: the numeric report ID for this report, if needed + :param application: the application used to report the values + """ + if buttons is not None: + for i, b in buttons.items(): + if i not in self.buttons: + raise InvalidHIDCommunication( + f"button {i} is not part of this {self.application}" + ) + if b is not None: + self._buttons[i] = b + + def replace_none_in_tuple(item, default): + if item is None: + item = (None, None) + + if None in item: + if item[0] is None: + item = (default[0], item[1]) + if item[1] is None: + item = (item[0], default[1]) + + return item + + right = replace_none_in_tuple(right, self.right) + self.right = right + left = replace_none_in_tuple(left, self.left) + self.left = left + + if hat_switch is None: + hat_switch = self.hat_switch + else: + self.hat_switch = hat_switch + + reportID = reportID or self.default_reportID + + gamepad = GamepadData() + for i, b in self._buttons.items(): + gamepad.__setattr__(f"b{i}", int(b) if b is not None else 0) + + self.store_axes("left_stick", gamepad, left) + self.store_axes("right_stick", gamepad, right) + gamepad.hatswitch = hat_switch # type: ignore ### gamepad is by default empty + return super().create_report( + gamepad, reportID=reportID, application=application + ) + + def event( + self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None + ): + """ + Send an input event on the default report ID. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + """ + r = self.create_report( + left=left, right=right, hat_switch=hat_switch, buttons=buttons + ) + self.call_input_event(r) + return [r] + + +class JoystickGamepad(BaseGamepad): + buttons_map = { + 1: "BTN_TRIGGER", + 2: "BTN_THUMB", + 3: "BTN_THUMB2", + 4: "BTN_TOP", + 5: "BTN_TOP2", + 6: "BTN_PINKIE", + 7: "BTN_BASE", + 8: "BTN_BASE2", + 9: "BTN_BASE3", + 10: "BTN_BASE4", + 11: "BTN_BASE5", + 12: "BTN_BASE6", + 13: "BTN_DEAD", + } + + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("rudder"), + "y": AxisMapping("throttle"), + }, + } + + def __init__(self, rdesc, application="Joystick", name=None, input_info=None): + super().__init__(rdesc, application, name, input_info) + + def create_report( + self, + *, + left=(None, None), + right=(None, None), + hat_switch=None, + buttons=None, + reportID=None, + application=None, + ): + """ + Return an input report for this device. + + :param left: a tuple of absolute (x, y) value of the left joypad + where ``None`` is "leave unchanged" + :param right: a tuple of absolute (x, y) value of the right joypad + where ``None`` is "leave unchanged" + :param hat_switch: an absolute angular value of the hat switch + where ``None`` is "leave unchanged" + :param buttons: a dict of index/bool for the button states, + where ``None`` is "leave unchanged" + :param reportID: the numeric report ID for this report, if needed + :param application: the application for this report, if needed + """ + if application is None: + application = "Joystick" + return super().create_report( + left=left, + right=right, + hat_switch=hat_switch, + buttons=buttons, + reportID=reportID, + application=application, + ) + + def store_right_joystick(self, gamepad, data): + gamepad.rudder, gamepad.throttle = data diff --git a/tools/testing/selftests/hid/tests/test_gamepad.py b/tools/testing/selftests/hid/tests/test_gamepad.py index 26c74040b796..900fff044348 100644 --- a/tools/testing/selftests/hid/tests/test_gamepad.py +++ b/tools/testing/selftests/hid/tests/test_gamepad.py @@ -10,7 +10,10 @@ from . import base import libevdev import pytest -from hidtools.device.base_gamepad import AsusGamepad, SaitekGamepad +from .base_gamepad import ( + AsusGamepad, + SaitekGamepad, +) import logging -- cgit v1.2.3 From aa7e560454a90d4fe9924500f1ae2a3779806b85 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:37 +0200 Subject: selftests/hid: move the gamepads definitions in the test file More in line with the other test_* files. No code change Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-17-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/test_gamepad.py | 415 +++++++++++++++++++++- 1 file changed, 411 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/test_gamepad.py b/tools/testing/selftests/hid/tests/test_gamepad.py index 900fff044348..bd30dadbeb7d 100644 --- a/tools/testing/selftests/hid/tests/test_gamepad.py +++ b/tools/testing/selftests/hid/tests/test_gamepad.py @@ -10,10 +10,8 @@ from . import base import libevdev import pytest -from .base_gamepad import ( - AsusGamepad, - SaitekGamepad, -) +from .base_gamepad import BaseGamepad, JoystickGamepad +from hidtools.util import BusType import logging @@ -202,6 +200,415 @@ class BaseTest: ) +class SaitekGamepad(JoystickGamepad): + # fmt: off + report_descriptor = [ + 0x05, 0x01, # Usage Page (Generic Desktop) 0 + 0x09, 0x04, # Usage (Joystick) 2 + 0xa1, 0x01, # Collection (Application) 4 + 0x09, 0x01, # .Usage (Pointer) 6 + 0xa1, 0x00, # .Collection (Physical) 8 + 0x85, 0x01, # ..Report ID (1) 10 + 0x09, 0x30, # ..Usage (X) 12 + 0x15, 0x00, # ..Logical Minimum (0) 14 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 16 + 0x35, 0x00, # ..Physical Minimum (0) 19 + 0x46, 0xff, 0x00, # ..Physical Maximum (255) 21 + 0x75, 0x08, # ..Report Size (8) 24 + 0x95, 0x01, # ..Report Count (1) 26 + 0x81, 0x02, # ..Input (Data,Var,Abs) 28 + 0x09, 0x31, # ..Usage (Y) 30 + 0x81, 0x02, # ..Input (Data,Var,Abs) 32 + 0x05, 0x02, # ..Usage Page (Simulation Controls) 34 + 0x09, 0xba, # ..Usage (Rudder) 36 + 0x81, 0x02, # ..Input (Data,Var,Abs) 38 + 0x09, 0xbb, # ..Usage (Throttle) 40 + 0x81, 0x02, # ..Input (Data,Var,Abs) 42 + 0x05, 0x09, # ..Usage Page (Button) 44 + 0x19, 0x01, # ..Usage Minimum (1) 46 + 0x29, 0x0c, # ..Usage Maximum (12) 48 + 0x25, 0x01, # ..Logical Maximum (1) 50 + 0x45, 0x01, # ..Physical Maximum (1) 52 + 0x75, 0x01, # ..Report Size (1) 54 + 0x95, 0x0c, # ..Report Count (12) 56 + 0x81, 0x02, # ..Input (Data,Var,Abs) 58 + 0x95, 0x01, # ..Report Count (1) 60 + 0x75, 0x00, # ..Report Size (0) 62 + 0x81, 0x03, # ..Input (Cnst,Var,Abs) 64 + 0x05, 0x01, # ..Usage Page (Generic Desktop) 66 + 0x09, 0x39, # ..Usage (Hat switch) 68 + 0x25, 0x07, # ..Logical Maximum (7) 70 + 0x46, 0x3b, 0x01, # ..Physical Maximum (315) 72 + 0x55, 0x00, # ..Unit Exponent (0) 75 + 0x65, 0x44, # ..Unit (Degrees^4,EngRotation) 77 + 0x75, 0x04, # ..Report Size (4) 79 + 0x81, 0x42, # ..Input (Data,Var,Abs,Null) 81 + 0x65, 0x00, # ..Unit (None) 83 + 0xc0, # .End Collection 85 + 0x05, 0x0f, # .Usage Page (Vendor Usage Page 0x0f) 86 + 0x09, 0x92, # .Usage (Vendor Usage 0x92) 88 + 0xa1, 0x02, # .Collection (Logical) 90 + 0x85, 0x02, # ..Report ID (2) 92 + 0x09, 0xa0, # ..Usage (Vendor Usage 0xa0) 94 + 0x09, 0x9f, # ..Usage (Vendor Usage 0x9f) 96 + 0x25, 0x01, # ..Logical Maximum (1) 98 + 0x45, 0x00, # ..Physical Maximum (0) 100 + 0x75, 0x01, # ..Report Size (1) 102 + 0x95, 0x02, # ..Report Count (2) 104 + 0x81, 0x02, # ..Input (Data,Var,Abs) 106 + 0x75, 0x06, # ..Report Size (6) 108 + 0x95, 0x01, # ..Report Count (1) 110 + 0x81, 0x03, # ..Input (Cnst,Var,Abs) 112 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 114 + 0x75, 0x07, # ..Report Size (7) 116 + 0x25, 0x7f, # ..Logical Maximum (127) 118 + 0x81, 0x02, # ..Input (Data,Var,Abs) 120 + 0x09, 0x94, # ..Usage (Vendor Usage 0x94) 122 + 0x75, 0x01, # ..Report Size (1) 124 + 0x25, 0x01, # ..Logical Maximum (1) 126 + 0x81, 0x02, # ..Input (Data,Var,Abs) 128 + 0xc0, # .End Collection 130 + 0x09, 0x21, # .Usage (Vendor Usage 0x21) 131 + 0xa1, 0x02, # .Collection (Logical) 133 + 0x85, 0x0b, # ..Report ID (11) 135 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 137 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 139 + 0x75, 0x08, # ..Report Size (8) 142 + 0x91, 0x02, # ..Output (Data,Var,Abs) 144 + 0x09, 0x53, # ..Usage (Vendor Usage 0x53) 146 + 0x25, 0x0a, # ..Logical Maximum (10) 148 + 0x91, 0x02, # ..Output (Data,Var,Abs) 150 + 0x09, 0x50, # ..Usage (Vendor Usage 0x50) 152 + 0x27, 0xfe, 0xff, 0x00, 0x00, # ..Logical Maximum (65534) 154 + 0x47, 0xfe, 0xff, 0x00, 0x00, # ..Physical Maximum (65534) 159 + 0x75, 0x10, # ..Report Size (16) 164 + 0x55, 0xfd, # ..Unit Exponent (237) 166 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 168 + 0x91, 0x02, # ..Output (Data,Var,Abs) 171 + 0x55, 0x00, # ..Unit Exponent (0) 173 + 0x65, 0x00, # ..Unit (None) 175 + 0x09, 0x54, # ..Usage (Vendor Usage 0x54) 177 + 0x55, 0xfd, # ..Unit Exponent (237) 179 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 181 + 0x91, 0x02, # ..Output (Data,Var,Abs) 184 + 0x55, 0x00, # ..Unit Exponent (0) 186 + 0x65, 0x00, # ..Unit (None) 188 + 0x09, 0xa7, # ..Usage (Vendor Usage 0xa7) 190 + 0x55, 0xfd, # ..Unit Exponent (237) 192 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 194 + 0x91, 0x02, # ..Output (Data,Var,Abs) 197 + 0x55, 0x00, # ..Unit Exponent (0) 199 + 0x65, 0x00, # ..Unit (None) 201 + 0xc0, # .End Collection 203 + 0x09, 0x5a, # .Usage (Vendor Usage 0x5a) 204 + 0xa1, 0x02, # .Collection (Logical) 206 + 0x85, 0x0c, # ..Report ID (12) 208 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 210 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 212 + 0x45, 0x00, # ..Physical Maximum (0) 215 + 0x75, 0x08, # ..Report Size (8) 217 + 0x91, 0x02, # ..Output (Data,Var,Abs) 219 + 0x09, 0x5c, # ..Usage (Vendor Usage 0x5c) 221 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 223 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 226 + 0x75, 0x10, # ..Report Size (16) 229 + 0x55, 0xfd, # ..Unit Exponent (237) 231 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 233 + 0x91, 0x02, # ..Output (Data,Var,Abs) 236 + 0x55, 0x00, # ..Unit Exponent (0) 238 + 0x65, 0x00, # ..Unit (None) 240 + 0x09, 0x5b, # ..Usage (Vendor Usage 0x5b) 242 + 0x25, 0x7f, # ..Logical Maximum (127) 244 + 0x75, 0x08, # ..Report Size (8) 246 + 0x91, 0x02, # ..Output (Data,Var,Abs) 248 + 0x09, 0x5e, # ..Usage (Vendor Usage 0x5e) 250 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 252 + 0x75, 0x10, # ..Report Size (16) 255 + 0x55, 0xfd, # ..Unit Exponent (237) 257 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 259 + 0x91, 0x02, # ..Output (Data,Var,Abs) 262 + 0x55, 0x00, # ..Unit Exponent (0) 264 + 0x65, 0x00, # ..Unit (None) 266 + 0x09, 0x5d, # ..Usage (Vendor Usage 0x5d) 268 + 0x25, 0x7f, # ..Logical Maximum (127) 270 + 0x75, 0x08, # ..Report Size (8) 272 + 0x91, 0x02, # ..Output (Data,Var,Abs) 274 + 0xc0, # .End Collection 276 + 0x09, 0x73, # .Usage (Vendor Usage 0x73) 277 + 0xa1, 0x02, # .Collection (Logical) 279 + 0x85, 0x0d, # ..Report ID (13) 281 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 283 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 285 + 0x45, 0x00, # ..Physical Maximum (0) 288 + 0x91, 0x02, # ..Output (Data,Var,Abs) 290 + 0x09, 0x70, # ..Usage (Vendor Usage 0x70) 292 + 0x15, 0x81, # ..Logical Minimum (-127) 294 + 0x25, 0x7f, # ..Logical Maximum (127) 296 + 0x36, 0xf0, 0xd8, # ..Physical Minimum (-10000) 298 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 301 + 0x91, 0x02, # ..Output (Data,Var,Abs) 304 + 0xc0, # .End Collection 306 + 0x09, 0x6e, # .Usage (Vendor Usage 0x6e) 307 + 0xa1, 0x02, # .Collection (Logical) 309 + 0x85, 0x0e, # ..Report ID (14) 311 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 313 + 0x15, 0x00, # ..Logical Minimum (0) 315 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 317 + 0x35, 0x00, # ..Physical Minimum (0) 320 + 0x45, 0x00, # ..Physical Maximum (0) 322 + 0x91, 0x02, # ..Output (Data,Var,Abs) 324 + 0x09, 0x70, # ..Usage (Vendor Usage 0x70) 326 + 0x25, 0x7f, # ..Logical Maximum (127) 328 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 330 + 0x91, 0x02, # ..Output (Data,Var,Abs) 333 + 0x09, 0x6f, # ..Usage (Vendor Usage 0x6f) 335 + 0x15, 0x81, # ..Logical Minimum (-127) 337 + 0x36, 0xf0, 0xd8, # ..Physical Minimum (-10000) 339 + 0x91, 0x02, # ..Output (Data,Var,Abs) 342 + 0x09, 0x71, # ..Usage (Vendor Usage 0x71) 344 + 0x15, 0x00, # ..Logical Minimum (0) 346 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 348 + 0x35, 0x00, # ..Physical Minimum (0) 351 + 0x46, 0x68, 0x01, # ..Physical Maximum (360) 353 + 0x91, 0x02, # ..Output (Data,Var,Abs) 356 + 0x09, 0x72, # ..Usage (Vendor Usage 0x72) 358 + 0x75, 0x10, # ..Report Size (16) 360 + 0x26, 0x10, 0x27, # ..Logical Maximum (10000) 362 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 365 + 0x55, 0xfd, # ..Unit Exponent (237) 368 + 0x66, 0x01, 0x10, # ..Unit (Seconds,SILinear) 370 + 0x91, 0x02, # ..Output (Data,Var,Abs) 373 + 0x55, 0x00, # ..Unit Exponent (0) 375 + 0x65, 0x00, # ..Unit (None) 377 + 0xc0, # .End Collection 379 + 0x09, 0x77, # .Usage (Vendor Usage 0x77) 380 + 0xa1, 0x02, # .Collection (Logical) 382 + 0x85, 0x51, # ..Report ID (81) 384 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 386 + 0x25, 0x7f, # ..Logical Maximum (127) 388 + 0x45, 0x00, # ..Physical Maximum (0) 390 + 0x75, 0x08, # ..Report Size (8) 392 + 0x91, 0x02, # ..Output (Data,Var,Abs) 394 + 0x09, 0x78, # ..Usage (Vendor Usage 0x78) 396 + 0xa1, 0x02, # ..Collection (Logical) 398 + 0x09, 0x7b, # ...Usage (Vendor Usage 0x7b) 400 + 0x09, 0x79, # ...Usage (Vendor Usage 0x79) 402 + 0x09, 0x7a, # ...Usage (Vendor Usage 0x7a) 404 + 0x15, 0x01, # ...Logical Minimum (1) 406 + 0x25, 0x03, # ...Logical Maximum (3) 408 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 410 + 0xc0, # ..End Collection 412 + 0x09, 0x7c, # ..Usage (Vendor Usage 0x7c) 413 + 0x15, 0x00, # ..Logical Minimum (0) 415 + 0x26, 0xfe, 0x00, # ..Logical Maximum (254) 417 + 0x91, 0x02, # ..Output (Data,Var,Abs) 420 + 0xc0, # .End Collection 422 + 0x09, 0x92, # .Usage (Vendor Usage 0x92) 423 + 0xa1, 0x02, # .Collection (Logical) 425 + 0x85, 0x52, # ..Report ID (82) 427 + 0x09, 0x96, # ..Usage (Vendor Usage 0x96) 429 + 0xa1, 0x02, # ..Collection (Logical) 431 + 0x09, 0x9a, # ...Usage (Vendor Usage 0x9a) 433 + 0x09, 0x99, # ...Usage (Vendor Usage 0x99) 435 + 0x09, 0x97, # ...Usage (Vendor Usage 0x97) 437 + 0x09, 0x98, # ...Usage (Vendor Usage 0x98) 439 + 0x09, 0x9b, # ...Usage (Vendor Usage 0x9b) 441 + 0x09, 0x9c, # ...Usage (Vendor Usage 0x9c) 443 + 0x15, 0x01, # ...Logical Minimum (1) 445 + 0x25, 0x06, # ...Logical Maximum (6) 447 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 449 + 0xc0, # ..End Collection 451 + 0xc0, # .End Collection 452 + 0x05, 0xff, # .Usage Page (Vendor Usage Page 0xff) 453 + 0x0a, 0x01, 0x03, # .Usage (Vendor Usage 0x301) 455 + 0xa1, 0x02, # .Collection (Logical) 458 + 0x85, 0x40, # ..Report ID (64) 460 + 0x0a, 0x02, 0x03, # ..Usage (Vendor Usage 0x302) 462 + 0xa1, 0x02, # ..Collection (Logical) 465 + 0x1a, 0x11, 0x03, # ...Usage Minimum (785) 467 + 0x2a, 0x20, 0x03, # ...Usage Maximum (800) 470 + 0x25, 0x10, # ...Logical Maximum (16) 473 + 0x91, 0x00, # ...Output (Data,Arr,Abs) 475 + 0xc0, # ..End Collection 477 + 0x0a, 0x03, 0x03, # ..Usage (Vendor Usage 0x303) 478 + 0x15, 0x00, # ..Logical Minimum (0) 481 + 0x27, 0xff, 0xff, 0x00, 0x00, # ..Logical Maximum (65535) 483 + 0x75, 0x10, # ..Report Size (16) 488 + 0x91, 0x02, # ..Output (Data,Var,Abs) 490 + 0xc0, # .End Collection 492 + 0x05, 0x0f, # .Usage Page (Vendor Usage Page 0x0f) 493 + 0x09, 0x7d, # .Usage (Vendor Usage 0x7d) 495 + 0xa1, 0x02, # .Collection (Logical) 497 + 0x85, 0x43, # ..Report ID (67) 499 + 0x09, 0x7e, # ..Usage (Vendor Usage 0x7e) 501 + 0x26, 0x80, 0x00, # ..Logical Maximum (128) 503 + 0x46, 0x10, 0x27, # ..Physical Maximum (10000) 506 + 0x75, 0x08, # ..Report Size (8) 509 + 0x91, 0x02, # ..Output (Data,Var,Abs) 511 + 0xc0, # .End Collection 513 + 0x09, 0x7f, # .Usage (Vendor Usage 0x7f) 514 + 0xa1, 0x02, # .Collection (Logical) 516 + 0x85, 0x0b, # ..Report ID (11) 518 + 0x09, 0x80, # ..Usage (Vendor Usage 0x80) 520 + 0x26, 0xff, 0x7f, # ..Logical Maximum (32767) 522 + 0x45, 0x00, # ..Physical Maximum (0) 525 + 0x75, 0x0f, # ..Report Size (15) 527 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 529 + 0x09, 0xa9, # ..Usage (Vendor Usage 0xa9) 531 + 0x25, 0x01, # ..Logical Maximum (1) 533 + 0x75, 0x01, # ..Report Size (1) 535 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 537 + 0x09, 0x83, # ..Usage (Vendor Usage 0x83) 539 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 541 + 0x75, 0x08, # ..Report Size (8) 544 + 0xb1, 0x03, # ..Feature (Cnst,Var,Abs) 546 + 0xc0, # .End Collection 548 + 0x09, 0xab, # .Usage (Vendor Usage 0xab) 549 + 0xa1, 0x03, # .Collection (Report) 551 + 0x85, 0x15, # ..Report ID (21) 553 + 0x09, 0x25, # ..Usage (Vendor Usage 0x25) 555 + 0xa1, 0x02, # ..Collection (Logical) 557 + 0x09, 0x26, # ...Usage (Vendor Usage 0x26) 559 + 0x09, 0x30, # ...Usage (Vendor Usage 0x30) 561 + 0x09, 0x32, # ...Usage (Vendor Usage 0x32) 563 + 0x09, 0x31, # ...Usage (Vendor Usage 0x31) 565 + 0x09, 0x33, # ...Usage (Vendor Usage 0x33) 567 + 0x09, 0x34, # ...Usage (Vendor Usage 0x34) 569 + 0x15, 0x01, # ...Logical Minimum (1) 571 + 0x25, 0x06, # ...Logical Maximum (6) 573 + 0xb1, 0x00, # ...Feature (Data,Arr,Abs) 575 + 0xc0, # ..End Collection 577 + 0xc0, # .End Collection 578 + 0x09, 0x89, # .Usage (Vendor Usage 0x89) 579 + 0xa1, 0x03, # .Collection (Report) 581 + 0x85, 0x16, # ..Report ID (22) 583 + 0x09, 0x8b, # ..Usage (Vendor Usage 0x8b) 585 + 0xa1, 0x02, # ..Collection (Logical) 587 + 0x09, 0x8c, # ...Usage (Vendor Usage 0x8c) 589 + 0x09, 0x8d, # ...Usage (Vendor Usage 0x8d) 591 + 0x09, 0x8e, # ...Usage (Vendor Usage 0x8e) 593 + 0x25, 0x03, # ...Logical Maximum (3) 595 + 0xb1, 0x00, # ...Feature (Data,Arr,Abs) 597 + 0xc0, # ..End Collection 599 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 600 + 0x15, 0x00, # ..Logical Minimum (0) 602 + 0x26, 0xfe, 0x00, # ..Logical Maximum (254) 604 + 0xb1, 0x02, # ..Feature (Data,Var,Abs) 607 + 0xc0, # .End Collection 609 + 0x09, 0x90, # .Usage (Vendor Usage 0x90) 610 + 0xa1, 0x03, # .Collection (Report) 612 + 0x85, 0x50, # ..Report ID (80) 614 + 0x09, 0x22, # ..Usage (Vendor Usage 0x22) 616 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 618 + 0x91, 0x02, # ..Output (Data,Var,Abs) 621 + 0xc0, # .End Collection 623 + 0xc0, # End Collection 624 + ] + # fmt: on + + def __init__(self, rdesc=report_descriptor, name=None): + super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x06A3, 0xFF0D)) + self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) + + +class AsusGamepad(BaseGamepad): + # fmt: off + report_descriptor = [ + 0x05, 0x01, # Usage Page (Generic Desktop) 0 + 0x09, 0x05, # Usage (Game Pad) 2 + 0xa1, 0x01, # Collection (Application) 4 + 0x85, 0x01, # .Report ID (1) 6 + 0x05, 0x09, # .Usage Page (Button) 8 + 0x0a, 0x01, 0x00, # .Usage (Vendor Usage 0x01) 10 + 0x0a, 0x02, 0x00, # .Usage (Vendor Usage 0x02) 13 + 0x0a, 0x04, 0x00, # .Usage (Vendor Usage 0x04) 16 + 0x0a, 0x05, 0x00, # .Usage (Vendor Usage 0x05) 19 + 0x0a, 0x07, 0x00, # .Usage (Vendor Usage 0x07) 22 + 0x0a, 0x08, 0x00, # .Usage (Vendor Usage 0x08) 25 + 0x0a, 0x0e, 0x00, # .Usage (Vendor Usage 0x0e) 28 + 0x0a, 0x0f, 0x00, # .Usage (Vendor Usage 0x0f) 31 + 0x0a, 0x0d, 0x00, # .Usage (Vendor Usage 0x0d) 34 + 0x05, 0x0c, # .Usage Page (Consumer Devices) 37 + 0x0a, 0x24, 0x02, # .Usage (AC Back) 39 + 0x0a, 0x23, 0x02, # .Usage (AC Home) 42 + 0x15, 0x00, # .Logical Minimum (0) 45 + 0x25, 0x01, # .Logical Maximum (1) 47 + 0x75, 0x01, # .Report Size (1) 49 + 0x95, 0x0b, # .Report Count (11) 51 + 0x81, 0x02, # .Input (Data,Var,Abs) 53 + 0x75, 0x01, # .Report Size (1) 55 + 0x95, 0x01, # .Report Count (1) 57 + 0x81, 0x03, # .Input (Cnst,Var,Abs) 59 + 0x05, 0x01, # .Usage Page (Generic Desktop) 61 + 0x75, 0x04, # .Report Size (4) 63 + 0x95, 0x01, # .Report Count (1) 65 + 0x25, 0x07, # .Logical Maximum (7) 67 + 0x46, 0x3b, 0x01, # .Physical Maximum (315) 69 + 0x66, 0x14, 0x00, # .Unit (Degrees,EngRotation) 72 + 0x09, 0x39, # .Usage (Hat switch) 75 + 0x81, 0x42, # .Input (Data,Var,Abs,Null) 77 + 0x66, 0x00, 0x00, # .Unit (None) 79 + 0x09, 0x01, # .Usage (Pointer) 82 + 0xa1, 0x00, # .Collection (Physical) 84 + 0x09, 0x30, # ..Usage (X) 86 + 0x09, 0x31, # ..Usage (Y) 88 + 0x09, 0x32, # ..Usage (Z) 90 + 0x09, 0x35, # ..Usage (Rz) 92 + 0x05, 0x02, # ..Usage Page (Simulation Controls) 94 + 0x09, 0xc5, # ..Usage (Brake) 96 + 0x09, 0xc4, # ..Usage (Accelerator) 98 + 0x15, 0x00, # ..Logical Minimum (0) 100 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 102 + 0x35, 0x00, # ..Physical Minimum (0) 105 + 0x46, 0xff, 0x00, # ..Physical Maximum (255) 107 + 0x75, 0x08, # ..Report Size (8) 110 + 0x95, 0x06, # ..Report Count (6) 112 + 0x81, 0x02, # ..Input (Data,Var,Abs) 114 + 0xc0, # .End Collection 116 + 0x85, 0x02, # .Report ID (2) 117 + 0x05, 0x08, # .Usage Page (LEDs) 119 + 0x0a, 0x01, 0x00, # .Usage (Num Lock) 121 + 0x0a, 0x02, 0x00, # .Usage (Caps Lock) 124 + 0x0a, 0x03, 0x00, # .Usage (Scroll Lock) 127 + 0x0a, 0x04, 0x00, # .Usage (Compose) 130 + 0x15, 0x00, # .Logical Minimum (0) 133 + 0x25, 0x01, # .Logical Maximum (1) 135 + 0x75, 0x01, # .Report Size (1) 137 + 0x95, 0x04, # .Report Count (4) 139 + 0x91, 0x02, # .Output (Data,Var,Abs) 141 + 0x75, 0x04, # .Report Size (4) 143 + 0x95, 0x01, # .Report Count (1) 145 + 0x91, 0x03, # .Output (Cnst,Var,Abs) 147 + 0xc0, # End Collection 149 + 0x05, 0x0c, # Usage Page (Consumer Devices) 150 + 0x09, 0x01, # Usage (Consumer Control) 152 + 0xa1, 0x01, # Collection (Application) 154 + 0x85, 0x03, # .Report ID (3) 156 + 0x05, 0x01, # .Usage Page (Generic Desktop) 158 + 0x09, 0x06, # .Usage (Keyboard) 160 + 0xa1, 0x02, # .Collection (Logical) 162 + 0x05, 0x06, # ..Usage Page (Generic Device Controls) 164 + 0x09, 0x20, # ..Usage (Battery Strength) 166 + 0x15, 0x00, # ..Logical Minimum (0) 168 + 0x26, 0xff, 0x00, # ..Logical Maximum (255) 170 + 0x75, 0x08, # ..Report Size (8) 173 + 0x95, 0x01, # ..Report Count (1) 175 + 0x81, 0x02, # ..Input (Data,Var,Abs) 177 + 0x06, 0xbc, 0xff, # ..Usage Page (Vendor Usage Page 0xffbc) 179 + 0x0a, 0xad, 0xbd, # ..Usage (Vendor Usage 0xbdad) 182 + 0x75, 0x08, # ..Report Size (8) 185 + 0x95, 0x06, # ..Report Count (6) 187 + 0x81, 0x02, # ..Input (Data,Var,Abs) 189 + 0xc0, # .End Collection 191 + 0xc0, # End Collection 192 + ] + # fmt: on + + def __init__(self, rdesc=report_descriptor, name=None): + super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x18D1, 0x2C40)) + self.buttons = (1, 2, 4, 5, 7, 8, 14, 15, 13) + + class TestSaitekGamepad(BaseTest.TestGamepad): def create_device(self): return SaitekGamepad() -- cgit v1.2.3 From b22cbfb42c19a378cca5fae3a98395225af05384 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 10 Apr 2024 19:19:38 +0200 Subject: selftests/hid: add tests for the Raptor Mach 2 joystick The only interesting bit is the HAT switch, and we use a BPF program to fix it. So ensure this works correctly. Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-18-a8bf16033ef8@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/test_gamepad.py | 47 ++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/test_gamepad.py b/tools/testing/selftests/hid/tests/test_gamepad.py index bd30dadbeb7d..8d5b5ffdae49 100644 --- a/tools/testing/selftests/hid/tests/test_gamepad.py +++ b/tools/testing/selftests/hid/tests/test_gamepad.py @@ -10,7 +10,7 @@ from . import base import libevdev import pytest -from .base_gamepad import BaseGamepad, JoystickGamepad +from .base_gamepad import BaseGamepad, JoystickGamepad, AxisMapping from hidtools.util import BusType import logging @@ -609,6 +609,40 @@ class AsusGamepad(BaseGamepad): self.buttons = (1, 2, 4, 5, 7, 8, 14, 15, 13) +class RaptorMach2Joystick(JoystickGamepad): + axes_map = { + "left_stick": { + "x": AxisMapping("x"), + "y": AxisMapping("y"), + }, + "right_stick": { + "x": AxisMapping("z"), + "y": AxisMapping("Rz"), + }, + } + + def __init__( + self, + name, + rdesc=None, + application="Joystick", + input_info=(BusType.USB, 0x11C0, 0x5606), + ): + super().__init__(rdesc, application, name, input_info) + self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) + self.hat_switch = 240 # null value is 240 as max is 239 + + def event( + self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None + ): + if hat_switch is not None: + hat_switch *= 30 + + return super().event( + left=left, right=right, hat_switch=hat_switch, buttons=buttons + ) + + class TestSaitekGamepad(BaseTest.TestGamepad): def create_device(self): return SaitekGamepad() @@ -617,3 +651,14 @@ class TestSaitekGamepad(BaseTest.TestGamepad): class TestAsusGamepad(BaseTest.TestGamepad): def create_device(self): return AsusGamepad() + + +class TestRaptorMach2Joystick(BaseTest.TestGamepad): + hid_bpfs = [("FR-TEC__Raptor-Mach-2.bpf.o", True)] + + def create_device(self): + return RaptorMach2Joystick( + "uhid test Sanmos Group FR-TEC Raptor MACH 2", + rdesc="05 01 09 04 a1 01 05 01 85 01 05 01 09 30 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 31 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 33 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 00 09 00 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 32 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 35 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 34 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 36 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 09 19 01 2a 1d 00 15 00 25 01 75 01 96 80 00 81 02 05 01 09 39 26 ef 00 46 68 01 65 14 75 10 95 01 81 42 05 01 09 00 75 08 95 1d 81 01 15 00 26 ef 00 85 58 26 ff 00 46 ff 00 75 08 95 3f 09 00 91 02 85 59 75 08 95 80 09 00 b1 02 c0", + input_info=(BusType.USB, 0x11C0, 0x5606), + ) -- cgit v1.2.3 From 89ea968a9d759f71ac7b8d50949a8e5e5bcb1111 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 6 May 2024 16:36:12 +0200 Subject: selftests/hid: skip tests with HID-BPF if udev-hid-bpf is not installed udev-hid-bpf is still not installed everywhere, and we should probably not assume it is installed automatically. Link: https://lore.kernel.org/r/20240506143612.148031-1-bentiss@kernel.org Reviewed-by: Peter Hutterer Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/tests/base.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/hid/tests/base.py b/tools/testing/selftests/hid/tests/base.py index 2d006c0f5fcd..3a465768e507 100644 --- a/tools/testing/selftests/hid/tests/base.py +++ b/tools/testing/selftests/hid/tests/base.py @@ -8,6 +8,7 @@ import libevdev import os import pytest +import shutil import subprocess import time @@ -240,6 +241,10 @@ class BaseTestCase: root_dir = (script_dir / "../../../../..").resolve() bpf_dir = root_dir / "drivers/hid/bpf/progs" + udev_hid_bpf = shutil.which("udev-hid-bpf") + if not udev_hid_bpf: + pytest.skip("udev-hid-bpf not found in $PATH, skipping") + wait = False for _, rdesc_fixup in self.hid_bpfs: if rdesc_fixup: -- cgit v1.2.3 From 651d61bc8b7d8bb622cfc24be2ee92eebb4ed3cc Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 11 Apr 2023 15:44:46 +0930 Subject: KVM: PPC: Fix documentation for ppc mmu caps The documentation mentions KVM_CAP_PPC_RADIX_MMU, but the defines in the kvm headers spell it KVM_CAP_PPC_MMU_RADIX. Similarly with KVM_CAP_PPC_MMU_HASH_V3. Fixes: c92701322711 ("KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU") Signed-off-by: Joel Stanley Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://msgid.link/20230411061446.26324-1-joel@jms.id.au --- Documentation/virt/kvm/api.rst | 8 ++++---- include/uapi/linux/kvm.h | 4 ++-- tools/include/uapi/linux/kvm.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0b5a33ee71ee..636e6794828f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4300,7 +4300,7 @@ operating system that uses the PIT for timing (e.g. Linux 2.4.x). 4.100 KVM_PPC_CONFIGURE_V3_MMU ------------------------------ -:Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +:Capability: KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 :Architectures: ppc :Type: vm ioctl :Parameters: struct kvm_ppc_mmuv3_cfg (in) @@ -4334,7 +4334,7 @@ the Power ISA V3.00, Book III section 5.7.6.1. 4.101 KVM_PPC_GET_RMMU_INFO --------------------------- -:Capability: KVM_CAP_PPC_RADIX_MMU +:Capability: KVM_CAP_PPC_MMU_RADIX :Architectures: ppc :Type: vm ioctl :Parameters: struct kvm_ppc_rmmu_info (out) @@ -8095,7 +8095,7 @@ capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. -8.3 KVM_CAP_PPC_RADIX_MMU +8.3 KVM_CAP_PPC_MMU_RADIX ------------------------- :Architectures: ppc @@ -8105,7 +8105,7 @@ available, means that the kernel can support guests using the radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor). -8.4 KVM_CAP_PPC_HASH_MMU_V3 +8.4 KVM_CAP_PPC_MMU_HASH_V3 --------------------------- :Architectures: ppc diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2190adbe3002..d03842abae57 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1221,9 +1221,9 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) -/* Available with KVM_CAP_PPC_RADIX_MMU */ +/* Available with KVM_CAP_PPC_MMU_RADIX */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c3308536482b..4b6b635b8bd1 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1502,7 +1502,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) -- cgit v1.2.3 From 69fb6eab1969d09187feff14f370e01032054f1f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 May 2024 00:04:06 -0300 Subject: perf annotate: Use zfree() to avoid possibly accessing dangling pointers When freeing a->b it is good practice to set a->b to NULL using zfree(&a->b) so that when we have a bug where a reference to a freed 'a' pointer is kept somewhere, we can more quickly cause a segfault if some code tries to use a->b. This is mostly done but some new cases were introduced recently, convert them to zfree(). Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZjmbHHrjIm5YRIBv@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 3 ++- tools/perf/util/annotate-data.c | 17 +++++++++-------- tools/perf/util/annotate.c | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index a4a0f042f201..8d6bf08d371d 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include "ui/browser.h" @@ -130,7 +131,7 @@ static void annotated_data_browser__delete_entries(struct annotated_data_browser list_for_each_entry_safe(pos, tmp, &browser->entries, node) { list_del_init(&pos->node); - free(pos->hists); + zfree(&pos->hists); free(pos); } } diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index faefa444af1e..57e7d4b3550b 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "annotate.h" #include "annotate-data.h" @@ -311,8 +312,8 @@ static void delete_members(struct annotated_member *member) list_for_each_entry_safe(child, tmp, &member->children, node) { list_del(&child->node); delete_members(child); - free(child->type_name); - free(child->var_name); + zfree(&child->type_name); + zfree(&child->var_name); free(child); } } @@ -582,7 +583,7 @@ void global_var_type__tree_delete(struct rb_root *root) rb_erase(node, root); gvar = rb_entry(node, struct global_var_entry, node); - free(gvar->name); + zfree(&gvar->name); free(gvar); } } @@ -1817,16 +1818,16 @@ static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_en err: while (--i >= 0) - free(adt->histograms[i]); - free(adt->histograms); + zfree(&(adt->histograms[i])); + zfree(&adt->histograms); return -ENOMEM; } static void delete_data_type_histograms(struct annotated_data_type *adt) { for (int i = 0; i < adt->nr_histograms; i++) - free(adt->histograms[i]); - free(adt->histograms); + zfree(&(adt->histograms[i])); + zfree(&adt->histograms); } void annotated_data_type__tree_delete(struct rb_root *root) @@ -1840,7 +1841,7 @@ void annotated_data_type__tree_delete(struct rb_root *root) pos = rb_entry(node, struct annotated_data_type, node); delete_members(&pos->self); delete_data_type_histograms(pos); - free(pos->self.type_name); + zfree(&pos->self.type_name); free(pos); } } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index d7d55263fc91..2b178835c1f3 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2618,13 +2618,13 @@ static void delete_basic_blocks(struct basic_block_data *bb_data) list_for_each_entry_safe(link, tmp, &bb_data->queue, node) { list_del(&link->node); - free(link->bb); + zfree(&link->bb); free(link); } list_for_each_entry_safe(link, tmp, &bb_data->visited, node) { list_del(&link->node); - free(link->bb); + zfree(&link->bb); free(link); } } -- cgit v1.2.3 From 54ef362e4daa4a4ecfa2abdc251b21564d27784e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 May 2024 00:04:06 -0300 Subject: perf callchain: Use zfree() to avoid possibly accessing dangling pointers When freeing a->b it is good practice to set a->b to NULL using zfree(&a->b) so that when we have a bug where a reference to a freed 'a' pointer is kept somewhere, we can more quickly cause a segfault if some code tries to use a->b. Convert one such case in the callchain code. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZjmcGobQ8E52EyjJ@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 68feed871809..1730b852a947 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -606,7 +606,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) call->brtype_stat = zalloc(sizeof(*call->brtype_stat)); if (!call->brtype_stat) { perror("not enough memory for the code path branch statistics"); - free(call->brtype_stat); + zfree(&call->brtype_stat); return -ENOMEM; } } -- cgit v1.2.3 From 07fde75306667f60b5cee6f10c4115efbc719b96 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 May 2024 00:04:06 -0300 Subject: perf kwork: Use zfree() to avoid possibly accessing dangling pointers When freeing a->b it is good practice to set a->b to NULL using zfree(&a->b) so that when we have a bug where a reference to a freed 'a' pointer is kept somewhere, we can more quickly cause a segfault if some code tries to use a->b. Convert one such case in the 'perf kwork' codebase. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Yang Jihong Link: https://lore.kernel.org/lkml/Zjmc5EiN6zmWZj4r@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kwork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index 0092b9b39611..56e3f3a5e03a 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -2230,7 +2230,7 @@ static int perf_kwork__top(struct perf_kwork *kwork) perf_kwork__top_report(kwork); out: - free(kwork->top_stat.cpus_runtime); + zfree(&kwork->top_stat.cpus_runtime); return ret; } -- cgit v1.2.3 From 36e8aa90fd6c577f783d5d8b02fbc205bb8e7f86 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 6 May 2024 17:49:00 +0530 Subject: perf annotate: Fix a comment about multi_regs in extract_reg_offset function Fix a comment in function which explains how multi_regs field gets set for an instruction. In the example, "mov %rsi, 8(%rbx,%rcx,4)", the comment mistakenly referred to "dst_multi_regs = 0". Correct it to use "src_multi_regs = 0" Signed-off-by: Athira Rajeev Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/r/20240506121906.76639-4-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 2b178835c1f3..0ffae315ca08 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2092,7 +2092,7 @@ static int extract_reg_offset(struct arch *arch, const char *str, * mov 0x18, %r8 # src_reg1 = -1, src_mem = 0 * # dst_reg1 = r8, dst_mem = 0 * - * mov %rsi, 8(%rbx,%rcx,4) # src_reg1 = rsi, src_mem = 0, dst_multi_regs = 0 + * mov %rsi, 8(%rbx,%rcx,4) # src_reg1 = rsi, src_mem = 0, src_multi_regs = 0 * # dst_reg1 = rbx, dst_reg2 = rcx, dst_mem = 1 * # dst_multi_regs = 1, dst_offset = 8 */ -- cgit v1.2.3 From 0d2e3f251149b758458586df61578827eef7dc8d Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Apr 2024 16:21:46 +0100 Subject: perf cs-etm: Print error for new PERF_RECORD_AUX_OUTPUT_HW_ID versions The likely fix for this is to update perf so print a helpful message. Signed-off-by: James Clark Tested-by: Ganapatrao Kulkarni Acked-by: Anshuman Khandual Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steve Clevenger Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240429152207.479221-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index d65d7485886c..32818bd7cd17 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -335,8 +335,11 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id); /* check that we can handle this version */ - if (version > CS_AUX_HW_ID_CURR_VERSION) + if (version > CS_AUX_HW_ID_CURR_VERSION) { + pr_err("CS ETM Trace: PERF_RECORD_AUX_OUTPUT_HW_ID version %d not supported. Please update Perf.\n", + version); return -EINVAL; + } /* get access to the etm metadata */ etm = container_of(session->auxtrace, struct cs_etm_auxtrace, auxtrace); -- cgit v1.2.3 From ee73fe99f77b066afc5035727130a6b8016e64b6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Apr 2024 16:21:47 +0100 Subject: perf auxtrace: Allow number of queues to be specified Currently it's only possible to initialize with the default number of queues and then use auxtrace_queues__add_event() to grow the array. But that's problematic if you don't have a real event to pass into that function yet. The queues hold a void *priv member to store custom state, and for Coresight we want to create decoders upfront before receiving data, so add a new function that allows pre-allocating queues. One reason to do this is because we might need to store metadata (HW_ID events) that effects other queues, but never actually receive auxtrace data on that queue. Reviewed-by: Anshuman Khandual Signed-off-by: James Clark Tested-by: Ganapatrao Kulkarni Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steve Clevenger Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240429152207.479221-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 9 +++++++-- tools/perf/util/auxtrace.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 3fd350de0051..e2f317063eec 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -218,15 +218,20 @@ static struct auxtrace_queue *auxtrace_alloc_queue_array(unsigned int nr_queues) return queue_array; } -int auxtrace_queues__init(struct auxtrace_queues *queues) +int auxtrace_queues__init_nr(struct auxtrace_queues *queues, int nr_queues) { - queues->nr_queues = AUXTRACE_INIT_NR_QUEUES; + queues->nr_queues = nr_queues; queues->queue_array = auxtrace_alloc_queue_array(queues->nr_queues); if (!queues->queue_array) return -ENOMEM; return 0; } +int auxtrace_queues__init(struct auxtrace_queues *queues) +{ + return auxtrace_queues__init_nr(queues, AUXTRACE_INIT_NR_QUEUES); +} + static int auxtrace_queues__grow(struct auxtrace_queues *queues, unsigned int new_nr_queues) { diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 55702215a82d..8a6ec9565835 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -521,6 +521,7 @@ int auxtrace_mmap__read_snapshot(struct mmap *map, struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size); +int auxtrace_queues__init_nr(struct auxtrace_queues *queues, int nr_queues); int auxtrace_queues__init(struct auxtrace_queues *queues); int auxtrace_queues__add_event(struct auxtrace_queues *queues, struct perf_session *session, -- cgit v1.2.3 From b78854e5c008b01265d78c181332a3bb66c0abdf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 May 2024 00:04:06 -0300 Subject: perf probe: Use zfree() to avoid possibly accessing dangling pointers When freeing a->b it is good practice to set a->b to NULL using zfree(&a->b) so that when we have a bug where a reference to a freed 'a' pointer is kept somewhere, we can more quickly cause a segfault if some code tries to use a->b. Convert one such case in the 'perf probe' codebase. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZjpBnkL2wO3QJa5W@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 019fef8da6a8..003a3bcebfdf 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -325,7 +325,7 @@ static void cleanup_params(void) for (i = 0; i < params->nevents; i++) clear_perf_probe_event(params->events + i); line_range__clear(¶ms->line_range); - free(params->target); + zfree(¶ms->target); strfilter__delete(params->filter); nsinfo__put(params->nsi); zfree(¶ms); -- cgit v1.2.3 From d9180e23fbfa3875424d3a6b28b71b072862a52a Mon Sep 17 00:00:00 2001 From: He Zhe Date: Tue, 7 May 2024 14:50:26 +0800 Subject: perf bench internals inject-build-id: Fix trap divide when collecting just one DSO 'perf bench internals inject-build-id' suffers from the following error when only one DSO is collected. # perf bench internals inject-build-id -v Collected 1 DSOs traps: internals-injec[2305] trap divide error ip:557566ba6394 sp:7ffd4de97fe0 error:0 in perf[557566b2a000+23d000] Build-id injection benchmark Iteration #1 Floating point exception This patch removes the unnecessary minus one from the divisor which also corrects the randomization range. Signed-off-by: He Zhe Fixes: 0bf02a0d80427f26 ("perf bench: Add build-id injection benchmark") Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240507065026.2652929-1-zhe.he@windriver.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/inject-buildid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c index 49331743c743..a759eb2328be 100644 --- a/tools/perf/bench/inject-buildid.c +++ b/tools/perf/bench/inject-buildid.c @@ -362,7 +362,7 @@ static int inject_build_id(struct bench_data *data, u64 *max_rss) return -1; for (i = 0; i < nr_mmaps; i++) { - int idx = rand() % (nr_dsos - 1); + int idx = rand() % nr_dsos; struct bench_dso *dso = &dsos[idx]; u64 timestamp = rand() % 1000000; -- cgit v1.2.3 From 4bf6a4ebc59201bcd12c932f25edda4c3e36e5df Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 2 May 2024 10:52:58 +0200 Subject: selftests: mm: cow: flag vmsplice() hugetlb tests as XFAIL Patch series "selftests: mm: cow: flag vmsplice() hugetlb tests as XFAIL". The failing hugetlb vmsplice() COW tests keep confusing people, and having tests that have been failing for years and likely will keep failing for years to come because nobody cares enough is rather suboptimal. Let's mark them as XFAIL and document why fixing them is not that easy as it would appear at first sight. More details can be found in [1], especially around how hugetlb pages cannot really be overcommitted, and why we don't particularly care about these vmsplice() leaks for hugetlb -- in contrast to ordinary memory. [1] https://lore.kernel.org/all/8b42a24d-caf0-46ef-9e15-0f88d47d2f21@redhat.com/ This patch (of 2): The vmsplice() hugetlb tests have been failing right from the start, and we documented that in the introducing commit 7dad331be781 ("selftests/vm: anon_cow: hugetlb tests"): Note that some tests cases still fail. This will, for example, be fixed once vmsplice properly uses FOLL_PIN instead of FOLL_GET for pinning. With 2 MiB and 1 GiB hugetlb on x86_64, the expected failures are: Until vmsplice() is changed, these tests will likely keep failing: hugetlb COW reuse logic is harder to change, because using the same COW reuse logic as we use for !hugetlb could harm other (sane) users when running out of free hugetlb pages. More details can be found in [1], especially around how hugetlb pages cannot really be overcommitted, and why we don't particularly care about these vmsplice() leaks for hugetlb -- in contrast to ordinary memory. These (expected) failures keep confusing people, so flag them accordingly. Before: $ ./cow [...] Bail out! 8 out of 778 tests failed # Totals: pass:769 fail:8 xfail:0 xpass:0 skip:1 error:0 $ echo $? 1 After: $ ./cow [...] # Totals: pass:769 fail:0 xfail:8 xpass:0 skip:1 error:0 $ echo $? 0 [1] https://lore.kernel.org/all/8b42a24d-caf0-46ef-9e15-0f88d47d2f21@redhat.com/ Link: https://lkml.kernel.org/r/20240502085259.103784-1-david@redhat.com Link: https://lkml.kernel.org/r/20240502085259.103784-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Muchun Song Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 106 ++++++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 363bf5f801be..0549672acbd6 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -199,7 +199,7 @@ static int child_vmsplice_memcmp_fn(char *mem, size_t size, typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) + child_fn fn, bool xfail) { struct comm_pipes comm_pipes; char buf; @@ -247,33 +247,47 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, else ret = -EINVAL; - ksft_test_result(!ret, "No leak from parent into child\n"); + if (!ret) { + ksft_test_result_pass("No leak from parent into child\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from parent into child\n"); + } else { + ksft_test_result_fail("Leak from parent into child\n"); + } close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_cow_in_parent(char *mem, size_t size) +static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); } -static void test_cow_in_parent_mprotect(char *mem, size_t size) +static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); } -static void test_vmsplice_in_child(char *mem, size_t size) +static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn, + is_hugetlb); } -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +static void test_vmsplice_in_child_mprotect(char *mem, size_t size, + bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn, + is_hugetlb); } static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) + bool before_fork, bool xfail) { struct iovec iov = { .iov_base = mem, @@ -355,8 +369,18 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size, } } - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); + if (!memcmp(old, new, transferred)) { + ksft_test_result_pass("No leak from child into parent\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from child into parent\n"); + } else { + ksft_test_result_fail("Leak from child into parent\n"); + } close_pipe: close(fds[0]); close(fds[1]); @@ -367,14 +391,14 @@ free: free(new); } -static void test_vmsplice_before_fork(char *mem, size_t size) +static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, true); + do_test_vmsplice_in_parent(mem, size, true, is_hugetlb); } -static void test_vmsplice_after_fork(char *mem, size_t size) +static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, false); + do_test_vmsplice_in_parent(mem, size, false, is_hugetlb); } #ifdef LOCAL_CONFIG_HAVE_LIBURING @@ -529,12 +553,12 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_iouring_ro(char *mem, size_t size) +static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, false); } -static void test_iouring_fork(char *mem, size_t size) +static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, true); } @@ -678,37 +702,41 @@ free_tmp: free(tmp); } -static void test_ro_pin_on_shared(char *mem, size_t size) +static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } -static void test_ro_fast_pin_on_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } -typedef void (*test_fn)(char *mem, size_t size); +typedef void (*test_fn)(char *mem, size_t size, bool hugetlb); static void do_run_with_base_page(test_fn fn, bool swapout) { @@ -740,7 +768,7 @@ static void do_run_with_base_page(test_fn fn, bool swapout) } } - fn(mem, pagesize); + fn(mem, pagesize, false); munmap: munmap(mem, pagesize); } @@ -904,7 +932,7 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) break; } - fn(mem, size); + fn(mem, size, false); munmap: munmap(mmap_mem, mmap_size); if (mremap_mem != MAP_FAILED) @@ -997,7 +1025,7 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) } munmap(dummy, hugetlbsize); - fn(mem, hugetlbsize); + fn(mem, hugetlbsize, true); munmap: munmap(mem, hugetlbsize); } @@ -1036,7 +1064,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child", - test_vmsplice_in_child + test_vmsplice_in_child, }, /* * vmsplice() test, but do an additional mprotect(PROT_READ)+ @@ -1044,7 +1072,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect + test_vmsplice_in_child_mprotect, }, /* * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after @@ -1322,23 +1350,31 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_anon_thp_collapse_unshared(char *mem, size_t size) +static void test_anon_thp_collapse_unshared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); } -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); } -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); } -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); } -- cgit v1.2.3 From a4c43b8a09805a7b9b39344c1ba304a5641aca77 Mon Sep 17 00:00:00 2001 From: Saurav Shah Date: Thu, 2 May 2024 04:43:17 +0530 Subject: selftests/memfd: fix spelling mistakes Fix spelling mistakes in the comments. Link: https://lkml.kernel.org/r/20240501231317.24648-1-sauravshah.31@gmail.com Signed-off-by: Saurav Shah Cc: Aleksa Sarai Cc: Greg Thelen Cc: Jeff Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/fuse_test.c | 2 +- tools/testing/selftests/memfd/memfd_test.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 93798c8c5d54..dbc171a3806d 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -306,7 +306,7 @@ int main(int argc, char **argv) * then the kernel did a page-replacement or canceled the read() (or * whatever magic it did..). In that case, the memfd object is still * all zero. - * In case the memfd-object was *not* sealed, the read() was successfull + * In case the memfd-object was *not* sealed, the read() was successful * and the memfd object must *not* be all zero. * Note that in real scenarios, there might be a mixture of both, but * in this test-cases, we have explicit 200ms delays which should be diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 18f585684e20..95af2d78fd31 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1528,7 +1528,7 @@ static void test_share_open(char *banner, char *b_suffix) /* * Test sharing via fork() - * Test whether seal-modifications work as expected with forked childs. + * Test whether seal-modifications work as expected with forked children. */ static void test_share_fork(char *banner, char *b_suffix) { -- cgit v1.2.3 From 67f4c91a449a6cc936679e43a916d79ee134464a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 30 Apr 2024 15:15:08 +0200 Subject: selftests: mm: gup_longterm: test unsharing logic when R/O pinning In our FOLL_LONGTERM tests, we prefault the page tables for the GUP-fast test cases to be able to find a PTE and exercise the "longterm pinning allowed" logic on the GUP-fast path where possible. For now, we always prefault the page tables writable, resulting in PTEs that are writable. Let's cover more cases to also test if our unsharing logic works as expected (and is able to make progress when there is nothing to unshare) by mprotect'ing the range R/O when R/O-pinning, so we don't get PTEs that are writable. This change would have found an issue introduced by commit a12083d721d7 ("mm/gup: handle hugepd for follow_page()"), whereby R/O pinning was not able to make progress in all cases, because unsharing logic was not provided with the VMA to decide at some point that long-term R/O pinning a !anon page is fine. Link: https://lkml.kernel.org/r/20240430131508.86924-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_longterm.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index ad168d35b23b..6c6d99c1dc76 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -118,15 +118,22 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) return; } - /* - * Fault in the page writable such that GUP-fast can eventually pin - * it immediately. - */ + /* Fault in the page such that GUP-fast can pin it directly. */ memset(mem, 0, size); switch (type) { case TEST_TYPE_RO: case TEST_TYPE_RO_FAST: + /* + * Cover more cases regarding unsharing decisions when + * long-term R/O pinning by mapping the page R/O. + */ + ret = mprotect(mem, size, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + /* FALLTHROUGH */ case TEST_TYPE_RW: case TEST_TYPE_RW_FAST: { struct pin_longterm_test args; @@ -228,6 +235,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) assert(false); } +munmap: munmap(mem, size); } -- cgit v1.2.3 From 769e6a1e15bdbbaf2b0d2f37c24f2c53268bd21f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:38 -0700 Subject: perf ui browser: Don't save pointer to stack memory ui_browser__show() is capturing the input title that is stack allocated memory in hist_browser__run(). Avoid a use after return by strdup-ing the string. Committer notes: Further explanation from Ian Rogers: My command line using tui is: $ sudo bash -c 'rm /tmp/asan.log*; export ASAN_OPTIONS="log_path=/tmp/asan.log"; /tmp/perf/perf mem record -a sleep 1; /tmp/perf/perf mem report' I then go to the perf annotate view and quit. This triggers the asan error (from the log file): ``` ==1254591==ERROR: AddressSanitizer: stack-use-after-return on address 0x7f2813331920 at pc 0x7f28180 65991 bp 0x7fff0a21c750 sp 0x7fff0a21bf10 READ of size 80 at 0x7f2813331920 thread T0 #0 0x7f2818065990 in __interceptor_strlen ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:461 #1 0x7f2817698251 in SLsmg_write_wrapped_string (/lib/x86_64-linux-gnu/libslang.so.2+0x98251) #2 0x7f28176984b9 in SLsmg_write_nstring (/lib/x86_64-linux-gnu/libslang.so.2+0x984b9) #3 0x55c94045b365 in ui_browser__write_nstring ui/browser.c:60 #4 0x55c94045c558 in __ui_browser__show_title ui/browser.c:266 #5 0x55c94045c776 in ui_browser__show ui/browser.c:288 #6 0x55c94045c06d in ui_browser__handle_resize ui/browser.c:206 #7 0x55c94047979b in do_annotate ui/browsers/hists.c:2458 #8 0x55c94047fb17 in evsel__hists_browse ui/browsers/hists.c:3412 #9 0x55c940480a0c in perf_evsel_menu__run ui/browsers/hists.c:3527 #10 0x55c940481108 in __evlist__tui_browse_hists ui/browsers/hists.c:3613 #11 0x55c9404813f7 in evlist__tui_browse_hists ui/browsers/hists.c:3661 #12 0x55c93ffa253f in report__browse_hists tools/perf/builtin-report.c:671 #13 0x55c93ffa58ca in __cmd_report tools/perf/builtin-report.c:1141 #14 0x55c93ffaf159 in cmd_report tools/perf/builtin-report.c:1805 #15 0x55c94000c05c in report_events tools/perf/builtin-mem.c:374 #16 0x55c94000d96d in cmd_mem tools/perf/builtin-mem.c:516 #17 0x55c9400e44ee in run_builtin tools/perf/perf.c:350 #18 0x55c9400e4a5a in handle_internal_command tools/perf/perf.c:403 #19 0x55c9400e4e22 in run_argv tools/perf/perf.c:447 #20 0x55c9400e53ad in main tools/perf/perf.c:561 #21 0x7f28170456c9 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 #22 0x7f2817045784 in __libc_start_main_impl ../csu/libc-start.c:360 #23 0x55c93ff544c0 in _start (/tmp/perf/perf+0x19a4c0) (BuildId: 84899b0e8c7d3a3eaa67b2eb35e3d8b2f8cd4c93) Address 0x7f2813331920 is located in stack of thread T0 at offset 32 in frame #0 0x55c94046e85e in hist_browser__run ui/browsers/hists.c:746 This frame has 1 object(s): [32, 192) 'title' (line 747) <== Memory access at offset 32 is inside this variable HINT: this may be a false positive if your program uses some custom stack unwind mechanism, swapcontext or vfork ``` hist_browser__run isn't on the stack so the asan error looks legit. There's no clean init/exit on struct ui_browser so I may be trading a use-after-return for a memory leak, but that seems look a good trade anyway. Fixes: 05e8b0804ec4 ("perf ui browser: Stop using 'self'") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: K Prateek Nayak Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browser.c | 4 +++- tools/perf/ui/browser.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index 603d11283cbd..c4cdf2ea69b7 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -287,7 +287,8 @@ int ui_browser__show(struct ui_browser *browser, const char *title, mutex_lock(&ui__lock); __ui_browser__show_title(browser, title); - browser->title = title; + free(browser->title); + browser->title = strdup(title); zfree(&browser->helpline); va_start(ap, helpline); @@ -304,6 +305,7 @@ void ui_browser__hide(struct ui_browser *browser) mutex_lock(&ui__lock); ui_helpline__pop(); zfree(&browser->helpline); + zfree(&browser->title); mutex_unlock(&ui__lock); } diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h index 510ce4554050..6e98d5f8f71c 100644 --- a/tools/perf/ui/browser.h +++ b/tools/perf/ui/browser.h @@ -21,7 +21,7 @@ struct ui_browser { u8 extra_title_lines; int current_color; void *priv; - const char *title; + char *title; char *helpline; const char *no_samples_msg; void (*refresh_dimensions)(struct ui_browser *browser); -- cgit v1.2.3 From a3f7768bcf48281df14d98715f076c5656571527 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:39 -0700 Subject: perf annotate: Fix memory leak in annotated_source Freeing hash map doesn't free the entries added to the hashmap, add the missing free(). Fixes: d3e7cad6f36d9e80 ("perf annotate: Add a hashmap for symbol histogram") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: K Prateek Nayak Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0ffae315ca08..541988cf6e19 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -107,9 +107,15 @@ static struct annotated_source *annotated_source__new(void) static __maybe_unused void annotated_source__delete(struct annotated_source *src) { + struct hashmap_entry *cur; + size_t bkt; + if (src == NULL) return; + hashmap__for_each_entry(src->samples, cur, bkt) + zfree(&cur->pvalue); + hashmap__free(src->samples); zfree(&src->histograms); free(src); -- cgit v1.2.3 From 557b32c343925dfac480ef331494437ea223932d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:40 -0700 Subject: perf block-info: Remove unused refcount block_info__get() has no callers so the refcount is only ever one. As such remove the reference counting logic and turn puts to deletes. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/block-info.c | 22 +++++----------------- tools/perf/util/block-info.h | 15 +-------------- tools/perf/util/hist.c | 4 ++-- 3 files changed, 8 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 895ee8adf3b3..04068d48683f 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -43,26 +43,14 @@ static struct block_header_column { } }; -struct block_info *block_info__get(struct block_info *bi) -{ - if (bi) - refcount_inc(&bi->refcnt); - return bi; -} - -void block_info__put(struct block_info *bi) +struct block_info *block_info__new(void) { - if (bi && refcount_dec_and_test(&bi->refcnt)) - free(bi); + return zalloc(sizeof(struct block_info)); } -struct block_info *block_info__new(void) +void block_info__delete(struct block_info *bi) { - struct block_info *bi = zalloc(sizeof(*bi)); - - if (bi) - refcount_set(&bi->refcnt, 1); - return bi; + free(bi); } int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right) @@ -148,7 +136,7 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, he_block = hists__add_entry_block(&bh->block_hists, &al, bi); if (!he_block) { - block_info__put(bi); + block_info__delete(bi); return -1; } } diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h index 96f53e89795e..0b9e1aad4c55 100644 --- a/tools/perf/util/block-info.h +++ b/tools/perf/util/block-info.h @@ -3,7 +3,6 @@ #define __PERF_BLOCK_H #include -#include #include "hist.h" #include "symbol.h" #include "sort.h" @@ -19,7 +18,6 @@ struct block_info { u64 total_cycles; int num; int num_aggr; - refcount_t refcnt; }; struct block_fmt { @@ -48,19 +46,8 @@ struct block_report { int nr_fmts; }; -struct block_hist; - struct block_info *block_info__new(void); -struct block_info *block_info__get(struct block_info *bi); -void block_info__put(struct block_info *bi); - -static inline void __block_info__zput(struct block_info **bi) -{ - block_info__put(*bi); - *bi = NULL; -} - -#define block_info__zput(bi) __block_info__zput(&bi) +void block_info__delete(struct block_info *bi); int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 55ea6afcc437..b8a508cd0b14 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -631,7 +631,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, */ mem_info__zput(entry->mem_info); - block_info__zput(entry->block_info); + block_info__delete(entry->block_info); kvm_info__zput(entry->kvm_info); @@ -1341,7 +1341,7 @@ void hist_entry__delete(struct hist_entry *he) } if (he->block_info) - block_info__zput(he->block_info); + block_info__delete(he->block_info); if (he->kvm_info) kvm_info__zput(he->kvm_info); -- cgit v1.2.3 From a8cd4766d9128b897af6d4e0d22604f3bdaf2f82 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:41 -0700 Subject: perf cpumap: Remove refcnt from 'struct cpu_aggr_map' It is assigned a value of 1 and never incremented. Remove and replace puts with delete. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 16 +++------------- tools/perf/util/cpumap.c | 2 -- tools/perf/util/cpumap.h | 2 -- 3 files changed, 3 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 65a3dd7ffac3..35f79b48e8dc 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1631,23 +1631,13 @@ static int perf_stat_init_aggr_mode(void) static void cpu_aggr_map__delete(struct cpu_aggr_map *map) { - if (map) { - WARN_ONCE(refcount_read(&map->refcnt) != 0, - "cpu_aggr_map refcnt unbalanced\n"); - free(map); - } -} - -static void cpu_aggr_map__put(struct cpu_aggr_map *map) -{ - if (map && refcount_dec_and_test(&map->refcnt)) - cpu_aggr_map__delete(map); + free(map); } static void perf_stat__exit_aggr_mode(void) { - cpu_aggr_map__put(stat_config.aggr_map); - cpu_aggr_map__put(stat_config.cpus_aggr_map); + cpu_aggr_map__delete(stat_config.aggr_map); + cpu_aggr_map__delete(stat_config.cpus_aggr_map); stat_config.aggr_map = NULL; stat_config.cpus_aggr_map = NULL; } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 6a270d640acb..27094211edd8 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -180,8 +180,6 @@ struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr) cpus->nr = nr; for (i = 0; i < nr; i++) cpus->map[i] = aggr_cpu_id__empty(); - - refcount_set(&cpus->refcnt, 1); } return cpus; diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 26cf76c693f5..ee0f6139b04a 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -5,7 +5,6 @@ #include #include #include -#include /** Identify where counts are aggregated, -1 implies not to aggregate. */ struct aggr_cpu_id { @@ -37,7 +36,6 @@ struct aggr_cpu_id { /** A collection of aggr_cpu_id values, the "built" version is sorted and uniqued. */ struct cpu_aggr_map { - refcount_t refcnt; /** Number of valid entries. */ int nr; /** The entries. */ -- cgit v1.2.3 From 13ca628716c6f2c3c1c477b007582ad9cfb8eba5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:42 -0700 Subject: perf comm: Add reference count checking to 'struct comm_str' Reference count checking of an rbtree is troublesome as each pointer should have a reference, switch to using a sorted array. Remove an indirection by embedding the reference count with the string. Use pthread_once to safely initialize the comm_strs and reader writer mutex. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/comm.c | 196 +++++++++++++++++++++++++++++++------------------ 1 file changed, 126 insertions(+), 70 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index afb8d4fd2644..1aa9a08e5b03 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -1,108 +1,164 @@ // SPDX-License-Identifier: GPL-2.0 #include "comm.h" #include -#include -#include #include +#include #include -#include #include #include "rwsem.h" -struct comm_str { - char *str; - struct rb_node rb_node; +DECLARE_RC_STRUCT(comm_str) { refcount_t refcnt; + char str[]; }; -/* Should perhaps be moved to struct machine */ -static struct rb_root comm_str_root; -static struct rw_semaphore comm_str_lock = {.lock = PTHREAD_RWLOCK_INITIALIZER,}; +static struct comm_strs { + struct rw_semaphore lock; + struct comm_str **strs; + int num_strs; + int capacity; +} _comm_strs; + +static void comm_strs__init(void) +{ + init_rwsem(&_comm_strs.lock); + _comm_strs.capacity = 16; + _comm_strs.num_strs = 0; + _comm_strs.strs = calloc(16, sizeof(*_comm_strs.strs)); +} + +static struct comm_strs *comm_strs__get(void) +{ + static pthread_once_t comm_strs_type_once = PTHREAD_ONCE_INIT; + + pthread_once(&comm_strs_type_once, comm_strs__init); + + return &_comm_strs; +} + +static refcount_t *comm_str__refcnt(struct comm_str *cs) +{ + return &RC_CHK_ACCESS(cs)->refcnt; +} + +static const char *comm_str__str(const struct comm_str *cs) +{ + return &RC_CHK_ACCESS(cs)->str[0]; +} static struct comm_str *comm_str__get(struct comm_str *cs) { - if (cs && refcount_inc_not_zero(&cs->refcnt)) - return cs; + struct comm_str *result; + + if (RC_CHK_GET(result, cs)) + refcount_inc_not_zero(comm_str__refcnt(cs)); - return NULL; + return result; } static void comm_str__put(struct comm_str *cs) { - if (cs && refcount_dec_and_test(&cs->refcnt)) { - down_write(&comm_str_lock); - rb_erase(&cs->rb_node, &comm_str_root); - up_write(&comm_str_lock); - zfree(&cs->str); - free(cs); + if (cs && refcount_dec_and_test(comm_str__refcnt(cs))) { + struct comm_strs *comm_strs = comm_strs__get(); + int i; + + down_write(&comm_strs->lock); + for (i = 0; i < comm_strs->num_strs; i++) { + if (comm_strs->strs[i] == cs) + break; + } + for (; i < comm_strs->num_strs - 1; i++) + comm_strs->strs[i] = comm_strs->strs[i + 1]; + + comm_strs->num_strs--; + up_write(&comm_strs->lock); + RC_CHK_FREE(cs); + } else { + RC_CHK_PUT(cs); } } -static struct comm_str *comm_str__alloc(const char *str) +static struct comm_str *comm_str__new(const char *str) { - struct comm_str *cs; + struct comm_str *result = NULL; + RC_STRUCT(comm_str) *cs; - cs = zalloc(sizeof(*cs)); - if (!cs) - return NULL; - - cs->str = strdup(str); - if (!cs->str) { - free(cs); - return NULL; + cs = malloc(sizeof(*cs) + strlen(str) + 1); + if (ADD_RC_CHK(result, cs)) { + refcount_set(comm_str__refcnt(result), 1); + strcpy(&cs->str[0], str); } + return result; +} - refcount_set(&cs->refcnt, 1); +static int comm_str__cmp(const void *_lhs, const void *_rhs) +{ + const struct comm_str *lhs = *(const struct comm_str * const *)_lhs; + const struct comm_str *rhs = *(const struct comm_str * const *)_rhs; - return cs; + return strcmp(comm_str__str(lhs), comm_str__str(rhs)); } -static -struct comm_str *__comm_str__findnew(const char *str, struct rb_root *root) +static int comm_str__search(const void *_key, const void *_member) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct comm_str *iter, *new; - int cmp; - - while (*p != NULL) { - parent = *p; - iter = rb_entry(parent, struct comm_str, rb_node); - - /* - * If we race with comm_str__put, iter->refcnt is 0 - * and it will be removed within comm_str__put call - * shortly, ignore it in this search. - */ - cmp = strcmp(str, iter->str); - if (!cmp && comm_str__get(iter)) - return iter; - - if (cmp < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } + const char *key = _key; + const struct comm_str *member = *(const struct comm_str * const *)_member; - new = comm_str__alloc(str); - if (!new) - return NULL; + return strcmp(key, comm_str__str(member)); +} + +static struct comm_str *__comm_strs__find(struct comm_strs *comm_strs, const char *str) +{ + struct comm_str **result; - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, root); + result = bsearch(str, comm_strs->strs, comm_strs->num_strs, sizeof(struct comm_str *), + comm_str__search); - return new; + if (!result) + return NULL; + + return comm_str__get(*result); } -static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root) +static struct comm_str *comm_strs__findnew(const char *str) { - struct comm_str *cs; + struct comm_strs *comm_strs = comm_strs__get(); + struct comm_str *result; - down_write(&comm_str_lock); - cs = __comm_str__findnew(str, root); - up_write(&comm_str_lock); + if (!comm_strs) + return NULL; - return cs; + down_read(&comm_strs->lock); + result = __comm_strs__find(comm_strs, str); + up_read(&comm_strs->lock); + if (result) + return result; + + down_write(&comm_strs->lock); + result = __comm_strs__find(comm_strs, str); + if (!result) { + if (comm_strs->num_strs == comm_strs->capacity) { + struct comm_str **tmp; + + tmp = reallocarray(comm_strs->strs, + comm_strs->capacity + 16, + sizeof(*comm_strs->strs)); + if (!tmp) { + up_write(&comm_strs->lock); + return NULL; + } + comm_strs->strs = tmp; + comm_strs->capacity += 16; + } + result = comm_str__new(str); + if (result) { + comm_strs->strs[comm_strs->num_strs++] = result; + qsort(comm_strs->strs, comm_strs->num_strs, sizeof(struct comm_str *), + comm_str__cmp); + } + } + up_write(&comm_strs->lock); + return result; } struct comm *comm__new(const char *str, u64 timestamp, bool exec) @@ -115,7 +171,7 @@ struct comm *comm__new(const char *str, u64 timestamp, bool exec) comm->start = timestamp; comm->exec = exec; - comm->comm_str = comm_str__findnew(str, &comm_str_root); + comm->comm_str = comm_strs__findnew(str); if (!comm->comm_str) { free(comm); return NULL; @@ -128,7 +184,7 @@ int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec) { struct comm_str *new, *old = comm->comm_str; - new = comm_str__findnew(str, &comm_str_root); + new = comm_strs__findnew(str); if (!new) return -ENOMEM; @@ -149,5 +205,5 @@ void comm__free(struct comm *comm) const char *comm__str(const struct comm *comm) { - return comm->comm_str->str; + return comm_str__str(comm->comm_str); } -- cgit v1.2.3 From ad3003a65a3ce1abf3b30af265bb36e23224a7aa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:43 -0700 Subject: perf mem-info: Move mem-info out of mem-events and symbol Move mem-info to its own header rather than having it split between mem-events and symbol. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 1 + tools/perf/builtin-report.c | 1 + tools/perf/builtin-script.c | 1 + tools/perf/tests/mem.c | 1 + tools/perf/util/Build | 1 + tools/perf/util/hist.c | 1 + tools/perf/util/machine.c | 1 + tools/perf/util/mem-events.c | 16 ++++++------ tools/perf/util/mem-events.h | 29 ++++++++-------------- tools/perf/util/mem-info.c | 28 +++++++++++++++++++++ tools/perf/util/mem-info.h | 28 +++++++++++++++++++++ .../util/scripting-engines/trace-event-python.c | 1 + tools/perf/util/sort.c | 1 + tools/perf/util/symbol.c | 26 +------------------ tools/perf/util/symbol.h | 12 --------- 15 files changed, 85 insertions(+), 63 deletions(-) create mode 100644 tools/perf/util/mem-info.c create mode 100644 tools/perf/util/mem-info.h (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 1f1d17df9b9a..c624414b2285 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -38,6 +38,7 @@ #include "ui/browsers/hists.h" #include "thread.h" #include "mem2node.h" +#include "mem-info.h" #include "symbol.h" #include "ui/ui.h" #include "ui/progress.h" diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index b5525f4f7090..6eb1d90b589f 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -31,6 +31,7 @@ #include "util/evsel.h" #include "util/evswitch.h" #include "util/header.h" +#include "util/mem-info.h" #include "util/session.h" #include "util/srcline.h" #include "util/tool.h" diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index f7c3c3868c3c..58af4f3aa592 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -32,6 +32,7 @@ #include "util/time-utils.h" #include "util/path.h" #include "util/event.h" +#include "util/mem-info.h" #include "ui/ui.h" #include "print_binary.h" #include "print_insn.h" diff --git a/tools/perf/tests/mem.c b/tools/perf/tests/mem.c index 56014ec7d49d..f694a60d8a73 100644 --- a/tools/perf/tests/mem.c +++ b/tools/perf/tests/mem.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "util/map_symbol.h" #include "util/mem-events.h" +#include "util/mem-info.h" #include "util/symbol.h" #include "linux/perf_event.h" #include "util/debug.h" diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 292170a99ab6..da64efd8718f 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -141,6 +141,7 @@ perf-y += term.o perf-y += help-unknown-cmd.o perf-y += dlfilter.o perf-y += mem-events.o +perf-y += mem-info.o perf-y += vsprintf.o perf-y += units.o perf-y += time-utils.o diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index b8a508cd0b14..56453a02cdf4 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -9,6 +9,7 @@ #include "map_symbol.h" #include "branch.h" #include "mem-events.h" +#include "mem-info.h" #include "session.h" #include "namespaces.h" #include "cgroup.h" diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a3ff2ab154bd..d5a01ef92668 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -16,6 +16,7 @@ #include "map_symbol.h" #include "branch.h" #include "mem-events.h" +#include "mem-info.h" #include "path.h" #include "srcline.h" #include "symbol.h" diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 637cbd4a7bfb..f618a5ccc7af 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -10,7 +10,9 @@ #include #include "map_symbol.h" #include "mem-events.h" +#include "mem-info.h" #include "debug.h" +#include "evsel.h" #include "symbol.h" #include "pmu.h" #include "pmus.h" @@ -281,7 +283,7 @@ static const char * const tlb_access[] = { "Fault", }; -int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__tlb_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { size_t l = 0, i; u64 m = PERF_MEM_TLB_NA; @@ -359,7 +361,7 @@ static const char * const mem_hops[] = { "board", }; -static int perf_mem__op_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +static int perf_mem__op_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { u64 op = PERF_MEM_LOCK_NA; int l; @@ -383,7 +385,7 @@ static int perf_mem__op_scnprintf(char *out, size_t sz, struct mem_info *mem_inf return l; } -int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__lvl_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { union perf_mem_data_src data_src; int printed = 0; @@ -465,7 +467,7 @@ static const char * const snoopx_access[] = { "Peer", }; -int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__snp_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { size_t i, l = 0; u64 m = PERF_MEM_SNOOP_NA; @@ -507,7 +509,7 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } -int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__lck_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { u64 mask = PERF_MEM_LOCK_NA; int l; @@ -525,7 +527,7 @@ int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } -int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_mem__blk_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { size_t l = 0; u64 mask = PERF_MEM_BLK_NA; @@ -548,7 +550,7 @@ int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } -int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +int perf_script__meminfo_scnprintf(char *out, size_t sz, const struct mem_info *mem_info) { int i = 0; diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 15d5f0320d27..ca31014d7934 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -3,13 +3,7 @@ #define __PERF_MEM_EVENTS_H #include -#include -#include #include -#include -#include -#include "stat.h" -#include "evsel.h" struct perf_mem_event { bool record; @@ -21,13 +15,6 @@ struct perf_mem_event { const char *event_name; }; -struct mem_info { - struct addr_map_symbol iaddr; - struct addr_map_symbol daddr; - union perf_mem_data_src data_src; - refcount_t refcnt; -}; - enum { PERF_MEM_EVENTS__LOAD, PERF_MEM_EVENTS__STORE, @@ -35,6 +22,10 @@ enum { PERF_MEM_EVENTS__MAX, }; +struct evsel; +struct mem_info; +struct perf_pmu; + extern unsigned int perf_mem_events__loads_ldlat; extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX]; @@ -49,13 +40,13 @@ bool is_mem_loads_aux_event(struct evsel *leader); void perf_pmu__mem_events_list(struct perf_pmu *pmu); int perf_mem_events__record_args(const char **rec_argv, int *argv_nr); -int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info); +int perf_mem__tlb_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__lvl_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__snp_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__lck_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); +int perf_mem__blk_scnprintf(char *out, size_t sz, const struct mem_info *mem_info); -int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info); +int perf_script__meminfo_scnprintf(char *bf, size_t size, const struct mem_info *mem_info); struct c2c_stats { u32 nr_entries; diff --git a/tools/perf/util/mem-info.c b/tools/perf/util/mem-info.c new file mode 100644 index 000000000000..ff0dfdb5369a --- /dev/null +++ b/tools/perf/util/mem-info.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "mem-info.h" + +struct mem_info *mem_info__get(struct mem_info *mi) +{ + if (mi) + refcount_inc(&mi->refcnt); + return mi; +} + +void mem_info__put(struct mem_info *mi) +{ + if (mi && refcount_dec_and_test(&mi->refcnt)) { + addr_map_symbol__exit(&mi->iaddr); + addr_map_symbol__exit(&mi->daddr); + free(mi); + } +} + +struct mem_info *mem_info__new(void) +{ + struct mem_info *mi = zalloc(sizeof(*mi)); + + if (mi) + refcount_set(&mi->refcnt, 1); + return mi; +} diff --git a/tools/perf/util/mem-info.h b/tools/perf/util/mem-info.h new file mode 100644 index 000000000000..bb7d8375d73c --- /dev/null +++ b/tools/perf/util/mem-info.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_MEM_INFO_H +#define __PERF_MEM_INFO_H + +#include +#include +#include "map_symbol.h" + +struct mem_info { + struct addr_map_symbol iaddr; + struct addr_map_symbol daddr; + union perf_mem_data_src data_src; + refcount_t refcnt; +}; + +struct mem_info *mem_info__new(void); +struct mem_info *mem_info__get(struct mem_info *mi); +void mem_info__put(struct mem_info *mi); + +static inline void __mem_info__zput(struct mem_info **mi) +{ + mem_info__put(*mi); + *mi = NULL; +} + +#define mem_info__zput(mi) __mem_info__zput(&mi) + +#endif /* __PERF_MEM_INFO_H */ diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c2caa5720299..c77fe08a0545 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -45,6 +45,7 @@ #include "../thread.h" #include "../comm.h" #include "../machine.h" +#include "../mem-info.h" #include "../db-export.h" #include "../thread-stack.h" #include "../trace-event.h" diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 704664e5b4ea..711ef69306f3 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -23,6 +23,7 @@ #include "strlist.h" #include "strbuf.h" #include "mem-events.h" +#include "mem-info.h" #include "annotate.h" #include "annotate-data.h" #include "event.h" diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 7772a4d3e66c..eb3319baa1b5 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -27,6 +27,7 @@ #include "symbol.h" #include "map_symbol.h" #include "mem-events.h" +#include "mem-info.h" #include "symsrc.h" #include "strlist.h" #include "intlist.h" @@ -2570,31 +2571,6 @@ int symbol__config_symfs(const struct option *opt __maybe_unused, return 0; } -struct mem_info *mem_info__get(struct mem_info *mi) -{ - if (mi) - refcount_inc(&mi->refcnt); - return mi; -} - -void mem_info__put(struct mem_info *mi) -{ - if (mi && refcount_dec_and_test(&mi->refcnt)) { - addr_map_symbol__exit(&mi->iaddr); - addr_map_symbol__exit(&mi->daddr); - free(mi); - } -} - -struct mem_info *mem_info__new(void) -{ - struct mem_info *mi = zalloc(sizeof(*mi)); - - if (mi) - refcount_set(&mi->refcnt, 1); - return mi; -} - /* * Checks that user supplied symbol kernel files are accessible because * the default mechanism for accessing elf files fails silently. i.e. if diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 071837ddce2a..3fb5d146d9b1 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -268,18 +268,6 @@ enum { SDT_NOTE_IDX_REFCTR, }; -struct mem_info *mem_info__new(void); -struct mem_info *mem_info__get(struct mem_info *mi); -void mem_info__put(struct mem_info *mi); - -static inline void __mem_info__zput(struct mem_info **mi) -{ - mem_info__put(*mi); - *mi = NULL; -} - -#define mem_info__zput(mi) __mem_info__zput(&mi) - int symbol__validate_sym_arguments(void); #endif /* __PERF_SYMBOL */ -- cgit v1.2.3 From 1a8c2e0177df250093b482b0c0034b53fdc5409f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:44 -0700 Subject: perf mem-info: Add reference count checking Add reference count checking and switch 'struct mem_info' usage to use accessor functions. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 12 ++-- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-script.c | 11 +++- tools/perf/tests/mem.c | 10 ++-- tools/perf/util/hist.c | 26 +++++---- tools/perf/util/machine.c | 6 +- tools/perf/util/mem-events.c | 20 +++---- tools/perf/util/mem-info.c | 29 +++++---- tools/perf/util/mem-info.h | 28 ++++++++- .../util/scripting-engines/trace-event-python.c | 11 +++- tools/perf/util/sort.c | 68 +++++++++++----------- 11 files changed, 135 insertions(+), 88 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index c624414b2285..c157bd31f2e5 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -530,7 +530,7 @@ static int dcacheline_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = cl_address(he->mem_info->daddr.addr, chk_double_cl); + addr = cl_address(mem_info__daddr(he->mem_info)->addr, chk_double_cl); return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -568,7 +568,7 @@ static int offset_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = cl_offset(he->mem_info->daddr.al_addr, chk_double_cl); + addr = cl_offset(mem_info__daddr(he->mem_info)->al_addr, chk_double_cl); return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -580,10 +580,10 @@ offset_cmp(struct perf_hpp_fmt *fmt __maybe_unused, uint64_t l = 0, r = 0; if (left->mem_info) - l = cl_offset(left->mem_info->daddr.addr, chk_double_cl); + l = cl_offset(mem_info__daddr(left->mem_info)->addr, chk_double_cl); if (right->mem_info) - r = cl_offset(right->mem_info->daddr.addr, chk_double_cl); + r = cl_offset(mem_info__daddr(right->mem_info)->addr, chk_double_cl); return (int64_t)(r - l); } @@ -597,7 +597,7 @@ iaddr_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, char buf[20]; if (he->mem_info) - addr = he->mem_info->iaddr.addr; + addr = mem_info__iaddr(he->mem_info)->addr; return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr)); } @@ -2593,7 +2593,7 @@ perf_c2c_cacheline_browser__title(struct hist_browser *browser, he = cl_browser->he; if (he->mem_info) - addr = cl_address(he->mem_info->daddr.addr, chk_double_cl); + addr = cl_address(mem_info__daddr(he->mem_info)->addr, chk_double_cl); scnprintf(bf, size, "Cacheline 0x%lx", addr); return 0; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 6eb1d90b589f..0892b6e3e5e7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -186,7 +186,7 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter, } else if (rep->mem_mode) { mi = he->mem_info; - err = addr_map_symbol__inc_samples(&mi->daddr, sample, evsel); + err = addr_map_symbol__inc_samples(mem_info__daddr(mi), sample, evsel); if (err) goto out; diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 58af4f3aa592..c16224b1fef3 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2051,13 +2051,18 @@ static int evlist__max_name_len(struct evlist *evlist) static int data_src__fprintf(u64 data_src, FILE *fp) { - struct mem_info mi = { .data_src.val = data_src }; + struct mem_info *mi = mem_info__new(); char decode[100]; char out[100]; static int maxlen; int len; - perf_script__meminfo_scnprintf(decode, 100, &mi); + if (!mi) + return -ENOMEM; + + mem_info__data_src(mi)->val = data_src; + perf_script__meminfo_scnprintf(decode, 100, mi); + mem_info__put(mi); len = scnprintf(out, 100, "%16" PRIx64 " %s", data_src, decode); if (maxlen < len) @@ -2498,7 +2503,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, evsel = evlist__last(*pevlist); if (!evsel->priv) { - if (scr->per_event_dump) { + if (scr->per_event_dump) { evsel->priv = evsel_script__new(evsel, scr->session->data); if (!evsel->priv) return -ENOMEM; diff --git a/tools/perf/tests/mem.c b/tools/perf/tests/mem.c index f694a60d8a73..cb3d749e157b 100644 --- a/tools/perf/tests/mem.c +++ b/tools/perf/tests/mem.c @@ -13,12 +13,14 @@ static int check(union perf_mem_data_src data_src, { char out[100]; char failure[100]; - struct mem_info mi = { .data_src = data_src }; - + struct mem_info *mi = mem_info__new(); int n; - n = perf_mem__snp_scnprintf(out, sizeof out, &mi); - n += perf_mem__lvl_scnprintf(out + n, sizeof out - n, &mi); + TEST_ASSERT_VAL("Memory allocation failed", mi); + *mem_info__data_src(mi) = data_src; + n = perf_mem__snp_scnprintf(out, sizeof out, mi); + n += perf_mem__lvl_scnprintf(out + n, sizeof out - n, mi); + mem_info__put(mi); scnprintf(failure, sizeof failure, "unexpected %s", out); TEST_ASSERT_VAL(failure, !strcmp(string, out)); return 0; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 56453a02cdf4..00814d42d5f1 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -154,8 +154,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) } if (h->mem_info) { - if (h->mem_info->daddr.ms.sym) { - symlen = (int)h->mem_info->daddr.ms.sym->namelen + 4 + if (mem_info__daddr(h->mem_info)->ms.sym) { + symlen = (int)mem_info__daddr(h->mem_info)->ms.sym->namelen + 4 + unresolved_col_width + 2; hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen); @@ -169,8 +169,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen); } - if (h->mem_info->iaddr.ms.sym) { - symlen = (int)h->mem_info->iaddr.ms.sym->namelen + 4 + if (mem_info__iaddr(h->mem_info)->ms.sym) { + symlen = (int)mem_info__iaddr(h->mem_info)->ms.sym->namelen + 4 + unresolved_col_width + 2; hists__new_col_len(hists, HISTC_MEM_IADDR_SYMBOL, symlen); @@ -180,8 +180,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen); } - if (h->mem_info->daddr.ms.map) { - symlen = dso__name_len(map__dso(h->mem_info->daddr.ms.map)); + if (mem_info__daddr(h->mem_info)->ms.map) { + symlen = dso__name_len(map__dso(mem_info__daddr(h->mem_info)->ms.map)); hists__new_col_len(hists, HISTC_MEM_DADDR_DSO, symlen); } else { @@ -477,8 +477,10 @@ static int hist_entry__init(struct hist_entry *he, } if (he->mem_info) { - he->mem_info->iaddr.ms.map = map__get(he->mem_info->iaddr.ms.map); - he->mem_info->daddr.ms.map = map__get(he->mem_info->daddr.ms.map); + mem_info__iaddr(he->mem_info)->ms.map = + map__get(mem_info__iaddr(he->mem_info)->ms.map); + mem_info__daddr(he->mem_info)->ms.map = + map__get(mem_info__daddr(he->mem_info)->ms.map); } if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) @@ -526,8 +528,8 @@ err_infos: zfree(&he->branch_info); } if (he->mem_info) { - map_symbol__exit(&he->mem_info->iaddr.ms); - map_symbol__exit(&he->mem_info->daddr.ms); + map_symbol__exit(&mem_info__iaddr(he->mem_info)->ms); + map_symbol__exit(&mem_info__daddr(he->mem_info)->ms); } err: map_symbol__exit(&he->ms); @@ -1336,8 +1338,8 @@ void hist_entry__delete(struct hist_entry *he) } if (he->mem_info) { - map_symbol__exit(&he->mem_info->iaddr.ms); - map_symbol__exit(&he->mem_info->daddr.ms); + map_symbol__exit(&mem_info__iaddr(he->mem_info)->ms); + map_symbol__exit(&mem_info__daddr(he->mem_info)->ms); mem_info__zput(he->mem_info); } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index d5a01ef92668..8477edefc299 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2013,11 +2013,11 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, if (!mi) return NULL; - ip__resolve_ams(al->thread, &mi->iaddr, sample->ip); - ip__resolve_data(al->thread, al->cpumode, &mi->daddr, + ip__resolve_ams(al->thread, mem_info__iaddr(mi), sample->ip); + ip__resolve_data(al->thread, al->cpumode, mem_info__daddr(mi), sample->addr, sample->phys_addr, sample->data_page_size); - mi->data_src.val = sample->data_src; + mem_info__data_src(mi)->val = sample->data_src; return mi; } diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index f618a5ccc7af..6dda47bb774f 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -293,7 +293,7 @@ int perf_mem__tlb_scnprintf(char *out, size_t sz, const struct mem_info *mem_inf out[0] = '\0'; if (mem_info) - m = mem_info->data_src.mem_dtlb; + m = mem_info__const_data_src(mem_info)->mem_dtlb; hit = m & PERF_MEM_TLB_HIT; miss = m & PERF_MEM_TLB_MISS; @@ -367,7 +367,7 @@ static int perf_mem__op_scnprintf(char *out, size_t sz, const struct mem_info *m int l; if (mem_info) - op = mem_info->data_src.mem_op; + op = mem_info__const_data_src(mem_info)->mem_op; if (op & PERF_MEM_OP_NA) l = scnprintf(out, sz, "N/A"); @@ -400,7 +400,7 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, const struct mem_info *mem_inf if (!mem_info) goto na; - data_src = mem_info->data_src; + data_src = *mem_info__const_data_src(mem_info); if (data_src.mem_lvl & PERF_MEM_LVL_HIT) memcpy(hit_miss, "hit", 3); @@ -476,7 +476,7 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, const struct mem_info *mem_inf out[0] = '\0'; if (mem_info) - m = mem_info->data_src.mem_snoop; + m = mem_info__const_data_src(mem_info)->mem_snoop; for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) { if (!(m & 0x1)) @@ -490,7 +490,7 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, const struct mem_info *mem_inf m = 0; if (mem_info) - m = mem_info->data_src.mem_snoopx; + m = mem_info__const_data_src(mem_info)->mem_snoopx; for (i = 0; m && i < ARRAY_SIZE(snoopx_access); i++, m >>= 1) { if (!(m & 0x1)) @@ -515,7 +515,7 @@ int perf_mem__lck_scnprintf(char *out, size_t sz, const struct mem_info *mem_inf int l; if (mem_info) - mask = mem_info->data_src.mem_lock; + mask = mem_info__const_data_src(mem_info)->mem_lock; if (mask & PERF_MEM_LOCK_NA) l = scnprintf(out, sz, "N/A"); @@ -536,7 +536,7 @@ int perf_mem__blk_scnprintf(char *out, size_t sz, const struct mem_info *mem_inf out[0] = '\0'; if (mem_info) - mask = mem_info->data_src.mem_blk; + mask = mem_info__const_data_src(mem_info)->mem_blk; if (!mask || (mask & PERF_MEM_BLK_NA)) { l += scnprintf(out + l, sz - l, " N/A"); @@ -572,8 +572,8 @@ int perf_script__meminfo_scnprintf(char *out, size_t sz, const struct mem_info * int c2c_decode_stats(struct c2c_stats *stats, struct mem_info *mi) { - union perf_mem_data_src *data_src = &mi->data_src; - u64 daddr = mi->daddr.addr; + union perf_mem_data_src *data_src = mem_info__data_src(mi); + u64 daddr = mem_info__daddr(mi)->addr; u64 op = data_src->mem_op; u64 lvl = data_src->mem_lvl; u64 snoop = data_src->mem_snoop; @@ -700,7 +700,7 @@ do { \ return -1; } - if (!mi->daddr.ms.map || !mi->iaddr.ms.map) { + if (!mem_info__daddr(mi)->ms.map || !mem_info__iaddr(mi)->ms.map) { stats->nomap++; return -1; } diff --git a/tools/perf/util/mem-info.c b/tools/perf/util/mem-info.c index ff0dfdb5369a..27d67721a695 100644 --- a/tools/perf/util/mem-info.c +++ b/tools/perf/util/mem-info.c @@ -4,25 +4,32 @@ struct mem_info *mem_info__get(struct mem_info *mi) { - if (mi) - refcount_inc(&mi->refcnt); - return mi; + struct mem_info *result; + + if (RC_CHK_GET(result, mi)) + refcount_inc(mem_info__refcnt(mi)); + + return result; } void mem_info__put(struct mem_info *mi) { - if (mi && refcount_dec_and_test(&mi->refcnt)) { - addr_map_symbol__exit(&mi->iaddr); - addr_map_symbol__exit(&mi->daddr); - free(mi); + if (mi && refcount_dec_and_test(mem_info__refcnt(mi))) { + addr_map_symbol__exit(mem_info__iaddr(mi)); + addr_map_symbol__exit(mem_info__daddr(mi)); + RC_CHK_FREE(mi); + } else { + RC_CHK_PUT(mi); } } struct mem_info *mem_info__new(void) { - struct mem_info *mi = zalloc(sizeof(*mi)); + struct mem_info *result = NULL; + RC_STRUCT(mem_info) *mi = zalloc(sizeof(*mi)); + + if (ADD_RC_CHK(result, mi)) + refcount_set(mem_info__refcnt(result), 1); - if (mi) - refcount_set(&mi->refcnt, 1); - return mi; + return result; } diff --git a/tools/perf/util/mem-info.h b/tools/perf/util/mem-info.h index bb7d8375d73c..0f68e29f311b 100644 --- a/tools/perf/util/mem-info.h +++ b/tools/perf/util/mem-info.h @@ -4,9 +4,10 @@ #include #include +#include #include "map_symbol.h" -struct mem_info { +DECLARE_RC_STRUCT(mem_info) { struct addr_map_symbol iaddr; struct addr_map_symbol daddr; union perf_mem_data_src data_src; @@ -25,4 +26,29 @@ static inline void __mem_info__zput(struct mem_info **mi) #define mem_info__zput(mi) __mem_info__zput(&mi) +static inline struct addr_map_symbol *mem_info__iaddr(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->iaddr; +} + +static inline struct addr_map_symbol *mem_info__daddr(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->daddr; +} + +static inline union perf_mem_data_src *mem_info__data_src(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->data_src; +} + +static inline const union perf_mem_data_src *mem_info__const_data_src(const struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->data_src; +} + +static inline refcount_t *mem_info__refcnt(struct mem_info *mi) +{ + return &RC_CHK_ACCESS(mi)->refcnt; +} + #endif /* __PERF_MEM_INFO_H */ diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c77fe08a0545..fb00f3ad6815 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -721,15 +721,20 @@ static void set_sample_read_in_dict(PyObject *dict_sample, } static void set_sample_datasrc_in_dict(PyObject *dict, - struct perf_sample *sample) + struct perf_sample *sample) { - struct mem_info mi = { .data_src.val = sample->data_src }; + struct mem_info *mi = mem_info__new(); char decode[100]; + if (!mi) + Py_FatalError("couldn't create mem-info"); + pydict_set_item_string_decref(dict, "datasrc", PyLong_FromUnsignedLongLong(sample->data_src)); - perf_script__meminfo_scnprintf(decode, 100, &mi); + mem_info__data_src(mi)->val = sample->data_src; + perf_script__meminfo_scnprintf(decode, 100, mi); + mem_info__put(mi); pydict_set_item_string_decref(dict, "datasrc_decode", _PyUnicode_FromString(decode)); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 711ef69306f3..cd39ea972193 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1365,9 +1365,9 @@ sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->daddr.addr; + l = mem_info__daddr(left->mem_info)->addr; if (right->mem_info) - r = right->mem_info->daddr.addr; + r = mem_info__daddr(right->mem_info)->addr; return (int64_t)(r - l); } @@ -1379,8 +1379,8 @@ static int hist_entry__daddr_snprintf(struct hist_entry *he, char *bf, struct map_symbol *ms = NULL; if (he->mem_info) { - addr = he->mem_info->daddr.addr; - ms = &he->mem_info->daddr.ms; + addr = mem_info__daddr(he->mem_info)->addr; + ms = &mem_info__daddr(he->mem_info)->ms; } return _hist_entry__sym_snprintf(ms, addr, he->level, bf, size, width); } @@ -1391,9 +1391,9 @@ sort__iaddr_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->iaddr.addr; + l = mem_info__iaddr(left->mem_info)->addr; if (right->mem_info) - r = right->mem_info->iaddr.addr; + r = mem_info__iaddr(right->mem_info)->addr; return (int64_t)(r - l); } @@ -1405,8 +1405,8 @@ static int hist_entry__iaddr_snprintf(struct hist_entry *he, char *bf, struct map_symbol *ms = NULL; if (he->mem_info) { - addr = he->mem_info->iaddr.addr; - ms = &he->mem_info->iaddr.ms; + addr = mem_info__iaddr(he->mem_info)->addr; + ms = &mem_info__iaddr(he->mem_info)->ms; } return _hist_entry__sym_snprintf(ms, addr, he->level, bf, size, width); } @@ -1418,9 +1418,9 @@ sort__dso_daddr_cmp(struct hist_entry *left, struct hist_entry *right) struct map *map_r = NULL; if (left->mem_info) - map_l = left->mem_info->daddr.ms.map; + map_l = mem_info__daddr(left->mem_info)->ms.map; if (right->mem_info) - map_r = right->mem_info->daddr.ms.map; + map_r = mem_info__daddr(right->mem_info)->ms.map; return _sort__dso_cmp(map_l, map_r); } @@ -1431,7 +1431,7 @@ static int hist_entry__dso_daddr_snprintf(struct hist_entry *he, char *bf, struct map *map = NULL; if (he->mem_info) - map = he->mem_info->daddr.ms.map; + map = mem_info__daddr(he->mem_info)->ms.map; return _hist_entry__dso_snprintf(map, bf, size, width); } @@ -1443,12 +1443,12 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_lock = PERF_MEM_LOCK_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_lock = PERF_MEM_LOCK_NA; @@ -1471,12 +1471,12 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_dtlb = PERF_MEM_TLB_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_dtlb = PERF_MEM_TLB_NA; @@ -1499,12 +1499,12 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_lvl = PERF_MEM_LVL_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_lvl = PERF_MEM_LVL_NA; @@ -1527,12 +1527,12 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_snoop = PERF_MEM_SNOOP_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_snoop = PERF_MEM_SNOOP_NA; @@ -1563,8 +1563,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if (left->cpumode > right->cpumode) return -1; if (left->cpumode < right->cpumode) return 1; - l_map = left->mem_info->daddr.ms.map; - r_map = right->mem_info->daddr.ms.map; + l_map = mem_info__daddr(left->mem_info)->ms.map; + r_map = mem_info__daddr(right->mem_info)->ms.map; /* if both are NULL, jump to sort on al_addr instead */ if (!l_map && !r_map) @@ -1599,8 +1599,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) addr: /* al_addr does all the right addr - start + offset calculations */ - l = cl_address(left->mem_info->daddr.al_addr, chk_double_cl); - r = cl_address(right->mem_info->daddr.al_addr, chk_double_cl); + l = cl_address(mem_info__daddr(left->mem_info)->al_addr, chk_double_cl); + r = cl_address(mem_info__daddr(right->mem_info)->al_addr, chk_double_cl); if (l > r) return -1; if (l < r) return 1; @@ -1617,11 +1617,11 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf, char level = he->level; if (he->mem_info) { - struct map *map = he->mem_info->daddr.ms.map; + struct map *map = mem_info__daddr(he->mem_info)->ms.map; struct dso *dso = map ? map__dso(map) : NULL; - addr = cl_address(he->mem_info->daddr.al_addr, chk_double_cl); - ms = &he->mem_info->daddr.ms; + addr = cl_address(mem_info__daddr(he->mem_info)->al_addr, chk_double_cl); + ms = &mem_info__daddr(he->mem_info)->ms; /* print [s] for shared data mmaps */ if ((he->cpumode != PERF_RECORD_MISC_KERNEL) && @@ -1806,12 +1806,12 @@ sort__blocked_cmp(struct hist_entry *left, struct hist_entry *right) union perf_mem_data_src data_src_r; if (left->mem_info) - data_src_l = left->mem_info->data_src; + data_src_l = *mem_info__data_src(left->mem_info); else data_src_l.mem_blk = PERF_MEM_BLK_NA; if (right->mem_info) - data_src_r = right->mem_info->data_src; + data_src_r = *mem_info__data_src(right->mem_info); else data_src_r.mem_blk = PERF_MEM_BLK_NA; @@ -1840,9 +1840,9 @@ sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->daddr.phys_addr; + l = mem_info__daddr(left->mem_info)->phys_addr; if (right->mem_info) - r = right->mem_info->daddr.phys_addr; + r = mem_info__daddr(right->mem_info)->phys_addr; return (int64_t)(r - l); } @@ -1854,7 +1854,7 @@ static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf, size_t ret = 0; size_t len = BITS_PER_LONG / 4; - addr = he->mem_info->daddr.phys_addr; + addr = mem_info__daddr(he->mem_info)->phys_addr; ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level); @@ -1881,9 +1881,9 @@ sort__data_page_size_cmp(struct hist_entry *left, struct hist_entry *right) uint64_t l = 0, r = 0; if (left->mem_info) - l = left->mem_info->daddr.data_page_size; + l = mem_info__daddr(left->mem_info)->data_page_size; if (right->mem_info) - r = right->mem_info->daddr.data_page_size; + r = mem_info__daddr(right->mem_info)->data_page_size; return (int64_t)(r - l); } @@ -1894,7 +1894,7 @@ static int hist_entry__data_page_size_snprintf(struct hist_entry *he, char *bf, char str[PAGE_SIZE_NAME_LEN]; return repsep_snprintf(bf, size, "%-*s", width, - get_page_size_name(he->mem_info->daddr.data_page_size, str)); + get_page_size_name(mem_info__daddr(he->mem_info)->data_page_size, str)); } struct sort_entry sort_mem_data_page_size = { -- cgit v1.2.3 From d561e170bd07db62c9b8dbe3d0f59085447ca77a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 11:35:45 -0700 Subject: perf hist: Avoid 'struct hist_entry_iter' mem_info memory leak 'struct mem_info' is reference counted while 'struct branch_info' and he_cache (struct hist_entry **) are not. Break apart the priv field in 'struct hist_entry_iter' so that we can know which values are owned by the iter and do the appropriate free or put. Move hide_unresolved to marginally shrink the size of the now grown struct. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Ben Gainey Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sun Haiyong Cc: Tim Chen Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240507183545.1236093-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 39 ++++++++++++++------------------------- tools/perf/util/hist.h | 8 +++++--- 2 files changed, 19 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 00814d42d5f1..2e9e193179dd 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -476,13 +476,6 @@ static int hist_entry__init(struct hist_entry *he, he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map); } - if (he->mem_info) { - mem_info__iaddr(he->mem_info)->ms.map = - map__get(mem_info__iaddr(he->mem_info)->ms.map); - mem_info__daddr(he->mem_info)->ms.map = - map__get(mem_info__daddr(he->mem_info)->ms.map); - } - if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) callchain_init(he->callchain); @@ -574,7 +567,6 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template, he = NULL; } } - return he; } @@ -747,7 +739,7 @@ __hists__add_entry(struct hists *hists, .filtered = symbol__parent_filter(sym_parent) | al->filtered, .hists = hists, .branch_info = bi, - .mem_info = mi, + .mem_info = mem_info__get(mi), .kvm_info = ki, .block_info = block_info, .transaction = sample->transaction, @@ -836,7 +828,7 @@ iter_prepare_mem_entry(struct hist_entry_iter *iter, struct addr_location *al) if (mi == NULL) return -ENOMEM; - iter->priv = mi; + iter->mi = mi; return 0; } @@ -844,7 +836,7 @@ static int iter_add_single_mem_entry(struct hist_entry_iter *iter, struct addr_location *al) { u64 cost; - struct mem_info *mi = iter->priv; + struct mem_info *mi = iter->mi; struct hists *hists = evsel__hists(iter->evsel); struct perf_sample *sample = iter->sample; struct hist_entry *he; @@ -891,12 +883,7 @@ iter_finish_mem_entry(struct hist_entry_iter *iter, err = hist_entry__append_callchain(he, iter->sample); out: - /* - * We don't need to free iter->priv (mem_info) here since the mem info - * was either already freed in hists__findnew_entry() or passed to a - * new hist entry by hist_entry__new(). - */ - iter->priv = NULL; + mem_info__zput(iter->mi); iter->he = NULL; return err; @@ -915,7 +902,7 @@ iter_prepare_branch_entry(struct hist_entry_iter *iter, struct addr_location *al iter->curr = 0; iter->total = sample->branch_stack->nr; - iter->priv = bi; + iter->bi = bi; return 0; } @@ -929,7 +916,7 @@ iter_add_single_branch_entry(struct hist_entry_iter *iter __maybe_unused, static int iter_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *al) { - struct branch_info *bi = iter->priv; + struct branch_info *bi = iter->bi; int i = iter->curr; if (bi == NULL) @@ -958,7 +945,7 @@ iter_add_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *a int i = iter->curr; int err = 0; - bi = iter->priv; + bi = iter->bi; if (iter->hide_unresolved && !(bi[i].from.ms.sym && bi[i].to.ms.sym)) goto out; @@ -987,7 +974,7 @@ static int iter_finish_branch_entry(struct hist_entry_iter *iter, struct addr_location *al __maybe_unused) { - zfree(&iter->priv); + zfree(&iter->bi); iter->he = NULL; return iter->curr >= iter->total ? 0 : -1; @@ -1055,7 +1042,7 @@ iter_prepare_cumulative_entry(struct hist_entry_iter *iter, if (he_cache == NULL) return -ENOMEM; - iter->priv = he_cache; + iter->he_cache = he_cache; iter->curr = 0; return 0; @@ -1068,7 +1055,7 @@ iter_add_single_cumulative_entry(struct hist_entry_iter *iter, struct evsel *evsel = iter->evsel; struct hists *hists = evsel__hists(evsel); struct perf_sample *sample = iter->sample; - struct hist_entry **he_cache = iter->priv; + struct hist_entry **he_cache = iter->he_cache; struct hist_entry *he; int err = 0; @@ -1126,7 +1113,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, { struct evsel *evsel = iter->evsel; struct perf_sample *sample = iter->sample; - struct hist_entry **he_cache = iter->priv; + struct hist_entry **he_cache = iter->he_cache; struct hist_entry *he; struct hist_entry he_tmp = { .hists = evsel__hists(evsel), @@ -1192,7 +1179,9 @@ static int iter_finish_cumulative_entry(struct hist_entry_iter *iter, struct addr_location *al __maybe_unused) { - zfree(&iter->priv); + mem_info__zput(iter->mi); + zfree(&iter->bi); + zfree(&iter->he_cache); iter->he = NULL; return 0; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 5260822b9773..8fb3bdd29188 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -132,18 +132,20 @@ struct hist_entry_iter { int total; int curr; - bool hide_unresolved; - struct evsel *evsel; struct perf_sample *sample; struct hist_entry *he; struct symbol *parent; - void *priv; + + struct mem_info *mi; + struct branch_info *bi; + struct hist_entry **he_cache; const struct hist_iter_ops *ops; /* user-defined callback function (optional) */ int (*add_entry_cb)(struct hist_entry_iter *iter, struct addr_location *al, bool single, void *arg); + bool hide_unresolved; }; extern const struct hist_iter_ops hist_iter_normal; -- cgit v1.2.3 From 2ce987e1650216638b2b5f44948c6efea67038ae Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 09:42:26 +0200 Subject: bpf: Avoid __hidden__ attribute in static object An object defined as `static' defaults to hidden visibility. If additionally the visibility(__weak__) compiler attribute is applied to the declaration of the object, GCC warns that the attribute gets ignored. This patch removes the only instance of this problem among the BPF selftests. Tested in bpf-next master. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507074227.4523-2-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/progs/cpumask_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index c705d8112a35..b979e91f55f0 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -9,7 +9,7 @@ int err; -#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) +#define private(name) SEC(".bss." #name) __attribute__((aligned(8))) private(MASK) static struct bpf_cpumask __kptr * global_mask; struct __cpumask_map_value { -- cgit v1.2.3 From b0fbdf759da05a35b67fd27b8859738b79af25d6 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 09:42:27 +0200 Subject: bpf: Disable some `attribute ignored' warnings in GCC This patch modifies selftests/bpf/Makefile to pass -Wno-attributes to GCC. This is because of the following attributes which are ignored: - btf_decl_tag - btf_type_tag There are many of these. At the moment none of these are recognized/handled by gcc-bpf. We are aware that btf_decl_tag is necessary for some of the selftest harness to communicate test failure/success. Support for it is in progress in GCC upstream: https://gcc.gnu.org/pipermail/gcc-patches/2024-May/650482.html However, the GCC master branch is not yet open, so the series above (currently under review upstream) wont be able to make it there until 14.1 gets released, probably mid next week. As for btf_type_tag, more extensive work will be needed in GCC upstream to support it in both BTF and DWARF. We have a WIP big patch for that, but that is not needed to compile/build the selftests. - used There are SEC macros defined in the selftests as: #define SEC(N) __attribute__((section(N),used)) The SEC macro is used for both functions and global variables. According to the GCC documentation `used' attribute is really only meaningful for functions, and it warns when the attribute is used for other global objects, like for example ctl_array in test_xdp_noinline.c. Ignoring this is benign. - align_value In progs/test_cls_redirect.c:127 there is: typedef uint8_t *net_ptr __attribute__((align_value(8))); GCC warns that it is ignoring this attribute, because it is not implemented by GCC. I think ignoring this attribute in GCC is benign, because according to the clang documentation [1] its purpose seems to be merely declarative and doesn't seem to translate into extra checks at run-time, only to perhaps better optimized code ("runtime behavior is undefined if the pointed memory object is not aligned to the specified alignment"). [1] https://clang.llvm.org/docs/AttributeReference.html#align-value Tested in bpf-next master. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507074227.4523-3-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ba28d42b74db..5d9c906bc3cb 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -431,7 +431,7 @@ endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -Wno-attributes -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -- cgit v1.2.3 From 675b4e24bc50f4600b6bf3527fdbaa1f73498334 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 11:50:11 +0200 Subject: bpf: Temporarily define BPF_NO_PRESEVE_ACCESS_INDEX for GCC The vmlinux.h file generated by bpftool makes use of compiler pragmas in order to install the CO-RE preserve_access_index in all the struct types derived from the BTF info: #ifndef __VMLINUX_H__ #define __VMLINUX_H__ #ifndef BPF_NO_PRESERVE_ACCESS_INDEX #pragma clang attribute push (__attribute__((preserve_access_index)), apply_t = record #endif [... type definitions generated from kernel BTF ... ] #ifndef BPF_NO_PRESERVE_ACCESS_INDEX #pragma clang attribute pop #endif The `clang attribute push/pop' pragmas are specific to clang/llvm and are not supported by GCC. At the moment the BTF dumping services in libbpf do not support dicriminating between types dumped because they are directly referred and types dumped because they are dependencies. A suitable API is being worked now. See [1] and [2]. In the interim, this patch changes the selftests/bpf Makefile so it passes -DBPF_NO_PRESERVE_ACCESS_INDEX to GCC when it builds the selftests. This workaround is temporary, and may have an impact on the results of the GCC-built tests. [1] https://lore.kernel.org/bpf/20240503111836.25275-1-jose.marchesi@oracle.com/T/#u [2] https://lore.kernel.org/bpf/20240504205510.24785-1-jose.marchesi@oracle.com/T/#u Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240507095011.15867-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 5d9c906bc3cb..f0c429cf4424 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -431,7 +431,7 @@ endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 -Wno-attributes -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -- cgit v1.2.3 From 207cf6e649ee551ab3bdb1cfe1b2848e6a4337a5 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 7 May 2024 13:22:19 +0100 Subject: selftests/bpf: Add CFLAGS per source file and runner This patch adds support to specify CFLAGS per source file and per test runner. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507122220.207820-2-cupertino.miranda@oracle.com --- tools/testing/selftests/bpf/Makefile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f0c429cf4424..511ea84139a3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -81,11 +81,11 @@ TEST_INST_SUBDIRS += bpf_gcc # The following tests contain C code that, although technically legal, # triggers GCC warnings that cannot be disabled: declaration of # anonymous struct types in function parameter lists. -progs/btf_dump_test_case_bitfields.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_namespacing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_packing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_padding.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_syntax.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_bitfields.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error endif ifneq ($(CLANG_CPUV4),) @@ -470,7 +470,7 @@ LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(ske # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 @@ -498,7 +498,7 @@ endef # Using TRUNNER_XXX variables, provided by callers of DEFINE_TEST_RUNNER and # set up by DEFINE_TEST_RUNNER itself, create test runner build rules with: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER_RULES ifeq ($($(TRUNNER_OUTPUT)-dir),) @@ -521,7 +521,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ - $$($$<-CFLAGS)) + $$($$<-CFLAGS) \ + $$($$<-$2-CFLAGS)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) -- cgit v1.2.3 From b2e086cb28aa358f7b5564888304908aff735827 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 7 May 2024 13:22:20 +0100 Subject: selftests/bpf: Change functions definitions to support GCC The test_xdp_noinline.c contains 2 functions that use more then 5 arguments. This patch collapses the 2 last arguments in an array. Also in GCC and ipa_sra optimization increases the number of arguments used in function encap_v4. This pass disables the optimization for that particular file. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507122220.207820-3-cupertino.miranda@oracle.com --- .../selftests/bpf/progs/test_xdp_noinline.c | 27 +++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 5c7e4758a0ca..fad94e41cef9 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -318,6 +318,14 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC push_options +/* GCC optimization collapses functions and increases the number of arguments + * beyond the compatible amount supported by BPF. + */ +#pragma GCC optimize("-fno-ipa-sra") +#endif + static __attribute__ ((noinline)) bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, struct packet_description *pckt, @@ -372,6 +380,10 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC pop_options +#endif + static __attribute__ ((noinline)) int swap_mac_and_send(void *data, void *data_end) { @@ -588,12 +600,13 @@ static void connection_table_lookup(struct real_definition **real, __attribute__ ((noinline)) static int process_l3_headers_v6(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct ipv6hdr *ip6h; __u64 iph_len; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; ip6h = data + off; if (ip6h + 1 > data_end) @@ -619,11 +632,12 @@ static int process_l3_headers_v6(struct packet_description *pckt, __attribute__ ((noinline)) static int process_l3_headers_v4(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct iphdr *iph; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; iph = data + off; if (iph + 1 > data_end) @@ -666,13 +680,14 @@ static int process_packet(void *data, __u64 off, void *data_end, __u8 protocol; __u32 vip_num; int action; + void *extra_args[2] = { data, data_end }; if (is_ipv6) action = process_l3_headers_v6(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); else action = process_l3_headers_v4(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); if (action >= 0) return action; protocol = pckt.flow.proto; -- cgit v1.2.3 From 8374b56b1df5566d19d645e49da2bf31b660bcfd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:29 -0700 Subject: libbpf: remove unnecessary struct_ops prog validity check libbpf ensures that BPF program references set in map->st_ops->progs[i] during open phase are always valid STRUCT_OPS programs. This is done in bpf_object__collect_st_ops_relos(). So there is no need to double-check that in bpf_map__init_kern_struct_ops(). Simplify the code by removing unnecessary check. Also, we avoid using local prog variable to keep code similar to the upcoming fix, which adds similar logic in another part of bpf_map__init_kern_struct_ops(). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a3566a456bc8..7d77a1b158ce 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1152,22 +1152,15 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) return -ENOTSUP; } - prog = st_ops->progs[i]; - if (prog) { + if (st_ops->progs[i]) { /* If we had declaratively set struct_ops callback, we need to - * first validate that it's actually a struct_ops program. - * And then force its autoload to false, because it doesn't have + * force its autoload to false, because it doesn't have * a chance of succeeding from POV of the current struct_ops map. * If this program is still referenced somewhere else, though, * then bpf_object_adjust_struct_ops_autoload() will update its * autoload accordingly. */ - if (!is_valid_st_ops_program(obj, prog)) { - pr_warn("struct_ops init_kern %s: member %s is declaratively assigned a non-struct_ops program\n", - map->name, mname); - return -EINVAL; - } - prog->autoload = false; + st_ops->progs[i]->autoload = false; st_ops->progs[i] = NULL; } -- cgit v1.2.3 From e18e2e70dbd1ee3099049557060067b6ec703efa Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:30 -0700 Subject: libbpf: handle yet another corner case of nulling out struct_ops program There is yet another corner case where user can set STRUCT_OPS program reference in STRUCT_OPS map to NULL, but libbpf will fail to disable autoload for such BPF program. This time it's the case of "new" kernel which has type information about callback field, but user explicitly nulled-out program reference from user-space after opening BPF object. Fix, hopefully, the last remaining unhandled case. Fixes: 0737df6de946 ("libbpf: better fix for handling nulled-out struct_ops program") Fixes: f973fccd43d3 ("libbpf: handle nulled-out program in struct_ops correctly") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-3-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7d77a1b158ce..04de4fb81785 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1193,11 +1193,19 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) } if (btf_is_ptr(mtype)) { - /* Update the value from the shadow type */ prog = *(void **)mdata; + /* just like for !kern_member case above, reset declaratively + * set (at compile time) program's autload to false, + * if user replaced it with another program or NULL + */ + if (st_ops->progs[i] && st_ops->progs[i] != prog) + st_ops->progs[i]->autoload = false; + + /* Update the value from the shadow type */ st_ops->progs[i] = prog; if (!prog) continue; + if (!is_valid_st_ops_program(obj, prog)) { pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n", map->name, mname); -- cgit v1.2.3 From 9d66d60e968d85742569d025a2fb509cb57333bb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:31 -0700 Subject: selftests/bpf: add another struct_ops callback use case test Add a test which tests the case that was just fixed. Kernel has full type information about callback, but user explicitly nulls out the reference to declaratively set BPF program reference. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-4-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/test_struct_ops_module.c | 27 ++++++++++++++++++++++ .../selftests/bpf/progs/struct_ops_nulled_out_cb.c | 22 ++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index bd39586abd5a..f3c61ebad323 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -4,6 +4,7 @@ #include #include "struct_ops_module.skel.h" +#include "struct_ops_nulled_out_cb.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -174,6 +175,30 @@ cleanup: struct_ops_module__destroy(skel); } +/* validate that it's ok to "turn off" callback that kernel supports */ +static void test_struct_ops_nulled_out_cb(void) +{ + struct struct_ops_nulled_out_cb *skel; + int err; + + skel = struct_ops_nulled_out_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + /* kernel knows about test_1, but we still null it out */ + skel->struct_ops.ops->test_1 = NULL; + + err = struct_ops_nulled_out_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + ASSERT_FALSE(bpf_program__autoload(skel->progs.test_1_turn_off), "prog_autoload"); + ASSERT_LT(bpf_program__fd(skel->progs.test_1_turn_off), 0, "prog_fd"); + +cleanup: + struct_ops_nulled_out_cb__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) @@ -182,5 +207,7 @@ void serial_test_struct_ops_module(void) test_struct_ops_not_zeroed(); if (test__start_subtest("test_struct_ops_incompatible")) test_struct_ops_incompatible(); + if (test__start_subtest("test_struct_ops_null_out_cb")) + test_struct_ops_nulled_out_cb(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c new file mode 100644 index 000000000000..fa2021388485 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +int rand; +int arr[1]; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_turn_off) +{ + return arr[rand]; /* potentially way out of range access */ +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + .test_1 = (void *)test_1_turn_off, +}; + -- cgit v1.2.3 From 548c2ede0dc81cb8c86f3a72c1c63fe1c179cbfe Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:32 -0700 Subject: libbpf: fix libbpf_strerror_r() handling unknown errors strerror_r(), used from libbpf-specific libbpf_strerror_r() wrapper is documented to return error in two different ways, depending on glibc version. Take that into account when handling strerror_r()'s own errors, which happens when we pass some non-standard (internal) kernel error to it. Before this patch we'd have "ERROR: strerror_r(524)=22", which is quite confusing. Now for the same situation we'll see a bit less visually scary "unknown error (-524)". At least we won't confuse user with irrelevant EINVAL (22). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-5-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/str_error.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c index 146da01979c7..5e6a1e27ddf9 100644 --- a/tools/lib/bpf/str_error.c +++ b/tools/lib/bpf/str_error.c @@ -2,6 +2,7 @@ #undef _GNU_SOURCE #include #include +#include #include "str_error.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -15,7 +16,18 @@ char *libbpf_strerror_r(int err, char *dst, int len) { int ret = strerror_r(err < 0 ? -err : err, dst, len); - if (ret) - snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + /* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't + * handle the error, on glibc >=2.13 *positive* (errno-like) error + * code is returned directly + */ + if (ret == -1) + ret = errno; + if (ret) { + if (ret == EINVAL) + /* strerror_r() doesn't recognize this specific error */ + snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err); + else + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + } return dst; } -- cgit v1.2.3 From c78420bafe7cf9ce14fa7ceb40ce62e1372e661d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:33 -0700 Subject: libbpf: improve early detection of doomed-to-fail BPF program loading Extend libbpf's pre-load checks for BPF programs, detecting more typical conditions that are destinated to cause BPF program failure. This is an opportunity to provide more helpful and actionable error message to users, instead of potentially very confusing BPF verifier log and/or error. In this case, we detect struct_ops BPF program that was not referenced anywhere, but still attempted to be loaded (according to libbpf logic). Suggest that the program might need to be used in some struct_ops variable. User will get a message of the following kind: libbpf: prog 'test_1_forgotten': SEC("struct_ops") program isn't referenced anywhere, did you forget to use it? Suggested-by: Tejun Heo Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-6-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 04de4fb81785..5401f2df463d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7372,7 +7372,11 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog __u32 log_level = prog->log_level; int ret, err; - if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* Be more helpful by rejecting programs that can't be validated early + * with more meaningful and actionable error message. + */ + switch (prog->type) { + case BPF_PROG_TYPE_UNSPEC: /* * The program type must be set. Most likely we couldn't find a proper * section definition at load time, and thus we didn't infer the type. @@ -7380,6 +7384,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->attach_btf_id == 0) { + pr_warn("prog '%s': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?\n", + prog->name); + return -EINVAL; + } + break; + default: + break; } if (!insns || !insns_cnt) -- cgit v1.2.3 From 41df0733ea414a49094258adab4d600db0420731 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:34 -0700 Subject: selftests/bpf: validate struct_ops early failure detection logic Add a simple test that validates that libbpf will reject isolated struct_ops program early with helpful warning message. Also validate that explicit use of such BPF program through BPF skeleton after BPF object is open won't trigger any warnings. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-7-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/test_struct_ops_module.c | 45 ++++++++++++++++++++++ .../selftests/bpf/progs/struct_ops_forgotten_cb.c | 19 +++++++++ 2 files changed, 64 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index f3c61ebad323..3785b648c8ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -5,6 +5,7 @@ #include "struct_ops_module.skel.h" #include "struct_ops_nulled_out_cb.skel.h" +#include "struct_ops_forgotten_cb.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -199,6 +200,48 @@ cleanup: struct_ops_nulled_out_cb__destroy(skel); } +/* validate that libbpf generates reasonable error message if struct_ops is + * not referenced in any struct_ops map + */ +static void test_struct_ops_forgotten_cb(void) +{ + struct struct_ops_forgotten_cb *skel; + char *log; + int err; + + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + start_libbpf_log_capture(); + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_ERR(err, "skel_load")) + goto cleanup; + + log = stop_libbpf_log_capture(); + ASSERT_HAS_SUBSTR(log, + "prog 'test_1_forgotten': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?", + "libbpf_log"); + free(log); + + struct_ops_forgotten_cb__destroy(skel); + + /* now let's programmatically use it, we should be fine now */ + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->struct_ops.ops->test_1 = skel->progs.test_1_forgotten; /* not anymore */ + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + +cleanup: + struct_ops_forgotten_cb__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) @@ -209,5 +252,7 @@ void serial_test_struct_ops_module(void) test_struct_ops_incompatible(); if (test__start_subtest("test_struct_ops_null_out_cb")) test_struct_ops_nulled_out_cb(); + if (test__start_subtest("struct_ops_forgotten_cb")) + test_struct_ops_forgotten_cb(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c new file mode 100644 index 000000000000..3c822103bd40 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_forgotten) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + /* we forgot to reference test_1_forgotten above, oops */ +}; + -- cgit v1.2.3 From 7b9959b8cdbc40b31b4c66bb900ec8d5e5b305bd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:35 -0700 Subject: selftests/bpf: shorten subtest names for struct_ops_module test Drive-by clean up, we shouldn't use meaningless "test_" prefix for subtest names. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-8-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 3785b648c8ad..29e183a80f49 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -244,13 +244,13 @@ cleanup: void serial_test_struct_ops_module(void) { - if (test__start_subtest("test_struct_ops_load")) + if (test__start_subtest("struct_ops_load")) test_struct_ops_load(); - if (test__start_subtest("test_struct_ops_not_zeroed")) + if (test__start_subtest("struct_ops_not_zeroed")) test_struct_ops_not_zeroed(); - if (test__start_subtest("test_struct_ops_incompatible")) + if (test__start_subtest("struct_ops_incompatible")) test_struct_ops_incompatible(); - if (test__start_subtest("test_struct_ops_null_out_cb")) + if (test__start_subtest("struct_ops_null_out_cb")) test_struct_ops_nulled_out_cb(); if (test__start_subtest("struct_ops_forgotten_cb")) test_struct_ops_forgotten_cb(); -- cgit v1.2.3 From 76508154d7da82c139416eee6d318fde4feae143 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 6 May 2024 13:43:16 +0200 Subject: selftests: netfilter: conntrack_tcp_unreplied.sh: wait for initial connection attempt Netdev CI reports occasional failures with this test ("ERROR: ns2-dX6bUE did not pick up tcp connection from peer"). Add explicit busywait call until the initial connection attempt shows up in conntrack rather than a one-shot 'must exist' check. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240506114320.12178-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../net/netfilter/conntrack_tcp_unreplied.sh | 25 ++++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh index 1f862c089028..121ea93c0178 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh @@ -106,6 +106,23 @@ ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do sleep 0.1 done' & +wait_for_attempt() +{ + count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + return 0 + fi + + return 1 +} + +# wait for conntrack to pick the new connection request up before loading +# the nat redirect rule. +if ! busywait "$BUSYWAIT_TIMEOUT" wait_for_attempt; then + echo "ERROR: $ns2 did not pick up tcp connection from peer" + exit 1 +fi + ip netns exec "$ns2" nft -f - </dev/null | wc -l) -if [ "$count" -eq 0 ]; then - echo "ERROR: $ns2 did not pick up tcp connection from peer" - exit 1 -fi - wait_for_redirect() { count=$(ip netns exec "$ns2" conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l) @@ -136,7 +147,7 @@ wait_for_redirect() } echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect" -busywait $BUSYWAIT_TIMEOUT wait_for_redirect +busywait "$BUSYWAIT_TIMEOUT" wait_for_redirect ret=$? expect="packets 1 bytes 60" -- cgit v1.2.3 From eb709b5f6536636dfb87b85ded0b2af9bb6cd9e6 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 6 May 2024 12:02:04 -0700 Subject: selftests/net: fix uninitialized variables When building with clang, via: make LLVM=1 -C tools/testing/selftest ...clang warns about three variables that are not initialized in all cases: 1) The opt_ipproto_off variable is used uninitialized if "testname" is not "ip". Willem de Bruijn pointed out that this is an actual bug, and suggested the fix that I'm using here (thanks!). 2) The addr_len is used uninitialized, but only in the assert case, which bails out, so this is harmless. 3) The family variable in add_listener() is only used uninitialized in the error case (neither IPv4 nor IPv6 is specified), so it's also harmless. Fix by initializing each variable. Signed-off-by: John Hubbard Reviewed-by: Willem de Bruijn Acked-by: Mat Martineau Link: https://lore.kernel.org/r/20240506190204.28497-1-jhubbard@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/gro.c | 3 +++ tools/testing/selftests/net/ip_local_port_range.c | 2 +- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 353e1e867fbb..6038b96ecee8 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -119,6 +119,9 @@ static void setup_sock_filter(int fd) next_off = offsetof(struct ipv6hdr, nexthdr); ipproto_off = ETH_HLEN + next_off; + /* Overridden later if exthdrs are used: */ + opt_ipproto_off = ipproto_off; + if (strcmp(testname, "ip") == 0) { if (proto == PF_INET) optlen = sizeof(struct ip_timestamp); diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c index 193b82745fd8..29451d2244b7 100644 --- a/tools/testing/selftests/net/ip_local_port_range.c +++ b/tools/testing/selftests/net/ip_local_port_range.c @@ -359,7 +359,7 @@ TEST_F(ip_local_port_range, late_bind) struct sockaddr_in v4; struct sockaddr_in6 v6; } addr; - socklen_t addr_len; + socklen_t addr_len = 0; const int one = 1; int fd, err; __u32 range; diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7426a2cbd4a0..7ad5a59adff2 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -1276,7 +1276,7 @@ int add_listener(int argc, char *argv[]) struct sockaddr_storage addr; struct sockaddr_in6 *a6; struct sockaddr_in *a4; - u_int16_t family; + u_int16_t family = AF_UNSPEC; int enable = 1; int sock; int err; -- cgit v1.2.3 From 187c219b57eaf3e1b7a3cab2c6a8b7909bdbf4a9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 6 May 2024 21:13:38 -0700 Subject: perf dwarf-aux: Print array type name with "[]" It's confusing both pointers and arrays are printed as *. Let's print array types with [] so that we can identify them easily. Although it's interchangable, sometimes it can cause confusion with size like in the below example. Note that it is not the same with C syntax where it goes to the variable names, but we want to have it in the type names (like in Go language). Before: mov [20] 0x68(reg5) -> reg0 type='struct page**' size=0x80 (die:0x4e61d32) After: mov [20] 0x68(reg5) -> reg0 type='struct page*[]' size=0x80 (die:0x4e61d32) Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240507041338.2081775-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index c0a492e65388..ec988f294497 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1110,8 +1110,10 @@ int die_get_typename_from_type(Dwarf_Die *type_die, struct strbuf *buf) const char *tmp = ""; tag = dwarf_tag(type_die); - if (tag == DW_TAG_array_type || tag == DW_TAG_pointer_type) + if (tag == DW_TAG_pointer_type) tmp = "*"; + else if (tag == DW_TAG_array_type) + tmp = "[]"; else if (tag == DW_TAG_subroutine_type) { /* Function pointer */ return strbuf_add(buf, "(function_type)", 15); -- cgit v1.2.3 From cbc7afffc5ec581d3781c49fe9c8e8c661e5217b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 3 May 2024 15:13:51 +0200 Subject: selftests: microchip: add test for QoS support on KSZ9477 switch family Add tests covering following functionality on KSZ9477 switch family: - default port priority - global DSCP to Internal Priority Mapping - apptrust configuration This script was tested on KSZ9893R Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- .../selftests/drivers/net/microchip/ksz9477_qos.sh | 668 +++++++++++++++++++++ 1 file changed, 668 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh new file mode 100755 index 000000000000..82be5d013330 --- /dev/null +++ b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh @@ -0,0 +1,668 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024 Pengutronix, Oleksij Rempel + +# The script is adopted to work with the Microchip KSZ switch driver. + +ETH_FCS_LEN=4 + +WAIT_TIME=1 +NUM_NETIFS=4 +REQUIRE_JQ="yes" +REQUIRE_MZ="yes" +STABLE_MAC_ADDRS=yes +NETIF_CREATE=no +lib_dir=$(dirname $0)/../../../net/forwarding +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh + +require_command dcb + +h1=${NETIFS[p1]} +swp1=${NETIFS[p2]} +swp2=${NETIFS[p3]} +h2=${NETIFS[p4]} + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +# On h1_ and h2_create do not set IP addresses to avoid interaction with the +# system, to keep packet counters clean. +h1_create() +{ + simple_if_init $h1 + sysctl_set net.ipv6.conf.${h1}.disable_ipv6 1 + # Get the MAC address of the interface to use it with mausezahn + h1_mac=$(ip -j link show dev ${h1} | jq -e '.[].address') +} + +h1_destroy() +{ + sysctl_restore net.ipv6.conf.${h1}.disable_ipv6 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + sysctl_set net.ipv6.conf.${h2}.disable_ipv6 1 + h2_mac=$(ip -j link show dev ${h2} | jq -e '.[].address') +} + +h2_destroy() +{ + sysctl_restore net.ipv6.conf.${h2}.disable_ipv6 + simple_if_fini $h2 +} + +switch_create() +{ + ip link set ${swp1} up + ip link set ${swp2} up + sysctl_set net.ipv6.conf.${swp1}.disable_ipv6 1 + sysctl_set net.ipv6.conf.${swp2}.disable_ipv6 1 + + # Ports should trust VLAN PCP even with vlan_filtering=0 + ip link add br0 type bridge + ip link set ${swp1} master br0 + ip link set ${swp2} master br0 + ip link set br0 up + sysctl_set net.ipv6.conf.br0.disable_ipv6 1 +} + +switch_destroy() +{ + sysctl_restore net.ipv6.conf.${swp2}.disable_ipv6 + sysctl_restore net.ipv6.conf.${swp1}.disable_ipv6 + + ip link del br0 +} + +setup_prepare() +{ + vrf_prepare + + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + switch_destroy + + vrf_cleanup +} + +set_apptrust_order() +{ + local if_name=$1 + local order=$2 + + dcb apptrust set dev ${if_name} order ${order} +} + +# Function to extract a specified field from a given JSON stats string +extract_network_stat() { + local stats_json=$1 + local field_name=$2 + + echo $(echo "$stats_json" | jq -r "$field_name") +} + +run_test() +{ + local test_name=$1; + local apptrust_order=$2; + local port_prio=$3; + local dscp_ipv=$4; + local dscp=$5; + local have_vlan=$6; + local pcp_ipv=$7; + local vlan_pcp=$8; + local ip_v6=$9 + + local rx_ipv + local tx_ipv + + RET=0 + + # Send some packet to populate the switch MAC table + $MZ ${h2} -a ${h2_mac} -b ${h1_mac} -p 64 -t icmp echores -c 1 + + # Based on the apptrust order, set the expected Internal Priority values + # for the RX and TX paths. + if [ "${apptrust_order}" == "" ]; then + echo "Apptrust order not set." + rx_ipv=${port_prio} + tx_ipv=${port_prio} + elif [ "${apptrust_order}" == "dscp" ]; then + echo "Apptrust order is DSCP." + rx_ipv=${dscp_ipv} + tx_ipv=${dscp_ipv} + elif [ "${apptrust_order}" == "pcp" ]; then + echo "Apptrust order is PCP." + rx_ipv=${pcp_ipv} + tx_ipv=${pcp_ipv} + elif [ "${apptrust_order}" == "pcp dscp" ]; then + echo "Apptrust order is PCP DSCP." + if [ ${have_vlan} -eq 1 ]; then + rx_ipv=$((dscp_ipv > pcp_ipv ? dscp_ipv : pcp_ipv)) + tx_ipv=${pcp_ipv} + else + rx_ipv=${dscp_ipv} + tx_ipv=${dscp_ipv} + fi + else + RET=1 + echo "Error: Unknown apptrust order ${apptrust_order}" + log_test "${test_name}" + return + fi + + # Most/all? of the KSZ switches do not provide per-TC counters. There + # are only tx_hi and rx_hi counters, which are used to count packets + # which are considered as high priority and most likely not assigned + # to the queue 0. + # On the ingress path, packets seem to get high priority status + # independently of the DSCP or PCP global mapping. On the egress path, + # the high priority status is assigned based on the DSCP or PCP global + # map configuration. + # The thresholds for the high priority status are not documented, but + # it seems that the switch considers packets as high priority on the + # ingress path if detected Internal Priority is greater than 0. On the + # egress path, the switch considers packets as high priority if + # detected Internal Priority is greater than 1. + if [ ${rx_ipv} -ge 1 ]; then + local expect_rx_high_prio=1 + else + local expect_rx_high_prio=0 + fi + + if [ ${tx_ipv} -ge 2 ]; then + local expect_tx_high_prio=1 + else + local expect_tx_high_prio=0 + fi + + # Use ip tool to get the current switch packet counters. ethool stats + # need to be recalculated to get the correct values. + local swp1_stats=$(ip -s -j link show dev ${swp1}) + local swp2_stats=$(ip -s -j link show dev ${swp2}) + local swp1_rx_packets_before=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.packets') + local swp1_rx_bytes_before=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.bytes') + local swp2_tx_packets_before=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.packets') + local swp2_tx_bytes_before=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.bytes') + local swp1_rx_hi_before=$(ethtool_stats_get ${swp1} "rx_hi") + local swp2_tx_hi_before=$(ethtool_stats_get ${swp2} "tx_hi") + + # Assamble the mausezahn command based on the test parameters + # For the testis with ipv4 or ipv6, use icmp response packets, + # to avoid interaction with the system, to keep packet counters + # clean. + if [ ${ip_v6} -eq 0 ]; then + local ip="-a ${h1_mac} -b ${h2_mac} -A ${H1_IPV4} \ + -B ${H2_IPV4} -t icmp unreach,code=1,dscp=${dscp}" + else + local ip="-6 -a ${h1_mac} -b ${h2_mac} -A ${H1_IPV6} \ + -B ${H2_IPV6} -t icmp6 type=1,code=0,dscp=${dscp}" + fi + + if [ ${have_vlan} -eq 1 ]; then + local vlan_pcp_opt="-Q ${vlan_pcp}:0" + else + local vlan_pcp_opt="" + fi + $MZ ${h1} ${ip} -c ${PING_COUNT} -d 10msec ${vlan_pcp_opt} + + # Wait until the switch packet counters are updated + sleep 6 + + local swp1_stats=$(ip -s -j link show dev ${swp1}) + local swp2_stats=$(ip -s -j link show dev ${swp2}) + + local swp1_rx_packets_after=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.packets') + local swp1_rx_bytes_after=$(extract_network_stat "$swp1_stats" \ + '.[0].stats64.rx.bytes') + local swp2_tx_packets_after=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.packets') + local swp2_tx_bytes_after=$(extract_network_stat "$swp2_stats" \ + '.[0].stats64.tx.bytes') + + local swp1_rx_packets_diff=$((${swp1_rx_packets_after} - \ + ${swp1_rx_packets_before})) + local swp2_tx_packets_diff=$((${swp2_tx_packets_after} - \ + ${swp2_tx_packets_before})) + + local swp1_rx_hi_after=$(ethtool_stats_get ${swp1} "rx_hi") + local swp2_tx_hi_after=$(ethtool_stats_get ${swp2} "tx_hi") + + # Test if any packets were received on swp1, we will rx before and after + if [ ${swp1_rx_packets_diff} -lt ${PING_COUNT} ]; then + echo "Not expected amount of received packets on ${swp1}" + echo "before ${swp1_rx_packets_before} after ${swp1_rx_packets_after}" + RET=1 + fi + + # Test if any packets were transmitted on swp2, we will tx before and after + if [ ${swp2_tx_packets_diff} -lt ${PING_COUNT} ]; then + echo "Not expected amount of transmitted packets on ${swp2}" + echo "before ${swp2_tx_packets_before} after ${swp2_tx_packets_after}" + RET=1 + fi + + # tx/rx_hi counted in bytes. So, we need to compare the difference in bytes + local swp1_rx_bytes_diff=$(($swp1_rx_bytes_after - $swp1_rx_bytes_before)) + local swp2_tx_bytes_diff=$(($swp2_tx_bytes_after - $swp2_tx_bytes_before)) + local swp1_rx_hi_diff=$(($swp1_rx_hi_after - $swp1_rx_hi_before)) + local swp2_tx_hi_diff=$(($swp2_tx_hi_after - $swp2_tx_hi_before)) + + if [ ${expect_rx_high_prio} -eq 1 ]; then + swp1_rx_hi_diff=$((${swp1_rx_hi_diff} - \ + ${swp1_rx_packets_diff} * ${ETH_FCS_LEN})) + if [ ${swp1_rx_hi_diff} -ne ${swp1_rx_bytes_diff} ]; then + echo "Not expected amount of high priority packets received on ${swp1}" + echo "RX hi diff: ${swp1_rx_hi_diff}, expected RX bytes diff: ${swp1_rx_bytes_diff}" + RET=1 + fi + else + if [ ${swp1_rx_hi_diff} -ne 0 ]; then + echo "Unexpected amount of high priority packets received on ${swp1}" + echo "RX hi diff: ${swp1_rx_hi_diff}, expected 0" + RET=1 + fi + fi + + if [ ${expect_tx_high_prio} -eq 1 ]; then + swp2_tx_hi_diff=$((${swp2_tx_hi_diff} - \ + ${swp2_tx_packets_diff} * ${ETH_FCS_LEN})) + if [ ${swp2_tx_hi_diff} -ne ${swp2_tx_bytes_diff} ]; then + echo "Not expected amount of high priority packets transmitted on ${swp2}" + echo "TX hi diff: ${swp2_tx_hi_diff}, expected TX bytes diff: ${swp2_tx_bytes_diff}" + RET=1 + fi + else + if [ ${swp2_tx_hi_diff} -ne 0 ]; then + echo "Unexpected amount of high priority packets transmitted on ${swp2}" + echo "TX hi diff: ${swp2_tx_hi_diff}, expected 0" + RET=1 + fi + fi + + log_test "${test_name}" +} + +run_test_dscp() +{ + # IPv4 test + run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 0 + # IPv6 test + run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 1 +} + +run_test_dscp_pcp() +{ + # IPv4 test + run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 0 + # IPv6 test + run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 1 +} + +port_default_prio_get() +{ + local if_name=$1 + local prio + + prio="$(dcb -j app show dev ${if_name} default-prio | \ + jq '.default_prio[]')" + if [ -z "${prio}" ]; then + prio=0 + fi + + echo ${prio} +} + +test_port_default() +{ + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_prio=$(port_default_prio_get ${swp1}) + local apptrust_order="" + + RET=0 + + # Make sure no other priority sources will interfere with the test + set_apptrust_order ${swp1} "${apptrust_order}" + + for val in $(seq 0 7); do + dcb app replace dev ${swp1} default-prio ${val} + if [ $val -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + break + fi + + run_test_dscp "Port-default QoS classification, prio: ${val}" \ + "${apptrust_order}" ${val} 0 0 + done + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_prio} + if [ $orig_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "Port-default QoS classification" +} + +port_get_default_apptrust() +{ + local if_name=$1 + + dcb -j apptrust show dev ${if_name} | jq -r '.order[]' | \ + tr '\n' ' ' | xargs +} + +test_port_apptrust() +{ + local original_dscp_prios_swp1=$(get_dscp_prios ${swp1}) + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_port_prio=$(port_default_prio_get ${swp1}) + local order_variants=("pcp dscp" "dscp" "pcp") + local apptrust_order + local port_prio + local dscp_prio + local pcp_prio + local dscp + local pcp + + RET=0 + + # First, test if apptrust configuration as taken by the kernel + for order in "${order_variants[@]}"; do + set_apptrust_order ${swp1} "${order}" + if [[ "$order" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + break + fi + done + + log_test "Apptrust, supported variants" + + # To test if the apptrust configuration is working as expected, we need + # to set DSCP priorities for the switch port. + init_dscp_prios "${swp1}" "${original_dscp_prios_swp1}" + + # Start with a simple test where all apptrust sources are disabled + # default port priority is 0, DSCP priority is mapped to 7. + # No high priority packets should be received or transmitted. + port_prio=0 + dscp_prio=7 + dscp=4 + + dcb app replace dev ${swp1} default-prio ${port_prio} + dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio} + + apptrust_order="" + set_apptrust_order ${swp1} "${apptrust_order}" + # Test with apptrust sources disabled, Packets should get port default + # priority which is 0 + run_test_dscp "Apptrust, all disabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="pcp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If PCP is enabled, packets should get PCP priority, which is not + # set in this test (no VLAN tags are present in the packet). No high + # priority packets should be received or transmitted. + run_test_dscp "Apptrust, PCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="dscp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If DSCP is enabled, packets should get DSCP priority which is set to 7 + # in this test. High priority packets should be received and transmitted. + run_test_dscp "Apptrust, DSCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + apptrust_order="pcp dscp" + set_apptrust_order ${swp1} "${apptrust_order}" + # If PCP and DSCP are enabled, PCP would have higher apptrust priority + # so packets should get PCP priority. But in this test VLAN PCP is not + # set, so it should get DSCP priority which is set to 7. High priority + # packets should be received and transmitted. + run_test_dscp "Apptrust, PCP and DSCP are enabled. DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + + # If VLAN PCP is set, it should have higher apptrust priority than DSCP + # so packets should get VLAN PCP priority. Send packets with VLAN PCP + # set to 0, DSCP set to 7. Packets should get VLAN PCP priority. + # No high priority packets should be transmitted. Due to nature of the + # switch, high priority packets will be received. + pcp_prio=0 + pcp=0 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + + # If VLAN PCP is set to 7, it should have higher apptrust priority than + # DSCP so packets should get VLAN PCP priority. Send packets with VLAN + # PCP set to 7, DSCP set to 7. Packets should get VLAN PCP priority. + # High priority packets should be received and transmitted. + pcp_prio=7 + pcp=7 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + # Now make sure that the switch is able to handle the case where DSCP + # priority is set to 0 and PCP priority is set to 7. Packets should get + # PCP priority. High priority packets should be received and transmitted. + dscp_prio=0 + dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio} + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + # If both VLAN PCP and DSCP are set to 0, packets should get 0 priority. + # No high priority packets should be received or transmitted. + pcp_prio=0 + pcp=0 + run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp} + + # Restore original priorities + if ! restore_priorities "${swp1}" "${original_dscp_prios_swp1}"; then + RET=1 + fi + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_port_prio} + if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "Apptrust, restore original settings" +} + +# Function to get current DSCP priorities +get_dscp_prios() { + local if_name=$1 + dcb -j app show dev ${if_name} | jq -c '.dscp_prio' +} + +# Function to set a specific DSCP priority on a device +replace_dscp_prio() { + local if_name=$1 + local dscp=$2 + local prio=$3 + dcb app replace dev ${if_name} dscp-prio ${dscp}:${prio} +} + +# Function to compare DSCP maps +compare_dscp_maps() { + local old_json=$1 + local new_json=$2 + local dscp=$3 + local prio=$4 + + # Create a modified old_json with the expected change for comparison + local modified_old_json=$(echo "$old_json" | + jq --argjson dscp $dscp --argjson prio $prio \ + 'map(if .[0] == $dscp then [$dscp, $prio] else . end)' | + tr -d " \n") + + # Compare new_json with the modified_old_json + if [[ "$modified_old_json" == "$new_json" ]]; then + return 0 + else + return 1 + fi +} + +# Function to set DSCP priorities +set_and_verify_dscp() { + local port=$1 + local dscp=$2 + local new_prio=$3 + + local old_prios=$(get_dscp_prios $port) + + replace_dscp_prio "$port" $dscp $new_prio + + # Fetch current settings and compare + local current_prios=$(get_dscp_prios $port) + if ! compare_dscp_maps "$old_prios" "$current_prios" $dscp $new_prio; then + echo "Error: Unintended changes detected in DSCP map for $port after setting DSCP $dscp to $new_prio." + return 1 + fi + return 0 +} + +# Function to restore original priorities +restore_priorities() { + local port=$1 + local original_prios=$2 + + echo "Removing test artifacts for $port" + local current_prios=$(get_dscp_prios $port) + local prio_str=$(echo "$current_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app del dev $port dscp-prio $prio_str + + echo "Restoring original DSCP priorities for $port" + local restore_str=$(echo "$original_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app add dev $port dscp-prio $restore_str + + local current_prios=$(get_dscp_prios $port) + if [[ "$original_prios" != "$current_prios" ]]; then + echo "Error: Failed to restore original DSCP priorities for $port" + return 1 + fi + return 0 +} + +# Initialize DSCP priorities. Set them to predictable values for testing. +init_dscp_prios() { + local port=$1 + local original_prios=$2 + + echo "Removing any existing DSCP priority mappins for $port" + local prio_str=$(echo "$original_prios" | + jq -r 'map("\(.[0]):\(.[1])") | join(" ")') + dcb app del dev $port dscp-prio $prio_str + + # Initialize DSCP priorities list + local dscp_prios="" + for dscp in {0..63}; do + dscp_prios+=("$dscp:0") + done + + echo "Setting initial DSCP priorities map to 0 for $port" + dcb app add dev $port dscp-prio ${dscp_prios[@]} +} + +# Main function to test global DSCP map across specified ports +test_global_dscp_map() { + local ports=("$swp1" "$swp2") + local original_dscp_prios_port0=$(get_dscp_prios ${ports[0]}) + local orig_apptrust=$(port_get_default_apptrust ${swp1}) + local orig_port_prio=$(port_default_prio_get ${swp1}) + local apptrust_order="dscp" + local port_prio=0 + local dscp_prio + local dscp + + RET=0 + + set_apptrust_order ${swp1} "${apptrust_order}" + dcb app replace dev ${swp1} default-prio ${port_prio} + + # Initialize DSCP priorities + init_dscp_prios "${ports[0]}" "$original_dscp_prios_port0" + + # Loop over each DSCP index + for dscp in {0..63}; do + # and test each Internal Priority value + for dscp_prio in {0..7}; do + # do it for each port. This is to test if the global DSCP map + # is accessible from all ports. + for port in "${ports[@]}"; do + if ! set_and_verify_dscp "$port" $dscp $dscp_prio; then + RET=1 + fi + done + + # Test if the DSCP priority is correctly applied to the packets + run_test_dscp "DSCP (${dscp}) QoS classification, prio: ${dscp_prio}" \ + "${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} + if [ ${RET} -eq 1 ]; then + break + fi + done + done + + # Restore original priorities + if ! restore_priorities "${ports[0]}" "${original_dscp_prios_port0}"; then + RET=1 + fi + + set_apptrust_order ${swp1} "${orig_apptrust}" + if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then + RET=1 + fi + + dcb app replace dev ${swp1} default-prio ${orig_port_prio} + if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then + RET=1 + fi + + log_test "DSCP global map" +} + +trap cleanup EXIT + +ALL_TESTS=" + test_port_default + test_port_apptrust + test_global_dscp_map +" + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From 252aa6d53931381bd774acd06866ed0fb1976ead Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 7 May 2024 11:11:55 +0200 Subject: test: hsr: Call cleanup_all_ns when hsr_redbox.sh script exits Without this change the created netns instances are not cleared after this script execution. To fix this problem the cleanup_all_ns function from ../lib.sh is called. Signed-off-by: Lukasz Majewski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/hsr/hsr_redbox.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh index 52e0412c32e6..db69be95ecb3 100755 --- a/tools/testing/selftests/net/hsr/hsr_redbox.sh +++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh @@ -86,6 +86,8 @@ setup_hsr_interfaces() check_prerequisites setup_ns ns1 ns2 ns3 +trap cleanup_all_ns EXIT + setup_hsr_interfaces 1 do_complete_ping_test -- cgit v1.2.3 From 98ec6d38ee57a734123c6f5d42640804034024ef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 8 May 2024 09:41:17 +0100 Subject: selftests/powerpc/dexcr: Fix spelling mistake "predicition" -> "prediction" There is a spelling mistake in the help message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Michael Ellerman Link: https://msgid.link/20240508084117.2869261-1-colin.i.king@gmail.com --- tools/testing/selftests/powerpc/dexcr/chdexcr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c index bda44630cada..c548d7a5bb9b 100644 --- a/tools/testing/selftests/powerpc/dexcr/chdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -26,7 +26,7 @@ static void help(void) "\n" "The normal option sets the aspect in the DEXCR. The --no- variant\n" "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" - "so indirect branch predicition will be disabled in the provided program.\n" + "so indirect branch prediction will be disabled in the provided program.\n" "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" "prediction may occur.\n" "\n" -- cgit v1.2.3 From 9a169c267e946b0f47f67e8ccc70134708ccf3d4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 7 May 2024 14:30:33 +0300 Subject: selftests: test_bridge_neigh_suppress.sh: Fix failures due to duplicate MAC When creating the topology for the test, three veth pairs are created in the initial network namespace before being moved to one of the network namespaces created by the test. On systems where systemd-udev uses MACAddressPolicy=persistent (default since systemd version 242), this will result in some net devices having the same MAC address since they were created with the same name in the initial network namespace. In turn, this leads to arping / ndisc6 failing since packets are dropped by the bridge's loopback filter. Fix by creating each net device in the correct network namespace instead of moving it there from the initial network namespace. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240426074015.251854d4@kernel.org/ Fixes: 7648ac72dcd7 ("selftests: net: Add bridge neighbor suppression test") Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20240507113033.1732534-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/test_bridge_neigh_suppress.sh | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh index 8533393a4f18..02b986c9c247 100755 --- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh +++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh @@ -154,17 +154,9 @@ setup_topo() setup_topo_ns $ns done - ip link add name veth0 type veth peer name veth1 - ip link set dev veth0 netns $h1 name eth0 - ip link set dev veth1 netns $sw1 name swp1 - - ip link add name veth0 type veth peer name veth1 - ip link set dev veth0 netns $sw1 name veth0 - ip link set dev veth1 netns $sw2 name veth0 - - ip link add name veth0 type veth peer name veth1 - ip link set dev veth0 netns $h2 name eth0 - ip link set dev veth1 netns $sw2 name swp1 + ip -n $h1 link add name eth0 type veth peer name swp1 netns $sw1 + ip -n $sw1 link add name veth0 type veth peer name veth0 netns $sw2 + ip -n $h2 link add name eth0 type veth peer name swp1 netns $sw2 } setup_host_common() -- cgit v1.2.3 From e612b5c1d3ee325aff991b4078b4999bf6bac096 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 26 Apr 2024 16:11:16 +0000 Subject: bpf, arm64: Add support for lse atomics in bpf_arena When LSE atomics are available, BPF atomic instructions are implemented as single ARM64 atomic instructions, therefore it is easy to enable these in bpf_arena using the currently available exception handling setup. LL_SC atomics use loops and therefore would need more work to enable in bpf_arena. Enable LSE atomics based instructions in bpf_arena and use the bpf_jit_supports_insn() callback to reject atomics in bpf_arena if LSE atomics are not available. All atomics and arena_atomics selftests are passing: [root@ip-172-31-2-216 bpf]# ./test_progs -a atomics,arena_atomics #3/1 arena_atomics/add:OK #3/2 arena_atomics/sub:OK #3/3 arena_atomics/and:OK #3/4 arena_atomics/or:OK #3/5 arena_atomics/xor:OK #3/6 arena_atomics/cmpxchg:OK #3/7 arena_atomics/xchg:OK #3 arena_atomics:OK #10/1 atomics/add:OK #10/2 atomics/sub:OK #10/3 atomics/and:OK #10/4 atomics/or:OK #10/5 atomics/xor:OK #10/6 atomics/cmpxchg:OK #10/7 atomics/xchg:OK #10 atomics:OK Summary: 2/14 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240426161116.441-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 48 +++++++++++++++++++++++----- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 - 2 files changed, 40 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 76b91f36c729..53347d4217f4 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -494,20 +494,26 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) static int emit_lse_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) { const u8 code = insn->code; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const u8 dst = bpf2a64[insn->dst_reg]; const u8 src = bpf2a64[insn->src_reg]; const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; const bool isdw = BPF_SIZE(code) == BPF_DW; + const bool arena = BPF_MODE(code) == BPF_PROBE_ATOMIC; const s16 off = insn->off; - u8 reg; + u8 reg = dst; - if (!off) { - reg = dst; - } else { - emit_a64_mov_i(1, tmp, off, ctx); - emit(A64_ADD(1, tmp, tmp, dst), ctx); - reg = tmp; + if (off || arena) { + if (off) { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_ADD(1, tmp, tmp, dst), ctx); + reg = tmp; + } + if (arena) { + emit(A64_ADD(1, tmp, reg, arena_vm_base), ctx); + reg = tmp; + } } switch (insn->imm) { @@ -576,6 +582,12 @@ static int emit_ll_sc_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) u8 reg; s32 jmp_offset; + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + /* ll_sc based atomics don't support unsafe pointers yet. */ + pr_err_once("unknown atomic opcode %02x\n", code); + return -EINVAL; + } + if (!off) { reg = dst; } else { @@ -777,7 +789,8 @@ static int add_exception_handler(const struct bpf_insn *insn, if (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32) + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; if (!ctx->prog->aux->extable || @@ -1474,12 +1487,18 @@ emit_cond_jmp: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) ret = emit_lse_atomic(insn, ctx); else ret = emit_ll_sc_atomic(insn, ctx); if (ret) return ret; + + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; break; default: @@ -2527,6 +2546,19 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (!cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) + return false; + } + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index cf657fc35619..0445ac38bc07 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,4 +10,3 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -arena_atomics -- cgit v1.2.3 From bbed8b9ffed16572357c5a348c7172846d106cfe Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Apr 2024 17:27:57 -0300 Subject: tools lib rbtree: pick some improvements from the kernel rbtree code The tools/lib/rbtree.c code came from the kernel. Remove the EXPORT_SYMBOL() that make sense only there. Unfortunately it is not being checked with tools/perf/check_headers.sh. Will try to remedy this. Until then pick the improvements from: b0687c1119b4e8c8 ("lib/rbtree: use '+' instead of '|' for setting color.") That I noticed by doing: diff -u tools/lib/rbtree.c lib/rbtree.c diff -u tools/include/linux/rbtree_augmented.h include/linux/rbtree_augmented.h There is one other cases, but lets pick it in separate patches. Link: https://lkml.kernel.org/r/ZigZzeFoukzRKG1Q@x1 Signed-off-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Noah Goldstein Signed-off-by: Andrew Morton --- tools/include/linux/rbtree_augmented.h | 4 ++-- tools/lib/rbtree.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 570bb9794421..95483c7d81df 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; + rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { - rb->__rb_parent_color = (unsigned long)p | color; + rb->__rb_parent_color = (unsigned long)p + color; } static inline void diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 727396de6be5..9e7307186b7f 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) -- cgit v1.2.3 From b9112b17950c955071abfd4331d4daa162d6ec4d Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Mon, 6 May 2024 09:54:19 +0200 Subject: selftests/alsa: make dump_config_tree() as void function dump_config_tree() is declared to return an int, but the compiler cannot prove that it always returns any value at all. This leads to a clang warning, when building via: make LLVM=1 -C tools/testing/selftests Suggested-by: John Hubbard Cc: Mark Brown Signed-off-by: Jaroslav Kysela Reviewed-by: Mark Brown Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240506075419.301780-1-perex@perex.cz --- tools/testing/selftests/alsa/conf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c index 89e3656a042d..e2b3a5810f47 100644 --- a/tools/testing/selftests/alsa/conf.c +++ b/tools/testing/selftests/alsa/conf.c @@ -105,7 +105,7 @@ static struct card_cfg_data *conf_data_by_card(int card, bool msg) return NULL; } -static int dump_config_tree(snd_config_t *top) +static void dump_config_tree(snd_config_t *top) { snd_output_t *out; int err; -- cgit v1.2.3 From cd3fc3b9782130a5bc1dc3dfccffbc1657637a93 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 20:47:56 +0200 Subject: bpf: avoid uninitialized warnings in verifier_global_subprogs.c [Changes from V1: - The warning to disable is -Wmaybe-uninitialized, not -Wuninitialized. - This warning is only supported in GCC.] The BPF selftest verifier_global_subprogs.c contains code that purposedly performs out of bounds access to memory, to check whether the kernel verifier is able to catch them. For example: __noinline int global_unsupp(const int *mem) { if (!mem) return 0; return mem[100]; /* BOOM */ } With -O1 and higher and no inlining, GCC notices this fact and emits a "maybe uninitialized" warning. This is by design. Note that the emission of these warnings is highly dependent on the precise optimizations that are performed. This patch adds a compiler pragma to verifier_global_subprogs.c to ignore these warnings. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240507184756.1772-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_global_subprogs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index baff5ffe9405..a9fc30ed4d73 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -8,6 +8,13 @@ #include "xdp_metadata.h" #include "bpf_kfuncs.h" +/* The compiler may be able to detect the access to uninitialized + memory in the routines performing out of bound memory accesses and + emit warnings about it. This is the case of GCC. */ +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + int arr[1]; int unkn_idx; const volatile bool call_dead_subprog = false; -- cgit v1.2.3 From 1209a523f6914404e8941d9e04caa42be7cab8d5 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Wed, 8 May 2024 12:35:51 +0200 Subject: bpf: avoid UB in usages of the __imm_insn macro [Changes from V2: - no-strict-aliasing is only applied when building with GCC. - cpumask_failure.c is excluded, as it doesn't use __imm_insn.] The __imm_insn macro is defined in bpf_misc.h as: #define __imm_insn(name, expr) [name]"i"(*(long *)&(expr)) This may lead to type-punning and strict aliasing rules violations in it's typical usage where the address of a struct bpf_insn is passed as expr, like in: __imm_insn(st_mem, BPF_ST_MEM(BPF_W, BPF_REG_1, offsetof(struct __sk_buff, mark), 42)) Where: #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) In all the actual instances of this in the BPF selftests the value is fed to a volatile asm statement as soon as it gets read from memory, and thus it is unlikely anti-aliasing rules breakage may lead to misguided optimizations. However, GCC detects the potential problem (indirectly) by issuing a warning stating that a temporary is used uninitialized, where the temporary corresponds to the memory read by *(long *). This patch adds -fno-strict-aliasing to the compilation flags of the particular selftests that do type punning via __imm_insn, only for GCC. Tested in master bpf-next. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240508103551.14955-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 511ea84139a3..e962a0bd8a78 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -86,6 +86,19 @@ progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +# The following tests do type-punning, via the __imm_insn macro, from +# `struct bpf_insn' to long and then uses the value. This triggers an +# "is used uninitialized" warning in GCC due to strict-aliasing +# rules. +progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing endif ifneq ($(CLANG_CPUV4),) -- cgit v1.2.3 From 911edc69c832161b62a8ad10a6972290157a7bd3 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Wed, 8 May 2024 13:03:32 +0200 Subject: bpf: guard BPF_NO_PRESERVE_ACCESS_INDEX in skb_pkt_end.c This little patch is a follow-up to: https://lore.kernel.org/bpf/20240507095011.15867-1-jose.marchesi@oracle.com/T/#u The temporary workaround of passing -DBPF_NO_PRESERVE_ACCESS_INDEX when building with GCC triggers a redefinition preprocessor error when building progs/skb_pkt_end.c. This patch adds a guard to avoid redefinition. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Cc: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240508110332.17332-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/skb_pkt_end.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index 992b7861003a..db4abd2682fc 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX #define BPF_NO_PRESERVE_ACCESS_INDEX +#endif #include #include #include -- cgit v1.2.3 From 009367099eb61a4fc2af44d4eb06b6b4de7de6db Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Wed, 8 May 2024 12:13:13 +0200 Subject: bpf: Avoid uninitialized value in BPF_CORE_READ_BITFIELD [Changes from V1: - Use a default branch in the switch statement to initialize `val'.] GCC warns that `val' may be used uninitialized in the BPF_CRE_READ_BITFIELD macro, defined in bpf_core_read.h as: [...] unsigned long long val; \ [...] \ switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ case 1: val = *(const unsigned char *)p; break; \ case 2: val = *(const unsigned short *)p; break; \ case 4: val = *(const unsigned int *)p; break; \ case 8: val = *(const unsigned long long *)p; break; \ } \ [...] val; \ } \ This patch adds a default entry in the switch statement that sets `val' to zero in order to avoid the warning, and random values to be used in case __builtin_preserve_field_info returns unexpected values for BPF_FIELD_BYTE_SIZE. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240508101313.16662-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_core_read.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index b5c7ce5c243a..c0e13cdf9660 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -104,6 +104,7 @@ enum bpf_enum_value_kind { case 2: val = *(const unsigned short *)p; break; \ case 4: val = *(const unsigned int *)p; break; \ case 8: val = *(const unsigned long long *)p; break; \ + default: val = 0; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ -- cgit v1.2.3 From 17909476d631979927109187cc7e89e4577ff5be Mon Sep 17 00:00:00 2001 From: Lu Dai Date: Sat, 4 May 2024 18:01:06 +0300 Subject: selftests: kselftest_deps: fix l5_test() empty variable In the function l5_test(), variable $tests is empty when there is no .mk file in the subsystem to be tested. It causes the following grep operation get stuck. This fix check the variable $tests, return when it is empty. Signed-off-by: Lu Dai Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_deps.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index de59cc8f03c3..487e49fdf2a6 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -244,6 +244,7 @@ l4_test() l5_test() { tests=$(find $(dirname "$test") -type f -name "*.mk") + [[ -z "${tests// }" ]] && return test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \ cut -d "=" -f 2) -- cgit v1.2.3 From 051f2226a545a07b5d21b5c9d8626ddd483c68a5 Mon Sep 17 00:00:00 2001 From: Amer Al Shanawany Date: Mon, 22 Apr 2024 15:16:59 +0200 Subject: selftests: filesystems: add missing stddef header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix compiler warning and errors when compiling statmount test. gcc 12.3 (Ubuntu 12.3.0-1ubuntu1~22.04) statmount_test.c:572:24: warning: implicit declaration of function ‘offsetof’ [-Wimplicit-function-declaration] 572 | #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t)) | ^~~~~~~~ statmount_test.c:598:51: note: in expansion of macro ‘str_off’ 598 | test_statmount_string(STATMOUNT_MNT_ROOT, str_off(mnt_root), "mount root"); | ^~~~~~~ statmount_test.c:18:1: note: ‘offsetof’ is defined in header ‘’; did you forget to ‘#include ’? 17 | #include "../../kselftest.h" +++ |+#include Signed-off-by: Amer Al Shanawany Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/statmount/statmount_test.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 3eafd7da58e2..e6d7c4f1c85b 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include +#include #include #include #include -- cgit v1.2.3 From b0df30628459a0fb349e3de09cf0fd6548241044 Mon Sep 17 00:00:00 2001 From: Amer Al Shanawany Date: Sat, 4 May 2024 19:09:16 +0200 Subject: selftests/capabilities: fix warn_unused_result build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warnings by adding return check and error handling. test_execve.c: In function ‘do_tests’: test_execve.c:100:17: warning: ignoring return value of ‘capng_get_caps_process’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 100 | capng_get_caps_process(); | ^~~~~~~~~~~~~~~~~~~~~~~~ validate_cap.c: In function ‘main’: validate_cap.c:47:9: warning: ignoring return value of ‘capng_get_caps_process’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 47 | capng_get_caps_process(); | ^~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Amer Al Shanawany Reviewed-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/capabilities/test_execve.c | 12 +++++++++--- tools/testing/selftests/capabilities/validate_cap.c | 7 ++++++- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c index 7cde07a5df78..47bad7ddc5bc 100644 --- a/tools/testing/selftests/capabilities/test_execve.c +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -82,7 +82,7 @@ static bool create_and_enter_ns(uid_t inner_uid) { uid_t outer_uid; gid_t outer_gid; - int i; + int i, ret; bool have_outer_privilege; outer_uid = getuid(); @@ -97,7 +97,10 @@ static bool create_and_enter_ns(uid_t inner_uid) ksft_exit_fail_msg("setresuid - %s\n", strerror(errno)); // Re-enable effective caps - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) + ksft_exit_fail_msg("capng_get_caps_process failed\n"); + for (i = 0; i < CAP_LAST_CAP; i++) if (capng_have_capability(CAPNG_PERMITTED, i)) capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i); @@ -207,6 +210,7 @@ static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient) static int do_tests(int uid, const char *our_path) { + int ret; bool have_outer_privilege = create_and_enter_ns(uid); int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY); @@ -250,7 +254,9 @@ static int do_tests(int uid, const char *our_path) ksft_exit_fail_msg("chmod - %s\n", strerror(errno)); } - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) + ksft_exit_fail_msg("capng_get_caps_process failed\n"); /* Make sure that i starts out clear */ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c index 60b4e7b716a7..65f2a1c89239 100644 --- a/tools/testing/selftests/capabilities/validate_cap.c +++ b/tools/testing/selftests/capabilities/validate_cap.c @@ -28,6 +28,7 @@ static bool bool_arg(char **argv, int i) int main(int argc, char **argv) { const char *atsec = ""; + int ret; /* * Be careful just in case a setgid or setcapped copy of this @@ -44,7 +45,11 @@ int main(int argc, char **argv) atsec = " (AT_SECURE is not set)"; #endif - capng_get_caps_process(); + ret = capng_get_caps_process(); + if (ret == -1) { + ksft_print_msg("capng_get_caps_process failed\n"); + return 1; + } if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) { ksft_print_msg("Wrong effective state%s\n", atsec); -- cgit v1.2.3 From 2fd3ef1b9265eda7f53b9506f1ebfb67eb6435a2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 4 May 2024 09:33:10 +0900 Subject: selftests/ftrace: Fix BTFARG testcase to check fprobe is enabled correctly Since the dynevent/add_remove_btfarg.tc test case forgets to ensure that fprobe is enabled for some structure field access tests which uses the fprobe, it fails if CONFIG_FPROBE=n or CONFIG_FPROBE_EVENTS=n. Fixes it to ensure the fprobe events are supported. Fixes: d892d3d3d885 ("selftests/ftrace: Add BTF fields access testcases") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc index b9c21a81d248..c0cdad4c400e 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc @@ -53,7 +53,7 @@ fi echo > dynamic_events -if [ "$FIELDS" ] ; then +if [ "$FIELDS" -a "$FPROBES" ] ; then echo "t:tpevent ${TP2} obj_size=s->object_size" >> dynamic_events echo "f:fpevent ${TP3}%return path=\$retval->name:string" >> dynamic_events echo "t:tpevent2 ${TP4} p->se.group_node.next->prev" >> dynamic_events -- cgit v1.2.3 From b07b7e2fd51840c7dfffa98c4344ab36195bb8dc Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 4 May 2024 09:33:19 +0900 Subject: selftests/ftrace: Fix checkbashisms errors Fix the below checkbashisms errors. Because of these errors, these tests will fail on dash shell. possible bashism in test.d/kprobe/kretprobe_entry_arg.tc line 14 ('function' is useless): function streq() { possible bashism in test.d/dynevent/fprobe_entry_arg.tc line 14 ('function' is useless): function streq() { Fixes: f6e2253a617c ("selftests/ftrace: Add test cases for entry args at function exit") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc | 2 +- tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc index d183b8a8ecf8..1e251ce2998e 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc @@ -11,7 +11,7 @@ echo 1 > events/tests/enable echo > trace cat trace > /dev/null -function streq() { +streq() { test $1 = $2 } diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc index 53b82f36a1d0..e50470b53164 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc @@ -11,7 +11,7 @@ echo 1 > events/kprobes/enable echo > trace cat trace > /dev/null -function streq() { +streq() { test $1 = $2 } -- cgit v1.2.3 From 14d28ec6f821622211aa65b4da156399c9a4a9c6 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 8 May 2024 13:41:01 -0700 Subject: selftests/resctrl: fix clang build warnings related to abs(), labs() calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with clang, via: make LLVM=1 -C tools/testing/selftests ...two types of warnings occur: warning: absolute value function 'abs' given an argument of type 'long' but has parameter of type 'int' which may cause truncation of value warning: taking the absolute value of unsigned type 'unsigned long' has no effect Fix these by: a) using labs() in place of abs(), when long integers are involved, and b) Change to use signed integer data types, in places where subtraction is used (and could end up with negative values). c) Remove a duplicate abs() call in cmt_test.c. Cc: Ilpo Järvinen Reviewed-by: Reinette Chatre Signed-off-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cmt_test.c | 4 ++-- tools/testing/selftests/resctrl/mba_test.c | 2 +- tools/testing/selftests/resctrl/mbm_test.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index a44e6fcd37b7..0105afec6188 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -40,11 +40,11 @@ static int show_results_info(unsigned long sum_llc_val, int no_of_bits, int ret; avg_llc_val = sum_llc_val / num_of_runs; - avg_diff = (long)abs(cache_span - avg_llc_val); + avg_diff = (long)(cache_span - avg_llc_val); diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100; ret = platform && abs((int)diff_percent) > max_diff_percent && - abs(avg_diff) > max_diff; + labs(avg_diff) > max_diff; ksft_print_msg("%s Check cache miss rate within %lu%%\n", ret ? "Fail:" : "Pass:", max_diff_percent); diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 5d6af9e8afed..a6ad39aae162 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -60,8 +60,8 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) /* Memory bandwidth from 100% down to 10% */ for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; allocation++) { - unsigned long avg_bw_imc, avg_bw_resc; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_bw_imc, avg_bw_resc; int avg_diff_per; float avg_diff; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 3059ccc51a5a..6fec51e1ff46 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -17,8 +17,8 @@ static int show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, size_t span) { - unsigned long avg_bw_imc = 0, avg_bw_resc = 0; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_bw_imc = 0, avg_bw_resc = 0; int runs, ret, avg_diff_per; float avg_diff = 0; -- cgit v1.2.3 From daef47b89efd0b745e8478d69a3ad724bd8b4dc6 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Tue, 7 May 2024 21:38:26 +0000 Subject: selftests: Compile kselftest headers with -D_GNU_SOURCE Add the -D_GNU_SOURCE flag to KHDR_INCLUDES so that it is defined in a central location. Commit 809216233555 ("selftests/harness: remove use of LINE_MAX") introduced asprintf into kselftest_harness.h, which is a GNU extension and needs _GNU_SOURCE to either be defined prior to including headers or with the -D_GNU_SOURCE flag passed to the compiler. Fixed up commit log: Shuah Khan Fixes: 809216233555 ("selftests/harness: remove use of LINE_MAX") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404301040.3bea5782-oliver.sang@intel.com Signed-off-by: Edward Liaw Reviewed-by: Muhammad Usama Anjum Reviewed-by: Mark Brown Reviewed-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 4 ++-- tools/testing/selftests/kselftest_harness.h | 2 +- tools/testing/selftests/lib.mk | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index e1504833654d..ed012a7f0786 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -161,11 +161,11 @@ ifneq ($(KBUILD_OUTPUT),) # $(realpath ...) resolves symlinks abs_objtree := $(realpath $(abs_objtree)) BUILD := $(abs_objtree)/kselftest - KHDR_INCLUDES := -isystem ${abs_objtree}/usr/include + KHDR_INCLUDES := -D_GNU_SOURCE -isystem ${abs_objtree}/usr/include else BUILD := $(CURDIR) abs_srctree := $(shell cd $(top_srcdir) && pwd) - KHDR_INCLUDES := -isystem ${abs_srctree}/usr/include + KHDR_INCLUDES := -D_GNU_SOURCE -isystem ${abs_srctree}/usr/include DEFAULT_INSTALL_HDR_PATH := 1 endif diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index d98702b6955d..b2a1b6343896 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -51,7 +51,7 @@ #define __KSELFTEST_HARNESS_H #ifndef _GNU_SOURCE -#define _GNU_SOURCE +static_assert(0, "kselftest harness requires _GNU_SOURCE to be defined"); #endif #include #include diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 8ae203d8ed7f..7fa4a96e26ed 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -53,7 +53,7 @@ selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) top_srcdir = $(selfdir)/../../.. ifeq ($(KHDR_INCLUDES),) -KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include +KHDR_INCLUDES := -D_GNU_SOURCE -isystem $(top_srcdir)/usr/include endif # The following are built by lib.mk common compile rules. -- cgit v1.2.3 From 2c3b8f8f37c6c0c926d584cf4158db95e62b960c Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Tue, 7 May 2024 21:38:27 +0000 Subject: selftests/sgx: Include KHDR_INCLUDES in Makefile Add KHDR_INCLUDES to the CFLAGS to pull in the kselftest harness dependencies (-D_GNU_SOURCE). Also, remove redefinitions of _GNU_SOURCE in the source code. Fixes: 809216233555 ("selftests/harness: remove use of LINE_MAX") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404301040.3bea5782-oliver.sang@intel.com Signed-off-by: Edward Liaw Reviewed-by: Muhammad Usama Anjum Acked-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Reviewed-by: John Hubbard Signed-off-by: Shuah Khan --- tools/testing/selftests/sgx/Makefile | 2 +- tools/testing/selftests/sgx/sigstruct.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 867f88ce2570..26ea30fae23c 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -12,7 +12,7 @@ OBJCOPY := $(CROSS_COMPILE)objcopy endif INCLUDES := -I$(top_srcdir)/tools/include -HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC +HOST_CFLAGS := -Wall -Werror $(KHDR_INCLUDES) -g $(INCLUDES) -fPIC HOST_LDFLAGS := -z noexecstack -lcrypto ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \ -fno-stack-protector -mrdrnd $(INCLUDES) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index d73b29becf5b..200034a0fee5 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ -#define _GNU_SOURCE #include #include #include -- cgit v1.2.3 From 1d0dc857b5d8711ff0f037bea9a0c13049c1763c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 7 May 2024 11:40:58 -0400 Subject: selftests: drv-net: add checksum tests Run tools/testing/selftest/net/csum.c as part of drv-net. This binary covers multiple scenarios, based on arguments given, for both IPv4 and IPv6: - Accept UDP correct checksum - Detect UDP invalid checksum - Accept TCP correct checksum - Detect TCP invalid checksum - Transmit UDP: basic checksum offload - Transmit UDP: zero checksum conversion The test direction is reversed between receive and transmit tests, so that the NIC under test is always the local machine. In total this adds up to 12 testcases, with more to follow. For conciseness, I replaced individual functions with a function factory. Also detect hardware offload feature availability using Ethtool netlink and skip tests when either feature is off. This need may be common for offload feature tests and eventually deserving of a thin wrapper in lib.py. Missing are the PF_PACKET based send tests ('-P'). These use virtio_net_hdr to program hardware checksum offload. Which requires looking up the local MAC address and (harder) the MAC of the next hop. I'll have to give it some though how to do that robustly and where that code would belong. Tested: make -C tools/testing/selftests/ \ TARGETS="drivers/net drivers/net/hw" \ install INSTALL_PATH=/tmp/ksft cd /tmp/ksft sudo NETIF=ens4 REMOTE_TYPE=ssh \ REMOTE_ARGS="root@10.40.0.2" \ LOCAL_V4="10.40.0.1" \ REMOTE_V4="10.40.0.2" \ ./run_kselftest.sh -t drivers/net/hw:csum.py Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240507154216.501111-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/Makefile | 1 + tools/testing/selftests/drivers/net/hw/csum.py | 122 +++ tools/testing/selftests/net/.gitignore | 1 - tools/testing/selftests/net/Makefile | 1 - tools/testing/selftests/net/csum.c | 1000 ----------------------- tools/testing/selftests/net/lib/.gitignore | 2 + tools/testing/selftests/net/lib/Makefile | 7 + tools/testing/selftests/net/lib/csum.c | 1000 +++++++++++++++++++++++ 8 files changed, 1132 insertions(+), 1002 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/hw/csum.py delete mode 100644 tools/testing/selftests/net/csum.c create mode 100644 tools/testing/selftests/net/lib/.gitignore create mode 100644 tools/testing/selftests/net/lib/csum.c (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 1dd732855d76..4933d045ab66 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0+ OR MIT TEST_PROGS = \ + csum.py \ devlink_port_split.py \ ethtool.sh \ ethtool_extended_state.sh \ diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py new file mode 100755 index 000000000000..cb40497faee4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/csum.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Run the tools/testing/selftests/net/csum testsuite.""" + +from os import path + +from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import EthtoolFamily, NetDrvEpEnv +from lib.py import bkg, cmd, wait_port_listen + +def test_receive(cfg, ipv4=False, extra_args=None): + """Test local nic checksum receive. Remote host sends crafted packets.""" + if not cfg.have_rx_csum: + raise KsftSkipEx(f"Test requires rx checksum offload on {cfg.ifname}") + + if ipv4: + ip_args = f"-4 -S {cfg.remote_v4} -D {cfg.v4}" + else: + ip_args = f"-6 -S {cfg.remote_v6} -D {cfg.v6}" + + rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}" + tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}" + + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(34000, proto="udp") + cmd(tx_cmd, host=cfg.remote) + + +def test_transmit(cfg, ipv4=False, extra_args=None): + """Test local nic checksum transmit. Remote host verifies packets.""" + if (not cfg.have_tx_csum_generic and + not (cfg.have_tx_csum_ipv4 and ipv4) and + not (cfg.have_tx_csum_ipv6 and not ipv4)): + raise KsftSkipEx(f"Test requires tx checksum offload on {cfg.ifname}") + + if ipv4: + ip_args = f"-4 -S {cfg.v4} -D {cfg.remote_v4}" + else: + ip_args = f"-6 -S {cfg.v6} -D {cfg.remote_v6}" + + # Cannot randomize input when calculating zero checksum + if extra_args != "-U -Z": + extra_args += " -r 1" + + rx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -L 1 -n 100 {ip_args} -R {extra_args}" + tx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -L 1 -n 100 {ip_args} -T {extra_args}" + + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(34000, proto="udp", host=cfg.remote) + cmd(tx_cmd) + + +def test_builder(name, cfg, ipv4=False, tx=False, extra_args=""): + """Construct specific tests from the common template. + + Most tests follow the same basic pattern, differing only in + Direction of the test and optional flags passed to csum.""" + def f(cfg): + if ipv4: + cfg.require_v4() + else: + cfg.require_v6() + + if tx: + test_transmit(cfg, ipv4, extra_args) + else: + test_receive(cfg, ipv4, extra_args) + + if ipv4: + f.__name__ = "ipv4_" + name + else: + f.__name__ = "ipv6_" + name + return f + + +def check_nic_features(cfg) -> None: + """Test whether Tx and Rx checksum offload are enabled. + + If the device under test has either off, then skip the relevant tests.""" + cfg.have_tx_csum_generic = False + cfg.have_tx_csum_ipv4 = False + cfg.have_tx_csum_ipv6 = False + cfg.have_rx_csum = False + + ethnl = EthtoolFamily() + features = ethnl.features_get({"header": {"dev-index": cfg.ifindex}}) + for f in features["active"]["bits"]["bit"]: + if f["name"] == "tx-checksum-ip-generic": + cfg.have_tx_csum_generic = True + elif f["name"] == "tx-checksum-ipv4": + cfg.have_tx_csum_ipv4 = True + elif f["name"] == "tx-checksum-ipv6": + cfg.have_tx_csum_ipv6 = True + elif f["name"] == "rx-checksum": + cfg.have_rx_csum = True + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + check_nic_features(cfg) + + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../net/lib/csum") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + cases = [] + for ipv4 in [True, False]: + cases.append(test_builder("rx_tcp", cfg, ipv4, False, "-t")) + cases.append(test_builder("rx_tcp_invalid", cfg, ipv4, False, "-t -E")) + + cases.append(test_builder("rx_udp", cfg, ipv4, False, "")) + cases.append(test_builder("rx_udp_invalid", cfg, ipv4, False, "-E")) + + cases.append(test_builder("tx_udp_csum_offload", cfg, ipv4, True, "-U")) + cases.append(test_builder("tx_udp_zero_checksum", cfg, ipv4, True, "-U -Z")) + + ksft_run(cases=cases, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index d996a0ab0765..74ae1068229c 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -2,7 +2,6 @@ bind_bhash bind_timewait bind_wildcard -csum cmsg_sender diag_uid fin_ack_lat diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 5befca249452..052c21438dc4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -81,7 +81,6 @@ TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello -TEST_GEN_FILES += csum TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh diff --git a/tools/testing/selftests/net/csum.c b/tools/testing/selftests/net/csum.c deleted file mode 100644 index b9f3fc3c3426..000000000000 --- a/tools/testing/selftests/net/csum.c +++ /dev/null @@ -1,1000 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* Test hardware checksum offload: Rx + Tx, IPv4 + IPv6, TCP + UDP. - * - * The test runs on two machines to exercise the NIC. For this reason it - * is not integrated in kselftests. - * - * CMD=$((./csum -[46] -[tu] -S $SADDR -D $DADDR -[RT] -r 1 $EXTRA_ARGS)) - * - * Rx: - * - * The sender sends packets with a known checksum field using PF_INET(6) - * SOCK_RAW sockets. - * - * good packet: $CMD [-t] - * bad packet: $CMD [-t] -E - * - * The receiver reads UDP packets with a UDP socket. This is not an - * option for TCP packets ('-t'). Optionally insert an iptables filter - * to avoid these entering the real protocol stack. - * - * The receiver also reads all packets with a PF_PACKET socket, to - * observe whether both good and bad packets arrive on the host. And to - * read the optional TP_STATUS_CSUM_VALID bit. This requires setting - * option PACKET_AUXDATA, and works only for CHECKSUM_UNNECESSARY. - * - * Tx: - * - * The sender needs to build CHECKSUM_PARTIAL packets to exercise tx - * checksum offload. - * - * The sender can sends packets with a UDP socket. - * - * Optionally crafts a packet that sums up to zero to verify that the - * device writes negative zero 0xFFFF in this case to distinguish from - * 0x0000 (checksum disabled), as required by RFC 768. Hit this case - * by choosing a specific source port. - * - * good packet: $CMD -U - * zero csum: $CMD -U -Z - * - * The sender can also build packets with PF_PACKET with PACKET_VNET_HDR, - * to cover more protocols. PF_PACKET requires passing src and dst mac - * addresses. - * - * good packet: $CMD -s $smac -d $dmac -p [-t] - * - * Argument '-z' sends UDP packets with a 0x000 checksum disabled field, - * to verify that the NIC passes these packets unmodified. - * - * Argument '-e' adds a transport mode encapsulation header between - * network and transport header. This will fail for devices that parse - * headers. Should work on devices that implement protocol agnostic tx - * checksum offload (NETIF_F_HW_CSUM). - * - * Argument '-r $SEED' optionally randomizes header, payload and length - * to increase coverage between packets sent. SEED 1 further chooses a - * different seed for each run (and logs this for reproducibility). It - * is advised to enable this for extra coverage in continuous testing. - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kselftest.h" - -static bool cfg_bad_csum; -static int cfg_family = PF_INET6; -static int cfg_num_pkt = 4; -static bool cfg_do_rx = true; -static bool cfg_do_tx = true; -static bool cfg_encap; -static char *cfg_ifname = "eth0"; -static char *cfg_mac_dst; -static char *cfg_mac_src; -static int cfg_proto = IPPROTO_UDP; -static int cfg_payload_char = 'a'; -static int cfg_payload_len = 100; -static uint16_t cfg_port_dst = 34000; -static uint16_t cfg_port_src = 33000; -static uint16_t cfg_port_src_encap = 33001; -static unsigned int cfg_random_seed; -static int cfg_rcvbuf = 1 << 22; /* be able to queue large cfg_num_pkt */ -static bool cfg_send_pfpacket; -static bool cfg_send_udp; -static int cfg_timeout_ms = 2000; -static bool cfg_zero_disable; /* skip checksum: set to zero (udp only) */ -static bool cfg_zero_sum; /* create packet that adds up to zero */ - -static struct sockaddr_in cfg_daddr4 = {.sin_family = AF_INET}; -static struct sockaddr_in cfg_saddr4 = {.sin_family = AF_INET}; -static struct sockaddr_in6 cfg_daddr6 = {.sin6_family = AF_INET6}; -static struct sockaddr_in6 cfg_saddr6 = {.sin6_family = AF_INET6}; - -#define ENC_HEADER_LEN (sizeof(struct udphdr) + sizeof(struct udp_encap_hdr)) -#define MAX_HEADER_LEN (sizeof(struct ipv6hdr) + ENC_HEADER_LEN + sizeof(struct tcphdr)) -#define MAX_PAYLOAD_LEN 1024 - -/* Trivial demo encap. Stand-in for transport layer protocols like ESP or PSP */ -struct udp_encap_hdr { - uint8_t nexthdr; - uint8_t padding[3]; -}; - -/* Ipaddrs, for pseudo csum. Global var is ugly, pass through funcs was worse */ -static void *iph_addr_p; - -static unsigned long gettimeofday_ms(void) -{ - struct timeval tv; - - gettimeofday(&tv, NULL); - return (tv.tv_sec * 1000UL) + (tv.tv_usec / 1000UL); -} - -static uint32_t checksum_nofold(char *data, size_t len, uint32_t sum) -{ - uint16_t *words = (uint16_t *)data; - int i; - - for (i = 0; i < len / 2; i++) - sum += words[i]; - - if (len & 1) - sum += ((unsigned char *)data)[len - 1]; - - return sum; -} - -static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) -{ - sum = checksum_nofold(data, len, sum); - - while (sum > 0xFFFF) - sum = (sum & 0xFFFF) + (sum >> 16); - - return ~sum; -} - -static uint16_t checksum(void *th, uint16_t proto, size_t len) -{ - uint32_t sum; - int alen; - - alen = cfg_family == PF_INET6 ? 32 : 8; - - sum = checksum_nofold(iph_addr_p, alen, 0); - sum += htons(proto); - sum += htons(len); - - /* With CHECKSUM_PARTIAL kernel expects non-inverted pseudo csum */ - if (cfg_do_tx && cfg_send_pfpacket) - return ~checksum_fold(NULL, 0, sum); - else - return checksum_fold(th, len, sum); -} - -static void *build_packet_ipv4(void *_iph, uint8_t proto, unsigned int len) -{ - struct iphdr *iph = _iph; - - memset(iph, 0, sizeof(*iph)); - - iph->version = 4; - iph->ihl = 5; - iph->ttl = 8; - iph->protocol = proto; - iph->saddr = cfg_saddr4.sin_addr.s_addr; - iph->daddr = cfg_daddr4.sin_addr.s_addr; - iph->tot_len = htons(sizeof(*iph) + len); - iph->check = checksum_fold(iph, sizeof(*iph), 0); - - iph_addr_p = &iph->saddr; - - return iph + 1; -} - -static void *build_packet_ipv6(void *_ip6h, uint8_t proto, unsigned int len) -{ - struct ipv6hdr *ip6h = _ip6h; - - memset(ip6h, 0, sizeof(*ip6h)); - - ip6h->version = 6; - ip6h->payload_len = htons(len); - ip6h->nexthdr = proto; - ip6h->hop_limit = 64; - ip6h->saddr = cfg_saddr6.sin6_addr; - ip6h->daddr = cfg_daddr6.sin6_addr; - - iph_addr_p = &ip6h->saddr; - - return ip6h + 1; -} - -static void *build_packet_udp(void *_uh) -{ - struct udphdr *uh = _uh; - - uh->source = htons(cfg_port_src); - uh->dest = htons(cfg_port_dst); - uh->len = htons(sizeof(*uh) + cfg_payload_len); - uh->check = 0; - - /* choose source port so that uh->check adds up to zero */ - if (cfg_zero_sum) { - uh->source = 0; - uh->source = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); - - fprintf(stderr, "tx: changing sport: %hu -> %hu\n", - cfg_port_src, ntohs(uh->source)); - cfg_port_src = ntohs(uh->source); - } - - if (cfg_zero_disable) - uh->check = 0; - else - uh->check = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); - - if (cfg_bad_csum) - uh->check = ~uh->check; - - fprintf(stderr, "tx: sending checksum: 0x%x\n", uh->check); - return uh + 1; -} - -static void *build_packet_tcp(void *_th) -{ - struct tcphdr *th = _th; - - th->source = htons(cfg_port_src); - th->dest = htons(cfg_port_dst); - th->doff = 5; - th->check = 0; - - th->check = checksum(th, IPPROTO_TCP, sizeof(*th) + cfg_payload_len); - - if (cfg_bad_csum) - th->check = ~th->check; - - fprintf(stderr, "tx: sending checksum: 0x%x\n", th->check); - return th + 1; -} - -static char *build_packet_udp_encap(void *_uh) -{ - struct udphdr *uh = _uh; - struct udp_encap_hdr *eh = _uh + sizeof(*uh); - - /* outer dst == inner dst, to simplify BPF filter - * outer src != inner src, to demultiplex on recv - */ - uh->dest = htons(cfg_port_dst); - uh->source = htons(cfg_port_src_encap); - uh->check = 0; - uh->len = htons(sizeof(*uh) + - sizeof(*eh) + - sizeof(struct tcphdr) + - cfg_payload_len); - - eh->nexthdr = IPPROTO_TCP; - - return build_packet_tcp(eh + 1); -} - -static char *build_packet(char *buf, int max_len, int *len) -{ - uint8_t proto; - char *off; - int tlen; - - if (cfg_random_seed) { - int *buf32 = (void *)buf; - int i; - - for (i = 0; i < (max_len / sizeof(int)); i++) - buf32[i] = rand(); - } else { - memset(buf, cfg_payload_char, max_len); - } - - if (cfg_proto == IPPROTO_UDP) - tlen = sizeof(struct udphdr) + cfg_payload_len; - else - tlen = sizeof(struct tcphdr) + cfg_payload_len; - - if (cfg_encap) { - proto = IPPROTO_UDP; - tlen += ENC_HEADER_LEN; - } else { - proto = cfg_proto; - } - - if (cfg_family == PF_INET) - off = build_packet_ipv4(buf, proto, tlen); - else - off = build_packet_ipv6(buf, proto, tlen); - - if (cfg_encap) - off = build_packet_udp_encap(off); - else if (cfg_proto == IPPROTO_UDP) - off = build_packet_udp(off); - else - off = build_packet_tcp(off); - - /* only pass the payload, but still compute headers for cfg_zero_sum */ - if (cfg_send_udp) { - *len = cfg_payload_len; - return off; - } - - *len = off - buf + cfg_payload_len; - return buf; -} - -static int open_inet(int ipproto, int protocol) -{ - int fd; - - fd = socket(cfg_family, ipproto, protocol); - if (fd == -1) - error(1, errno, "socket inet"); - - if (cfg_family == PF_INET6) { - /* may have been updated by cfg_zero_sum */ - cfg_saddr6.sin6_port = htons(cfg_port_src); - - if (bind(fd, (void *)&cfg_saddr6, sizeof(cfg_saddr6))) - error(1, errno, "bind dgram 6"); - if (connect(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) - error(1, errno, "connect dgram 6"); - } else { - /* may have been updated by cfg_zero_sum */ - cfg_saddr4.sin_port = htons(cfg_port_src); - - if (bind(fd, (void *)&cfg_saddr4, sizeof(cfg_saddr4))) - error(1, errno, "bind dgram 4"); - if (connect(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) - error(1, errno, "connect dgram 4"); - } - - return fd; -} - -static int open_packet(void) -{ - int fd, one = 1; - - fd = socket(PF_PACKET, SOCK_RAW, 0); - if (fd == -1) - error(1, errno, "socket packet"); - - if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one))) - error(1, errno, "setsockopt packet_vnet_ndr"); - - return fd; -} - -static void send_inet(int fd, const char *buf, int len) -{ - int ret; - - ret = write(fd, buf, len); - if (ret == -1) - error(1, errno, "write"); - if (ret != len) - error(1, 0, "write: %d", ret); -} - -static void eth_str_to_addr(const char *str, unsigned char *eth) -{ - if (sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", - ð[0], ð[1], ð[2], ð[3], ð[4], ð[5]) != 6) - error(1, 0, "cannot parse mac addr %s", str); -} - -static void send_packet(int fd, const char *buf, int len) -{ - struct virtio_net_hdr vh = {0}; - struct sockaddr_ll addr = {0}; - struct msghdr msg = {0}; - struct ethhdr eth; - struct iovec iov[3]; - int ret; - - addr.sll_family = AF_PACKET; - addr.sll_halen = ETH_ALEN; - addr.sll_ifindex = if_nametoindex(cfg_ifname); - if (!addr.sll_ifindex) - error(1, errno, "if_nametoindex %s", cfg_ifname); - - vh.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - if (cfg_family == PF_INET6) { - vh.csum_start = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); - addr.sll_protocol = htons(ETH_P_IPV6); - } else { - vh.csum_start = sizeof(struct ethhdr) + sizeof(struct iphdr); - addr.sll_protocol = htons(ETH_P_IP); - } - - if (cfg_encap) - vh.csum_start += ENC_HEADER_LEN; - - if (cfg_proto == IPPROTO_TCP) { - vh.csum_offset = __builtin_offsetof(struct tcphdr, check); - vh.hdr_len = vh.csum_start + sizeof(struct tcphdr); - } else { - vh.csum_offset = __builtin_offsetof(struct udphdr, check); - vh.hdr_len = vh.csum_start + sizeof(struct udphdr); - } - - eth_str_to_addr(cfg_mac_src, eth.h_source); - eth_str_to_addr(cfg_mac_dst, eth.h_dest); - eth.h_proto = addr.sll_protocol; - - iov[0].iov_base = &vh; - iov[0].iov_len = sizeof(vh); - - iov[1].iov_base = ð - iov[1].iov_len = sizeof(eth); - - iov[2].iov_base = (void *)buf; - iov[2].iov_len = len; - - msg.msg_iov = iov; - msg.msg_iovlen = ARRAY_SIZE(iov); - - msg.msg_name = &addr; - msg.msg_namelen = sizeof(addr); - - ret = sendmsg(fd, &msg, 0); - if (ret == -1) - error(1, errno, "sendmsg packet"); - if (ret != sizeof(vh) + sizeof(eth) + len) - error(1, errno, "sendmsg packet: %u", ret); -} - -static int recv_prepare_udp(void) -{ - int fd; - - fd = socket(cfg_family, SOCK_DGRAM, 0); - if (fd == -1) - error(1, errno, "socket r"); - - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, - &cfg_rcvbuf, sizeof(cfg_rcvbuf))) - error(1, errno, "setsockopt SO_RCVBUF r"); - - if (cfg_family == PF_INET6) { - if (bind(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) - error(1, errno, "bind r"); - } else { - if (bind(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) - error(1, errno, "bind r"); - } - - return fd; -} - -/* Filter out all traffic that is not cfg_proto with our destination port. - * - * Otherwise background noise may cause PF_PACKET receive queue overflow, - * dropping the expected packets and failing the test. - */ -static void __recv_prepare_packet_filter(int fd, int off_nexthdr, int off_dport) -{ - struct sock_filter filter[] = { - BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), - BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_nexthdr), - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_encap ? IPPROTO_UDP : cfg_proto, 0, 2), - BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_port_dst, 1, 0), - BPF_STMT(BPF_RET + BPF_K, 0), - BPF_STMT(BPF_RET + BPF_K, 0xFFFF), - }; - struct sock_fprog prog = {}; - - prog.filter = filter; - prog.len = ARRAY_SIZE(filter); - if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) - error(1, errno, "setsockopt filter"); -} - -static void recv_prepare_packet_filter(int fd) -{ - const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ - - if (cfg_family == AF_INET) - __recv_prepare_packet_filter(fd, offsetof(struct iphdr, protocol), - sizeof(struct iphdr) + off_dport); - else - __recv_prepare_packet_filter(fd, offsetof(struct ipv6hdr, nexthdr), - sizeof(struct ipv6hdr) + off_dport); -} - -static void recv_prepare_packet_bind(int fd) -{ - struct sockaddr_ll laddr = {0}; - - laddr.sll_family = AF_PACKET; - - if (cfg_family == PF_INET) - laddr.sll_protocol = htons(ETH_P_IP); - else - laddr.sll_protocol = htons(ETH_P_IPV6); - - laddr.sll_ifindex = if_nametoindex(cfg_ifname); - if (!laddr.sll_ifindex) - error(1, 0, "if_nametoindex %s", cfg_ifname); - - if (bind(fd, (void *)&laddr, sizeof(laddr))) - error(1, errno, "bind pf_packet"); -} - -static int recv_prepare_packet(void) -{ - int fd, one = 1; - - fd = socket(PF_PACKET, SOCK_DGRAM, 0); - if (fd == -1) - error(1, errno, "socket p"); - - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, - &cfg_rcvbuf, sizeof(cfg_rcvbuf))) - error(1, errno, "setsockopt SO_RCVBUF p"); - - /* enable auxdata to recv checksum status (valid vs unknown) */ - if (setsockopt(fd, SOL_PACKET, PACKET_AUXDATA, &one, sizeof(one))) - error(1, errno, "setsockopt auxdata"); - - /* install filter to restrict packet flow to match */ - recv_prepare_packet_filter(fd); - - /* bind to address family to start packet flow */ - recv_prepare_packet_bind(fd); - - return fd; -} - -static int recv_udp(int fd) -{ - static char buf[MAX_PAYLOAD_LEN]; - int ret, count = 0; - - while (1) { - ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); - if (ret == -1 && errno == EAGAIN) - break; - if (ret == -1) - error(1, errno, "recv r"); - - fprintf(stderr, "rx: udp: len=%u\n", ret); - count++; - } - - return count; -} - -static int recv_verify_csum(void *th, int len, uint16_t sport, uint16_t csum_field) -{ - uint16_t csum; - - csum = checksum(th, cfg_proto, len); - - fprintf(stderr, "rx: pkt: sport=%hu len=%u csum=0x%hx verify=0x%hx\n", - sport, len, csum_field, csum); - - /* csum must be zero unless cfg_bad_csum indicates bad csum */ - if (csum && !cfg_bad_csum) { - fprintf(stderr, "pkt: bad csum\n"); - return 1; - } else if (cfg_bad_csum && !csum) { - fprintf(stderr, "pkt: good csum, while bad expected\n"); - return 1; - } - - if (cfg_zero_sum && csum_field != 0xFFFF) { - fprintf(stderr, "pkt: zero csum: field should be 0xFFFF, is 0x%hx\n", csum_field); - return 1; - } - - return 0; -} - -static int recv_verify_packet_tcp(void *th, int len) -{ - struct tcphdr *tcph = th; - - if (len < sizeof(*tcph) || tcph->dest != htons(cfg_port_dst)) - return -1; - - return recv_verify_csum(th, len, ntohs(tcph->source), tcph->check); -} - -static int recv_verify_packet_udp_encap(void *th, int len) -{ - struct udp_encap_hdr *eh = th; - - if (len < sizeof(*eh) || eh->nexthdr != IPPROTO_TCP) - return -1; - - return recv_verify_packet_tcp(eh + 1, len - sizeof(*eh)); -} - -static int recv_verify_packet_udp(void *th, int len) -{ - struct udphdr *udph = th; - - if (len < sizeof(*udph)) - return -1; - - if (udph->dest != htons(cfg_port_dst)) - return -1; - - if (udph->source == htons(cfg_port_src_encap)) - return recv_verify_packet_udp_encap(udph + 1, - len - sizeof(*udph)); - - return recv_verify_csum(th, len, ntohs(udph->source), udph->check); -} - -static int recv_verify_packet_ipv4(void *nh, int len) -{ - struct iphdr *iph = nh; - uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; - - if (len < sizeof(*iph) || iph->protocol != proto) - return -1; - - iph_addr_p = &iph->saddr; - if (proto == IPPROTO_TCP) - return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph)); - else - return recv_verify_packet_udp(iph + 1, len - sizeof(*iph)); -} - -static int recv_verify_packet_ipv6(void *nh, int len) -{ - struct ipv6hdr *ip6h = nh; - uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; - - if (len < sizeof(*ip6h) || ip6h->nexthdr != proto) - return -1; - - iph_addr_p = &ip6h->saddr; - - if (proto == IPPROTO_TCP) - return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h)); - else - return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h)); -} - -/* return whether auxdata includes TP_STATUS_CSUM_VALID */ -static uint32_t recv_get_packet_csum_status(struct msghdr *msg) -{ - struct tpacket_auxdata *aux = NULL; - struct cmsghdr *cm; - - if (msg->msg_flags & MSG_CTRUNC) - error(1, 0, "cmsg: truncated"); - - for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) { - if (cm->cmsg_level != SOL_PACKET || - cm->cmsg_type != PACKET_AUXDATA) - error(1, 0, "cmsg: level=%d type=%d\n", - cm->cmsg_level, cm->cmsg_type); - - if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) - error(1, 0, "cmsg: len=%lu expected=%lu", - cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); - - aux = (void *)CMSG_DATA(cm); - } - - if (!aux) - error(1, 0, "cmsg: no auxdata"); - - return aux->tp_status; -} - -static int recv_packet(int fd) -{ - static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; - unsigned long total = 0, bad_csums = 0, bad_validations = 0; - char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; - struct pkt *buf = (void *)_buf; - struct msghdr msg = {0}; - uint32_t tp_status; - struct iovec iov; - int len, ret; - - iov.iov_base = _buf; - iov.iov_len = sizeof(_buf); - - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - msg.msg_control = ctrl; - msg.msg_controllen = sizeof(ctrl); - - while (1) { - msg.msg_flags = 0; - - len = recvmsg(fd, &msg, MSG_DONTWAIT); - if (len == -1 && errno == EAGAIN) - break; - if (len == -1) - error(1, errno, "recv p"); - - tp_status = recv_get_packet_csum_status(&msg); - - /* GRO might coalesce randomized packets. Such GSO packets are - * then reinitialized for csum offload (CHECKSUM_PARTIAL), with - * a pseudo csum. Do not try to validate these checksums. - */ - if (tp_status & TP_STATUS_CSUMNOTREADY) { - fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n"); - continue; - } - - if (cfg_family == PF_INET6) - ret = recv_verify_packet_ipv6(buf, len); - else - ret = recv_verify_packet_ipv4(buf, len); - - if (ret == -1 /* skip: non-matching */) - continue; - - total++; - if (ret == 1) - bad_csums++; - - /* Fail if kernel returns valid for known bad csum. - * Do not fail if kernel does not validate a good csum: - * Absence of validation does not imply invalid. - */ - if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) { - fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n"); - bad_validations++; - } - } - - if (bad_csums || bad_validations) - error(1, 0, "rx: errors at pf_packet: total=%lu bad_csums=%lu bad_valids=%lu\n", - total, bad_csums, bad_validations); - - return total; -} - -static void parse_args(int argc, char *const argv[]) -{ - const char *daddr = NULL, *saddr = NULL; - int c; - - while ((c = getopt(argc, argv, "46d:D:eEi:l:L:n:r:PRs:S:tTuUzZ")) != -1) { - switch (c) { - case '4': - cfg_family = PF_INET; - break; - case '6': - cfg_family = PF_INET6; - break; - case 'd': - cfg_mac_dst = optarg; - break; - case 'D': - daddr = optarg; - break; - case 'e': - cfg_encap = true; - break; - case 'E': - cfg_bad_csum = true; - break; - case 'i': - cfg_ifname = optarg; - break; - case 'l': - cfg_payload_len = strtol(optarg, NULL, 0); - break; - case 'L': - cfg_timeout_ms = strtol(optarg, NULL, 0) * 1000; - break; - case 'n': - cfg_num_pkt = strtol(optarg, NULL, 0); - break; - case 'r': - cfg_random_seed = strtol(optarg, NULL, 0); - break; - case 'P': - cfg_send_pfpacket = true; - break; - case 'R': - /* only Rx: used with two machine tests */ - cfg_do_tx = false; - break; - case 's': - cfg_mac_src = optarg; - break; - case 'S': - saddr = optarg; - break; - case 't': - cfg_proto = IPPROTO_TCP; - break; - case 'T': - /* only Tx: used with two machine tests */ - cfg_do_rx = false; - break; - case 'u': - cfg_proto = IPPROTO_UDP; - break; - case 'U': - /* send using real udp socket, - * to exercise tx checksum offload - */ - cfg_send_udp = true; - break; - case 'z': - cfg_zero_disable = true; - break; - case 'Z': - cfg_zero_sum = true; - break; - default: - error(1, 0, "unknown arg %c", c); - } - } - - if (!daddr || !saddr) - error(1, 0, "Must pass -D and -S "); - - if (cfg_do_tx && cfg_send_pfpacket && (!cfg_mac_src || !cfg_mac_dst)) - error(1, 0, "Transmit with pf_packet requires mac addresses"); - - if (cfg_payload_len > MAX_PAYLOAD_LEN) - error(1, 0, "Payload length exceeds max"); - - if (cfg_proto != IPPROTO_UDP && (cfg_zero_sum || cfg_zero_disable)) - error(1, 0, "Only UDP supports zero csum"); - - if (cfg_zero_sum && !cfg_send_udp) - error(1, 0, "Zero checksum conversion requires -U for tx csum offload"); - if (cfg_zero_sum && cfg_bad_csum) - error(1, 0, "Cannot combine zero checksum conversion and invalid checksum"); - if (cfg_zero_sum && cfg_random_seed) - error(1, 0, "Cannot combine zero checksum conversion with randomization"); - - if (cfg_family == PF_INET6) { - cfg_saddr6.sin6_port = htons(cfg_port_src); - cfg_daddr6.sin6_port = htons(cfg_port_dst); - - if (inet_pton(cfg_family, daddr, &cfg_daddr6.sin6_addr) != 1) - error(1, errno, "Cannot parse ipv6 -D"); - if (inet_pton(cfg_family, saddr, &cfg_saddr6.sin6_addr) != 1) - error(1, errno, "Cannot parse ipv6 -S"); - } else { - cfg_saddr4.sin_port = htons(cfg_port_src); - cfg_daddr4.sin_port = htons(cfg_port_dst); - - if (inet_pton(cfg_family, daddr, &cfg_daddr4.sin_addr) != 1) - error(1, errno, "Cannot parse ipv4 -D"); - if (inet_pton(cfg_family, saddr, &cfg_saddr4.sin_addr) != 1) - error(1, errno, "Cannot parse ipv4 -S"); - } - - if (cfg_do_tx && cfg_random_seed) { - /* special case: time-based seed */ - if (cfg_random_seed == 1) - cfg_random_seed = (unsigned int)gettimeofday_ms(); - srand(cfg_random_seed); - fprintf(stderr, "randomization seed: %u\n", cfg_random_seed); - } -} - -static void do_tx(void) -{ - static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; - char *buf; - int fd, len, i; - - buf = build_packet(_buf, sizeof(_buf), &len); - - if (cfg_send_pfpacket) - fd = open_packet(); - else if (cfg_send_udp) - fd = open_inet(SOCK_DGRAM, 0); - else - fd = open_inet(SOCK_RAW, IPPROTO_RAW); - - for (i = 0; i < cfg_num_pkt; i++) { - if (cfg_send_pfpacket) - send_packet(fd, buf, len); - else - send_inet(fd, buf, len); - - /* randomize each packet individually to increase coverage */ - if (cfg_random_seed) { - cfg_payload_len = rand() % MAX_PAYLOAD_LEN; - buf = build_packet(_buf, sizeof(_buf), &len); - } - } - - if (close(fd)) - error(1, errno, "close tx"); -} - -static void do_rx(int fdp, int fdr) -{ - unsigned long count_udp = 0, count_pkt = 0; - long tleft, tstop; - struct pollfd pfd; - - tstop = gettimeofday_ms() + cfg_timeout_ms; - tleft = cfg_timeout_ms; - - do { - pfd.events = POLLIN; - pfd.fd = fdp; - if (poll(&pfd, 1, tleft) == -1) - error(1, errno, "poll"); - - if (pfd.revents & POLLIN) - count_pkt += recv_packet(fdp); - - if (cfg_proto == IPPROTO_UDP) - count_udp += recv_udp(fdr); - - tleft = tstop - gettimeofday_ms(); - } while (tleft > 0); - - if (close(fdr)) - error(1, errno, "close r"); - if (close(fdp)) - error(1, errno, "close p"); - - if (count_pkt < cfg_num_pkt) - error(1, 0, "rx: missing packets at pf_packet: %lu < %u", - count_pkt, cfg_num_pkt); - - if (cfg_proto == IPPROTO_UDP) { - if (cfg_bad_csum && count_udp) - error(1, 0, "rx: unexpected packets at udp"); - if (!cfg_bad_csum && !count_udp) - error(1, 0, "rx: missing packets at udp"); - } -} - -int main(int argc, char *const argv[]) -{ - int fdp = -1, fdr = -1; /* -1 to silence -Wmaybe-uninitialized */ - - parse_args(argc, argv); - - /* open receive sockets before transmitting */ - if (cfg_do_rx) { - fdp = recv_prepare_packet(); - fdr = recv_prepare_udp(); - } - - if (cfg_do_tx) - do_tx(); - - if (cfg_do_rx) - do_rx(fdp, fdr); - - fprintf(stderr, "OK\n"); - return 0; -} diff --git a/tools/testing/selftests/net/lib/.gitignore b/tools/testing/selftests/net/lib/.gitignore new file mode 100644 index 000000000000..1ebc6187f421 --- /dev/null +++ b/tools/testing/selftests/net/lib/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +csum diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index 48557e6250dd..82c3264b115e 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -1,8 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES) +# Additional include paths needed by kselftest.h +CFLAGS += -I../../ + TEST_FILES := ../../../../../Documentation/netlink/specs TEST_FILES += ../../../../net/ynl +TEST_GEN_FILES += csum + TEST_INCLUDES := $(wildcard py/*.py) include ../../lib.mk diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c new file mode 100644 index 000000000000..b9f3fc3c3426 --- /dev/null +++ b/tools/testing/selftests/net/lib/csum.c @@ -0,0 +1,1000 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Test hardware checksum offload: Rx + Tx, IPv4 + IPv6, TCP + UDP. + * + * The test runs on two machines to exercise the NIC. For this reason it + * is not integrated in kselftests. + * + * CMD=$((./csum -[46] -[tu] -S $SADDR -D $DADDR -[RT] -r 1 $EXTRA_ARGS)) + * + * Rx: + * + * The sender sends packets with a known checksum field using PF_INET(6) + * SOCK_RAW sockets. + * + * good packet: $CMD [-t] + * bad packet: $CMD [-t] -E + * + * The receiver reads UDP packets with a UDP socket. This is not an + * option for TCP packets ('-t'). Optionally insert an iptables filter + * to avoid these entering the real protocol stack. + * + * The receiver also reads all packets with a PF_PACKET socket, to + * observe whether both good and bad packets arrive on the host. And to + * read the optional TP_STATUS_CSUM_VALID bit. This requires setting + * option PACKET_AUXDATA, and works only for CHECKSUM_UNNECESSARY. + * + * Tx: + * + * The sender needs to build CHECKSUM_PARTIAL packets to exercise tx + * checksum offload. + * + * The sender can sends packets with a UDP socket. + * + * Optionally crafts a packet that sums up to zero to verify that the + * device writes negative zero 0xFFFF in this case to distinguish from + * 0x0000 (checksum disabled), as required by RFC 768. Hit this case + * by choosing a specific source port. + * + * good packet: $CMD -U + * zero csum: $CMD -U -Z + * + * The sender can also build packets with PF_PACKET with PACKET_VNET_HDR, + * to cover more protocols. PF_PACKET requires passing src and dst mac + * addresses. + * + * good packet: $CMD -s $smac -d $dmac -p [-t] + * + * Argument '-z' sends UDP packets with a 0x000 checksum disabled field, + * to verify that the NIC passes these packets unmodified. + * + * Argument '-e' adds a transport mode encapsulation header between + * network and transport header. This will fail for devices that parse + * headers. Should work on devices that implement protocol agnostic tx + * checksum offload (NETIF_F_HW_CSUM). + * + * Argument '-r $SEED' optionally randomizes header, payload and length + * to increase coverage between packets sent. SEED 1 further chooses a + * different seed for each run (and logs this for reproducibility). It + * is advised to enable this for extra coverage in continuous testing. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" + +static bool cfg_bad_csum; +static int cfg_family = PF_INET6; +static int cfg_num_pkt = 4; +static bool cfg_do_rx = true; +static bool cfg_do_tx = true; +static bool cfg_encap; +static char *cfg_ifname = "eth0"; +static char *cfg_mac_dst; +static char *cfg_mac_src; +static int cfg_proto = IPPROTO_UDP; +static int cfg_payload_char = 'a'; +static int cfg_payload_len = 100; +static uint16_t cfg_port_dst = 34000; +static uint16_t cfg_port_src = 33000; +static uint16_t cfg_port_src_encap = 33001; +static unsigned int cfg_random_seed; +static int cfg_rcvbuf = 1 << 22; /* be able to queue large cfg_num_pkt */ +static bool cfg_send_pfpacket; +static bool cfg_send_udp; +static int cfg_timeout_ms = 2000; +static bool cfg_zero_disable; /* skip checksum: set to zero (udp only) */ +static bool cfg_zero_sum; /* create packet that adds up to zero */ + +static struct sockaddr_in cfg_daddr4 = {.sin_family = AF_INET}; +static struct sockaddr_in cfg_saddr4 = {.sin_family = AF_INET}; +static struct sockaddr_in6 cfg_daddr6 = {.sin6_family = AF_INET6}; +static struct sockaddr_in6 cfg_saddr6 = {.sin6_family = AF_INET6}; + +#define ENC_HEADER_LEN (sizeof(struct udphdr) + sizeof(struct udp_encap_hdr)) +#define MAX_HEADER_LEN (sizeof(struct ipv6hdr) + ENC_HEADER_LEN + sizeof(struct tcphdr)) +#define MAX_PAYLOAD_LEN 1024 + +/* Trivial demo encap. Stand-in for transport layer protocols like ESP or PSP */ +struct udp_encap_hdr { + uint8_t nexthdr; + uint8_t padding[3]; +}; + +/* Ipaddrs, for pseudo csum. Global var is ugly, pass through funcs was worse */ +static void *iph_addr_p; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000UL) + (tv.tv_usec / 1000UL); +} + +static uint32_t checksum_nofold(char *data, size_t len, uint32_t sum) +{ + uint16_t *words = (uint16_t *)data; + int i; + + for (i = 0; i < len / 2; i++) + sum += words[i]; + + if (len & 1) + sum += ((unsigned char *)data)[len - 1]; + + return sum; +} + +static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) +{ + sum = checksum_nofold(data, len, sum); + + while (sum > 0xFFFF) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum; +} + +static uint16_t checksum(void *th, uint16_t proto, size_t len) +{ + uint32_t sum; + int alen; + + alen = cfg_family == PF_INET6 ? 32 : 8; + + sum = checksum_nofold(iph_addr_p, alen, 0); + sum += htons(proto); + sum += htons(len); + + /* With CHECKSUM_PARTIAL kernel expects non-inverted pseudo csum */ + if (cfg_do_tx && cfg_send_pfpacket) + return ~checksum_fold(NULL, 0, sum); + else + return checksum_fold(th, len, sum); +} + +static void *build_packet_ipv4(void *_iph, uint8_t proto, unsigned int len) +{ + struct iphdr *iph = _iph; + + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->ttl = 8; + iph->protocol = proto; + iph->saddr = cfg_saddr4.sin_addr.s_addr; + iph->daddr = cfg_daddr4.sin_addr.s_addr; + iph->tot_len = htons(sizeof(*iph) + len); + iph->check = checksum_fold(iph, sizeof(*iph), 0); + + iph_addr_p = &iph->saddr; + + return iph + 1; +} + +static void *build_packet_ipv6(void *_ip6h, uint8_t proto, unsigned int len) +{ + struct ipv6hdr *ip6h = _ip6h; + + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(len); + ip6h->nexthdr = proto; + ip6h->hop_limit = 64; + ip6h->saddr = cfg_saddr6.sin6_addr; + ip6h->daddr = cfg_daddr6.sin6_addr; + + iph_addr_p = &ip6h->saddr; + + return ip6h + 1; +} + +static void *build_packet_udp(void *_uh) +{ + struct udphdr *uh = _uh; + + uh->source = htons(cfg_port_src); + uh->dest = htons(cfg_port_dst); + uh->len = htons(sizeof(*uh) + cfg_payload_len); + uh->check = 0; + + /* choose source port so that uh->check adds up to zero */ + if (cfg_zero_sum) { + uh->source = 0; + uh->source = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); + + fprintf(stderr, "tx: changing sport: %hu -> %hu\n", + cfg_port_src, ntohs(uh->source)); + cfg_port_src = ntohs(uh->source); + } + + if (cfg_zero_disable) + uh->check = 0; + else + uh->check = checksum(uh, IPPROTO_UDP, sizeof(*uh) + cfg_payload_len); + + if (cfg_bad_csum) + uh->check = ~uh->check; + + fprintf(stderr, "tx: sending checksum: 0x%x\n", uh->check); + return uh + 1; +} + +static void *build_packet_tcp(void *_th) +{ + struct tcphdr *th = _th; + + th->source = htons(cfg_port_src); + th->dest = htons(cfg_port_dst); + th->doff = 5; + th->check = 0; + + th->check = checksum(th, IPPROTO_TCP, sizeof(*th) + cfg_payload_len); + + if (cfg_bad_csum) + th->check = ~th->check; + + fprintf(stderr, "tx: sending checksum: 0x%x\n", th->check); + return th + 1; +} + +static char *build_packet_udp_encap(void *_uh) +{ + struct udphdr *uh = _uh; + struct udp_encap_hdr *eh = _uh + sizeof(*uh); + + /* outer dst == inner dst, to simplify BPF filter + * outer src != inner src, to demultiplex on recv + */ + uh->dest = htons(cfg_port_dst); + uh->source = htons(cfg_port_src_encap); + uh->check = 0; + uh->len = htons(sizeof(*uh) + + sizeof(*eh) + + sizeof(struct tcphdr) + + cfg_payload_len); + + eh->nexthdr = IPPROTO_TCP; + + return build_packet_tcp(eh + 1); +} + +static char *build_packet(char *buf, int max_len, int *len) +{ + uint8_t proto; + char *off; + int tlen; + + if (cfg_random_seed) { + int *buf32 = (void *)buf; + int i; + + for (i = 0; i < (max_len / sizeof(int)); i++) + buf32[i] = rand(); + } else { + memset(buf, cfg_payload_char, max_len); + } + + if (cfg_proto == IPPROTO_UDP) + tlen = sizeof(struct udphdr) + cfg_payload_len; + else + tlen = sizeof(struct tcphdr) + cfg_payload_len; + + if (cfg_encap) { + proto = IPPROTO_UDP; + tlen += ENC_HEADER_LEN; + } else { + proto = cfg_proto; + } + + if (cfg_family == PF_INET) + off = build_packet_ipv4(buf, proto, tlen); + else + off = build_packet_ipv6(buf, proto, tlen); + + if (cfg_encap) + off = build_packet_udp_encap(off); + else if (cfg_proto == IPPROTO_UDP) + off = build_packet_udp(off); + else + off = build_packet_tcp(off); + + /* only pass the payload, but still compute headers for cfg_zero_sum */ + if (cfg_send_udp) { + *len = cfg_payload_len; + return off; + } + + *len = off - buf + cfg_payload_len; + return buf; +} + +static int open_inet(int ipproto, int protocol) +{ + int fd; + + fd = socket(cfg_family, ipproto, protocol); + if (fd == -1) + error(1, errno, "socket inet"); + + if (cfg_family == PF_INET6) { + /* may have been updated by cfg_zero_sum */ + cfg_saddr6.sin6_port = htons(cfg_port_src); + + if (bind(fd, (void *)&cfg_saddr6, sizeof(cfg_saddr6))) + error(1, errno, "bind dgram 6"); + if (connect(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) + error(1, errno, "connect dgram 6"); + } else { + /* may have been updated by cfg_zero_sum */ + cfg_saddr4.sin_port = htons(cfg_port_src); + + if (bind(fd, (void *)&cfg_saddr4, sizeof(cfg_saddr4))) + error(1, errno, "bind dgram 4"); + if (connect(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) + error(1, errno, "connect dgram 4"); + } + + return fd; +} + +static int open_packet(void) +{ + int fd, one = 1; + + fd = socket(PF_PACKET, SOCK_RAW, 0); + if (fd == -1) + error(1, errno, "socket packet"); + + if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one))) + error(1, errno, "setsockopt packet_vnet_ndr"); + + return fd; +} + +static void send_inet(int fd, const char *buf, int len) +{ + int ret; + + ret = write(fd, buf, len); + if (ret == -1) + error(1, errno, "write"); + if (ret != len) + error(1, 0, "write: %d", ret); +} + +static void eth_str_to_addr(const char *str, unsigned char *eth) +{ + if (sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + ð[0], ð[1], ð[2], ð[3], ð[4], ð[5]) != 6) + error(1, 0, "cannot parse mac addr %s", str); +} + +static void send_packet(int fd, const char *buf, int len) +{ + struct virtio_net_hdr vh = {0}; + struct sockaddr_ll addr = {0}; + struct msghdr msg = {0}; + struct ethhdr eth; + struct iovec iov[3]; + int ret; + + addr.sll_family = AF_PACKET; + addr.sll_halen = ETH_ALEN; + addr.sll_ifindex = if_nametoindex(cfg_ifname); + if (!addr.sll_ifindex) + error(1, errno, "if_nametoindex %s", cfg_ifname); + + vh.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + if (cfg_family == PF_INET6) { + vh.csum_start = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); + addr.sll_protocol = htons(ETH_P_IPV6); + } else { + vh.csum_start = sizeof(struct ethhdr) + sizeof(struct iphdr); + addr.sll_protocol = htons(ETH_P_IP); + } + + if (cfg_encap) + vh.csum_start += ENC_HEADER_LEN; + + if (cfg_proto == IPPROTO_TCP) { + vh.csum_offset = __builtin_offsetof(struct tcphdr, check); + vh.hdr_len = vh.csum_start + sizeof(struct tcphdr); + } else { + vh.csum_offset = __builtin_offsetof(struct udphdr, check); + vh.hdr_len = vh.csum_start + sizeof(struct udphdr); + } + + eth_str_to_addr(cfg_mac_src, eth.h_source); + eth_str_to_addr(cfg_mac_dst, eth.h_dest); + eth.h_proto = addr.sll_protocol; + + iov[0].iov_base = &vh; + iov[0].iov_len = sizeof(vh); + + iov[1].iov_base = ð + iov[1].iov_len = sizeof(eth); + + iov[2].iov_base = (void *)buf; + iov[2].iov_len = len; + + msg.msg_iov = iov; + msg.msg_iovlen = ARRAY_SIZE(iov); + + msg.msg_name = &addr; + msg.msg_namelen = sizeof(addr); + + ret = sendmsg(fd, &msg, 0); + if (ret == -1) + error(1, errno, "sendmsg packet"); + if (ret != sizeof(vh) + sizeof(eth) + len) + error(1, errno, "sendmsg packet: %u", ret); +} + +static int recv_prepare_udp(void) +{ + int fd; + + fd = socket(cfg_family, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket r"); + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &cfg_rcvbuf, sizeof(cfg_rcvbuf))) + error(1, errno, "setsockopt SO_RCVBUF r"); + + if (cfg_family == PF_INET6) { + if (bind(fd, (void *)&cfg_daddr6, sizeof(cfg_daddr6))) + error(1, errno, "bind r"); + } else { + if (bind(fd, (void *)&cfg_daddr4, sizeof(cfg_daddr4))) + error(1, errno, "bind r"); + } + + return fd; +} + +/* Filter out all traffic that is not cfg_proto with our destination port. + * + * Otherwise background noise may cause PF_PACKET receive queue overflow, + * dropping the expected packets and failing the test. + */ +static void __recv_prepare_packet_filter(int fd, int off_nexthdr, int off_dport) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_nexthdr), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_encap ? IPPROTO_UDP : cfg_proto, 0, 2), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_port_dst, 1, 0), + BPF_STMT(BPF_RET + BPF_K, 0), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = ARRAY_SIZE(filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +static void recv_prepare_packet_filter(int fd) +{ + const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ + + if (cfg_family == AF_INET) + __recv_prepare_packet_filter(fd, offsetof(struct iphdr, protocol), + sizeof(struct iphdr) + off_dport); + else + __recv_prepare_packet_filter(fd, offsetof(struct ipv6hdr, nexthdr), + sizeof(struct ipv6hdr) + off_dport); +} + +static void recv_prepare_packet_bind(int fd) +{ + struct sockaddr_ll laddr = {0}; + + laddr.sll_family = AF_PACKET; + + if (cfg_family == PF_INET) + laddr.sll_protocol = htons(ETH_P_IP); + else + laddr.sll_protocol = htons(ETH_P_IPV6); + + laddr.sll_ifindex = if_nametoindex(cfg_ifname); + if (!laddr.sll_ifindex) + error(1, 0, "if_nametoindex %s", cfg_ifname); + + if (bind(fd, (void *)&laddr, sizeof(laddr))) + error(1, errno, "bind pf_packet"); +} + +static int recv_prepare_packet(void) +{ + int fd, one = 1; + + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket p"); + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &cfg_rcvbuf, sizeof(cfg_rcvbuf))) + error(1, errno, "setsockopt SO_RCVBUF p"); + + /* enable auxdata to recv checksum status (valid vs unknown) */ + if (setsockopt(fd, SOL_PACKET, PACKET_AUXDATA, &one, sizeof(one))) + error(1, errno, "setsockopt auxdata"); + + /* install filter to restrict packet flow to match */ + recv_prepare_packet_filter(fd); + + /* bind to address family to start packet flow */ + recv_prepare_packet_bind(fd); + + return fd; +} + +static int recv_udp(int fd) +{ + static char buf[MAX_PAYLOAD_LEN]; + int ret, count = 0; + + while (1) { + ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); + if (ret == -1 && errno == EAGAIN) + break; + if (ret == -1) + error(1, errno, "recv r"); + + fprintf(stderr, "rx: udp: len=%u\n", ret); + count++; + } + + return count; +} + +static int recv_verify_csum(void *th, int len, uint16_t sport, uint16_t csum_field) +{ + uint16_t csum; + + csum = checksum(th, cfg_proto, len); + + fprintf(stderr, "rx: pkt: sport=%hu len=%u csum=0x%hx verify=0x%hx\n", + sport, len, csum_field, csum); + + /* csum must be zero unless cfg_bad_csum indicates bad csum */ + if (csum && !cfg_bad_csum) { + fprintf(stderr, "pkt: bad csum\n"); + return 1; + } else if (cfg_bad_csum && !csum) { + fprintf(stderr, "pkt: good csum, while bad expected\n"); + return 1; + } + + if (cfg_zero_sum && csum_field != 0xFFFF) { + fprintf(stderr, "pkt: zero csum: field should be 0xFFFF, is 0x%hx\n", csum_field); + return 1; + } + + return 0; +} + +static int recv_verify_packet_tcp(void *th, int len) +{ + struct tcphdr *tcph = th; + + if (len < sizeof(*tcph) || tcph->dest != htons(cfg_port_dst)) + return -1; + + return recv_verify_csum(th, len, ntohs(tcph->source), tcph->check); +} + +static int recv_verify_packet_udp_encap(void *th, int len) +{ + struct udp_encap_hdr *eh = th; + + if (len < sizeof(*eh) || eh->nexthdr != IPPROTO_TCP) + return -1; + + return recv_verify_packet_tcp(eh + 1, len - sizeof(*eh)); +} + +static int recv_verify_packet_udp(void *th, int len) +{ + struct udphdr *udph = th; + + if (len < sizeof(*udph)) + return -1; + + if (udph->dest != htons(cfg_port_dst)) + return -1; + + if (udph->source == htons(cfg_port_src_encap)) + return recv_verify_packet_udp_encap(udph + 1, + len - sizeof(*udph)); + + return recv_verify_csum(th, len, ntohs(udph->source), udph->check); +} + +static int recv_verify_packet_ipv4(void *nh, int len) +{ + struct iphdr *iph = nh; + uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + + if (len < sizeof(*iph) || iph->protocol != proto) + return -1; + + iph_addr_p = &iph->saddr; + if (proto == IPPROTO_TCP) + return recv_verify_packet_tcp(iph + 1, len - sizeof(*iph)); + else + return recv_verify_packet_udp(iph + 1, len - sizeof(*iph)); +} + +static int recv_verify_packet_ipv6(void *nh, int len) +{ + struct ipv6hdr *ip6h = nh; + uint16_t proto = cfg_encap ? IPPROTO_UDP : cfg_proto; + + if (len < sizeof(*ip6h) || ip6h->nexthdr != proto) + return -1; + + iph_addr_p = &ip6h->saddr; + + if (proto == IPPROTO_TCP) + return recv_verify_packet_tcp(ip6h + 1, len - sizeof(*ip6h)); + else + return recv_verify_packet_udp(ip6h + 1, len - sizeof(*ip6h)); +} + +/* return whether auxdata includes TP_STATUS_CSUM_VALID */ +static uint32_t recv_get_packet_csum_status(struct msghdr *msg) +{ + struct tpacket_auxdata *aux = NULL; + struct cmsghdr *cm; + + if (msg->msg_flags & MSG_CTRUNC) + error(1, 0, "cmsg: truncated"); + + for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) { + if (cm->cmsg_level != SOL_PACKET || + cm->cmsg_type != PACKET_AUXDATA) + error(1, 0, "cmsg: level=%d type=%d\n", + cm->cmsg_level, cm->cmsg_type); + + if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) + error(1, 0, "cmsg: len=%lu expected=%lu", + cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); + + aux = (void *)CMSG_DATA(cm); + } + + if (!aux) + error(1, 0, "cmsg: no auxdata"); + + return aux->tp_status; +} + +static int recv_packet(int fd) +{ + static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; + unsigned long total = 0, bad_csums = 0, bad_validations = 0; + char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; + struct pkt *buf = (void *)_buf; + struct msghdr msg = {0}; + uint32_t tp_status; + struct iovec iov; + int len, ret; + + iov.iov_base = _buf; + iov.iov_len = sizeof(_buf); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + while (1) { + msg.msg_flags = 0; + + len = recvmsg(fd, &msg, MSG_DONTWAIT); + if (len == -1 && errno == EAGAIN) + break; + if (len == -1) + error(1, errno, "recv p"); + + tp_status = recv_get_packet_csum_status(&msg); + + /* GRO might coalesce randomized packets. Such GSO packets are + * then reinitialized for csum offload (CHECKSUM_PARTIAL), with + * a pseudo csum. Do not try to validate these checksums. + */ + if (tp_status & TP_STATUS_CSUMNOTREADY) { + fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n"); + continue; + } + + if (cfg_family == PF_INET6) + ret = recv_verify_packet_ipv6(buf, len); + else + ret = recv_verify_packet_ipv4(buf, len); + + if (ret == -1 /* skip: non-matching */) + continue; + + total++; + if (ret == 1) + bad_csums++; + + /* Fail if kernel returns valid for known bad csum. + * Do not fail if kernel does not validate a good csum: + * Absence of validation does not imply invalid. + */ + if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) { + fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n"); + bad_validations++; + } + } + + if (bad_csums || bad_validations) + error(1, 0, "rx: errors at pf_packet: total=%lu bad_csums=%lu bad_valids=%lu\n", + total, bad_csums, bad_validations); + + return total; +} + +static void parse_args(int argc, char *const argv[]) +{ + const char *daddr = NULL, *saddr = NULL; + int c; + + while ((c = getopt(argc, argv, "46d:D:eEi:l:L:n:r:PRs:S:tTuUzZ")) != -1) { + switch (c) { + case '4': + cfg_family = PF_INET; + break; + case '6': + cfg_family = PF_INET6; + break; + case 'd': + cfg_mac_dst = optarg; + break; + case 'D': + daddr = optarg; + break; + case 'e': + cfg_encap = true; + break; + case 'E': + cfg_bad_csum = true; + break; + case 'i': + cfg_ifname = optarg; + break; + case 'l': + cfg_payload_len = strtol(optarg, NULL, 0); + break; + case 'L': + cfg_timeout_ms = strtol(optarg, NULL, 0) * 1000; + break; + case 'n': + cfg_num_pkt = strtol(optarg, NULL, 0); + break; + case 'r': + cfg_random_seed = strtol(optarg, NULL, 0); + break; + case 'P': + cfg_send_pfpacket = true; + break; + case 'R': + /* only Rx: used with two machine tests */ + cfg_do_tx = false; + break; + case 's': + cfg_mac_src = optarg; + break; + case 'S': + saddr = optarg; + break; + case 't': + cfg_proto = IPPROTO_TCP; + break; + case 'T': + /* only Tx: used with two machine tests */ + cfg_do_rx = false; + break; + case 'u': + cfg_proto = IPPROTO_UDP; + break; + case 'U': + /* send using real udp socket, + * to exercise tx checksum offload + */ + cfg_send_udp = true; + break; + case 'z': + cfg_zero_disable = true; + break; + case 'Z': + cfg_zero_sum = true; + break; + default: + error(1, 0, "unknown arg %c", c); + } + } + + if (!daddr || !saddr) + error(1, 0, "Must pass -D and -S "); + + if (cfg_do_tx && cfg_send_pfpacket && (!cfg_mac_src || !cfg_mac_dst)) + error(1, 0, "Transmit with pf_packet requires mac addresses"); + + if (cfg_payload_len > MAX_PAYLOAD_LEN) + error(1, 0, "Payload length exceeds max"); + + if (cfg_proto != IPPROTO_UDP && (cfg_zero_sum || cfg_zero_disable)) + error(1, 0, "Only UDP supports zero csum"); + + if (cfg_zero_sum && !cfg_send_udp) + error(1, 0, "Zero checksum conversion requires -U for tx csum offload"); + if (cfg_zero_sum && cfg_bad_csum) + error(1, 0, "Cannot combine zero checksum conversion and invalid checksum"); + if (cfg_zero_sum && cfg_random_seed) + error(1, 0, "Cannot combine zero checksum conversion with randomization"); + + if (cfg_family == PF_INET6) { + cfg_saddr6.sin6_port = htons(cfg_port_src); + cfg_daddr6.sin6_port = htons(cfg_port_dst); + + if (inet_pton(cfg_family, daddr, &cfg_daddr6.sin6_addr) != 1) + error(1, errno, "Cannot parse ipv6 -D"); + if (inet_pton(cfg_family, saddr, &cfg_saddr6.sin6_addr) != 1) + error(1, errno, "Cannot parse ipv6 -S"); + } else { + cfg_saddr4.sin_port = htons(cfg_port_src); + cfg_daddr4.sin_port = htons(cfg_port_dst); + + if (inet_pton(cfg_family, daddr, &cfg_daddr4.sin_addr) != 1) + error(1, errno, "Cannot parse ipv4 -D"); + if (inet_pton(cfg_family, saddr, &cfg_saddr4.sin_addr) != 1) + error(1, errno, "Cannot parse ipv4 -S"); + } + + if (cfg_do_tx && cfg_random_seed) { + /* special case: time-based seed */ + if (cfg_random_seed == 1) + cfg_random_seed = (unsigned int)gettimeofday_ms(); + srand(cfg_random_seed); + fprintf(stderr, "randomization seed: %u\n", cfg_random_seed); + } +} + +static void do_tx(void) +{ + static char _buf[MAX_HEADER_LEN + MAX_PAYLOAD_LEN]; + char *buf; + int fd, len, i; + + buf = build_packet(_buf, sizeof(_buf), &len); + + if (cfg_send_pfpacket) + fd = open_packet(); + else if (cfg_send_udp) + fd = open_inet(SOCK_DGRAM, 0); + else + fd = open_inet(SOCK_RAW, IPPROTO_RAW); + + for (i = 0; i < cfg_num_pkt; i++) { + if (cfg_send_pfpacket) + send_packet(fd, buf, len); + else + send_inet(fd, buf, len); + + /* randomize each packet individually to increase coverage */ + if (cfg_random_seed) { + cfg_payload_len = rand() % MAX_PAYLOAD_LEN; + buf = build_packet(_buf, sizeof(_buf), &len); + } + } + + if (close(fd)) + error(1, errno, "close tx"); +} + +static void do_rx(int fdp, int fdr) +{ + unsigned long count_udp = 0, count_pkt = 0; + long tleft, tstop; + struct pollfd pfd; + + tstop = gettimeofday_ms() + cfg_timeout_ms; + tleft = cfg_timeout_ms; + + do { + pfd.events = POLLIN; + pfd.fd = fdp; + if (poll(&pfd, 1, tleft) == -1) + error(1, errno, "poll"); + + if (pfd.revents & POLLIN) + count_pkt += recv_packet(fdp); + + if (cfg_proto == IPPROTO_UDP) + count_udp += recv_udp(fdr); + + tleft = tstop - gettimeofday_ms(); + } while (tleft > 0); + + if (close(fdr)) + error(1, errno, "close r"); + if (close(fdp)) + error(1, errno, "close p"); + + if (count_pkt < cfg_num_pkt) + error(1, 0, "rx: missing packets at pf_packet: %lu < %u", + count_pkt, cfg_num_pkt); + + if (cfg_proto == IPPROTO_UDP) { + if (cfg_bad_csum && count_udp) + error(1, 0, "rx: unexpected packets at udp"); + if (!cfg_bad_csum && !count_udp) + error(1, 0, "rx: missing packets at udp"); + } +} + +int main(int argc, char *const argv[]) +{ + int fdp = -1, fdr = -1; /* -1 to silence -Wmaybe-uninitialized */ + + parse_args(argc, argv); + + /* open receive sockets before transmitting */ + if (cfg_do_rx) { + fdp = recv_prepare_packet(); + fdr = recv_prepare_udp(); + } + + if (cfg_do_tx) + do_tx(); + + if (cfg_do_rx) + do_rx(fdp, fdr); + + fprintf(stderr, "OK\n"); + return 0; +} -- cgit v1.2.3 From 1cf2704242180351d156fb48c334b319ae6b0759 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 7 May 2024 09:32:28 -0700 Subject: net: selftest: add test for netdev netlink queue-get API Add a selftest for netdev generic netlink. For now there is only a single test that exercises the `queue-get` API. The test works with netdevsim by default or with a real device by setting NETIF. Add a timeout param to cmd() since ethtool -L can take a long time on real devices. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20240507163228.2066817-3-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 1 + tools/testing/selftests/drivers/net/lib/py/env.py | 6 ++- tools/testing/selftests/drivers/net/queues.py | 66 +++++++++++++++++++++++ tools/testing/selftests/net/lib/py/nsim.py | 4 +- tools/testing/selftests/net/lib/py/utils.py | 8 +-- 5 files changed, 77 insertions(+), 8 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/queues.py (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 754ec643768a..e54f382bcb02 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -4,6 +4,7 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) TEST_PROGS := \ ping.py \ + queues.py \ stats.py \ # end of TEST_PROGS diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 5c8f695b2536..edcedd7bffab 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -36,7 +36,7 @@ class NetDrvEnv: """ Class for a single NIC / host env, with no remote end """ - def __init__(self, src_path): + def __init__(self, src_path, **kwargs): self._ns = None self.env = _load_env_file(src_path) @@ -44,11 +44,13 @@ class NetDrvEnv: if 'NETIF' in self.env: self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] else: - self._ns = NetdevSimDev() + self._ns = NetdevSimDev(**kwargs) self.dev = self._ns.nsims[0].dev self.ifindex = self.dev['ifindex'] def __enter__(self): + ip(f"link set dev {self.dev['ifname']} up") + return self def __exit__(self, ex_type, ex_value, ex_tb): diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py new file mode 100755 index 000000000000..30f29096e27c --- /dev/null +++ b/tools/testing/selftests/drivers/net/queues.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily +from lib.py import NetDrvEnv +from lib.py import cmd +import glob + + +def sys_get_queues(ifname) -> int: + folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*') + return len(folders) + + +def nl_get_queues(cfg, nl): + queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) + if queues: + return len([q for q in queues if q['type'] == 'rx']) + return None + + +def get_queues(cfg, nl) -> None: + queues = nl_get_queues(cfg, nl) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + expected = sys_get_queues(cfg.dev['ifname']) + ksft_eq(queues, expected) + + +def addremove_queues(cfg, nl) -> None: + queues = nl_get_queues(cfg, nl) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + curr_queues = sys_get_queues(cfg.dev['ifname']) + if curr_queues == 1: + raise KsftSkipEx('cannot decrement queue: already at 1') + + netnl = EthtoolFamily() + channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + if channels['combined-count'] == 0: + rx_type = 'rx' + else: + rx_type = 'combined' + + expected = curr_queues - 1 + cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) + queues = nl_get_queues(cfg, nl) + ksft_eq(queues, expected) + + expected = curr_queues + cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) + queues = nl_get_queues(cfg, nl) + ksft_eq(queues, expected) + + +def main() -> None: + with NetDrvEnv(__file__, queue_count=3) as cfg: + ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py index 06896cdf7c18..f571a8b3139b 100644 --- a/tools/testing/selftests/net/lib/py/nsim.py +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -49,7 +49,7 @@ class NetdevSimDev: with open(fullpath, "w") as f: f.write(val) - def __init__(self, port_count=1, ns=None): + def __init__(self, port_count=1, queue_count=1, ns=None): # nsim will spawn in init_net, we'll set to actual ns once we switch it there self.ns = None @@ -59,7 +59,7 @@ class NetdevSimDev: addr = random.randrange(1 << 15) while True: try: - self.ctrl_write("new_device", "%u %u" % (addr, port_count)) + self.ctrl_write("new_device", "%u %u %u" % (addr, port_count, queue_count)) except OSError as e: if e.errno == errno.ENOSPC: addr = random.randrange(1 << 15) diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index ec8b086b4fcb..0540ea24921d 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -8,7 +8,7 @@ import time class cmd: - def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None): + def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None, timeout=5): if ns: comm = f'ip netns exec {ns} ' + comm @@ -23,15 +23,15 @@ class cmd: self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not background: - self.process(terminate=False, fail=fail) + self.process(terminate=False, fail=fail, timeout=timeout) - def process(self, terminate=True, fail=None): + def process(self, terminate=True, fail=None, timeout=5): if fail is None: fail = not terminate if terminate: self.proc.terminate() - stdout, stderr = self.proc.communicate(timeout=5) + stdout, stderr = self.proc.communicate(timeout) self.stdout = stdout.decode("utf-8") self.stderr = stderr.decode("utf-8") self.proc.stdout.close() -- cgit v1.2.3 From 41ee9b33e94a2457e936f0cc7423005902f36b67 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 2 May 2024 23:35:26 +0000 Subject: KVM: selftests: arm64: Rename helper in set_id_regs to imply VM scope Prepare for a later change that'll cram in per-vCPU feature ID test cases by renaming the current test case. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240502233529.1958459-5-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 16e2338686c1..3d0ce49f9b78 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -374,7 +374,7 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, TEST_ASSERT_EQ(val, old_val); } -static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) +static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) { uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; struct reg_mask_range range = { @@ -476,7 +476,7 @@ int main(void) ksft_set_plan(ftr_cnt); - test_user_set_reg(vcpu, aarch64_only); + test_vm_ftr_id_regs(vcpu, aarch64_only); test_guest_reg_read(vcpu); kvm_vm_free(vm); -- cgit v1.2.3 From 46247a317f403e52d51928f0e1b675cffbd1046c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 2 May 2024 23:35:27 +0000 Subject: KVM: selftests: arm64: Store expected register value in set_id_regs Rather than comparing against what is returned by the ioctl, store expected values for the feature ID registers in a table and compare with that instead. This will prove useful for subsequent tests involving vCPU reset. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240502233529.1958459-6-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 27 +++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 3d0ce49f9b78..7c8d33fa2ae6 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -327,8 +327,8 @@ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) return ftr; } -static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, - const struct reg_ftr_bits *ftr_bits) +static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, + const struct reg_ftr_bits *ftr_bits) { uint8_t shift = ftr_bits->shift; uint64_t mask = ftr_bits->mask; @@ -346,6 +346,8 @@ static void test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, vcpu_set_reg(vcpu, reg, val); vcpu_get_reg(vcpu, reg, &new_val); TEST_ASSERT_EQ(new_val, val); + + return new_val; } static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, @@ -374,6 +376,14 @@ static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, TEST_ASSERT_EQ(val, old_val); } +static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + +#define encoding_to_range_idx(encoding) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding), \ + sys_reg_CRn(encoding), sys_reg_CRm(encoding), \ + sys_reg_Op2(encoding)) + + static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) { uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; @@ -398,9 +408,7 @@ static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) int idx; /* Get the index to masks array for the idreg */ - idx = KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(reg_id), sys_reg_Op1(reg_id), - sys_reg_CRn(reg_id), sys_reg_CRm(reg_id), - sys_reg_Op2(reg_id)); + idx = encoding_to_range_idx(reg_id); for (int j = 0; ftr_bits[j].type != FTR_END; j++) { /* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */ @@ -414,7 +422,9 @@ static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask); test_reg_set_fail(vcpu, reg, &ftr_bits[j]); - test_reg_set_success(vcpu, reg, &ftr_bits[j]); + + test_reg_vals[idx] = test_reg_set_success(vcpu, reg, + &ftr_bits[j]); ksft_test_result_pass("%s\n", ftr_bits[j].name); } @@ -425,7 +435,6 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; struct ucall uc; - uint64_t val; while (!done) { vcpu_run(vcpu); @@ -436,8 +445,8 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) break; case UCALL_SYNC: /* Make sure the written values are seen by guest */ - vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(uc.args[2]), &val); - TEST_ASSERT_EQ(val, uc.args[3]); + TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])], + uc.args[3]); break; case UCALL_DONE: done = true; -- cgit v1.2.3 From 07eabd8a528f511f6bbef3b5cbe5d9f90c5bb4ea Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 2 May 2024 23:35:28 +0000 Subject: KVM: selftests: arm64: Test that feature ID regs survive a reset One of the expectations with feature ID registers is that their values survive a vCPU reset. Start testing that. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240502233529.1958459-7-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 41 ++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 7c8d33fa2ae6..24b248c78f5d 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -457,13 +457,36 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) } } +static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding) +{ + size_t idx = encoding_to_range_idx(encoding); + uint64_t observed; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding), &observed); + TEST_ASSERT_EQ(test_reg_vals[idx], observed); +} + +static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) +{ + /* + * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an + * architectural reset of the vCPU. + */ + aarch64_vcpu_setup(vcpu, NULL); + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) + test_assert_id_reg_unchanged(vcpu, test_regs[i].reg); + + ksft_test_result_pass("%s\n", __func__); +} + int main(void) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; bool aarch64_only; uint64_t val, el0; - int ftr_cnt; + int test_cnt; TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); @@ -476,18 +499,20 @@ int main(void) ksft_print_header(); - ftr_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + - ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + - ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - - ARRAY_SIZE(test_regs); + test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - + ARRAY_SIZE(test_regs) + 1; - ksft_set_plan(ftr_cnt); + ksft_set_plan(test_cnt); test_vm_ftr_id_regs(vcpu, aarch64_only); test_guest_reg_read(vcpu); + test_reset_preserves_id_regs(vcpu); + kvm_vm_free(vm); ksft_finished(); -- cgit v1.2.3 From 606af8293cd8b962ad7cc51326bfd974c2fa1f91 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 2 May 2024 23:35:29 +0000 Subject: KVM: selftests: arm64: Test vCPU-scoped feature ID registers Test that CLIDR_EL1 and MPIDR_EL1 are modifiable from userspace and that the values are preserved across a vCPU reset like the other feature ID registers. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240502233529.1958459-8-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 53 ++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 24b248c78f5d..a7de39fa2a0a 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -457,6 +457,53 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) } } +/* Politely lifted from arch/arm64/include/asm/cache.h */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +static void test_clidr(struct kvm_vcpu *vcpu) +{ + uint64_t clidr; + int level; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), &clidr); + + /* find the first empty level in the cache hierarchy */ + for (level = 1; level < 7; level++) { + if (!CLIDR_CTYPE(clidr, level)) + break; + } + + /* + * If you have a mind-boggling 7 levels of cache, congratulations, you + * get to fix this. + */ + TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy"); + + /* stick in a unified cache level */ + clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level); + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr); + test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + u64 val; + + test_clidr(vcpu); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + + test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; + ksft_test_result_pass("%s\n", __func__); +} + static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding) { size_t idx = encoding_to_range_idx(encoding); @@ -477,6 +524,8 @@ static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) for (int i = 0; i < ARRAY_SIZE(test_regs); i++) test_assert_id_reg_unchanged(vcpu, test_regs[i].reg); + test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); + ksft_test_result_pass("%s\n", __func__); } @@ -504,11 +553,13 @@ int main(void) ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - - ARRAY_SIZE(test_regs) + 1; + ARRAY_SIZE(test_regs) + 2; ksft_set_plan(test_cnt); test_vm_ftr_id_regs(vcpu, aarch64_only); + test_vcpu_ftr_id_regs(vcpu); + test_guest_reg_read(vcpu); test_reset_preserves_id_regs(vcpu); -- cgit v1.2.3 From c0338e609e6e8aff8a7052c90cff83a6bc792ebc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:17 -0700 Subject: selftests/bpf: Remove bpf_tracing_net.h usages from two networking tests This patch removes the bpf_tracing_net.h usage from the networking tests, fib_lookup and test_lwt_redirect. Instead of using the (copied) macro TC_ACT_SHOT and ETH_HLEN from bpf_tracing_net.h, they can directly use the ones defined in the network header files under linux/. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/fib_lookup.c | 2 +- tools/testing/selftests/bpf/progs/test_lwt_redirect.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/fib_lookup.c b/tools/testing/selftests/bpf/progs/fib_lookup.c index c4514dd58c62..7b5dd2214ff4 100644 --- a/tools/testing/selftests/bpf/progs/fib_lookup.c +++ b/tools/testing/selftests/bpf/progs/fib_lookup.c @@ -3,8 +3,8 @@ #include #include +#include #include -#include "bpf_tracing_net.h" struct bpf_fib_lookup fib_params = {}; int fib_lookup_ret = 0; diff --git a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c index 8c895122f293..83439b87b766 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c @@ -3,7 +3,7 @@ #include #include #include -#include "bpf_tracing_net.h" +#include /* We don't care about whether the packet can be received by network stack. * Just care if the packet is sent to the correct device at correct direction -- cgit v1.2.3 From cbaec46df6c08a2fab6be03d093d3d6ce74adc9a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:18 -0700 Subject: selftests/bpf: Add a few tcp helper functions and macros to bpf_tracing_net.h This patch adds a few tcp related helper functions to bpf_tracing_net.h. They will be useful for both tcp-cc and network tracing related bpf progs. They have already been in the bpf_tcp_helpers.h. This change is needed to retire the bpf_tcp_helpers.h and consolidate all tests to vmlinux.h (i.e. bpf_tracing_net.h). Some of the helpers (tcp_sk and inet_csk) are also defined in bpf_cc_cubic.c and they are removed. While at it, remove the vmlinux.h from bpf_cc_cubic.c. bpf_tracing_net.h (which has vmlinux.h after this patch) is enough and will be consistent with the other tcp-cc tests in the later patches. The other TCP_* macro additions will be needed for the bpf_dctcp changes in the later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_cc_cubic.c | 14 +------- .../testing/selftests/bpf/progs/bpf_tracing_net.h | 41 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 5c7697c70e2a..2004be380683 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -13,11 +13,9 @@ * kernel functions. */ -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include -#include "bpf_tracing_net.h" #define BPF_STRUCT_OPS(name, args...) \ SEC("struct_ops/"#name) \ @@ -40,16 +38,6 @@ extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -static struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} - -static struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - static bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index f9ec630dfcd5..ba4ca0334586 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -2,6 +2,9 @@ #ifndef __BPF_TRACING_NET_H__ #define __BPF_TRACING_NET_H__ +#include +#include + #define AF_INET 2 #define AF_INET6 10 @@ -46,6 +49,13 @@ #define TCP_CA_NAME_MAX 16 #define TCP_NAGLE_OFF 1 +#define TCP_ECN_OK 1 +#define TCP_ECN_QUEUE_CWR 2 +#define TCP_ECN_DEMAND_CWR 4 +#define TCP_ECN_SEEN 8 + +#define TCP_CONG_NEEDS_ECN 0x2 + #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 #define ICSK_TIME_LOSS_PROBE 5 @@ -129,4 +139,35 @@ #define tcp_jiffies32 ((__u32)bpf_jiffies64()) +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd < tp->snd_ssthresh; +} + +static inline bool tcp_is_cwnd_limited(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tcp_in_slow_start(tp)) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); +} + #endif -- cgit v1.2.3 From cc5b18ce1714160be3e0e3b9440a6306dc87e5c4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:19 -0700 Subject: selftests/bpf: Reuse the tcp_sk() from the bpf_tracing_net.h This patch removes the individual tcp_sk implementations from the tcp-cc tests. The tcp_sk() implementation from the bpf_tracing_net.h is reused instead. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c | 8 +------- tools/testing/selftests/bpf/progs/tcp_ca_update.c | 8 +------- tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c | 8 +------- 3 files changed, 3 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c index 7bb872fb22dd..d6467fcb1deb 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -1,17 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include char _license[] SEC("license") = "GPL"; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - SEC("struct_ops/incompl_cong_ops_ssthresh") __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c index b93a0ed33057..8581cad321b6 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include @@ -10,11 +9,6 @@ char _license[] SEC("license") = "GPL"; int ca1_cnt = 0; int ca2_cnt = 0; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - SEC("struct_ops/ca_update_1_init") void BPF_PROG(ca_update_1_init, struct sock *sk) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index 0724a79cec78..4a369439335e 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include @@ -11,11 +10,6 @@ char _license[] SEC("license") = "GPL"; #define min(a, b) ((a) < (b) ? (a) : (b)) -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; -- cgit v1.2.3 From 7d3851a31832bf8dc776a78494b788518734ad0f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:20 -0700 Subject: selftests/bpf: Sanitize the SEC and inline usages in the bpf-tcp-cc tests It is needed to remove the BPF_STRUCT_OPS usages from the tcp-cc tests because it is defined in bpf_tcp_helpers.h which is going to be retired. While at it, this patch consolidates all tcp-cc struct_ops programs to use the SEC("struct_ops") + BPF_PROG(). It also removes the unnecessary __always_inline usages from the tcp-cc tests. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_cc_cubic.c | 28 ++++++++------- tools/testing/selftests/bpf/progs/bpf_cubic.c | 42 +++++++++++----------- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 26 +++++++------- .../selftests/bpf/progs/bpf_dctcp_release.c | 3 +- tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c | 3 +- .../selftests/bpf/progs/tcp_ca_incompl_cong_ops.c | 4 +-- tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 22 ++++++------ .../selftests/bpf/progs/tcp_ca_unsupp_cong_op.c | 2 +- tools/testing/selftests/bpf/progs/tcp_ca_update.c | 10 +++--- .../selftests/bpf/progs/tcp_ca_write_sk_pacing.c | 12 +++---- 10 files changed, 77 insertions(+), 75 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 2004be380683..1654a530aa3d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -17,10 +17,6 @@ #include #include -#define BPF_STRUCT_OPS(name, args...) \ -SEC("struct_ops/"#name) \ -BPF_PROG(name, args) - #define USEC_PER_SEC 1000000UL #define TCP_PACING_SS_RATIO (200) #define TCP_PACING_CA_RATIO (120) @@ -114,18 +110,21 @@ static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) return flag & FLAG_DATA_ACKED; } -void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_init, struct sock *sk) { cubictcp_init(sk); } -void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { cubictcp_cwnd_event(sk, event); } -void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, - const struct rate_sample *rs) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); @@ -151,23 +150,26 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag tcp_update_pacing_rate(sk); } -__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { return cubictcp_recalc_ssthresh(sk); } -void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) { cubictcp_state(sk, new_state); } -void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, - const struct ack_sample *sample) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { cubictcp_acked(sk, sample); } -__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) { return tcp_reno_undo_cwnd(sk); } diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index c997e3e3d3fb..53a98b609e5f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -91,7 +91,7 @@ struct bictcp { __u32 curr_rtt; /* the minimum rtt of current round */ }; -static inline void bictcp_reset(struct bictcp *ca) +static void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; @@ -112,7 +112,7 @@ extern unsigned long CONFIG_HZ __kconfig; #define USEC_PER_SEC 1000000UL #define USEC_PER_JIFFY (USEC_PER_SEC / HZ) -static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) +static __u64 div64_u64(__u64 dividend, __u64 divisor) { return dividend / divisor; } @@ -120,7 +120,7 @@ static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) #define div64_ul div64_u64 #define BITS_PER_U64 (sizeof(__u64) * 8) -static __always_inline int fls64(__u64 x) +static int fls64(__u64 x) { int num = BITS_PER_U64 - 1; @@ -153,12 +153,12 @@ static __always_inline int fls64(__u64 x) return num + 1; } -static __always_inline __u32 bictcp_clock_us(const struct sock *sk) +static __u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } -static __always_inline void bictcp_hystart_reset(struct sock *sk) +static void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -169,8 +169,7 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -/* "struct_ops/" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_init") +SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -184,8 +183,7 @@ void BPF_PROG(bpf_cubic_init, struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* "struct_ops" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_cwnd_event") +SEC("struct_ops") void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { @@ -230,7 +228,7 @@ static const __u8 v[] = { * Newton-Raphson iteration. * Avg err ~= 0.195% */ -static __always_inline __u32 cubic_root(__u64 a) +static __u32 cubic_root(__u64 a) { __u32 x, b, shift; @@ -263,8 +261,7 @@ static __always_inline __u32 cubic_root(__u64 a) /* * Compute congestion window to use. */ -static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, - __u32 acked) +static void bictcp_update(struct bictcp *ca, __u32 cwnd, __u32 acked) { __u32 delta, bic_target, max_cnt; __u64 offs, t; @@ -377,8 +374,8 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -397,7 +394,8 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acke tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -414,7 +412,8 @@ __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -433,7 +432,7 @@ void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ -static __always_inline __u32 hystart_ack_delay(struct sock *sk) +static __u32 hystart_ack_delay(struct sock *sk) { unsigned long rate; @@ -444,7 +443,7 @@ static __always_inline __u32 hystart_ack_delay(struct sock *sk) div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); } -static __always_inline void hystart_update(struct sock *sk, __u32 delay) +static void hystart_update(struct sock *sk, __u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -492,8 +491,8 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) int bpf_cubic_acked_called = 0; -void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, - const struct ack_sample *sample) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -524,7 +523,8 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; -__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) { return tcp_reno_undo_cwnd(sk); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 460682759aed..b74dbb121384 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -48,8 +48,7 @@ struct dctcp { static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; -static __always_inline void dctcp_reset(const struct tcp_sock *tp, - struct dctcp *ca) +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; @@ -57,7 +56,7 @@ static __always_inline void dctcp_reset(const struct tcp_sock *tp, ca->old_delivered_ce = tp->delivered_ce; } -SEC("struct_ops/dctcp_init") +SEC("struct_ops") void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -104,7 +103,7 @@ void BPF_PROG(dctcp_init, struct sock *sk) dctcp_reset(tp, ca); } -SEC("struct_ops/dctcp_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); @@ -114,7 +113,7 @@ __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -SEC("struct_ops/dctcp_update_alpha") +SEC("struct_ops") void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); @@ -144,7 +143,7 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) } } -static __always_inline void dctcp_react_to_loss(struct sock *sk) +static void dctcp_react_to_loss(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -153,7 +152,7 @@ static __always_inline void dctcp_react_to_loss(struct sock *sk) tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); } -SEC("struct_ops/dctcp_state") +SEC("struct_ops") void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Recovery && @@ -164,7 +163,7 @@ void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) */ } -static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) +static void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); @@ -179,9 +178,8 @@ static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ -static __always_inline -void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, - __u32 *prior_rcv_nxt, __u32 *ce_state) +static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + __u32 *prior_rcv_nxt, __u32 *ce_state) { __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; @@ -201,7 +199,7 @@ void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, dctcp_ece_ack_cwr(sk, new_ce_state); } -SEC("struct_ops/dctcp_cwnd_event") +SEC("struct_ops") void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) { struct dctcp *ca = inet_csk_ca(sk); @@ -220,7 +218,7 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) } } -SEC("struct_ops/dctcp_cwnd_undo") +SEC("struct_ops") __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); @@ -230,7 +228,7 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -SEC("struct_ops/dctcp_reno_cong_avoid") +SEC("struct_ops") void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { tcp_reno_cong_avoid(sk, ack, acked); diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c index d836f7c372f0..a946b070bb06 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -13,7 +13,8 @@ char _license[] SEC("license") = "GPL"; const char cubic[] = "cubic"; -void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(dctcp_nouse_release, struct sock *sk) { bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic)); diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c index 2ecd833dcd41..633164e704dd 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -8,7 +8,8 @@ char _license[] SEC("license") = "X"; -void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(nogpltcp_init, struct sock *sk) { } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c index d6467fcb1deb..0016c90e9c13 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -6,13 +6,13 @@ char _license[] SEC("license") = "GPL"; -SEC("struct_ops/incompl_cong_ops_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/incompl_cong_ops_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c index 52b610357309..f95862f570b7 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -27,7 +27,7 @@ extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; -SEC("struct_ops/init") +SEC("struct_ops") void BPF_PROG(init, struct sock *sk) { bbr_init(sk); @@ -35,38 +35,38 @@ void BPF_PROG(init, struct sock *sk) cubictcp_init(sk); } -SEC("struct_ops/in_ack_event") +SEC("struct_ops") void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) { dctcp_update_alpha(sk, flags); } -SEC("struct_ops/cong_control") +SEC("struct_ops") void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { bbr_main(sk, ack, flag, rs); } -SEC("struct_ops/cong_avoid") +SEC("struct_ops") void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) { cubictcp_cong_avoid(sk, ack, acked); } -SEC("struct_ops/sndbuf_expand") +SEC("struct_ops") u32 BPF_PROG(sndbuf_expand, struct sock *sk) { return bbr_sndbuf_expand(sk); } -SEC("struct_ops/undo_cwnd") +SEC("struct_ops") u32 BPF_PROG(undo_cwnd, struct sock *sk) { bbr_undo_cwnd(sk); return dctcp_cwnd_undo(sk); } -SEC("struct_ops/cwnd_event") +SEC("struct_ops") void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) { bbr_cwnd_event(sk, event); @@ -74,7 +74,7 @@ void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) cubictcp_cwnd_event(sk, event); } -SEC("struct_ops/ssthresh") +SEC("struct_ops") u32 BPF_PROG(ssthresh, struct sock *sk) { bbr_ssthresh(sk); @@ -82,13 +82,13 @@ u32 BPF_PROG(ssthresh, struct sock *sk) return cubictcp_recalc_ssthresh(sk); } -SEC("struct_ops/min_tso_segs") +SEC("struct_ops") u32 BPF_PROG(min_tso_segs, struct sock *sk) { return bbr_min_tso_segs(sk); } -SEC("struct_ops/set_state") +SEC("struct_ops") void BPF_PROG(set_state, struct sock *sk, u8 new_state) { bbr_set_state(sk, new_state); @@ -96,7 +96,7 @@ void BPF_PROG(set_state, struct sock *sk, u8 new_state) cubictcp_state(sk, new_state); } -SEC("struct_ops/pkts_acked") +SEC("struct_ops") void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) { cubictcp_acked(sk, sample); diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c index c06f4a41c21a..54f916a931c6 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -7,7 +7,7 @@ char _license[] SEC("license") = "GPL"; -SEC("struct_ops/unsupp_cong_op_get_info") +SEC("struct_ops") size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c index 8581cad321b6..e4bd82bc0d01 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c @@ -9,31 +9,31 @@ char _license[] SEC("license") = "GPL"; int ca1_cnt = 0; int ca2_cnt = 0; -SEC("struct_ops/ca_update_1_init") +SEC("struct_ops") void BPF_PROG(ca_update_1_init, struct sock *sk) { ca1_cnt++; } -SEC("struct_ops/ca_update_2_init") +SEC("struct_ops") void BPF_PROG(ca_update_2_init, struct sock *sk) { ca2_cnt++; } -SEC("struct_ops/ca_update_cong_control") +SEC("struct_ops") void BPF_PROG(ca_update_cong_control, struct sock *sk, const struct rate_sample *rs) { } -SEC("struct_ops/ca_update_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(ca_update_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/ca_update_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index 4a369439335e..a58b5194fc89 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -10,17 +10,17 @@ char _license[] SEC("license") = "GPL"; #define min(a, b) ((a) < (b) ? (a) : (b)) -static inline unsigned int tcp_left_out(const struct tcp_sock *tp) +static unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } -static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } -SEC("struct_ops/write_sk_pacing_init") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_init, struct sock *sk) { #ifdef ENABLE_ATOMICS_TESTS @@ -31,7 +31,7 @@ void BPF_PROG(write_sk_pacing_init, struct sock *sk) #endif } -SEC("struct_ops/write_sk_pacing_cong_control") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, const struct rate_sample *rs) { @@ -43,13 +43,13 @@ void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ?: 1; } -SEC("struct_ops/write_sk_pacing_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/write_sk_pacing_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; -- cgit v1.2.3 From b1d87ae9b0d3d91767d85183e40c96f4229a6c21 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:21 -0700 Subject: selftests/bpf: Rename tcp-cc private struct in bpf_cubic and bpf_dctcp The "struct bictcp" and "struct dctcp" are private to the bpf prog and they are stored in the private buffer in inet_csk(sk)->icsk_ca_priv. Hence, there is no bpf CO-RE required. The same struct name exists in the vmlinux.h. To reuse vmlinux.h, they need to be renamed such that the bpf prog logic will be immuned from the kernel tcp-cc changes. This patch adds a "bpf_" prefix to them. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_cubic.c | 20 ++++++++++---------- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 16 ++++++++-------- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 53a98b609e5f..53872e2d2c52 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -70,7 +70,7 @@ static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) / (bic_scale * 10); /* BIC TCP Parameters */ -struct bictcp { +struct bpf_bictcp { __u32 cnt; /* increase cwnd by 1 after ACKs */ __u32 last_max_cwnd; /* last maximum snd_cwnd */ __u32 last_cwnd; /* the last snd_cwnd */ @@ -91,7 +91,7 @@ struct bictcp { __u32 curr_rtt; /* the minimum rtt of current round */ }; -static void bictcp_reset(struct bictcp *ca) +static void bictcp_reset(struct bpf_bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; @@ -161,7 +161,7 @@ static __u32 bictcp_clock_us(const struct sock *sk) static void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; @@ -172,7 +172,7 @@ static void bictcp_hystart_reset(struct sock *sk) SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); @@ -187,7 +187,7 @@ SEC("struct_ops") void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 now = tcp_jiffies32; __s32 delta; @@ -261,7 +261,7 @@ static __u32 cubic_root(__u64 a) /* * Compute congestion window to use. */ -static void bictcp_update(struct bictcp *ca, __u32 cwnd, __u32 acked) +static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked) { __u32 delta, bic_target, max_cnt; __u64 offs, t; @@ -378,7 +378,7 @@ SEC("struct_ops") void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; @@ -398,7 +398,7 @@ SEC("struct_ops") __u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -446,7 +446,7 @@ static __u32 hystart_ack_delay(struct sock *sk) static void hystart_update(struct sock *sk, __u32 delay) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { @@ -495,7 +495,7 @@ SEC("struct_ops") void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 delay; bpf_cubic_acked_called = 1; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index b74dbb121384..a8673b7ff35d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -35,7 +35,7 @@ struct { #define DCTCP_MAX_ALPHA 1024U -struct dctcp { +struct bpf_dctcp { __u32 old_delivered; __u32 old_delivered_ce; __u32 prior_rcv_nxt; @@ -48,7 +48,7 @@ struct dctcp { static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; -static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +static void dctcp_reset(const struct tcp_sock *tp, struct bpf_dctcp *ca) { ca->next_seq = tp->snd_nxt; @@ -60,7 +60,7 @@ SEC("struct_ops") void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); int *stg; if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { @@ -106,7 +106,7 @@ void BPF_PROG(dctcp_init, struct sock *sk) SEC("struct_ops") __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; @@ -117,7 +117,7 @@ SEC("struct_ops") void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { @@ -145,7 +145,7 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) static void dctcp_react_to_loss(struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; @@ -202,7 +202,7 @@ static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, SEC("struct_ops") void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: @@ -221,7 +221,7 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) SEC("struct_ops") __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + const struct bpf_dctcp *ca = inet_csk_ca(sk); return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } -- cgit v1.2.3 From a824c9a8a4d9a654d62674a8425c0f1abc9c3d33 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:22 -0700 Subject: selftests/bpf: Use bpf_tracing_net.h in bpf_cubic This patch uses bpf_tracing_net.h (i.e. vmlinux.h) in bpf_cubic. This will allow to retire the bpf_tcp_helpers.h and consolidate tcp-cc tests to vmlinux.h. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_cubic.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 53872e2d2c52..d665b8a15cc4 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -14,14 +14,22 @@ * "ca->ack_cnt / delta" operation. */ -#include -#include -#include -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" +#include char _license[] SEC("license") = "GPL"; #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} +#define after(seq2, seq1) before(seq1, seq2) + +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta -- cgit v1.2.3 From 6ad4e6e94697e960630594907666bc09e78a3b8a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:23 -0700 Subject: selftests/bpf: Use bpf_tracing_net.h in bpf_dctcp This patch uses bpf_tracing_net.h (i.e. vmlinux.h) in bpf_dctcp. This will allow to retire the bpf_tcp_helpers.h and consolidate tcp-cc tests to vmlinux.h. It will have a dup on min/max macros with the bpf_cubic. It could be further refactored in the future. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index a8673b7ff35d..3c9ffe340312 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -6,15 +6,23 @@ * the kernel BPF logic. */ -#include -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" + +#ifndef EBUSY +#define EBUSY 16 +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6eee55aa769c241182da73a391980f51edba27dc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:24 -0700 Subject: selftests/bpf: Remove bpf_tcp_helpers.h usages from other misc bpf tcp-cc tests This patch removed the final few bpf_tcp_helpers.h usages in some misc bpf tcp-cc tests and replace it with bpf_tracing_net.h (i.e. vmlinux.h) Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-9-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_dctcp_release.c | 7 +------ tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c | 5 +---- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c index a946b070bb06..c91763f248b2 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -1,14 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; const char cubic[] = "cubic"; diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c index 633164e704dd..8a7a4c1b54e8 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -1,10 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include +#include "bpf_tracing_net.h" #include -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "X"; -- cgit v1.2.3 From c075c9c4af289bb5956b0164283a85cf9c293c8e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:25 -0700 Subject: selftests/bpf: Remove the bpf_tcp_helpers.h usages from other non tcp-cc tests The patch removes the remaining bpf_tcp_helpers.h usages in the non tcp-cc networking tests. It either replaces it with bpf_tracing_net.h or just removed it because the test is not actually using any kernel sockets. For the later, the missing macro (mainly SOL_TCP) is defined locally. An exception is the test_sock_fields which is testing the "struct bpf_sock" type instead of the kernel sock type. Whenever "vmlinux.h" is used instead, it hits a verifier error on doing arithmetic on the sock_common pointer: ; return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1); @ test_sock_fields.c:54 21: (61) r2 = *(u32 *)(r1 +28) ; R1_w=sock_common() R2_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 22: (56) if w2 != 0x0 goto pc-6 ; R2_w=0 23: (b7) r3 = 28 ; R3_w=28 24: (bf) r2 = r1 ; R1_w=sock_common() R2_w=sock_common() 25: (0f) r2 += r3 R2 pointer arithmetic on sock_common prohibited Hence, instead of including bpf_tracing_net.h, the test_sock_fields test defines a tcp_sock with one lsndtime field in it. Another highlight is, in sockopt_qos_to_cc.c, the tcp_cc_eq() is replaced by bpf_strncmp(). tcp_cc_eq() was a workaround in bpf_tcp_helpers.h before bpf_strncmp had been added. The SOL_IPV6 addition to bpf_tracing_net.h is needed by the test_tcpbpf_kern test. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_tracing_net.h | 1 + tools/testing/selftests/bpf/progs/connect4_prog.c | 6 ++++-- tools/testing/selftests/bpf/progs/mptcp_sock.c | 4 ++-- tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c | 16 ++++++---------- .../selftests/bpf/progs/test_btf_skc_cls_ingress.c | 16 +++++----------- tools/testing/selftests/bpf/progs/test_sock_fields.c | 5 ++++- tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c | 13 +------------ 7 files changed, 23 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index ba4ca0334586..59843b430f76 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -25,6 +25,7 @@ #define IP_TOS 1 +#define SOL_IPV6 41 #define IPV6_TCLASS 67 #define IPV6_AUTOFLOWLABEL 70 diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 7ef49ec04838..bec529da7c9d 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -14,8 +14,6 @@ #include #include -#include "bpf_tcp_helpers.h" - #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 @@ -32,6 +30,10 @@ #define IFNAMSIZ 16 #endif +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + __attribute__ ((noinline)) __weak int do_bind(struct bpf_sock_addr *ctx) { diff --git a/tools/testing/selftests/bpf/progs/mptcp_sock.c b/tools/testing/selftests/bpf/progs/mptcp_sock.c index 91a0d7eff2ac..f3acb90588c7 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_sock.c +++ b/tools/testing/selftests/bpf/progs/mptcp_sock.c @@ -2,9 +2,9 @@ /* Copyright (c) 2020, Tessares SA. */ /* Copyright (c) 2022, SUSE. */ -#include +#include "bpf_tracing_net.h" #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; __u32 token = 0; diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c index 83753b00a556..5c3614333b01 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -1,24 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include -#include -#include -#include -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; __s32 page_size = 0; +const char cc_reno[TCP_CA_NAME_MAX] = "reno"; +const char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; + SEC("cgroup/setsockopt") int sockopt_qos_to_cc(struct bpf_sockopt *ctx) { void *optval_end = ctx->optval_end; int *optval = ctx->optval; char buf[TCP_CA_NAME_MAX]; - char cc_reno[TCP_CA_NAME_MAX] = "reno"; - char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS) goto out; @@ -29,11 +25,11 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx) if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) return 0; - if (!tcp_cc_eq(buf, cc_cubic)) + if (bpf_strncmp(buf, sizeof(buf), cc_cubic)) return 0; if (*optval == 0x2d) { - if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno, + if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, (void *)&cc_reno, sizeof(cc_reno))) return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c index e2bea4da194b..f0759efff6ef 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c @@ -1,19 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" + +#ifndef ENOENT +#define ENOENT 2 +#endif struct sockaddr_in6 srv_sa6 = {}; __u16 listen_tp_sport = 0; diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c index f75e531bf36f..196844be349c 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c @@ -7,7 +7,6 @@ #include #include -#include "bpf_tcp_helpers.h" enum bpf_linum_array_idx { EGRESS_LINUM_IDX, @@ -42,6 +41,10 @@ struct { __type(value, struct bpf_spinlock_cnt); } sk_pkt_out_cnt10 SEC(".maps"); +struct tcp_sock { + __u32 lsndtime; +} __attribute__((preserve_access_index)); + struct bpf_tcp_sock listen_tp = {}; struct sockaddr_in6 srv_sa6 = {}; struct bpf_tcp_sock cli_tp = {}; diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index a3f3f43fc195..6935f32eeb8f 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -1,18 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" #include "test_tcpbpf.h" struct tcpbpf_globals global = {}; -- cgit v1.2.3 From 6a650816b098a15c4690a22e3889858264d01aa8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:26 -0700 Subject: selftests/bpf: Retire bpf_tcp_helpers.h The previous patches have consolidated the tests to use bpf_tracing_net.h (i.e. vmlinux.h) instead of bpf_tcp_helpers.h. This patch can finally retire the bpf_tcp_helpers.h from the repository. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-11-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 241 -------------------------- 1 file changed, 241 deletions(-) delete mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h deleted file mode 100644 index 82a7c9de95f9..000000000000 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BPF_TCP_HELPERS_H -#define __BPF_TCP_HELPERS_H - -#include -#include -#include -#include -#include - -#define BPF_STRUCT_OPS(name, args...) \ -SEC("struct_ops/"#name) \ -BPF_PROG(name, args) - -#ifndef SOL_TCP -#define SOL_TCP 6 -#endif - -#ifndef TCP_CA_NAME_MAX -#define TCP_CA_NAME_MAX 16 -#endif - -#define tcp_jiffies32 ((__u32)bpf_jiffies64()) - -struct sock_common { - unsigned char skc_state; - __u16 skc_num; -} __attribute__((preserve_access_index)); - -enum sk_pacing { - SK_PACING_NONE = 0, - SK_PACING_NEEDED = 1, - SK_PACING_FQ = 2, -}; - -struct sock { - struct sock_common __sk_common; -#define sk_state __sk_common.skc_state - unsigned long sk_pacing_rate; - __u32 sk_pacing_status; /* see enum sk_pacing */ -} __attribute__((preserve_access_index)); - -struct inet_sock { - struct sock sk; -} __attribute__((preserve_access_index)); - -struct inet_connection_sock { - struct inet_sock icsk_inet; - __u8 icsk_ca_state:6, - icsk_ca_setsockopt:1, - icsk_ca_dst_locked:1; - struct { - __u8 pending; - } icsk_ack; - __u64 icsk_ca_priv[104 / sizeof(__u64)]; -} __attribute__((preserve_access_index)); - -struct request_sock { - struct sock_common __req_common; -} __attribute__((preserve_access_index)); - -struct tcp_sock { - struct inet_connection_sock inet_conn; - - __u32 rcv_nxt; - __u32 snd_nxt; - __u32 snd_una; - __u32 window_clamp; - __u8 ecn_flags; - __u32 delivered; - __u32 delivered_ce; - __u32 snd_cwnd; - __u32 snd_cwnd_cnt; - __u32 snd_cwnd_clamp; - __u32 snd_ssthresh; - __u8 syn_data:1, /* SYN includes data */ - syn_fastopen:1, /* SYN includes Fast Open option */ - syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ - syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ - __u32 max_packets_out; - __u32 lsndtime; - __u32 prior_cwnd; - __u64 tcp_mstamp; /* most recent packet received/sent */ - bool is_mptcp; -} __attribute__((preserve_access_index)); - -static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} - -static __always_inline void *inet_csk_ca(const struct sock *sk) -{ - return (void *)inet_csk(sk)->icsk_ca_priv; -} - -static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static __always_inline bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} -#define after(seq2, seq1) before(seq1, seq2) - -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 - -enum inet_csk_ack_state_t { - ICSK_ACK_SCHED = 1, - ICSK_ACK_TIMER = 2, - ICSK_ACK_PUSHED = 4, - ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ -}; - -enum tcp_ca_event { - CA_EVENT_TX_START = 0, - CA_EVENT_CWND_RESTART = 1, - CA_EVENT_COMPLETE_CWR = 2, - CA_EVENT_LOSS = 3, - CA_EVENT_ECN_NO_CE = 4, - CA_EVENT_ECN_IS_CE = 5, -}; - -struct ack_sample { - __u32 pkts_acked; - __s32 rtt_us; - __u32 in_flight; -} __attribute__((preserve_access_index)); - -struct rate_sample { - __u64 prior_mstamp; /* starting timestamp for interval */ - __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ - __s32 delivered; /* number of packets delivered over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - __u32 snd_interval_us; /* snd interval for delivered packets */ - __u32 rcv_interval_us; /* rcv interval for delivered packets */ - long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ - int losses; /* number of packets marked lost upon ACK */ - __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ - __u32 prior_in_flight; /* in flight before this ACK */ - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ - bool is_ack_delayed; /* is this (likely) a delayed ACK? */ -} __attribute__((preserve_access_index)); - -#define TCP_CA_NAME_MAX 16 -#define TCP_CONG_NEEDS_ECN 0x2 - -struct tcp_congestion_ops { - char name[TCP_CA_NAME_MAX]; - __u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); - - /* return slow start threshold (required) */ - __u32 (*ssthresh)(struct sock *sk); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked); - /* call before changing ca_state (optional) */ - void (*set_state)(struct sock *sk, __u8 new_state); - /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); - /* call when ack arrives (optional) */ - void (*in_ack_event)(struct sock *sk, __u32 flags); - /* new value of cwnd after loss (required) */ - __u32 (*undo_cwnd)(struct sock *sk); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - __u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - __u32 (*sndbuf_expand)(struct sock *sk); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); - void *owner; -}; - -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) -{ - return tp->snd_cwnd < tp->snd_ssthresh; -} - -static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; - - return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); -} - -static __always_inline bool tcp_cc_eq(const char *a, const char *b) -{ - int i; - - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (a[i] != b[i]) - return false; - if (!a[i]) - break; - } - - return true; -} - -extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; -extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; - -struct mptcp_sock { - struct inet_connection_sock sk; - - __u32 token; - struct sock *first; - char ca_name[TCP_CA_NAME_MAX]; -} __attribute__((preserve_access_index)); - -#endif -- cgit v1.2.3 From 60e0f986e89f10f2de874ff3ce8e2230701c9706 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 May 2024 18:40:04 +0000 Subject: selftest: epoll_busy_poll: epoll busy poll tests Add a simple test for the epoll busy poll ioctls, using the kernel selftest harness. This test ensures that the ioctls have the expected return codes and that the kernel properly gets and sets epoll busy poll parameters. The test can be expanded in the future to do real busy polling (provided another machine to act as the client is available). Signed-off-by: Joe Damato Link: https://lore.kernel.org/r/20240508184008.48264-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 3 +- tools/testing/selftests/net/epoll_busy_poll.c | 320 ++++++++++++++++++++++++++ 3 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/epoll_busy_poll.c (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 74ae1068229c..49a56eb5d036 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -4,6 +4,7 @@ bind_timewait bind_wildcard cmsg_sender diag_uid +epoll_busy_poll fin_ack_lat gro hwtstamp_config diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 052c21438dc4..bd01e4a0be2c 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -67,7 +67,7 @@ TEST_GEN_FILES += ipsec TEST_GEN_FILES += ioam6_parser TEST_GEN_FILES += gro TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen @@ -101,6 +101,7 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk +$(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto $(OUTPUT)/tcp_inq: LDLIBS += -lpthread diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c new file mode 100644 index 000000000000..9dd2830fd67c --- /dev/null +++ b/tools/testing/selftests/net/epoll_busy_poll.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* Basic per-epoll context busy poll test. + * + * Only tests the ioctls, but should be expanded to test two connected hosts in + * the future + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "../kselftest_harness.h" + +/* if the headers haven't been updated, we need to define some things */ +#if !defined(EPOLL_IOC_TYPE) +struct epoll_params { + uint32_t busy_poll_usecs; + uint16_t busy_poll_budget; + uint8_t prefer_busy_poll; + + /* pad the struct to a multiple of 64bits */ + uint8_t __pad; +}; + +#define EPOLL_IOC_TYPE 0x8A +#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params) +#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) +#endif + +FIXTURE(invalid_fd) +{ + int invalid_fd; + struct epoll_params params; +}; + +FIXTURE_SETUP(invalid_fd) +{ + int ret; + + ret = socket(AF_UNIX, SOCK_DGRAM, 0); + EXPECT_NE(-1, ret) + TH_LOG("error creating unix socket"); + + self->invalid_fd = ret; +} + +FIXTURE_TEARDOWN(invalid_fd) +{ + int ret; + + ret = close(self->invalid_fd); + EXPECT_EQ(0, ret); +} + +TEST_F(invalid_fd, test_invalid_fd) +{ + int ret; + + ret = ioctl(self->invalid_fd, EPIOCGPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCGPARAMS on invalid epoll FD should error"); + + EXPECT_EQ(ENOTTY, errno) + TH_LOG("EPIOCGPARAMS on invalid epoll FD should set errno to ENOTTY"); + + memset(&self->params, 0, sizeof(struct epoll_params)); + + ret = ioctl(self->invalid_fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS on invalid epoll FD should error"); + + EXPECT_EQ(ENOTTY, errno) + TH_LOG("EPIOCSPARAMS on invalid epoll FD should set errno to ENOTTY"); +} + +FIXTURE(epoll_busy_poll) +{ + int fd; + struct epoll_params params; + struct epoll_params *invalid_params; + cap_t caps; +}; + +FIXTURE_SETUP(epoll_busy_poll) +{ + int ret; + + ret = epoll_create1(0); + EXPECT_NE(-1, ret) + TH_LOG("epoll_create1 failed?"); + + self->fd = ret; + + self->caps = cap_get_proc(); + EXPECT_NE(NULL, self->caps); +} + +FIXTURE_TEARDOWN(epoll_busy_poll) +{ + int ret; + + ret = close(self->fd); + EXPECT_EQ(0, ret); + + ret = cap_free(self->caps); + EXPECT_NE(-1, ret) + TH_LOG("unable to free capabilities"); +} + +TEST_F(epoll_busy_poll, test_get_params) +{ + /* begin by getting the epoll params from the kernel + * + * the default should be default and all fields should be zero'd by the + * kernel, so set params fields to garbage to test this. + */ + int ret = 0; + + self->params.busy_poll_usecs = 0xff; + self->params.busy_poll_budget = 0xff; + self->params.prefer_busy_poll = 1; + self->params.__pad = 0xf; + + ret = ioctl(self->fd, EPIOCGPARAMS, &self->params); + EXPECT_EQ(0, ret) + TH_LOG("ioctl EPIOCGPARAMS should succeed"); + + EXPECT_EQ(0, self->params.busy_poll_usecs) + TH_LOG("EPIOCGPARAMS busy_poll_usecs should have been 0"); + + EXPECT_EQ(0, self->params.busy_poll_budget) + TH_LOG("EPIOCGPARAMS busy_poll_budget should have been 0"); + + EXPECT_EQ(0, self->params.prefer_busy_poll) + TH_LOG("EPIOCGPARAMS prefer_busy_poll should have been 0"); + + EXPECT_EQ(0, self->params.__pad) + TH_LOG("EPIOCGPARAMS __pad should have been 0"); + + self->invalid_params = (struct epoll_params *)0xdeadbeef; + ret = ioctl(self->fd, EPIOCGPARAMS, self->invalid_params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCGPARAMS should error with invalid params"); + + EXPECT_EQ(EFAULT, errno) + TH_LOG("EPIOCGPARAMS with invalid params should set errno to EFAULT"); +} + +TEST_F(epoll_busy_poll, test_set_invalid) +{ + int ret; + + memset(&self->params, 0, sizeof(struct epoll_params)); + + self->params.__pad = 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS non-zero __pad should error"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS non-zero __pad errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = (uint32_t)INT_MAX + 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error busy_poll_usecs > S32_MAX"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS busy_poll_usecs > S32_MAX errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = 32; + self->params.prefer_busy_poll = 2; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error prefer_busy_poll > 1"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("EPIOCSPARAMS prefer_busy_poll > 1 errno should be EINVAL"); + + self->params.__pad = 0; + self->params.busy_poll_usecs = 32; + self->params.prefer_busy_poll = 1; + + /* set budget well above kernel's NAPI_POLL_WEIGHT of 64 */ + self->params.busy_poll_budget = UINT16_MAX; + + /* test harness should run with CAP_NET_ADMIN, but let's make sure */ + cap_flag_value_t tmp; + + ret = cap_get_flag(self->caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &tmp); + EXPECT_EQ(0, ret) + TH_LOG("unable to get CAP_NET_ADMIN cap flag"); + + EXPECT_EQ(CAP_SET, tmp) + TH_LOG("expecting CAP_NET_ADMIN to be set for the test harness"); + + /* at this point we know CAP_NET_ADMIN is available, so setting the + * params with a busy_poll_budget > NAPI_POLL_WEIGHT should succeed + */ + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCSPARAMS should allow busy_poll_budget > NAPI_POLL_WEIGHT"); + + /* remove CAP_NET_ADMIN from our effective set */ + cap_value_t net_admin[] = { CAP_NET_ADMIN }; + + ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_CLEAR); + EXPECT_EQ(0, ret) + TH_LOG("couldnt clear CAP_NET_ADMIN"); + + ret = cap_set_proc(self->caps); + EXPECT_EQ(0, ret) + TH_LOG("cap_set_proc should drop CAP_NET_ADMIN"); + + /* this is now expected to fail */ + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error busy_poll_budget > NAPI_POLL_WEIGHT"); + + EXPECT_EQ(EPERM, errno) + TH_LOG("EPIOCSPARAMS errno should be EPERM busy_poll_budget > NAPI_POLL_WEIGHT"); + + /* restore CAP_NET_ADMIN to our effective set */ + ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_SET); + EXPECT_EQ(0, ret) + TH_LOG("couldn't restore CAP_NET_ADMIN"); + + ret = cap_set_proc(self->caps); + EXPECT_EQ(0, ret) + TH_LOG("cap_set_proc should set CAP_NET_ADMIN"); + + self->invalid_params = (struct epoll_params *)0xdeadbeef; + ret = ioctl(self->fd, EPIOCSPARAMS, self->invalid_params); + + EXPECT_EQ(-1, ret) + TH_LOG("EPIOCSPARAMS should error when epoll_params is invalid"); + + EXPECT_EQ(EFAULT, errno) + TH_LOG("EPIOCSPARAMS should set errno to EFAULT when epoll_params is invalid"); +} + +TEST_F(epoll_busy_poll, test_set_and_get_valid) +{ + int ret; + + memset(&self->params, 0, sizeof(struct epoll_params)); + + self->params.busy_poll_usecs = 25; + self->params.busy_poll_budget = 16; + self->params.prefer_busy_poll = 1; + + ret = ioctl(self->fd, EPIOCSPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCSPARAMS with valid params should not error"); + + /* check that the kernel returns the same values back */ + + memset(&self->params, 0, sizeof(struct epoll_params)); + + ret = ioctl(self->fd, EPIOCGPARAMS, &self->params); + + EXPECT_EQ(0, ret) + TH_LOG("EPIOCGPARAMS should not error"); + + EXPECT_EQ(25, self->params.busy_poll_usecs) + TH_LOG("params.busy_poll_usecs incorrect"); + + EXPECT_EQ(16, self->params.busy_poll_budget) + TH_LOG("params.busy_poll_budget incorrect"); + + EXPECT_EQ(1, self->params.prefer_busy_poll) + TH_LOG("params.prefer_busy_poll incorrect"); + + EXPECT_EQ(0, self->params.__pad) + TH_LOG("params.__pad was not 0"); +} + +TEST_F(epoll_busy_poll, test_invalid_ioctl) +{ + int invalid_ioctl = EPIOCGPARAMS + 10; + int ret; + + ret = ioctl(self->fd, invalid_ioctl, &self->params); + + EXPECT_EQ(-1, ret) + TH_LOG("invalid ioctl should return error"); + + EXPECT_EQ(EINVAL, errno) + TH_LOG("invalid ioctl should set errno to EINVAL"); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From 20434d2d896f85b38fa1fe91b8739afcd9cde3b3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:08 +0800 Subject: selftests/bpf: Add post_socket_cb for network_helper_opts __start_server() sets SO_REUSPORT through setsockopt() when the parameter 'reuseport' is set. This patch makes it more flexible by adding a function pointer post_socket_cb into struct network_helper_opts. The 'const struct post_socket_opts *cb_opts' args in the post_socket_cb is for the future extension. The 'reuseport' parameter can be dropped. Now the original start_reuseport_server() can be implemented by setting a newly defined reuseport_cb() function pointer to post_socket_cb filed of struct network_helper_opts. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/470cb82f209f055fc7fb39c66c6b090b5b7ed2b2.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 24 +++++++++++++++--------- tools/testing/selftests/bpf/network_helpers.h | 3 +++ 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 054d26e383e0..35250e6cde7f 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -81,9 +81,8 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, - bool reuseport, const struct network_helper_opts *opts) + const struct network_helper_opts *opts) { - int on = 1; int fd; fd = socket(addr->sa_family, type, opts->proto); @@ -95,9 +94,8 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl if (settimeo(fd, opts->timeout_ms)) goto error_close; - if (reuseport && - setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) { - log_err("Failed to set SO_REUSEPORT"); + if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) { + log_err("Failed to call post_socket_cb"); goto error_close; } @@ -132,7 +130,14 @@ int start_server(int family, int type, const char *addr_str, __u16 port, if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, (struct sockaddr *)&addr, addrlen, false, &opts); + return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); +} + +static int reuseport_cb(int fd, const struct post_socket_opts *opts) +{ + int on = 1; + + return setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)); } int *start_reuseport_server(int family, int type, const char *addr_str, @@ -140,6 +145,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, { struct network_helper_opts opts = { .timeout_ms = timeout_ms, + .post_socket_cb = reuseport_cb, }; struct sockaddr_storage addr; unsigned int nr_fds = 0; @@ -156,7 +162,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -165,7 +171,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -183,7 +189,7 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t l if (!opts) opts = &default_opts; - return __start_server(type, (struct sockaddr *)addr, len, 0, opts); + return __start_server(type, (struct sockaddr *)addr, len, opts); } void free_fds(int *fds, unsigned int nr_close_fds) diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index c62b54daa914..883c7ea9d8d5 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -21,6 +21,8 @@ typedef __u16 __sum16; #define VIP_NUM 5 #define MAGIC_BYTES 123 +struct post_socket_opts {}; + struct network_helper_opts { const char *cc; int timeout_ms; @@ -28,6 +30,7 @@ struct network_helper_opts { bool noconnect; int type; int proto; + int (*post_socket_cb)(int fd, const struct post_socket_opts *opts); }; /* ipv4 test vector */ -- cgit v1.2.3 From 5166b3e3e30a8eb93f7182283ed4db719bdfde1a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:09 +0800 Subject: selftests/bpf: Use start_server_addr in sockopt_inherit Include network_helpers.h in prog_tests/sockopt_inherit.c, use public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. Add a helper custom_cb() to set SOL_CUSTOM sockopt looply, set it to post_socket_cb pointer of struct network_helper_opts, and pass it to start_server_addr(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/687af66f743a0bf15cdba372c5f71fe64863219e.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockopt_inherit.c | 33 ++++++++-------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 917f486db826..51901359b603 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include "cgroup_helpers.h" +#include "network_helpers.h" #include "sockopt_inherit.skel.h" @@ -98,47 +99,36 @@ static void *server_thread(void *arg) return (void *)(long)err; } -static int start_server(void) +static int custom_cb(int fd, const struct post_socket_opts *opts) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; char buf; int err; - int fd; int i; - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) { buf = 0x01; err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1); if (err) { log_err("Failed to call setsockopt(%d)", i); - close(fd); return -1; } } - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; + return 0; } static void run_test(int cgroup_fd) { struct bpf_link *link_getsockopt = NULL; struct bpf_link *link_setsockopt = NULL; + struct network_helper_opts opts = { + .post_socket_cb = custom_cb, + }; int server_fd = -1, client_fd; + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; struct sockopt_inherit *obj; void *server_err; pthread_t tid; @@ -160,7 +150,8 @@ static void run_test(int cgroup_fd) if (!ASSERT_OK_PTR(link_setsockopt, "cg-attach-setsockopt")) goto close_bpf_object; - server_fd = start_server(); + server_fd = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr, + sizeof(addr), &opts); if (!ASSERT_GE(server_fd, 0, "start_server")) goto close_bpf_object; -- cgit v1.2.3 From 49e1fa8dbd81340f610057be3f3909f24c232807 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:10 +0800 Subject: selftests/bpf: Use start_server_addr in test_tcp_check_syncookie Include network_helpers.h in test_tcp_check_syncookie_user.c, use public helper start_server_addr() in it instead of the local defined function start_server(). This can avoid duplicate code. Add two helpers v6only_true() and v6only_false() to set IPV6_V6ONLY sockopt to true or false, set them to post_socket_cb pointer of struct network_helper_opts, and pass it to start_server_setsockopt(). In order to use functions defined in network_helpers.c, Makefile needs to be updated too. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/e0c5324f5da84f453f47543536e70f126eaa8678.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 1 + .../selftests/bpf/test_tcp_check_syncookie_user.c | 68 ++++++++-------------- 2 files changed, 25 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e962a0bd8a78..135023a357b3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -310,6 +310,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) $(OUTPUT)/xsk.o: $(BPFOBJ) +$(OUTPUT)/test_tcp_check_syncookie_user: $(NETWORK_HELPERS) BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index 32df93747095..bf60bc267cbc 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -16,44 +16,7 @@ #include #include "cgroup_helpers.h" - -static int start_server(const struct sockaddr *addr, socklen_t len, bool dual) -{ - int mode = !dual; - int fd; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (addr->sa_family == AF_INET6) { - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&mode, - sizeof(mode)) == -1) { - log_err("Failed to set the dual-stack mode"); - goto close_out; - } - } - - if (bind(fd, addr, len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} +#include "network_helpers.h" static int connect_to_server(const struct sockaddr *addr, socklen_t len) { @@ -216,8 +179,23 @@ static bool get_port(int server_fd, in_port_t *port) return true; } +static int v6only_true(int fd, const struct post_socket_opts *opts) +{ + int mode = true; + + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); +} + +static int v6only_false(int fd, const struct post_socket_opts *opts) +{ + int mode = false; + + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); +} + int main(int argc, char **argv) { + struct network_helper_opts opts = { 0 }; struct sockaddr_in addr4; struct sockaddr_in6 addr6; struct sockaddr_in addr4dual; @@ -259,18 +237,20 @@ int main(int argc, char **argv) addr6dual.sin6_addr = in6addr_any; addr6dual.sin6_port = 0; - server = start_server((const struct sockaddr *)&addr4, sizeof(addr4), - false); + server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4, + sizeof(addr4), NULL); if (server == -1 || !get_port(server, &addr4.sin_port)) goto err; - server_v6 = start_server((const struct sockaddr *)&addr6, - sizeof(addr6), false); + opts.post_socket_cb = v6only_true; + server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6, + sizeof(addr6), &opts); if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) goto err; - server_dual = start_server((const struct sockaddr *)&addr6dual, - sizeof(addr6dual), true); + opts.post_socket_cb = v6only_false; + server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual, + sizeof(addr6dual), &opts); if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) goto err; -- cgit v1.2.3 From 5059c73eca67e686dea42af079c41857cb00a5a6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:11 +0800 Subject: selftests/bpf: Use connect_to_fd in sockopt_inherit This patch uses public helper connect_to_fd() exported in network_helpers.h instead of the local defined function connect_to_server() in prog_tests/sockopt_inherit.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/71db79127cc160b0643fd9a12c70ae019ae076a1.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockopt_inherit.c | 31 +--------------------- 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 51901359b603..1d3a20f01b60 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -10,35 +10,6 @@ #define CUSTOM_INHERIT2 1 #define CUSTOM_LISTENER 2 -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} - static int verify_sockopt(int fd, int optname, const char *msg, char expected) { socklen_t optlen = 1; @@ -164,7 +135,7 @@ static void run_test(int cgroup_fd) pthread_cond_wait(&server_started, &server_started_mtx); pthread_mutex_unlock(&server_started_mtx); - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(server_fd, 0); if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto close_server_fd; -- cgit v1.2.3 From 65a3f0df44dd3db0f77e6ccff0a126969abc0da4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:12 +0800 Subject: selftests/bpf: Use connect_to_fd in test_tcp_check_syncookie This patch uses public helper connect_to_fd() exported in network_helpers.h instead of the local defined function connect_to_server() in test_tcp_check_syncookie_user.c. This can avoid duplicate code. Then the arguments "addr" and "len" of run_test() become useless, drop them too. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/e0ae6b790ac0abc7193aadfb2660c8c9eb0fe1f0.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/test_tcp_check_syncookie_user.c | 38 +++------------------- 1 file changed, 5 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index bf60bc267cbc..b928474f3bf9 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -18,30 +18,6 @@ #include "cgroup_helpers.h" #include "network_helpers.h" -static int connect_to_server(const struct sockaddr *addr, socklen_t len) -{ - int fd = -1; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)addr, len) == -1) { - log_err("Fail to connect to server"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static int get_map_fd_by_prog_id(int prog_id, bool *xdp) { struct bpf_prog_info info = {}; @@ -80,8 +56,7 @@ err: return map_fd; } -static int run_test(int server_fd, int results_fd, bool xdp, - const struct sockaddr *addr, socklen_t len) +static int run_test(int server_fd, int results_fd, bool xdp) { int client = -1, srv_client = -1; int ret = 0; @@ -107,7 +82,7 @@ static int run_test(int server_fd, int results_fd, bool xdp, goto err; } - client = connect_to_server(addr, len); + client = connect_to_fd(server_fd, 0); if (client == -1) goto err; @@ -254,16 +229,13 @@ int main(int argc, char **argv) if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) goto err; - if (run_test(server, results, xdp, - (const struct sockaddr *)&addr4, sizeof(addr4))) + if (run_test(server, results, xdp)) goto err; - if (run_test(server_v6, results, xdp, - (const struct sockaddr *)&addr6, sizeof(addr6))) + if (run_test(server_v6, results, xdp)) goto err; - if (run_test(server_dual, results, xdp, - (const struct sockaddr *)&addr4dual, sizeof(addr4dual))) + if (run_test(server_dual, results, xdp)) goto err; printf("ok\n"); -- cgit v1.2.3 From 7abbf38cd8edb92bc72fe3405f8a0bf19f7761c2 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:13 +0800 Subject: selftests/bpf: Drop get_port in test_tcp_check_syncookie The arguments "addr" and "len" of run_test() have dropped. This makes function get_port() useless. Drop it from test_tcp_check_syncookie_user.c. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/a9b5c8064ab4cbf0f68886fe0e4706428b8d0d47.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/test_tcp_check_syncookie_user.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index b928474f3bf9..7b5fc98838cd 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -139,21 +139,6 @@ out: return ret; } -static bool get_port(int server_fd, in_port_t *port) -{ - struct sockaddr_in addr; - socklen_t len = sizeof(addr); - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - return false; - } - - /* sin_port and sin6_port are located at the same offset. */ - *port = addr.sin_port; - return true; -} - static int v6only_true(int fd, const struct post_socket_opts *opts) { int mode = true; @@ -214,19 +199,19 @@ int main(int argc, char **argv) server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4, sizeof(addr4), NULL); - if (server == -1 || !get_port(server, &addr4.sin_port)) + if (server == -1) goto err; opts.post_socket_cb = v6only_true; server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6, sizeof(addr6), &opts); - if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) + if (server_v6 == -1) goto err; opts.post_socket_cb = v6only_false; server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual, sizeof(addr6dual), &opts); - if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) + if (server_dual == -1) goto err; if (run_test(server, results, xdp)) -- cgit v1.2.3 From 90f01afb0dfafbc9b094bb61e61a4ac297d9d0d2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 20:52:58 -0700 Subject: perf ui browser: Avoid SEGV on title If the title is NULL then it can lead to a SEGV. Fixes: 769e6a1e15bdbbaf ("perf ui browser: Don't save pointer to stack memory") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240508035301.1554434-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index c4cdf2ea69b7..19503e838738 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -203,7 +203,7 @@ void ui_browser__refresh_dimensions(struct ui_browser *browser) void ui_browser__handle_resize(struct ui_browser *browser) { ui__refresh_dimensions(false); - ui_browser__show(browser, browser->title, ui_helpline__current); + ui_browser__show(browser, browser->title ?: "", ui_helpline__current); ui_browser__refresh(browser); } -- cgit v1.2.3 From de6a908384fb1a8327ba46a2baec67d1dfe9a3e1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 20:52:59 -0700 Subject: perf comm: Fix comm_str__put() for reference count checking Searching for the entry in the array needs to avoid the intermediate pointer with reference count checking. Refactor the array removal to binary search for the entry. Change the array to hold an entry with a reference count (so the intermediate pointer can work) and remove from the array when the reference count on a comm_str falls to 1. Fixes: 13ca628716c6f2c3 ("perf comm: Add reference count checking to 'struct comm_str'") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240508035301.1554434-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/comm.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index 1aa9a08e5b03..233f2b6edf52 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -19,6 +19,8 @@ static struct comm_strs { int capacity; } _comm_strs; +static void comm_strs__remove_if_last(struct comm_str *cs); + static void comm_strs__init(void) { init_rwsem(&_comm_strs.lock); @@ -58,22 +60,15 @@ static struct comm_str *comm_str__get(struct comm_str *cs) static void comm_str__put(struct comm_str *cs) { - if (cs && refcount_dec_and_test(comm_str__refcnt(cs))) { - struct comm_strs *comm_strs = comm_strs__get(); - int i; + if (!cs) + return; - down_write(&comm_strs->lock); - for (i = 0; i < comm_strs->num_strs; i++) { - if (comm_strs->strs[i] == cs) - break; - } - for (; i < comm_strs->num_strs - 1; i++) - comm_strs->strs[i] = comm_strs->strs[i + 1]; - - comm_strs->num_strs--; - up_write(&comm_strs->lock); + if (refcount_dec_and_test(comm_str__refcnt(cs))) { RC_CHK_FREE(cs); } else { + if (refcount_read(comm_str__refcnt(cs)) == 1) + comm_strs__remove_if_last(cs); + RC_CHK_PUT(cs); } } @@ -107,6 +102,28 @@ static int comm_str__search(const void *_key, const void *_member) return strcmp(key, comm_str__str(member)); } +static void comm_strs__remove_if_last(struct comm_str *cs) +{ + struct comm_strs *comm_strs = comm_strs__get(); + + down_write(&comm_strs->lock); + /* + * Are there only references from the array, if so remove the array + * reference under the write lock so that we don't race with findnew. + */ + if (refcount_read(comm_str__refcnt(cs)) == 1) { + struct comm_str **entry; + + entry = bsearch(comm_str__str(cs), comm_strs->strs, comm_strs->num_strs, + sizeof(struct comm_str *), comm_str__search); + comm_str__put(*entry); + for (int i = entry - comm_strs->strs; i < comm_strs->num_strs - 1; i++) + comm_strs->strs[i] = comm_strs->strs[i + 1]; + comm_strs->num_strs--; + } + up_write(&comm_strs->lock); +} + static struct comm_str *__comm_strs__find(struct comm_strs *comm_strs, const char *str) { struct comm_str **result; @@ -158,7 +175,7 @@ static struct comm_str *comm_strs__findnew(const char *str) } } up_write(&comm_strs->lock); - return result; + return comm_str__get(result); } struct comm *comm__new(const char *str, u64 timestamp, bool exec) -- cgit v1.2.3 From 45b4f402a6b782352c4bafcff682bfb01da9ca05 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 20:53:00 -0700 Subject: perf report: Avoid SEGV in report__setup_sample_type() In some cases evsel->name is lazily initialized in evsel__name(). If not initialized passing NULL to strstr() leads to a SEGV. Fixes: ccb17caecfbd542f ("perf report: Set PERF_SAMPLE_DATA_SRC bit for Arm SPE event") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240508035301.1554434-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 0892b6e3e5e7..69618fb0110b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -429,7 +429,7 @@ static int report__setup_sample_type(struct report *rep) * compatibility, set the bit if it's an old perf data file. */ evlist__for_each_entry(session->evlist, evsel) { - if (strstr(evsel->name, "arm_spe") && + if (strstr(evsel__name(evsel), "arm_spe") && !(sample_type & PERF_SAMPLE_DATA_SRC)) { evsel->core.attr.sample_type |= PERF_SAMPLE_DATA_SRC; sample_type |= PERF_SAMPLE_DATA_SRC; -- cgit v1.2.3 From 3536c2575e88a890cf696b4ccd3da36bc937853b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 7 May 2024 20:53:01 -0700 Subject: perf thread: Fixes to thread__new() related to initializing comm Freeing the thread on failure won't work with reference count checking, use thread__delete(). Don't allocate the comm_str, use a stack allocation instead. Fixes: f6005cafebab72f8 ("perf thread: Add reference count checking") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240508035301.1554434-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 0a473112f881..87c59aa9fe38 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -39,12 +39,13 @@ int thread__init_maps(struct thread *thread, struct machine *machine) struct thread *thread__new(pid_t pid, pid_t tid) { - char *comm_str; - struct comm *comm; RC_STRUCT(thread) *_thread = zalloc(sizeof(*_thread)); struct thread *thread; if (ADD_RC_CHK(thread, _thread) != NULL) { + struct comm *comm; + char comm_str[32]; + thread__set_pid(thread, pid); thread__set_tid(thread, tid); thread__set_ppid(thread, -1); @@ -56,13 +57,8 @@ struct thread *thread__new(pid_t pid, pid_t tid) init_rwsem(thread__namespaces_lock(thread)); init_rwsem(thread__comm_lock(thread)); - comm_str = malloc(32); - if (!comm_str) - goto err_thread; - - snprintf(comm_str, 32, ":%d", tid); + snprintf(comm_str, sizeof(comm_str), ":%d", tid); comm = comm__new(comm_str, 0, false); - free(comm_str); if (!comm) goto err_thread; @@ -76,7 +72,7 @@ struct thread *thread__new(pid_t pid, pid_t tid) return thread; err_thread: - free(thread); + thread__delete(thread); return NULL; } -- cgit v1.2.3 From c9d492378faec5c1fb8ea1534c620f7320bbb23a Mon Sep 17 00:00:00 2001 From: James Clark Date: Wed, 8 May 2024 15:14:57 +0100 Subject: perf dwarf-aux: Fix build with HAVE_DWARF_CFI_SUPPORT check_allowed_ops() is used from both HAVE_DWARF_GETLOCATIONS_SUPPORT and HAVE_DWARF_CFI_SUPPORT sections, so move it into the right place so that it's available when either are defined. This shows up when doing a static cross compile for arm64: $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- LDFLAGS="-static" \ EXTRA_PERFLIBS="-lexpat" util/dwarf-aux.c:1723:6: error: implicit declaration of function 'check_allowed_ops' Fixes: 55442cc2f22d0727 ("perf dwarf-aux: Check allowed DWARF Ops") Reviewed-by: Ian Rogers Signed-off-by: James Clark Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240508141458.439017-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index ec988f294497..44ef968a7ad3 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1217,6 +1217,34 @@ static int offset_from_dwarf_op(Dwarf_Op *op) } return -1; } + +static bool check_allowed_ops(Dwarf_Op *ops, size_t nops) +{ + /* The first op is checked separately */ + ops++; + nops--; + + /* + * It needs to make sure if the location expression matches to the given + * register and offset exactly. Thus it rejects any complex expressions + * and only allows a few of selected operators that doesn't change the + * location. + */ + while (nops) { + switch (ops->atom) { + case DW_OP_stack_value: + case DW_OP_deref_size: + case DW_OP_deref: + case DW_OP_piece: + break; + default: + return false; + } + ops++; + nops--; + } + return true; +} #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT || HAVE_DWARF_CFI_SUPPORT */ #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT @@ -1397,34 +1425,6 @@ static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, return true; } -static bool check_allowed_ops(Dwarf_Op *ops, size_t nops) -{ - /* The first op is checked separately */ - ops++; - nops--; - - /* - * It needs to make sure if the location expression matches to the given - * register and offset exactly. Thus it rejects any complex expressions - * and only allows a few of selected operators that doesn't change the - * location. - */ - while (nops) { - switch (ops->atom) { - case DW_OP_stack_value: - case DW_OP_deref_size: - case DW_OP_deref: - case DW_OP_piece: - break; - default: - return false; - } - ops++; - nops--; - } - return true; -} - /* Only checks direct child DIEs in the given scope. */ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) { -- cgit v1.2.3 From d790ead8a60c0f687446f7d8faa96943dc158912 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 9 May 2024 08:32:45 -0700 Subject: perf tracepoint: Don't scan all tracepoints to test if one exists In is_valid_tracepoint, rather than scanning "/sys/kernel/tracing/events/*/*" skipping any path where "/sys/kernel/tracing/events/*/*/id" doesn't exist, and then testing if "*:*" matches the tracepoint name, just use the given tracepoint name replace the ':' with '/' and see if the id file exists. This turns a nested directory search into a single file available test. Rather than return 1 for valid and 0 for invalid, return true and false. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240509153245.1990426-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/tracepoint.c | 56 +++++++++++++++++--------------------------- tools/perf/util/tracepoint.h | 3 ++- 2 files changed, 24 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/tracepoint.c b/tools/perf/util/tracepoint.c index 92dd8b455b90..95377ed5d87b 100644 --- a/tools/perf/util/tracepoint.c +++ b/tools/perf/util/tracepoint.c @@ -4,10 +4,12 @@ #include #include #include +#include #include #include #include +#include "fncache.h" int tp_event_has_id(const char *dir_path, struct dirent *evt_dir) { @@ -26,39 +28,25 @@ int tp_event_has_id(const char *dir_path, struct dirent *evt_dir) /* * Check whether event is in /tracing/events */ -int is_valid_tracepoint(const char *event_string) +bool is_valid_tracepoint(const char *event_string) { - DIR *sys_dir, *evt_dir; - struct dirent *sys_dirent, *evt_dirent; - char evt_path[MAXPATHLEN]; - char *dir_path; - - sys_dir = tracing_events__opendir(); - if (!sys_dir) - return 0; - - for_each_subsystem(sys_dir, sys_dirent) { - dir_path = get_events_file(sys_dirent->d_name); - if (!dir_path) - continue; - evt_dir = opendir(dir_path); - if (!evt_dir) - goto next; - - for_each_event(dir_path, evt_dir, evt_dirent) { - snprintf(evt_path, MAXPATHLEN, "%s:%s", - sys_dirent->d_name, evt_dirent->d_name); - if (!strcmp(evt_path, event_string)) { - closedir(evt_dir); - put_events_file(dir_path); - closedir(sys_dir); - return 1; - } - } - closedir(evt_dir); -next: - put_events_file(dir_path); - } - closedir(sys_dir); - return 0; + char *dst, *path = malloc(strlen(event_string) + 4); /* Space for "/id\0". */ + bool have_file = false; /* Conservatively return false if memory allocation failed. */ + const char *src; + + if (!path) + return false; + + /* Copy event_string replacing the ':' with '/'. */ + for (src = event_string, dst = path; *src; src++, dst++) + *dst = (*src == ':') ? '/' : *src; + /* Add "/id\0". */ + memcpy(dst, "/id", 4); + + dst = get_events_file(path); + if (dst) + have_file = file_available(dst); + free(dst); + free(path); + return have_file; } diff --git a/tools/perf/util/tracepoint.h b/tools/perf/util/tracepoint.h index c4a110fe87d7..65ccb01fc312 100644 --- a/tools/perf/util/tracepoint.h +++ b/tools/perf/util/tracepoint.h @@ -4,6 +4,7 @@ #include #include +#include int tp_event_has_id(const char *dir_path, struct dirent *evt_dir); @@ -20,6 +21,6 @@ int tp_event_has_id(const char *dir_path, struct dirent *evt_dir); (strcmp(sys_dirent->d_name, ".")) && \ (strcmp(sys_dirent->d_name, ".."))) -int is_valid_tracepoint(const char *event_string); +bool is_valid_tracepoint(const char *event_string); #endif /* __PERF_TRACEPOINT_H */ -- cgit v1.2.3 From 9fe410a7ef483a9aca08bf620d8ddfd35ac99bc7 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 7 May 2024 15:12:05 +0100 Subject: perf symbols: Remove map from list before updating addresses Make the order of operations remove, update, add. Updating addresses before the map is removed causes the ordering check to fail when the map is removed. This can be reproduced when running Perf on an Arm system with a static kernel and Perf uses kcore rather than other sources: $ perf record -- ls $ perf report util/maps.c:96: check_invariants: Assertion `map__end(prev) <= map__start(map) || map__start(prev) == map__start(map)' failed Fixes: 659ad3492b913c90 ("perf maps: Switch from rbtree to lazily sorted array for addresses") Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240507141210.195939-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index eb3319baa1b5..0d4de786358d 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1378,13 +1378,15 @@ static int dso__load_kcore(struct dso *dso, struct map *map, if (RC_CHK_EQUAL(new_map, replacement_map)) { struct map *map_ref; - map__set_start(map, map__start(new_map)); - map__set_end(map, map__end(new_map)); - map__set_pgoff(map, map__pgoff(new_map)); - map__set_mapping_type(map, map__mapping_type(new_map)); /* Ensure maps are correctly ordered */ map_ref = map__get(map); maps__remove(kmaps, map_ref); + + map__set_start(map_ref, map__start(new_map)); + map__set_end(map_ref, map__end(new_map)); + map__set_pgoff(map_ref, map__pgoff(new_map)); + map__set_mapping_type(map_ref, map__mapping_type(new_map)); + err = maps__insert(kmaps, map_ref); map__put(map_ref); map__put(new_map); -- cgit v1.2.3 From fd81f52e311f11f3eba842439948f989ffa8ccc2 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 7 May 2024 15:12:06 +0100 Subject: perf maps: Re-use __maps__free_maps_by_name() maps__merge_in() hard codes the steps to free the maps_by_name list. It seems to not map__put() each element before freeing, and it sets maps_by_name_sorted to true after freeing, which may be harmless but is inconsistent with maps__init() and other functions. maps__maps_by_name_addr() is also quite hard to read because we already have maps__maps_by_name() and maps__maps_by_address(), but the function is only used in that place so delete it. Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240507141210.195939-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 61eb742d91e3..16b39db594f4 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -124,11 +124,6 @@ static void maps__set_maps_by_address(struct maps *maps, struct map **new) } -static struct map ***maps__maps_by_name_addr(struct maps *maps) -{ - return &RC_CHK_ACCESS(maps)->maps_by_name; -} - static void maps__set_nr_maps_allocated(struct maps *maps, unsigned int nr_maps_allocated) { RC_CHK_ACCESS(maps)->nr_maps_allocated = nr_maps_allocated; @@ -284,6 +279,9 @@ void maps__put(struct maps *maps) static void __maps__free_maps_by_name(struct maps *maps) { + if (!maps__maps_by_name(maps)) + return; + /* * Free everything to try to do it from the rbtree in the next search */ @@ -291,6 +289,9 @@ static void __maps__free_maps_by_name(struct maps *maps) map__put(maps__maps_by_name(maps)[i]); zfree(&RC_CHK_ACCESS(maps)->maps_by_name); + + /* Consistent with maps__init(). When maps_by_name == NULL, maps_by_name_sorted == false */ + maps__set_maps_by_name_sorted(maps, false); } static int map__start_cmp(const void *a, const void *b) @@ -1167,8 +1168,7 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map) } maps__set_maps_by_address(kmaps, merged_maps_by_address); maps__set_maps_by_address_sorted(kmaps, true); - zfree(maps__maps_by_name_addr(kmaps)); - maps__set_maps_by_name_sorted(kmaps, true); + __maps__free_maps_by_name(kmaps); maps__set_nr_maps_allocated(kmaps, merged_nr_maps_allocated); /* Copy entries before the new_map that can't overlap. */ -- cgit v1.2.3 From f30232b20fadea8c0f2f43f764bc06e51e8cfcdf Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 7 May 2024 15:12:07 +0100 Subject: perf symbols: Update kcore map before merging in remaining symbols When loading kcore, the main vmlinux map is updated in the same loop that merges the remaining maps. If a map that overlaps is merged in before kcore, the list can become unsortable when the main map addresses are updated. This will later trigger the check_invariants() assert: $ perf record $ perf report util/maps.c:96: check_invariants: Assertion `map__end(prev) <= map__start(map) || map__start(prev) == map__start(map)' failed. Aborted Fix it by moving the main map update prior to the loop so that maps__merge_in() can split it if necessary. Fixes: 659ad3492b913c90 ("perf maps: Switch from rbtree to lazily sorted array for addresses") Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240507141210.195939-4-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 0d4de786358d..f21fe5772bd8 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1290,7 +1290,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map, { struct maps *kmaps = map__kmaps(map); struct kcore_mapfn_data md; - struct map *replacement_map = NULL; + struct map *map_ref, *replacement_map = NULL; struct machine *machine; bool is_64_bit; int err, fd; @@ -1368,6 +1368,24 @@ static int dso__load_kcore(struct dso *dso, struct map *map, if (!replacement_map) replacement_map = list_entry(md.maps.next, struct map_list_node, node)->map; + /* + * Update addresses of vmlinux map. Re-insert it to ensure maps are + * correctly ordered. Do this before using maps__merge_in() for the + * remaining maps so vmlinux gets split if necessary. + */ + map_ref = map__get(map); + maps__remove(kmaps, map_ref); + + map__set_start(map_ref, map__start(replacement_map)); + map__set_end(map_ref, map__end(replacement_map)); + map__set_pgoff(map_ref, map__pgoff(replacement_map)); + map__set_mapping_type(map_ref, map__mapping_type(replacement_map)); + + err = maps__insert(kmaps, map_ref); + map__put(map_ref); + if (err) + goto out_err; + /* Add new maps */ while (!list_empty(&md.maps)) { struct map_list_node *new_node = list_entry(md.maps.next, struct map_list_node, node); @@ -1375,24 +1393,8 @@ static int dso__load_kcore(struct dso *dso, struct map *map, list_del_init(&new_node->node); - if (RC_CHK_EQUAL(new_map, replacement_map)) { - struct map *map_ref; - - /* Ensure maps are correctly ordered */ - map_ref = map__get(map); - maps__remove(kmaps, map_ref); - - map__set_start(map_ref, map__start(new_map)); - map__set_end(map_ref, map__end(new_map)); - map__set_pgoff(map_ref, map__pgoff(new_map)); - map__set_mapping_type(map_ref, map__mapping_type(new_map)); - - err = maps__insert(kmaps, map_ref); - map__put(map_ref); - map__put(new_map); - if (err) - goto out_err; - } else { + /* skip if replacement_map, already inserted above */ + if (!RC_CHK_EQUAL(new_map, replacement_map)) { /* * Merge kcore map into existing maps, * and ensure that current maps (eBPF) -- cgit v1.2.3 From 25626e19ae6df34f336f235b6b3dbd1b566d2738 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 7 May 2024 15:12:08 +0100 Subject: perf symbols: Fix ownership of string in dso__load_vmlinux() The linked commit updated dso__load_vmlinux() to call dso__set_long_name() before loading the symbols. Loading the symbols may not succeed but dso__set_long_name() takes ownership of the string. The two callers of this function free the string themselves on failure cases, resulting in the following error: $ perf record -- ls $ perf report free(): double free detected in tcache 2 Fix it by always taking ownership of the string, even on failure. This means the string is either freed at the very first early exit condition, or later when the dso is deleted or the long name is replaced. Now no special return value is needed to signify that the caller needs to free the string. Fixes: e59fea47f83e8a9a ("perf symbols: Fix DSO kernel load and symbol process to correctly map DSO to its long_name, type and adjust_symbols") Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240507141210.195939-5-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index f21fe5772bd8..9e5940b5bc59 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1978,6 +1978,10 @@ out: return ret; } +/* + * Always takes ownership of vmlinux when vmlinux_allocated == true, even if + * it returns an error. + */ int dso__load_vmlinux(struct dso *dso, struct map *map, const char *vmlinux, bool vmlinux_allocated) { @@ -1996,8 +2000,11 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, else symtab_type = DSO_BINARY_TYPE__VMLINUX; - if (symsrc__init(&ss, dso, symfs_vmlinux, symtab_type)) + if (symsrc__init(&ss, dso, symfs_vmlinux, symtab_type)) { + if (vmlinux_allocated) + free((char *) vmlinux); return -1; + } /* * dso__load_sym() may copy 'dso' which will result in the copies having @@ -2040,7 +2047,6 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map) err = dso__load_vmlinux(dso, map, filename, true); if (err > 0) goto out; - free(filename); } out: return err; @@ -2192,7 +2198,6 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map) err = dso__load_vmlinux(dso, map, filename, true); if (err > 0) return err; - free(filename); } if (!symbol_conf.ignore_vmlinux && vmlinux_path != NULL) { -- cgit v1.2.3 From b7bd96ec1b709f5079fd203b680261dabc0050aa Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 4 May 2024 09:36:56 +0900 Subject: selftests/ftrace: Fix required features for VFS type test case Since the VFS type argument test case uses fprobe events, it must check the availablity of dynamic_events file and fprobe events syntax in README. Without this fix, the test fails if CONFIG_FPROBE_EVENTS=n. Link: https://lore.kernel.org/all/171478301645.110267.464634740467398506.stgit@devnote2/ Fixes: ee97e5e135c6 ("selftests/ftrace: add fprobe test cases for VFS type "%pd" and "%pD"") Signed-off-by: Masami Hiramatsu (Google) --- tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc index 49a833bf334c..c6a9d2466a71 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -1,7 +1,8 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Fprobe event VFS type argument -# requires: kprobe_events "%pd/%pD":README +# requires: dynamic_events "%pd/%pD":README "f[:[/][]] [%return] []":README + : "Test argument %pd with name for fprobe" echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events -- cgit v1.2.3 From 4810ce7c91993f5d6e7c20fa8da7cb474ee72ca7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 9 May 2024 13:29:18 +0200 Subject: selftests: add F_DUPDFD_QUERY selftests Add simple selftests for the new F_DUPFD_QUERY fcntl(). Signed-off-by: Christian Brauner --- tools/testing/selftests/core/close_range_test.c | 55 ++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index c59e4adb905d..991c473e3859 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -17,6 +17,15 @@ #include "../kselftest_harness.h" #include "../clone3/clone3_selftests.h" + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_DUPFD_QUERY +#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -45,6 +54,15 @@ TEST(core_close_range) SKIP(return, "close_range() syscall not supported"); } + for (i = 0; i < 100; i++) { + ret = fcntl(open_fds[i], F_DUPFD_QUERY, open_fds[i + 1]); + if (ret < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(ret, 0); + } + } + EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0)); for (i = 0; i <= 50; i++) @@ -358,7 +376,7 @@ TEST(close_range_cloexec_unshare) */ TEST(close_range_cloexec_syzbot) { - int fd1, fd2, fd3, flags, ret, status; + int fd1, fd2, fd3, fd4, flags, ret, status; pid_t pid; struct __clone_args args = { .flags = CLONE_FILES, @@ -372,6 +390,13 @@ TEST(close_range_cloexec_syzbot) fd2 = dup2(fd1, 1000); EXPECT_GT(fd2, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd2); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); @@ -396,6 +421,15 @@ TEST(close_range_cloexec_syzbot) fd3 = dup2(fd1, 42); EXPECT_GT(fd3, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd3); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + + + /* * Duplicating the file descriptor must remove the * FD_CLOEXEC flag. @@ -426,6 +460,24 @@ TEST(close_range_cloexec_syzbot) fd3 = dup2(fd1, 42); EXPECT_GT(fd3, 0); + flags = fcntl(fd1, F_DUPFD_QUERY, fd3); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 1); + } + + fd4 = open("/dev/null", O_RDWR); + EXPECT_GT(fd4, 0); + + /* Same inode, different file pointers. */ + flags = fcntl(fd1, F_DUPFD_QUERY, fd4); + if (flags < 0) { + EXPECT_EQ(errno, EINVAL); + } else { + EXPECT_EQ(flags, 0); + } + flags = fcntl(fd3, F_GETFD); EXPECT_GT(flags, -1); EXPECT_EQ(flags & FD_CLOEXEC, 0); @@ -433,6 +485,7 @@ TEST(close_range_cloexec_syzbot) EXPECT_EQ(close(fd1), 0); EXPECT_EQ(close(fd2), 0); EXPECT_EQ(close(fd3), 0); + EXPECT_EQ(close(fd4), 0); } /* -- cgit v1.2.3 From a8a388c2aae490c08d59a6c15d15a968fea5089a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 May 2024 13:02:10 +0200 Subject: selftests: netfilter: add packetdrill based conntrack tests Add a new test script that uses packetdrill tool to exercise conntrack state machine. Needs ip/ip6tables and conntrack tool (to check if we have an entry in the expected state). Test cases added here cover following scenarios: 1. already-acked (retransmitted) packets are not tagged as INVALID 2. RST packet coming when conntrack is already closing (FIN/CLOSE_WAIT) transitions conntrack to CLOSE even if the RST is not an exact match 3. RST packets with out-of-window sequence numbers are marked as INVALID 4. SYN+Challenge ACK: check that challenge ack is allowed to pass 5. Old SYN/ACK: check conntrack handles the case where SYN is answered with SYN/ACK for an old, previous connection attempt 6. Check SYN reception while in ESTABLISHED state generates a challenge ack, RST response clears 'outdated' state + next SYN retransmit gets us into 'SYN_RECV' conntrack state. Tests get run twice, once with ipv4 and once with ipv6. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/netfilter/Makefile | 2 + tools/testing/selftests/net/netfilter/config | 1 + .../net/netfilter/nf_conntrack_packetdrill.sh | 71 +++++++++++++ .../selftests/net/netfilter/packetdrill/common.sh | 33 ++++++ .../packetdrill/conntrack_ack_loss_stall.pkt | 118 +++++++++++++++++++++ .../packetdrill/conntrack_inexact_rst.pkt | 62 +++++++++++ .../packetdrill/conntrack_rst_invalid.pkt | 59 +++++++++++ .../packetdrill/conntrack_syn_challenge_ack.pkt | 44 ++++++++ .../netfilter/packetdrill/conntrack_synack_old.pkt | 51 +++++++++ .../packetdrill/conntrack_synack_reuse.pkt | 34 ++++++ 10 files changed, 475 insertions(+) create mode 100755 tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh create mode 100755 tools/testing/selftests/net/netfilter/packetdrill/common.sh create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt create mode 100644 tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index e9a6c702b8c9..47945b2b3f92 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -13,6 +13,7 @@ TEST_PROGS += conntrack_tcp_unreplied.sh TEST_PROGS += conntrack_sctp_collision.sh TEST_PROGS += conntrack_vrf.sh TEST_PROGS += ipvs.sh +TEST_PROGS += nf_conntrack_packetdrill.sh TEST_PROGS += nf_nat_edemux.sh TEST_PROGS += nft_audit.sh TEST_PROGS += nft_concat_range.sh @@ -45,6 +46,7 @@ $(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) $(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) TEST_FILES := lib.sh +TEST_FILES += packetdrill TEST_INCLUDES := \ ../lib.sh diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 5b5b764f6cd0..63ef80ef47a4 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -86,3 +86,4 @@ CONFIG_VLAN_8021Q=m CONFIG_XFRM_USER=m CONFIG_XFRM_STATISTICS=y CONFIG_NET_PKTGEN=m +CONFIG_TUN=m diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh new file mode 100755 index 000000000000..c6fdd2079f4d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +checktool "conntrack --version" "run test without conntrack" +checktool "iptables --version" "run test without iptables" +checktool "ip6tables --version" "run test without ip6tables" + +modprobe -q tun +modprobe -q nf_conntrack +# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns + +PDRILL_TIMEOUT=10 + +files=" +conntrack_ack_loss_stall.pkt +conntrack_inexact_rst.pkt +conntrack_syn_challenge_ack.pkt +conntrack_synack_old.pkt +conntrack_synack_reuse.pkt +conntrack_rst_invalid.pkt +" + +if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then + echo "SKIP: packetdrill not installed" + exit ${ksft_skip} +fi + +ret=0 + +run_packetdrill() +{ + filename="$1" + ipver="$2" + local mtu=1500 + + export NFCT_IP_VERSION="$ipver" + + if [ "$ipver" = "ipv4" ];then + export xtables="iptables" + elif [ "$ipver" = "ipv6" ];then + export xtables="ip6tables" + mtu=1520 + fi + + timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \ + --tolerance_usecs=1000000 --non_fatal packet "$filename" +} + +run_one_test_file() +{ + filename="$1" + + for v in ipv4 ipv6;do + printf "%-50s(%s)%-20s" "$filename" "$v" "" + if run_packetdrill packetdrill/"$f" "$v";then + echo OK + else + echo FAIL + ret=1 + fi + done +} + +echo "Replaying packetdrill test cases:" +for f in $files;do + run_one_test_file packetdrill/"$f" +done + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh new file mode 100755 index 000000000000..ed36d535196d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# for debugging set net.netfilter.nf_log_all_netns=1 in init_net +# or do not use net namespaces. +modprobe -q nf_conntrack +sysctl -q net.netfilter.nf_conntrack_log_invalid=6 + +# Flush old cached data (fastopen cookies). +ip tcp_metrics flush all > /dev/null 2>&1 + +# TCP min, default, and max receive and send buffer sizes. +sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))" +sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304" + +# TCP congestion control. +sysctl -q net.ipv4.tcp_congestion_control=cubic + +# TCP slow start after idle. +sysctl -q net.ipv4.tcp_slow_start_after_idle=0 + +# TCP Explicit Congestion Notification (ECN) +sysctl -q net.ipv4.tcp_ecn=0 + +sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1 + +# Override the default qdisc on the tun device. +# Many tests fail with timing errors if the default +# is FQ and that paces their flows. +tc qdisc add dev tun0 root pfifo + +# Enable conntrack +$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt new file mode 100644 index 000000000000..d755bd64c54f --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt @@ -0,0 +1,118 @@ +// check that already-acked (retransmitted) packet is let through rather +// than tagged as INVALID. + +`packetdrill/common.sh` + +// should set -P DROP but it disconnects VM w.o. extra netns ++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 ++0 > S. 0:0(0) ack 1 ++.01 < . 1:1(0) ack 1 win 65535 ++0 accept(3, ..., ...) = 4 + ++0.0001 < P. 1:1461(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 1461 win 65535 ++0.0001 < P. 1461:2921(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 2921 win 65535 ++0.0001 < P. 2921:4381(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 4381 win 65535 ++0.0001 < P. 4381:5841(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 5841 win 65535 ++0.0001 < P. 5841:7301(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 7301 win 65535 ++0.0001 < P. 7301:8761(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 8761 win 65535 ++0.0001 < P. 8761:10221(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 10221 win 65535 ++0.0001 < P. 10221:11681(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 11681 win 65535 ++0.0001 < P. 11681:13141(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 13141 win 65535 ++0.0001 < P. 13141:14601(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 14601 win 65535 ++0.0001 < P. 14601:16061(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 16061 win 65535 ++0.0001 < P. 16061:17521(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 17521 win 65535 ++0.0001 < P. 17521:18981(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 18981 win 65535 ++0.0001 < P. 18981:20441(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 20441 win 65535 ++0.0001 < P. 20441:21901(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 21901 win 65535 ++0.0001 < P. 21901:23361(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 23361 win 65535 ++0.0001 < P. 23361:24821(1460) ack 1 win 257 +0.055 > . 1:1(0) ack 24821 win 65535 ++0.0001 < P. 24821:26281(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 26281 win 65535 ++0.0001 < P. 26281:27741(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 27741 win 65535 ++0.0001 < P. 27741:29201(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 29201 win 65535 ++0.0001 < P. 29201:30661(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 30661 win 65535 ++0.0001 < P. 30661:32121(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 32121 win 65535 ++0.0001 < P. 32121:33581(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 33581 win 65535 ++0.0001 < P. 33581:35041(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 35041 win 65535 ++0.0001 < P. 35041:36501(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 36501 win 65535 ++0.0001 < P. 36501:37961(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 37961 win 65535 ++0.0001 < P. 37961:39421(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 39421 win 65535 ++0.0001 < P. 39421:40881(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 40881 win 65535 ++0.0001 < P. 40881:42341(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 42341 win 65535 ++0.0001 < P. 42341:43801(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 43801 win 65535 ++0.0001 < P. 43801:45261(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 45261 win 65535 ++0.0001 < P. 45261:46721(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 46721 win 65535 ++0.0001 < P. 46721:48181(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 48181 win 65535 ++0.0001 < P. 48181:49641(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 49641 win 65535 ++0.0001 < P. 49641:51101(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 51101 win 65535 ++0.0001 < P. 51101:52561(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 52561 win 65535 ++0.0001 < P. 52561:54021(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 54021 win 65535 ++0.0001 < P. 54021:55481(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 55481 win 65535 ++0.0001 < P. 55481:56941(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 56941 win 65535 ++0.0001 < P. 56941:58401(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 58401 win 65535 ++0.0001 < P. 58401:59861(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 59861 win 65535 ++0.0001 < P. 59861:61321(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 61321 win 65535 ++0.0001 < P. 61321:62781(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 62781 win 65535 ++0.0001 < P. 62781:64241(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 64241 win 65535 ++0.0001 < P. 64241:65701(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 65701 win 65535 ++0.0001 < P. 65701:67161(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 67161 win 65535 + +// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0 ++0.0001 < P. 1:1461(1460) ack 1 win 257 + +// only sent if above packet isn't flagged as invalid ++.0 > . 1:1(0) ack 67161 win 65535 + ++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt new file mode 100644 index 000000000000..dccdd4c009c6 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt @@ -0,0 +1,62 @@ +// check RST packet that doesn't exactly match expected next sequence +// number still transitions conntrack state to CLOSE iff its already in +// FIN/CLOSE_WAIT. + +`packetdrill/common.sh` + +// 5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334 +// 5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture] +// 5.788126 server_ip > client_ip TLSv1.2 90 Application Data +// 5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872 +// 5.788447 client_ip > server_ip TLSv1.2 90 Application Data +// 5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350 +// 5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0 + ++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 + ++0.1 < S. 1:1(0) ack 1 win 65535 + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 + +// Conntrack should move to FIN_WAIT, then CLOSE_WAIT. ++0 < F. 3001:3001(0) ack 1001 win 65535 ++0 > . 1001:1001(0) ack 3002 win 65535 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT` + ++1 close(3) = 0 +// RST: unread data. FIN was seen, hence ack + 1 ++0 > R. 1001:1001(0) ack 3002 win 65535 +// ... and then, CLOSE. ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` + +// Spurious RST from peer -- no sk state. Should NOT get +// marked INVALID, because conntrack is already closing. ++0.1 < R 2001:2001(0) win 0 + +// No packets should have been marked INVALID ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt new file mode 100644 index 000000000000..686f18a3d9ef --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt @@ -0,0 +1,59 @@ +// check that out of window resets are marked as INVALID and conntrack remains +// in ESTABLISHED state. + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 + ++0.1 < S. 1:1(0) ack 1 win 65535 + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + +// out of window ++0.0 < R 0:0(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// out of window ++0.0 < R 1000000:1000000(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// in-window but not exact match ++0.0 < R 42:42(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0 < . 3001:3001(0) ack 1001 win 65535 + ++0.0 < R. 3000:3000(0) ack 1001 win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// exact next sequence ++0.0 < R. 3001:3001(0) ack 1001 win 0 +// Conntrack should move to CLOSE + +// Expect four invalid RSTs ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 4 "` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt new file mode 100644 index 000000000000..3442cd29bc93 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt @@ -0,0 +1,44 @@ +// Check connection re-use, i.e. peer that receives the SYN answers with +// a challenge-ACK. +// Check that conntrack lets all packets pass, including the challenge ack, +// and that a new connection is established. + +`packetdrill/common.sh` + +// S > +// . < (challnge-ack) +// R. > +// S > +// S. < +// Expected outcome: established connection. + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 + +// Challenge ACK, old incarnation. +0.1 < . 145824453:145824453(0) ack 643160523 win 240 + ++0.01 > R 643160523:643160523(0) win 0 + ++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// Must go through. ++0.01 > S 0:0(0) win 65535 + +// correct synack ++0.1 < S. 0:0(0) ack 1 win 250 + +// 3whs completes. ++0.01 > . 1:1(0) ack 1 win 256 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + +// No packets should have been marked INVALID ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt new file mode 100644 index 000000000000..3047160c4bf3 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt @@ -0,0 +1,51 @@ +// Check conntrack copes with syn/ack reply for a previous, old incarnation. + +// tcpdump with buggy sequence +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0 +// OLD synack, for old/previous S +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0 +// This reset never makes it to the endpoint, elided in the packetdrill script +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0 +// Syn retransmit, no change +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0 +// CORRECT synack, should be accepted, but conntrack classified this as INVALID: +// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 .. +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0 + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 + +// bogus/outdated synack, invalid ack value +0.1 < S. 145824453:145824453(0) ack 643160523 win 240 + +// syn retransmitted +1.01 > S 0:0(0) win 65535 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// correct synack ++0 < S. 145758918:145758918(0) ack 1 win 250 ++0 write(3, ..., 1) = 1 + +// with buggy conntrack above packet is dropped, so SYN rtx is seen: +// script packet: 1.054007 . 1:1(0) ack 16777958 win 256 +// actual packet: 3.010000 S 0:0(0) win 65535 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + ++0 > P. 1:2(1) ack 4294901762 win 256 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED` + +// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 1 "` + ++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt new file mode 100644 index 000000000000..21e1bb6395e4 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt @@ -0,0 +1,34 @@ +// Check reception of another SYN while we have an established conntrack state. +// Challenge ACK is supposed to pass through, RST reply should clear conntrack +// state and SYN retransmit should give us new 'SYN_RECV' connection state. + +`packetdrill/common.sh` + +// should show a match if bug is present: ++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 ++0 > S. 0:0(0) ack 1 ++.01 < . 1:1(0) ack 1 win 257 ++0 accept(3, ..., ...) = 4 + ++0 < P. 1:101(100) ack 1 win 257 ++.001 > . 1:1(0) ack 101 win 256 ++0 read(4, ..., 101) = 100 + +1.0 < S 2000:2000(0) win 32792 +// Won't expect this: challenge ack. + ++0 > . 1:1(0) ack 101 win 256 ++0 < R. 101:101(0) ack 1 win 257 ++0 close(4) = 0 + +1.5 < S 2000:2000(0) win 32792 + ++0 `conntrack -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV` ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` -- cgit v1.2.3 From 11a42964850b5b6f866f64b0157e6e99b4d7ab9d Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 10 May 2024 07:24:30 +0900 Subject: perf parse-events: pass parse_state to add_tracepoint The next patch will add another flag to parse_state that we will want to pass to evsel__newtp_idx(), so pass the whole parse_state all the way down instead of giving only the index Originally-by: Jiri Olsa Reviewed-by: Ian Rogers Signed-off-by: Dominique Martinet Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240510-perf_digit-v4-1-db1553f3233b@codewreck.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 31 ++++++++++++++++++------------- tools/perf/util/parse-events.h | 3 ++- tools/perf/util/parse-events.y | 2 +- 3 files changed, 21 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2b9ede311c31..e6a2a80b02df 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -543,13 +543,14 @@ static void tracepoint_error(struct parse_events_error *e, int err, parse_events_error__handle(e, column, strdup(str), strdup(help)); } -static int add_tracepoint(struct list_head *list, int *idx, +static int add_tracepoint(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, void *loc_) { YYLTYPE *loc = loc_; - struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, (*idx)++); + struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, parse_state->idx++); if (IS_ERR(evsel)) { tracepoint_error(err, PTR_ERR(evsel), sys_name, evt_name, loc->first_column); @@ -568,7 +569,8 @@ static int add_tracepoint(struct list_head *list, int *idx, return 0; } -static int add_tracepoint_multi_event(struct list_head *list, int *idx, +static int add_tracepoint_multi_event(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, YYLTYPE *loc) @@ -602,7 +604,7 @@ static int add_tracepoint_multi_event(struct list_head *list, int *idx, found++; - ret = add_tracepoint(list, idx, sys_name, evt_ent->d_name, + ret = add_tracepoint(parse_state, list, sys_name, evt_ent->d_name, err, head_config, loc); } @@ -616,19 +618,21 @@ static int add_tracepoint_multi_event(struct list_head *list, int *idx, return ret; } -static int add_tracepoint_event(struct list_head *list, int *idx, +static int add_tracepoint_event(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, YYLTYPE *loc) { return strpbrk(evt_name, "*?") ? - add_tracepoint_multi_event(list, idx, sys_name, evt_name, + add_tracepoint_multi_event(parse_state, list, sys_name, evt_name, err, head_config, loc) : - add_tracepoint(list, idx, sys_name, evt_name, + add_tracepoint(parse_state, list, sys_name, evt_name, err, head_config, loc); } -static int add_tracepoint_multi_sys(struct list_head *list, int *idx, +static int add_tracepoint_multi_sys(struct parse_events_state *parse_state, + struct list_head *list, const char *sys_name, const char *evt_name, struct parse_events_error *err, struct parse_events_terms *head_config, YYLTYPE *loc) @@ -654,7 +658,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx, if (!strglobmatch(events_ent->d_name, sys_name)) continue; - ret = add_tracepoint_event(list, idx, events_ent->d_name, + ret = add_tracepoint_event(parse_state, list, events_ent->d_name, evt_name, err, head_config, loc); } @@ -1291,7 +1295,8 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head return 0; } -int parse_events_add_tracepoint(struct list_head *list, int *idx, +int parse_events_add_tracepoint(struct parse_events_state *parse_state, + struct list_head *list, const char *sys, const char *event, struct parse_events_error *err, struct parse_events_terms *head_config, void *loc_) @@ -1307,14 +1312,14 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, } if (strpbrk(sys, "*?")) - return add_tracepoint_multi_sys(list, idx, sys, event, + return add_tracepoint_multi_sys(parse_state, list, sys, event, err, head_config, loc); else - return add_tracepoint_event(list, idx, sys, event, + return add_tracepoint_event(parse_state, list, sys, event, err, head_config, loc); #else + (void)parse_state; (void)list; - (void)idx; (void)sys; (void)event; (void)head_config; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index e7ac1f13376d..c06984bd3af8 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -208,7 +208,8 @@ int parse_events__modifier_event(struct parse_events_state *parse_state, void *l int parse_events__modifier_group(struct parse_events_state *parse_state, void *loc, struct list_head *list, struct parse_events_modifier mod); int parse_events__set_default_name(struct list_head *list, char *name); -int parse_events_add_tracepoint(struct list_head *list, int *idx, +int parse_events_add_tracepoint(struct parse_events_state *parse_state, + struct list_head *list, const char *sys, const char *event, struct parse_events_error *error, struct parse_events_terms *head_config, void *loc); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 68b3b06c7ff0..c94a3994177e 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -497,7 +497,7 @@ tracepoint_name opt_event_config if (!list) YYNOMEM; - err = parse_events_add_tracepoint(list, &parse_state->idx, $1.sys, $1.event, + err = parse_events_add_tracepoint(parse_state, list, $1.sys, $1.event, error, $2, &@1); parse_events_terms__delete($2); -- cgit v1.2.3 From a2a6604e1c5836ff4121f170af1328c6a141cca4 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 10 May 2024 07:24:31 +0900 Subject: perf parse-events: Add new 'fake_tp' parameter for tests The next commit will allow tracepoints starting with digits, but most systems do not have any available by default so tests should skip the actual "check if it exists in /sys/kernel/debug/tracing" step. In order to do that, add a new boolean flag specifying if we should actually "format" the probe or not. Originally-by: Jiri Olsa Reviewed-by: Ian Rogers Signed-off-by: Dominique Martinet Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240510-perf_digit-v4-2-db1553f3233b@codewreck.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 6 ++++-- tools/perf/tests/pmu-events.c | 2 +- tools/perf/util/evlist.c | 3 ++- tools/perf/util/evsel.c | 20 +++++++++++++------- tools/perf/util/evsel.h | 4 ++-- tools/perf/util/metricgroup.c | 3 ++- tools/perf/util/parse-events.c | 9 ++++++--- tools/perf/util/parse-events.h | 6 ++++-- 8 files changed, 34 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 993e482f094c..c3b77570bb57 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2493,7 +2493,8 @@ static int test_event(const struct evlist_test *e) return TEST_FAIL; } parse_events_error__init(&err); - ret = parse_events(evlist, e->name, &err); + ret = __parse_events(evlist, e->name, /*pmu_filter=*/NULL, &err, /*fake_pmu=*/NULL, + /*warn_if_reordered=*/true, /*fake_tp=*/true); if (ret) { pr_debug("failed to parse event '%s', err %d\n", e->name, ret); parse_events_error__print(&err, e->name); @@ -2521,7 +2522,8 @@ static int test_event_fake_pmu(const char *str) parse_events_error__init(&err); ret = __parse_events(evlist, str, /*pmu_filter=*/NULL, &err, - &perf_pmu__fake, /*warn_if_reordered=*/true); + &perf_pmu__fake, /*warn_if_reordered=*/true, + /*fake_tp=*/true); if (ret) { pr_debug("failed to parse event '%s', err %d\n", str, ret); diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index f39aadacd7a6..ff3e7bc0a77f 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -842,7 +842,7 @@ static int check_parse_id(const char *id, struct parse_events_error *error, *cur = '/'; ret = __parse_events(evlist, dup, /*pmu_filter=*/NULL, error, fake_pmu, - /*warn_if_reordered=*/true); + /*warn_if_reordered=*/true, /*fake_tp=*/false); free(dup); evlist__delete(evlist); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 55a300a0977b..3a719edafc7a 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -298,7 +298,8 @@ struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide) #ifdef HAVE_LIBTRACEEVENT struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide) { - struct evsel *evsel = evsel__newtp_idx("sched", "sched_switch", 0); + struct evsel *evsel = evsel__newtp_idx("sched", "sched_switch", 0, + /*format=*/true); if (IS_ERR(evsel)) return evsel; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 3536404e9447..4f818ab6b662 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -452,7 +452,7 @@ out_err: * Returns pointer with encoded error via interface. */ #ifdef HAVE_LIBTRACEEVENT -struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx) +struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx, bool format) { struct evsel *evsel = zalloc(perf_evsel__object.size); int err = -ENOMEM; @@ -469,14 +469,20 @@ struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx) if (asprintf(&evsel->name, "%s:%s", sys, name) < 0) goto out_free; - evsel->tp_format = trace_event__tp_format(sys, name); - if (IS_ERR(evsel->tp_format)) { - err = PTR_ERR(evsel->tp_format); - goto out_free; + event_attr_init(&attr); + + if (format) { + evsel->tp_format = trace_event__tp_format(sys, name); + if (IS_ERR(evsel->tp_format)) { + err = PTR_ERR(evsel->tp_format); + goto out_free; + } + attr.config = evsel->tp_format->id; + } else { + attr.config = (__u64) -1; } - event_attr_init(&attr); - attr.config = evsel->tp_format->id; + attr.sample_period = 1; evsel__init(evsel, &attr, idx); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 517cff431de2..375a38e15cd9 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -234,14 +234,14 @@ void free_config_terms(struct list_head *config_terms); #ifdef HAVE_LIBTRACEEVENT -struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx); +struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx, bool format); /* * Returns pointer with encoded error via interface. */ static inline struct evsel *evsel__newtp(const char *sys, const char *name) { - return evsel__newtp_idx(sys, name, 0); + return evsel__newtp_idx(sys, name, 0, true); } #endif diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 9be406524617..69f6a46402c3 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1502,7 +1502,8 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu, pr_debug("Parsing metric events '%s'\n", events.buf); parse_events_error__init(&parse_error); ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL, - &parse_error, fake_pmu, /*warn_if_reordered=*/false); + &parse_error, fake_pmu, /*warn_if_reordered=*/false, + /*fake_tp=*/false); if (ret) { parse_events_error__print(&parse_error, events.buf); goto err_out; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index e6a2a80b02df..30f958069076 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -550,7 +550,8 @@ static int add_tracepoint(struct parse_events_state *parse_state, struct parse_events_terms *head_config, void *loc_) { YYLTYPE *loc = loc_; - struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, parse_state->idx++); + struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, parse_state->idx++, + !parse_state->fake_tp); if (IS_ERR(evsel)) { tracepoint_error(err, PTR_ERR(evsel), sys_name, evt_name, loc->first_column); @@ -2135,7 +2136,7 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, struct parse_events_error *err, struct perf_pmu *fake_pmu, - bool warn_if_reordered) + bool warn_if_reordered, bool fake_tp) { struct parse_events_state parse_state = { .list = LIST_HEAD_INIT(parse_state.list), @@ -2143,6 +2144,7 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte .error = err, .stoken = PE_START_EVENTS, .fake_pmu = fake_pmu, + .fake_tp = fake_tp, .pmu_filter = pmu_filter, .match_legacy_cache_terms = true, }; @@ -2352,7 +2354,8 @@ int parse_events_option(const struct option *opt, const char *str, parse_events_error__init(&err); ret = __parse_events(*args->evlistp, str, args->pmu_filter, &err, - /*fake_pmu=*/NULL, /*warn_if_reordered=*/true); + /*fake_pmu=*/NULL, /*warn_if_reordered=*/true, + /*fake_tp=*/false); if (ret) { parse_events_error__print(&err, str); diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index c06984bd3af8..f2baa69fff98 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -32,14 +32,14 @@ int parse_events_option_new_evlist(const struct option *opt, const char *str, in __attribute__((nonnull(1, 2, 4))) int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, struct parse_events_error *error, struct perf_pmu *fake_pmu, - bool warn_if_reordered); + bool warn_if_reordered, bool fake_tp); __attribute__((nonnull(1, 2, 3))) static inline int parse_events(struct evlist *evlist, const char *str, struct parse_events_error *err) { return __parse_events(evlist, str, /*pmu_filter=*/NULL, err, /*fake_pmu=*/NULL, - /*warn_if_reordered=*/true); + /*warn_if_reordered=*/true, /*fake_tp=*/false); } int parse_event(struct evlist *evlist, const char *str); @@ -152,6 +152,8 @@ struct parse_events_state { int stoken; /* Special fake PMU marker for testing. */ struct perf_pmu *fake_pmu; + /* Skip actual tracepoint processing for testing. */ + bool fake_tp; /* If non-null, when wildcard matching only match the given PMU. */ const char *pmu_filter; /* Should PE_LEGACY_NAME tokens be generated for config terms? */ -- cgit v1.2.3 From 5ceb57990bf41684e9bc186128a07025adb896bd Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 10 May 2024 07:24:32 +0900 Subject: perf parse: Allow tracepoint names to start with digits Tracepoints can start with digits, although we don't have many of these: $ rg -g '*.h' '\bTRACE_EVENT\([0-9]' net/mac802154/trace.h 53:TRACE_EVENT(802154_drv_return_int, ... net/ieee802154/trace.h 66:TRACE_EVENT(802154_rdev_add_virtual_intf, ... include/trace/events/9p.h 124:TRACE_EVENT(9p_client_req, ... Just allow names to start with digits too so e.g. "perf trace -e '9p:*'" works Reviewed-by: Ian Rogers Signed-off-by: Dominique Martinet Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240510-perf_digit-v4-3-db1553f3233b@codewreck.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 7 +++++++ tools/perf/util/parse-events.l | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index c3b77570bb57..edc2adcf1bae 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2269,6 +2269,13 @@ static const struct evlist_test test__events[] = { .check = test__checkevent_breakpoint_2_events, /* 3 */ }, +#ifdef HAVE_LIBTRACEEVENT + { + .name = "9p:9p_client_req", + .check = test__checkevent_tracepoint, + /* 4 */ + }, +#endif }; static const struct evlist_test test__events_pmu[] = { diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 08ea2d845dc3..99d585d272e0 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -242,8 +242,8 @@ event [^,{}/]+ num_dec [0-9]+ num_hex 0x[a-fA-F0-9]{1,16} num_raw_hex [a-fA-F0-9]{1,16} -name [a-zA-Z_*?\[\]][a-zA-Z0-9_*?.\[\]!\-]* -name_tag [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] +name [a-zA-Z0-9_*?\[\]][a-zA-Z0-9_*?.\[\]!\-]* +name_tag [\'][a-zA-Z0-9_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* -- cgit v1.2.3 From e2eeef290c4adad7a0f95c4a41e1a992326a7829 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 May 2024 11:22:35 -0700 Subject: perf tools: Ignore deleted cgroups On large systems, cgroups can be created and deleted often. That means there's a race between perf tools and cgroups when it gets the cgroup name and opens the cgroup. I got a report that 'perf stat' with many cgroups failed quite often due to the missing cgroups on such a large machine. I think we can ignore such cgroups when expanding events and use id 0 if it fails to read the cgroup id. IIUC 0 is not a vaild cgroup id so it won't update event counts for the failed cgroups. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240509182235.2319599-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter_cgroup.c | 5 ++--- tools/perf/util/cgroup.c | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index 1c82377ed78b..ea29c372f339 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -136,9 +136,8 @@ static int bperf_load_program(struct evlist *evlist) cgrp = evsel->cgrp; if (read_cgroup_id(cgrp) < 0) { - pr_err("Failed to get cgroup id\n"); - err = -1; - goto out; + pr_debug("Failed to get cgroup id for %s\n", cgrp->name); + cgrp->id = 0; } map_fd = bpf_map__fd(skel->maps.cgrp_idx); diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index fcb509058499..0f759dd96db7 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -465,9 +465,11 @@ int evlist__expand_cgroup(struct evlist *evlist, const char *str, name = cn->name + prefix_len; if (name[0] == '/' && name[1]) name++; + + /* the cgroup can go away in the meantime */ cgrp = cgroup__new(name, open_cgroup); if (cgrp == NULL) - goto out_err; + continue; leader = NULL; evlist__for_each_entry(orig_list, pos) { -- cgit v1.2.3 From 6fe61cb4aebb07ae1c6e8e136e39e76ca3363107 Mon Sep 17 00:00:00 2001 From: Madadi Vineeth Reddy Date: Thu, 28 Mar 2024 14:30:05 +0530 Subject: perf sched: Rename 'switches' column header to 'count' and add usage description, options for latency Rename 'Switches' to 'Count' and document metrics shown for perf sched latency output. Also add options possible with perf sched latency. Initially, after seeing the output of 'perf sched latency', the term 'Switches' seemed like it's the number of context switches-in for a particular task, but upon going through the code, it was observed that it's actually keeping track of number of times a delay was calculated so that it is used in calculation of the average delay. Actually, the switches here is a subset of number of context switches-in because there are some cases where the count is not incremented in switch-in handler 'add_sched_in_event'. For example when a task is switched-in while it's state is not ready to run(!= THREAD_WAIT_CPU). commit d9340c1db3f52460 ("perf sched: Display time in milliseconds, reorganize output") changed it from the original count to switches. So, renamed switches to count to make things a bit more clearer and added the metrics description of latency in the document. Reviewed-by: Aditya Gupta Signed-off-by: Madadi Vineeth Reddy Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240328090005.8321-1-vineethr@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 36 +++++++++++++++++++++++++++++++++ tools/perf/builtin-sched.c | 2 +- 2 files changed, 37 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 5fbe42bd599b..a216d2991b19 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -20,6 +20,26 @@ There are several variants of 'perf sched': 'perf sched latency' to report the per task scheduling latencies and other scheduling properties of the workload. + Example usage: + perf sched record -- sleep 1 + perf sched latency + + ------------------------------------------------------------------------------------------------------------------------------------------- + Task | Runtime ms | Count | Avg delay ms | Max delay ms | Max delay start | Max delay end | + ------------------------------------------------------------------------------------------------------------------------------------------- + perf:(2) | 2.804 ms | 66 | avg: 0.524 ms | max: 1.069 ms | max start: 254752.314960 s | max end: 254752.316029 s + NetworkManager:1343 | 0.372 ms | 13 | avg: 0.008 ms | max: 0.013 ms | max start: 254751.551153 s | max end: 254751.551166 s + kworker/1:2-xfs:4649 | 0.012 ms | 1 | avg: 0.008 ms | max: 0.008 ms | max start: 254751.519807 s | max end: 254751.519815 s + kworker/3:1-xfs:388 | 0.011 ms | 1 | avg: 0.006 ms | max: 0.006 ms | max start: 254751.519809 s | max end: 254751.519815 s + sleep:147736 | 0.938 ms | 3 | avg: 0.006 ms | max: 0.007 ms | max start: 254751.313817 s | max end: 254751.313824 s + + It shows Runtime(time that a task spent actually running on the CPU), + Count(number of times a delay was calculated) and delay(time that a + task was ready to run but was kept waiting). + + Tasks with the same command name are merged and the merge count is + given within (), However if -p option is used, pid is mentioned. + 'perf sched script' to see a detailed trace of the workload that was recorded (aliased to 'perf script' for now). @@ -78,6 +98,22 @@ OPTIONS --force:: Don't complain, do it. +OPTIONS for 'perf sched latency' +------------------------------- + +-C:: +--CPU :: + CPU to profile on. + +-p:: +--pids:: + latency stats per pid instead of per command name. + +-s:: +--sort :: + sort by key(s): runtime, switch, avg, max + by default it's sorted by "avg ,max ,switch ,runtime". + OPTIONS for 'perf sched map' ---------------------------- diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 0fce7d8986c0..5977c49ae2c7 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3213,7 +3213,7 @@ static int perf_sched__lat(struct perf_sched *sched) perf_sched__sort_lat(sched); printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Max delay start | Max delay end |\n"); + printf(" Task | Runtime ms | Count | Avg delay ms | Max delay ms | Max delay start | Max delay end |\n"); printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n"); next = rb_first_cached(&sched->sorted_atom_root); -- cgit v1.2.3 From 5ecab785396055c3d6f2207bf80be9feacdd7ddc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 8 May 2024 22:31:23 -0700 Subject: perf lock: Avoid memory leaks from strdup() Leak sanitizer complains about the strdup-ed arguments not being freed and given cmd_record doesn't modify the given strings, remove the strdups. Original discussion in this patch: https://lore.kernel.org/lkml/20240430184156.1824083-1-irogers@google.com/ Suggested-by: Namhyung Kim Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: zhaimingbing Link: https://lore.kernel.org/r/20240509053123.1918093-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 230461280e45..7007d26fe654 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2275,23 +2275,13 @@ setup_args: return -ENOMEM; for (i = 0; i < ARRAY_SIZE(record_args); i++) - rec_argv[i] = strdup(record_args[i]); + rec_argv[i] = record_args[i]; for (j = 0; j < nr_tracepoints; j++) { - const char *ev_name; - - if (has_lock_stat) - ev_name = strdup(lock_tracepoints[j].name); - else - ev_name = strdup(contention_tracepoints[j].name); - - if (!ev_name) { - free(rec_argv); - return -ENOMEM; - } - rec_argv[i++] = "-e"; - rec_argv[i++] = ev_name; + rec_argv[i++] = has_lock_stat + ? lock_tracepoints[j].name + : contention_tracepoints[j].name; } for (j = 0; j < nr_callgraph_args; j++, i++) -- cgit v1.2.3 From 230a7a71f92212e723fa435d4ca5922de33ec88a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 8 May 2024 22:20:15 -0700 Subject: libsubcmd: Fix parse-options memory leak If a usage string is built in parse_options_subcommand, also free it. Fixes: 901421a5bdf605d2 ("perf tools: Remove subcmd dependencies on strbuf") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240509052015.1914670-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/parse-options.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index 9fa75943f2ed..d943d78b787e 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -633,11 +633,10 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o const char *const subcommands[], const char *usagestr[], int flags) { struct parse_opt_ctx_t ctx; + char *buf = NULL; /* build usage string if it's not provided */ if (subcommands && !usagestr[0]) { - char *buf = NULL; - astrcatf(&buf, "%s %s [] {", subcmd_config.exec_name, argv[0]); for (int i = 0; subcommands[i]; i++) { @@ -679,7 +678,10 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt); usage_with_options(usagestr, options); } - + if (buf) { + usagestr[0] = NULL; + free(buf); + } return parse_options_end(&ctx); } -- cgit v1.2.3 From 09541603462c399c7408d50295db99b4b8042eaa Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Thu, 9 May 2024 17:34:24 -0700 Subject: perf daemon: Fix file leak in daemon_session__control The open() function returns -1 on error. The 'control' and 'ack' file descriptors are both initialized with open() and further validated with 'if' statement. 'if (!control)' would evaluate to 'true' if returned value on error were '0' but it is actually '-1'. Fixes: edcaa47958c7438b ("perf daemon: Add 'ping' command") Signed-off-by: Samasth Norway Ananda Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240510003424.2016914-1-samasth.norway.ananda@oracle.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 83954af36753..de76bbc50bfb 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -523,7 +523,7 @@ static int daemon_session__control(struct daemon_session *session, session->base, SESSION_CONTROL); control = open(control_path, O_WRONLY|O_NONBLOCK); - if (!control) + if (control < 0) return -1; if (do_ack) { @@ -532,7 +532,7 @@ static int daemon_session__control(struct daemon_session *session, session->base, SESSION_ACK); ack = open(ack_path, O_RDONLY, O_NONBLOCK); - if (!ack) { + if (ack < 0) { close(control); return -1; } -- cgit v1.2.3 From 7e6423441b36e3a03907e2df84b73c414c9c3763 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 6 May 2024 21:58:25 +1000 Subject: selftests/mm: fix powerpc ARCH check In commit 0518dbe97fe6 ("selftests/mm: fix cross compilation with LLVM") the logic to detect the machine architecture in the Makefile was changed to use ARCH, and only fallback to uname -m if ARCH is unset. However the tests of ARCH were not updated to account for the fact that ARCH is "powerpc" for powerpc builds, not "ppc64". Fix it by changing the checks to look for "powerpc", and change the uname -m logic to convert "ppc64.*" into "powerpc". With that fixed the following tests now build for powerpc again: * protection_keys * va_high_addr_switch * virtual_address_range * write_to_hugetlbfs Link: https://lkml.kernel.org/r/20240506115825.66415-1-mpe@ellerman.id.au Fixes: 0518dbe97fe6 ("selftests/mm: fix cross compilation with LLVM") Signed-off-by: Michael Ellerman Cc: Mark Brown Cc: [6.4+] Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eb5f39a2668b..410495e0a611 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -12,7 +12,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/powerpc/') endif # Without this, failed build products remain, with up-to-date timestamps, @@ -98,13 +98,13 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),ppc64)) +ifneq (,$(findstring $(ARCH),powerpc)) TEST_GEN_FILES += protection_keys endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs -- cgit v1.2.3 From c499fe96d3f75a5cf50de6089dd8f1cddd1301a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 May 2024 09:19:19 -0700 Subject: selftests: net: add missing config for amt.sh Test needs IPv6 multicast. smcroute currently crashes when trying to install a route in a kernel without IPv6 multicast. Fixes: c08e8baea78e ("selftests: add amt interface selftest script") Link: https://lore.kernel.org/r/20240509161919.3939966-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/config | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 5e4390cac17e..04de7a6ba6f3 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -30,6 +30,7 @@ CONFIG_IP_GRE=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m +CONFIG_IPV6_MROUTE=y CONFIG_IPV6_SIT=y CONFIG_IP_DCCP=m CONFIG_NF_NAT=m -- cgit v1.2.3 From 4c639b6a7b9db236c0907aca8e92d1537076f2cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 May 2024 09:19:52 -0700 Subject: selftests: net: move amt to socat for better compatibility The test seems to expect that nc will exit after the first received message. This is not the case with Ncat 7.94. There are multiple versions of nc out there, switch to socat for better compatibility. Tell socat to exit after 128 bytes and pad the message. Since the test sets -e make sure we don't set exit code (|| true) and print the pass / fail rather then silently moving over the test and just setting non-zero exit code with no output indicating what failed. Fixes: c08e8baea78e ("selftests: add amt interface selftest script") Acked-by: Paolo Abeni Tested-by: Taehee Yoo Link: https://lore.kernel.org/r/20240509161952.3940476-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/amt.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index 75528788cb95..5175a42cbe8a 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -210,8 +210,8 @@ check_features() test_ipv4_forward() { - RESULT4=$(ip netns exec "${LISTENER}" nc -w 1 -l -u 239.0.0.1 4000) - if [ "$RESULT4" == "172.17.0.2" ]; then + RESULT4=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP4-LISTEN:4000,readbytes=128 || true) + if echo "$RESULT4" | grep -q "172.17.0.2"; then printf "TEST: %-60s [ OK ]\n" "IPv4 amt multicast forwarding" exit 0 else @@ -222,8 +222,8 @@ test_ipv4_forward() test_ipv6_forward() { - RESULT6=$(ip netns exec "${LISTENER}" nc -w 1 -l -u ff0e::5:6 6000) - if [ "$RESULT6" == "2001:db8:3::2" ]; then + RESULT6=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP6-LISTEN:6000,readbytes=128 || true) + if echo "$RESULT6" | grep -q "2001:db8:3::2"; then printf "TEST: %-60s [ OK ]\n" "IPv6 amt multicast forwarding" exit 0 else @@ -236,14 +236,14 @@ send_mcast4() { sleep 2 ip netns exec "${SOURCE}" bash -c \ - 'echo 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' & + 'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' & } send_mcast6() { sleep 2 ip netns exec "${SOURCE}" bash -c \ - 'echo 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' & + 'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' & } check_features -- cgit v1.2.3 From 2d3b8dfd82d76b1295167c6453d683ab99e50794 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 May 2024 17:57:04 -0700 Subject: selftests: net: fix timestamp not arriving in cmsg_time.sh On slow machines the SND timestamp sometimes doesn't arrive before we quit. The test only waits as long as the packet delay, so it's easy for a race condition to happen. Double the wait but do a bit of polling, once the SND timestamp arrives there's no point to wait any longer. This fixes the "TXTIME abs" failures on debug kernels, like: Case ICMPv4 - TXTIME abs returned '', expected 'OK' Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240510005705.43069-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/cmsg_sender.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index c79e65581dc3..161db24e3c40 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -333,16 +333,17 @@ static const char *cs_ts_info2str(unsigned int info) return "unknown"; } -static void +static unsigned long cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) { struct sock_extended_err *see; struct scm_timestamping *ts; + unsigned long ts_seen = 0; struct cmsghdr *cmsg; int i, err; if (!opt.ts.ena) - return; + return 0; msg->msg_control = cbuf; msg->msg_controllen = cbuf_sz; @@ -396,8 +397,11 @@ cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) printf(" %5s ts%d %lluus\n", cs_ts_info2str(see->ee_info), i, rel_time); + ts_seen |= 1 << see->ee_info; } } + + return ts_seen; } static void ca_set_sockopts(int fd) @@ -509,10 +513,16 @@ int main(int argc, char *argv[]) err = ERN_SUCCESS; if (opt.ts.ena) { - /* Make sure all timestamps have time to loop back */ - usleep(opt.txtime.delay); + unsigned long seen; + int i; - cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf)); + /* Make sure all timestamps have time to loop back */ + for (i = 0; i < 40; i++) { + seen = cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf)); + if (seen & (1 << SCM_TSTAMP_SND)) + break; + usleep(opt.txtime.delay / 20); + } } err_out: -- cgit v1.2.3 From b9d5f5711dd8ea2297b952c6a35e6e918cf57948 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 May 2024 17:57:05 -0700 Subject: selftests: net: increase the delay for relative cmsg_time.sh test Slow machines can delay scheduling of the packets for milliseconds. Increase the delay to 8ms if KSFT_MACHINE_SLOW. Try to limit the variability by moving setsockopts earlier (before we read time). This fixes the "TXTIME rel" failures on debug kernels, like: Case ICMPv4 - TXTIME rel returned '', expected 'OK' Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240510005705.43069-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/cmsg_sender.c | 32 +++++++++++++++++-------------- tools/testing/selftests/net/cmsg_time.sh | 7 +++++-- 2 files changed, 23 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 161db24e3c40..876c2db02a63 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -260,15 +260,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit); if (opt.txtime.ena) { - struct sock_txtime so_txtime = { - .clockid = CLOCK_MONOTONIC, - }; __u64 txtime; - if (setsockopt(fd, SOL_SOCKET, SO_TXTIME, - &so_txtime, sizeof(so_txtime))) - error(ERN_SOCKOPT, errno, "setsockopt TXTIME"); - txtime = time_start_mono.tv_sec * (1000ULL * 1000 * 1000) + time_start_mono.tv_nsec + opt.txtime.delay * 1000; @@ -284,13 +277,6 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) memcpy(CMSG_DATA(cmsg), &txtime, sizeof(txtime)); } if (opt.ts.ena) { - __u32 val = SOF_TIMESTAMPING_SOFTWARE | - SOF_TIMESTAMPING_OPT_TSONLY; - - if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, - &val, sizeof(val))) - error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING"); - cmsg = (struct cmsghdr *)(cbuf + cmsg_len); cmsg_len += CMSG_SPACE(sizeof(__u32)); if (cbuf_sz < cmsg_len) @@ -426,6 +412,24 @@ static void ca_set_sockopts(int fd) setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opt.sockopt.priority, sizeof(opt.sockopt.priority))) error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY"); + + if (opt.txtime.ena) { + struct sock_txtime so_txtime = { + .clockid = CLOCK_MONOTONIC, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_TXTIME, + &so_txtime, sizeof(so_txtime))) + error(ERN_SOCKOPT, errno, "setsockopt TXTIME"); + } + if (opt.ts.ena) { + __u32 val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_TSONLY; + + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, + &val, sizeof(val))) + error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING"); + } } int main(int argc, char *argv[]) diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh index af85267ad1e3..1d7e756644bc 100755 --- a/tools/testing/selftests/net/cmsg_time.sh +++ b/tools/testing/selftests/net/cmsg_time.sh @@ -66,10 +66,13 @@ for i in "-4 $TGT4" "-6 $TGT6"; do awk '/SND/ { if ($3 > 1000) print "OK"; }') check_result $? "$ts" "OK" "$prot - TXTIME abs" - ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 | + [ "$KSFT_MACHINE_SLOW" = yes ] && delay=8000 || delay=1000 + + ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d $delay | awk '/SND/ {snd=$3} /SCHED/ {sch=$3} - END { if (snd - sch > 500) print "OK"; }') + END { if (snd - sch > '$((delay/2))') print "OK"; + else print snd, "-", sch, "<", '$((delay/2))'; }') check_result $? "$ts" "OK" "$prot - TXTIME rel" done done -- cgit v1.2.3 From 9ef30265a483f0405e4f7b3f15cda251b9a2c7da Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 10 May 2024 14:04:51 -0700 Subject: perf annotate: Fix segfault on sample histogram A symbol can have no samples, then accessing the annotated_source->samples hashmap will result in a segfault. Fixes: a3f7768bcf48281d ("perf annotate: Fix memory leak in annotated_source") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240510210452.2449944-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 541988cf6e19..1451caf25e77 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -113,10 +113,11 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src if (src == NULL) return; - hashmap__for_each_entry(src->samples, cur, bkt) - zfree(&cur->pvalue); - - hashmap__free(src->samples); + if (src->samples) { + hashmap__for_each_entry(src->samples, cur, bkt) + zfree(&cur->pvalue); + hashmap__free(src->samples); + } zfree(&src->histograms); free(src); } -- cgit v1.2.3 From 2af1280b190c408bd590704806dd0d2d1cf52db5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 10 May 2024 14:04:52 -0700 Subject: perf annotate-data: Ensure the number of type histograms Arnaldo reported that there is a case where nr_histograms and histograms don't agree each other. It ended up in a segfault trying to access a NULL histograms array. Let's make sure to update the nr_histograms when the histograms array is changed. Reported-by: Arnaldo Carvalho de Melo Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240510210452.2449944-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 57e7d4b3550b..965da6c0b542 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1800,7 +1800,6 @@ static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_en sz += sizeof(struct type_hist_entry) * adt->self.size; /* Allocate a table of pointers for each event */ - adt->nr_histograms = nr_entries; adt->histograms = calloc(nr_entries, sizeof(*adt->histograms)); if (adt->histograms == NULL) return -ENOMEM; @@ -1814,6 +1813,8 @@ static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_en if (adt->histograms[i] == NULL) goto err; } + + adt->nr_histograms = nr_entries; return 0; err: @@ -1827,7 +1828,9 @@ static void delete_data_type_histograms(struct annotated_data_type *adt) { for (int i = 0; i < adt->nr_histograms; i++) zfree(&(adt->histograms[i])); + zfree(&adt->histograms); + adt->nr_histograms = 0; } void annotated_data_type__tree_delete(struct rb_root *root) -- cgit v1.2.3 From 193a9e30207f54777ff42d0d8be8389edc522277 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 9 May 2024 22:13:09 -0700 Subject: perf stat: Don't display metric header for non-leader uncore events On an Intel tigerlake laptop a metric like: { "BriefDescription": "Test", "MetricExpr": "imc_free_running@data_read@ + imc_free_running@data_write@", "MetricGroup": "Test", "MetricName": "Test", "ScaleUnit": "6.103515625e-5MiB" }, Will have 4 events: uncore_imc_free_running_0/data_read/ uncore_imc_free_running_0/data_write/ uncore_imc_free_running_1/data_read/ uncore_imc_free_running_1/data_write/ If aggregration is disabled with metric-only 2 column headers are needed: $ perf stat -M test --metric-only -A -a sleep 1 Performance counter stats for 'system wide': MiB Test MiB Test CPU0 1821.0 1820.5 But when not, the counts aggregated in the metric leader and only 1 column should be shown: $ perf stat -M test --metric-only -a sleep 1 Performance counter stats for 'system wide': MiB Test 5909.4 1.001258915 seconds time elapsed Achieve this by skipping events that aren't metric leaders when printing column headers and aggregation isn't disabled. The bug is long standing, the fixes tag is set to a refactor as that is as far back as is reasonable to backport. Fixes: 088519f318be3a41 ("perf stat: Move the display functions to stat-display.c") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kaige Ye Cc: Kan Liang Cc: K Prateek Nayak Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yicong Yang Link: https://lore.kernel.org/r/20240510051309.2452468-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index bfc1d705f437..91d2f7f65df7 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -1223,6 +1223,9 @@ static void print_metric_headers(struct perf_stat_config *config, /* Print metrics headers only */ evlist__for_each_entry(evlist, counter) { + if (config->aggr_mode != AGGR_NONE && counter->metric_leader != counter) + continue; + os.evsel = counter; perf_stat__print_shadow_stats(config, counter, 0, -- cgit v1.2.3 From d9c5f5f94c2d356fdf3503f7fcaf254512bc032d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 10 May 2024 17:36:01 -0700 Subject: perf pmu: Count sys and cpuid JSON events separately Sys events are eagerly loaded as each event has a compat option that may mean the event is or isn't associated with the PMU. These shouldn't be counted as loaded_json_events as that is used for JSON events matching the CPUID that may or may not have been loaded. The mismatch causes issues on ARM64 that uses sys events. Fixes: e6ff1eed3584362d ("perf pmu: Lazily add JSON events") Closes: https://lore.kernel.org/lkml/20240510024729.1075732-1-justin.he@arm.com/ Reported-by: Jia He Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240511003601.2666907-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 70 +++++++++++++++++++++++++++++++++++---------------- tools/perf/util/pmu.h | 6 +++-- 2 files changed, 53 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index b3b072feef02..888ce9912275 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -36,6 +36,18 @@ struct perf_pmu perf_pmu__fake = { #define UNIT_MAX_LEN 31 /* max length for event unit name */ +enum event_source { + /* An event loaded from /sys/devices//events. */ + EVENT_SRC_SYSFS, + /* An event loaded from a CPUID matched json file. */ + EVENT_SRC_CPU_JSON, + /* + * An event loaded from a /sys/devices//identifier matched json + * file. + */ + EVENT_SRC_SYS_JSON, +}; + /** * struct perf_pmu_alias - An event either read from sysfs or builtin in * pmu-events.c, created by parsing the pmu-events json files. @@ -521,7 +533,7 @@ static int update_alias(const struct pmu_event *pe, static int perf_pmu__new_alias(struct perf_pmu *pmu, const char *name, const char *desc, const char *val, FILE *val_fd, - const struct pmu_event *pe) + const struct pmu_event *pe, enum event_source src) { struct perf_pmu_alias *alias; int ret; @@ -574,25 +586,30 @@ static int perf_pmu__new_alias(struct perf_pmu *pmu, const char *name, } snprintf(alias->unit, sizeof(alias->unit), "%s", unit); } - if (!pe) { - /* Update an event from sysfs with json data. */ - struct update_alias_data data = { - .pmu = pmu, - .alias = alias, - }; - + switch (src) { + default: + case EVENT_SRC_SYSFS: alias->from_sysfs = true; if (pmu->events_table) { + /* Update an event from sysfs with json data. */ + struct update_alias_data data = { + .pmu = pmu, + .alias = alias, + }; if (pmu_events_table__find_event(pmu->events_table, pmu, name, update_alias, &data) == 0) - pmu->loaded_json_aliases++; + pmu->cpu_json_aliases++; } - } - - if (!pe) pmu->sysfs_aliases++; - else - pmu->loaded_json_aliases++; + break; + case EVENT_SRC_CPU_JSON: + pmu->cpu_json_aliases++; + break; + case EVENT_SRC_SYS_JSON: + pmu->sys_json_aliases++; + break; + + } list_add_tail(&alias->list, &pmu->aliases); return 0; } @@ -653,7 +670,8 @@ static int __pmu_aliases_parse(struct perf_pmu *pmu, int events_dir_fd) } if (perf_pmu__new_alias(pmu, name, /*desc=*/ NULL, - /*val=*/ NULL, file, /*pe=*/ NULL) < 0) + /*val=*/ NULL, file, /*pe=*/ NULL, + EVENT_SRC_SYSFS) < 0) pr_debug("Cannot set up %s\n", name); fclose(file); } @@ -946,7 +964,8 @@ static int pmu_add_cpu_aliases_map_callback(const struct pmu_event *pe, { struct perf_pmu *pmu = vdata; - perf_pmu__new_alias(pmu, pe->name, pe->desc, pe->event, /*val_fd=*/ NULL, pe); + perf_pmu__new_alias(pmu, pe->name, pe->desc, pe->event, /*val_fd=*/ NULL, + pe, EVENT_SRC_CPU_JSON); return 0; } @@ -981,13 +1000,14 @@ static int pmu_add_sys_aliases_iter_fn(const struct pmu_event *pe, return 0; if (pmu_uncore_alias_match(pe->pmu, pmu->name) && - pmu_uncore_identifier_match(pe->compat, pmu->id)) { + pmu_uncore_identifier_match(pe->compat, pmu->id)) { perf_pmu__new_alias(pmu, pe->name, pe->desc, pe->event, /*val_fd=*/ NULL, - pe); + pe, + EVENT_SRC_SYS_JSON); } return 0; @@ -1082,6 +1102,12 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char pmu->max_precise = pmu_max_precise(dirfd, pmu); pmu->alias_name = pmu_find_alias_name(pmu, dirfd); pmu->events_table = perf_pmu__find_events_table(pmu); + /* + * Load the sys json events/aliases when loading the PMU as each event + * may have a different compat regular expression. We therefore can't + * know the number of sys json events/aliases without computing the + * regular expressions for them all. + */ pmu_add_sys_aliases(pmu); list_add_tail(&pmu->list, pmus); @@ -1739,12 +1765,14 @@ size_t perf_pmu__num_events(struct perf_pmu *pmu) size_t nr; pmu_aliases_parse(pmu); - nr = pmu->sysfs_aliases; + nr = pmu->sysfs_aliases + pmu->sys_json_aliases;; if (pmu->cpu_aliases_added) - nr += pmu->loaded_json_aliases; + nr += pmu->cpu_json_aliases; else if (pmu->events_table) - nr += pmu_events_table__num_events(pmu->events_table, pmu) - pmu->loaded_json_aliases; + nr += pmu_events_table__num_events(pmu->events_table, pmu) - pmu->cpu_json_aliases; + else + assert(pmu->cpu_json_aliases == 0); return pmu->selectable ? nr + 1 : nr; } diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 561716aa2b25..b2d3fd291f02 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -123,8 +123,10 @@ struct perf_pmu { const struct pmu_events_table *events_table; /** @sysfs_aliases: Number of sysfs aliases loaded. */ uint32_t sysfs_aliases; - /** @sysfs_aliases: Number of json event aliases loaded. */ - uint32_t loaded_json_aliases; + /** @cpu_json_aliases: Number of json event aliases loaded specific to the CPUID. */ + uint32_t cpu_json_aliases; + /** @sys_json_aliases: Number of json event aliases loaded matching the PMU's identifier. */ + uint32_t sys_json_aliases; /** @sysfs_aliases_loaded: Are sysfs aliases loaded from disk? */ bool sysfs_aliases_loaded; /** -- cgit v1.2.3 From 37dc2e0d38d6eb690ee043b43ee6f7cdf2994bf6 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:36 +0200 Subject: selftests/pidfd: Fix config for pidfd_setns_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Required by switch_timens() to open /proc/self/ns/time_for_children. CONFIG_GENERIC_VDSO_TIME_NS is not available on UML, so pidfd_setns_test cannot be run successfully on this architecture. Cc: Shuah Khan Fixes: 2b40c5db73e2 ("selftests/pidfd: add pidfd setns tests") Reviewed-by: Kees Cook Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20240511171445.904356-2-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/pidfd/config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index f6f2965e17af..6133524710f7 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -3,5 +3,7 @@ CONFIG_IPC_NS=y CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y +CONFIG_TIME_NS=y +CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_CGROUPS=y CONFIG_CHECKPOINT_RESTORE=y -- cgit v1.2.3 From 7e4042abe2ee7c0977fd8bb049a6991b174a5e6f Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:37 +0200 Subject: selftests/landlock: Fix FS tests when run on a private mount point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the test environment, the mount point of the test's working directory may be shared or not, which changes the visibility of the nested "tmp" mount point for the test's parent process calling umount("tmp"). This was spotted while running tests in containers [1], where mount points are private. Cc: Günther Noack Cc: Shuah Khan Link: https://github.com/landlock-lsm/landlock-test-tools/pull/4 [1] Fixes: 41cca0542d7c ("selftests/harness: Fix TEST_F()'s vfork handling") Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-3-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 9a6036fbf289..46b9effd53e4 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -293,7 +293,15 @@ static void prepare_layout(struct __test_metadata *const _metadata) static void cleanup_layout(struct __test_metadata *const _metadata) { set_cap(_metadata, CAP_SYS_ADMIN); - EXPECT_EQ(0, umount(TMP_DIR)); + if (umount(TMP_DIR)) { + /* + * According to the test environment, the mount point of the + * current directory may be shared or not, which changes the + * visibility of the nested TMP_DIR mount point for the test's + * parent process doing this cleanup. + */ + ASSERT_EQ(EINVAL, errno); + } clear_cap(_metadata, CAP_SYS_ADMIN); EXPECT_EQ(0, remove_path(TMP_DIR)); } -- cgit v1.2.3 From fff37bd32c7605d93bf900c4c318d56d12000048 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:38 +0200 Subject: selftests/harness: Fix fixture teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure fixture teardowns are run when test cases failed, including when _metadata->teardown_parent is set to true. Make sure only one fixture teardown is run per test case, handling the case where the test child forks. Cc: Jakub Kicinski Cc: Shengyu Li Cc: Shuah Khan Fixes: 72d7cb5c190b ("selftests/harness: Prevent infinite loop due to Assert in FIXTURE_TEARDOWN") Fixes: 0710a1a73fb4 ("selftests/harness: Merge TEST_F_FORK() into TEST_F()") Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-4-mic@digikod.net Rule: add Link: https://lore.kernel.org/stable/20240506165518.474504-4-mic%40digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/kselftest_harness.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index d98702b6955d..55699a762c45 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -382,7 +382,10 @@ FIXTURE_DATA(fixture_name) self; \ pid_t child = 1; \ int status = 0; \ - bool jmp = false; \ + /* Makes sure there is only one teardown, even when child forks again. */ \ + bool *teardown = mmap(NULL, sizeof(*teardown), \ + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ + *teardown = false; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ if (setjmp(_metadata->env) == 0) { \ /* Use the same _metadata. */ \ @@ -399,15 +402,16 @@ _metadata->exit_code = KSFT_FAIL; \ } \ } \ - else \ - jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ + if (_metadata->setup_completed && !_metadata->teardown_parent && \ + __sync_bool_compare_and_swap(teardown, false, true)) \ fixture_name##_teardown(_metadata, &self, variant->data); \ _exit(0); \ } \ - if (_metadata->setup_completed && _metadata->teardown_parent) \ + if (_metadata->setup_completed && _metadata->teardown_parent && \ + __sync_bool_compare_and_swap(teardown, false, true)) \ fixture_name##_teardown(_metadata, &self, variant->data); \ + munmap(teardown, sizeof(*teardown)); \ if (!WIFEXITED(status) && WIFSIGNALED(status)) \ /* Forward signal to __wait_for_test(). */ \ kill(getpid(), WTERMSIG(status)); \ -- cgit v1.2.3 From a86f18903db9211e265cc130b61adb175b7a4c42 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:39 +0200 Subject: selftests/harness: Fix interleaved scheduling leading to race conditions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a race condition when running several FIXTURE_TEARDOWN() managing the same resource. This fixes a race condition in the Landlock file system tests when creating or unmounting the same directory. Using clone3() with CLONE_VFORK guarantees that the child and grandchild test processes are sequentially scheduled. This is implemented with a new clone3_vfork() helper replacing the fork() call. This avoids triggering this error in __wait_for_test(): Test ended in some other way [127] Cc: Christian Brauner Cc: David S. Miller Cc: Günther Noack Cc: Jakub Kicinski Cc: Mark Brown Cc: Shuah Khan Cc: Will Drewry Fixes: 41cca0542d7c ("selftests/harness: Fix TEST_F()'s vfork handling") Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-5-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/kselftest_harness.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 55699a762c45..9d7178a71c2c 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -66,6 +66,8 @@ #include #include #include +#include +#include #include "kselftest.h" @@ -80,6 +82,17 @@ # define TH_LOG_ENABLED 1 #endif +/* Wait for the child process to end but without sharing memory mapping. */ +static inline pid_t clone3_vfork(void) +{ + struct clone_args args = { + .flags = CLONE_VFORK, + .exit_signal = SIGCHLD, + }; + + return syscall(__NR_clone3, &args, sizeof(args)); +} + /** * TH_LOG() * @@ -1183,7 +1196,7 @@ void __run_test(struct __fixture_metadata *f, fflush(stdout); fflush(stderr); - t->pid = fork(); + t->pid = clone3_vfork(); if (t->pid < 0) { ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); t->exit_code = KSFT_FAIL; -- cgit v1.2.3 From 3656bc23429a4d539c81b5cb8f17ceeeeca8901a Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:40 +0200 Subject: selftests/landlock: Do not allocate memory in fixture data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not allocate self->dir_path in the test process because this would not be visible in the FIXTURE_TEARDOWN() process when relying on fork()/clone3() instead of vfork(). This change is required for a following commit removing vfork() call to not break the layout3_fs.* test cases. Cc: Günther Noack Cc: Shuah Khan Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-6-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 57 ++++++++++++++++++------------ 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 46b9effd53e4..1e2cffde02b5 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE #include +#include #include #include #include @@ -4624,7 +4625,6 @@ FIXTURE(layout3_fs) { bool has_created_dir; bool has_created_file; - char *dir_path; bool skip_test; }; @@ -4683,11 +4683,24 @@ FIXTURE_VARIANT_ADD(layout3_fs, hostfs) { .cwd_fs_magic = HOSTFS_SUPER_MAGIC, }; +static char *dirname_alloc(const char *path) +{ + char *dup; + + if (!path) + return NULL; + + dup = strdup(path); + if (!dup) + return NULL; + + return dirname(dup); +} + FIXTURE_SETUP(layout3_fs) { struct stat statbuf; - const char *slash; - size_t dir_len; + char *dir_path = dirname_alloc(variant->file_path); if (!supports_filesystem(variant->mnt.type) || !cwd_matches_fs(variant->cwd_fs_magic)) { @@ -4697,25 +4710,15 @@ FIXTURE_SETUP(layout3_fs) _metadata->teardown_parent = true; - slash = strrchr(variant->file_path, '/'); - ASSERT_NE(slash, NULL); - dir_len = (size_t)slash - (size_t)variant->file_path; - ASSERT_LT(0, dir_len); - self->dir_path = malloc(dir_len + 1); - self->dir_path[dir_len] = '\0'; - strncpy(self->dir_path, variant->file_path, dir_len); - prepare_layout_opt(_metadata, &variant->mnt); /* Creates directory when required. */ - if (stat(self->dir_path, &statbuf)) { + if (stat(dir_path, &statbuf)) { set_cap(_metadata, CAP_DAC_OVERRIDE); - EXPECT_EQ(0, mkdir(self->dir_path, 0700)) + EXPECT_EQ(0, mkdir(dir_path, 0700)) { TH_LOG("Failed to create directory \"%s\": %s", - self->dir_path, strerror(errno)); - free(self->dir_path); - self->dir_path = NULL; + dir_path, strerror(errno)); } self->has_created_dir = true; clear_cap(_metadata, CAP_DAC_OVERRIDE); @@ -4736,6 +4739,8 @@ FIXTURE_SETUP(layout3_fs) self->has_created_file = true; clear_cap(_metadata, CAP_DAC_OVERRIDE); } + + free(dir_path); } FIXTURE_TEARDOWN(layout3_fs) @@ -4754,16 +4759,17 @@ FIXTURE_TEARDOWN(layout3_fs) } if (self->has_created_dir) { + char *dir_path = dirname_alloc(variant->file_path); + set_cap(_metadata, CAP_DAC_OVERRIDE); /* * Don't check for error because the directory might already * have been removed (cf. release_inode test). */ - rmdir(self->dir_path); + rmdir(dir_path); clear_cap(_metadata, CAP_DAC_OVERRIDE); + free(dir_path); } - free(self->dir_path); - self->dir_path = NULL; cleanup_layout(_metadata); } @@ -4830,7 +4836,10 @@ TEST_F_FORK(layout3_fs, tag_inode_dir_mnt) TEST_F_FORK(layout3_fs, tag_inode_dir_child) { - layer3_fs_tag_inode(_metadata, self, variant, self->dir_path); + char *dir_path = dirname_alloc(variant->file_path); + + layer3_fs_tag_inode(_metadata, self, variant, dir_path); + free(dir_path); } TEST_F_FORK(layout3_fs, tag_inode_file) @@ -4857,9 +4866,13 @@ TEST_F_FORK(layout3_fs, release_inodes) if (self->has_created_file) EXPECT_EQ(0, remove_path(variant->file_path)); - if (self->has_created_dir) + if (self->has_created_dir) { + char *dir_path = dirname_alloc(variant->file_path); + /* Don't check for error because of cgroup specificities. */ - remove_path(self->dir_path); + remove_path(dir_path); + free(dir_path); + } ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_DIR, layer1); -- cgit v1.2.3 From cc80aa9a22c00a5e23ea9b4933f9d3ec8f686cb2 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:41 +0200 Subject: selftests/harness: Constify fixture variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FIXTURE_VARIANT_ADD() types are passed as const pointers to FIXTURE_TEARDOWN(). Make that explicit by constifying the variants declarations. Cc: Shuah Khan Cc: Will Drewry Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-7-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/kselftest_harness.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 9d7178a71c2c..201040207c85 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -338,7 +338,7 @@ static inline pid_t clone3_vfork(void) * variant. */ #define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ - extern FIXTURE_VARIANT(fixture_name) \ + extern const FIXTURE_VARIANT(fixture_name) \ _##fixture_name##_##variant_name##_variant; \ static struct __fixture_variant_metadata \ _##fixture_name##_##variant_name##_object = \ @@ -350,7 +350,7 @@ static inline pid_t clone3_vfork(void) __register_fixture_variant(&_##fixture_name##_fixture_object, \ &_##fixture_name##_##variant_name##_object); \ } \ - FIXTURE_VARIANT(fixture_name) \ + const FIXTURE_VARIANT(fixture_name) \ _##fixture_name##_##variant_name##_variant = /** -- cgit v1.2.3 From 821bc4a8fd2454ff6d719aae7cac93f60567fe65 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:42 +0200 Subject: selftests/pidfd: Fix wrong expectation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace a wrong EXPECT_GT(self->child_pid_exited, 0) with EXPECT_GE(), which will be actually tested on the parent and child sides with a following commit. Cc: Shuah Khan Reviewed-by: Kees Cook Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20240511171445.904356-8-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/pidfd/pidfd_setns_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 6e2f2cd400ca..47746b0c6acd 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -158,7 +158,7 @@ FIXTURE_SETUP(current_nsset) /* Create task that exits right away. */ self->child_pid_exited = create_child(&self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET); - EXPECT_GT(self->child_pid_exited, 0); + EXPECT_GE(self->child_pid_exited, 0); if (self->child_pid_exited == 0) _exit(EXIT_SUCCESS); -- cgit v1.2.3 From 24cf65a6226643f0f4be16fb2f9c0575b0edd967 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:43 +0200 Subject: selftests/harness: Share _metadata between forked processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unconditionally share _metadata between all forked processes, which enables to actually catch errors which were previously ignored. This is required for a following commit replacing vfork() with clone3() and CLONE_VFORK (i.e. not sharing the full memory) . It should also be useful to share _metadata to extend expectations to test process's forks. For instance, this change identified a wrong expectation in pidfd_setns_test. Because this _metadata is used by the new XFAIL_ADD(), use a global pointer initialized in TEST_F(). This is OK because only XFAIL_ADD() use it, and XFAIL_ADD() already depends on TEST_F(). Cc: Jakub Kicinski Cc: Shuah Khan Cc: Will Drewry Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-9-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/kselftest_harness.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 201040207c85..28415798fa60 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -430,19 +430,19 @@ static inline pid_t clone3_vfork(void) kill(getpid(), WTERMSIG(status)); \ __test_check_assert(_metadata); \ } \ - static struct __test_metadata \ - _##fixture_name##_##test_name##_object = { \ - .name = #test_name, \ - .fn = &wrapper_##fixture_name##_##test_name, \ - .fixture = &_##fixture_name##_fixture_object, \ - .termsig = signal, \ - .timeout = tmout, \ - .teardown_parent = false, \ - }; \ + static struct __test_metadata *_##fixture_name##_##test_name##_object; \ static void __attribute__((constructor)) \ _register_##fixture_name##_##test_name(void) \ { \ - __register_test(&_##fixture_name##_##test_name##_object); \ + struct __test_metadata *object = mmap(NULL, sizeof(*object), \ + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ + object->name = #test_name; \ + object->fn = &wrapper_##fixture_name##_##test_name; \ + object->fixture = &_##fixture_name##_fixture_object; \ + object->termsig = signal; \ + object->timeout = tmout; \ + _##fixture_name##_##test_name##_object = object; \ + __register_test(object); \ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ @@ -850,11 +850,12 @@ struct __test_xfail { { \ .fixture = &_##fixture_name##_fixture_object, \ .variant = &_##fixture_name##_##variant_name##_object, \ - .test = &_##fixture_name##_##test_name##_object, \ }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_##variant_name##_##test_name##_xfail(void) \ { \ + _##fixture_name##_##variant_name##_##test_name##_xfail.test = \ + _##fixture_name##_##test_name##_object; \ __register_xfail(&_##fixture_name##_##variant_name##_##test_name##_xfail); \ } @@ -1181,6 +1182,9 @@ void __run_test(struct __fixture_metadata *f, /* reset test struct */ t->exit_code = KSFT_PASS; t->trigger = 0; + t->aborted = false; + t->setup_completed = false; + memset(t->env, 0, sizeof(t->env)); memset(t->results->reason, 0, sizeof(t->results->reason)); if (asprintf(&test_name, "%s%s%s.%s", f->name, -- cgit v1.2.3 From f453cc30027b184c0a109d689b1335e6c826d514 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:44 +0200 Subject: selftests/harness: Fix vfork() side effects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting the time namespace with CLONE_NEWTIME returns -EUSERS if the calling thread shares memory with another thread (because of the shared vDSO), which is the case when it is created with vfork(). Fix pidfd_setns_test by replacing test harness's vfork() call with a clone3() call with CLONE_VFORK, and an explicit sharing of the _metadata and self objects. Replace _metadata->teardown_parent with a new FIXTURE_TEARDOWN_PARENT() helper that can replace FIXTURE_TEARDOWN(). This is a cleaner approach and it enables to selectively share the fixture data between the child process running tests and the parent process running the fixture teardown. This also avoids updating several tests to not rely on the self object's copy-on-write property (e.g. storing the returned value of a fork() call). Cc: Christian Brauner Cc: David S. Miller Cc: Günther Noack Cc: Jakub Kicinski Cc: Mark Brown Cc: Shuah Khan Cc: Will Drewry Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403291015.1fcfa957-oliver.sang@intel.com Fixes: 0710a1a73fb4 ("selftests/harness: Merge TEST_F_FORK() into TEST_F()") Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240511171445.904356-10-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/kselftest_harness.h | 66 ++++++++++++++++++++++------- tools/testing/selftests/landlock/fs_test.c | 16 +++---- 2 files changed, 57 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 28415798fa60..cbedb4a6cf7b 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -294,6 +294,32 @@ static inline pid_t clone3_vfork(void) * A bare "return;" statement may be used to return early. */ #define FIXTURE_TEARDOWN(fixture_name) \ + static const bool fixture_name##_teardown_parent; \ + __FIXTURE_TEARDOWN(fixture_name) + +/** + * FIXTURE_TEARDOWN_PARENT() + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_TEARDOWN_PARENT(fixture_name) { implementation } + * + * Same as FIXTURE_TEARDOWN() but run this code in a parent process. This + * enables the test process to drop its privileges without impacting the + * related FIXTURE_TEARDOWN_PARENT() (e.g. to remove files from a directory + * where write access was dropped). + * + * To make it possible for the parent process to use *self*, share (MAP_SHARED) + * the fixture data between all forked processes. + */ +#define FIXTURE_TEARDOWN_PARENT(fixture_name) \ + static const bool fixture_name##_teardown_parent = true; \ + __FIXTURE_TEARDOWN(fixture_name) + +#define __FIXTURE_TEARDOWN(fixture_name) \ void fixture_name##_teardown( \ struct __test_metadata __attribute__((unused)) *_metadata, \ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ @@ -368,10 +394,11 @@ static inline pid_t clone3_vfork(void) * Very similar to TEST() except that *self* is the setup instance of fixture's * datatype exposed for use by the implementation. * - * The @test_name code is run in a separate process sharing the same memory - * (i.e. vfork), which means that the test process can update its privileges - * without impacting the related FIXTURE_TEARDOWN() (e.g. to remove files from - * a directory where write access was dropped). + * The _metadata object is shared (MAP_SHARED) with all the potential forked + * processes, which enables them to use EXCEPT_*() and ASSERT_*(). + * + * The *self* object is only shared with the potential forked processes if + * FIXTURE_TEARDOWN_PARENT() is used instead of FIXTURE_TEARDOWN(). */ #define TEST_F(fixture_name, test_name) \ __TEST_F_IMPL(fixture_name, test_name, -1, TEST_TIMEOUT_DEFAULT) @@ -392,39 +419,49 @@ static inline pid_t clone3_vfork(void) struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ - FIXTURE_DATA(fixture_name) self; \ + FIXTURE_DATA(fixture_name) self_private, *self = NULL; \ pid_t child = 1; \ int status = 0; \ /* Makes sure there is only one teardown, even when child forks again. */ \ bool *teardown = mmap(NULL, sizeof(*teardown), \ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ *teardown = false; \ - memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ + if (sizeof(*self) > 0) { \ + if (fixture_name##_teardown_parent) { \ + self = mmap(NULL, sizeof(*self), PROT_READ | PROT_WRITE, \ + MAP_SHARED | MAP_ANONYMOUS, -1, 0); \ + } else { \ + memset(&self_private, 0, sizeof(self_private)); \ + self = &self_private; \ + } \ + } \ if (setjmp(_metadata->env) == 0) { \ - /* Use the same _metadata. */ \ - child = vfork(); \ + /* _metadata and potentially self are shared with all forks. */ \ + child = clone3_vfork(); \ if (child == 0) { \ - fixture_name##_setup(_metadata, &self, variant->data); \ + fixture_name##_setup(_metadata, self, variant->data); \ /* Let setup failure terminate early. */ \ if (_metadata->exit_code) \ _exit(0); \ _metadata->setup_completed = true; \ - fixture_name##_##test_name(_metadata, &self, variant->data); \ + fixture_name##_##test_name(_metadata, self, variant->data); \ } else if (child < 0 || child != waitpid(child, &status, 0)) { \ ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \ _metadata->exit_code = KSFT_FAIL; \ } \ } \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent && \ + if (_metadata->setup_completed && !fixture_name##_teardown_parent && \ __sync_bool_compare_and_swap(teardown, false, true)) \ - fixture_name##_teardown(_metadata, &self, variant->data); \ + fixture_name##_teardown(_metadata, self, variant->data); \ _exit(0); \ } \ - if (_metadata->setup_completed && _metadata->teardown_parent && \ + if (_metadata->setup_completed && fixture_name##_teardown_parent && \ __sync_bool_compare_and_swap(teardown, false, true)) \ - fixture_name##_teardown(_metadata, &self, variant->data); \ + fixture_name##_teardown(_metadata, self, variant->data); \ munmap(teardown, sizeof(*teardown)); \ + if (self && fixture_name##_teardown_parent) \ + munmap(self, sizeof(*self)); \ if (!WIFEXITED(status) && WIFSIGNALED(status)) \ /* Forward signal to __wait_for_test(). */ \ kill(getpid(), WTERMSIG(status)); \ @@ -898,7 +935,6 @@ struct __test_metadata { bool timed_out; /* did this test timeout instead of exiting? */ bool aborted; /* stopped test due to failed ASSERT */ bool setup_completed; /* did setup finish? */ - bool teardown_parent; /* run teardown in a parent process */ jmp_buf env; /* for exiting out of test early */ struct __test_results *results; struct __test_metadata *prev, *next; diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 1e2cffde02b5..27744524df51 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -286,8 +286,6 @@ static void prepare_layout_opt(struct __test_metadata *const _metadata, static void prepare_layout(struct __test_metadata *const _metadata) { - _metadata->teardown_parent = true; - prepare_layout_opt(_metadata, &mnt_tmp); } @@ -316,7 +314,7 @@ FIXTURE_SETUP(layout0) prepare_layout(_metadata); } -FIXTURE_TEARDOWN(layout0) +FIXTURE_TEARDOWN_PARENT(layout0) { cleanup_layout(_metadata); } @@ -379,7 +377,7 @@ FIXTURE_SETUP(layout1) create_layout1(_metadata); } -FIXTURE_TEARDOWN(layout1) +FIXTURE_TEARDOWN_PARENT(layout1) { remove_layout1(_metadata); @@ -3692,7 +3690,7 @@ FIXTURE_SETUP(ftruncate) create_file(_metadata, file1_s1d1); } -FIXTURE_TEARDOWN(ftruncate) +FIXTURE_TEARDOWN_PARENT(ftruncate) { EXPECT_EQ(0, remove_path(file1_s1d1)); cleanup_layout(_metadata); @@ -3870,7 +3868,7 @@ FIXTURE_SETUP(layout1_bind) clear_cap(_metadata, CAP_SYS_ADMIN); } -FIXTURE_TEARDOWN(layout1_bind) +FIXTURE_TEARDOWN_PARENT(layout1_bind) { /* umount(dir_s2d2)) is handled by namespace lifetime. */ @@ -4275,7 +4273,7 @@ FIXTURE_SETUP(layout2_overlay) clear_cap(_metadata, CAP_SYS_ADMIN); } -FIXTURE_TEARDOWN(layout2_overlay) +FIXTURE_TEARDOWN_PARENT(layout2_overlay) { if (self->skip_test) SKIP(return, "overlayfs is not supported (teardown)"); @@ -4708,8 +4706,6 @@ FIXTURE_SETUP(layout3_fs) SKIP(return, "this filesystem is not supported (setup)"); } - _metadata->teardown_parent = true; - prepare_layout_opt(_metadata, &variant->mnt); /* Creates directory when required. */ @@ -4743,7 +4739,7 @@ FIXTURE_SETUP(layout3_fs) free(dir_path); } -FIXTURE_TEARDOWN(layout3_fs) +FIXTURE_TEARDOWN_PARENT(layout3_fs) { if (self->skip_test) SKIP(return, "this filesystem is not supported (teardown)"); -- cgit v1.2.3 From 323feb3bdb67649bfa5614eb24ec9cb92a60cf33 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sat, 11 May 2024 19:14:45 +0200 Subject: selftests/harness: Handle TEST_F()'s explicit exit codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If TEST_F() explicitly calls exit(code) with code different than 0, then _metadata->exit_code is set to this code (e.g. KVM_ONE_VCPU_TEST()). We need to keep in mind that _metadata->exit_code can be KSFT_SKIP while the process exit code is 0. Cc: Jakub Kicinski Cc: Kees Cook Cc: Mark Brown Cc: Shuah Khan Cc: Will Drewry Reported-by: Sean Christopherson Tested-by: Sean Christopherson Closes: https://lore.kernel.org/r/ZjPelW6-AbtYvslu@google.com Fixes: 0710a1a73fb4 ("selftests/harness: Merge TEST_F_FORK() into TEST_F()") Link: https://lore.kernel.org/r/20240511171445.904356-11-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/kselftest_harness.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index cbedb4a6cf7b..3c8f2965c285 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -462,9 +462,13 @@ static inline pid_t clone3_vfork(void) munmap(teardown, sizeof(*teardown)); \ if (self && fixture_name##_teardown_parent) \ munmap(self, sizeof(*self)); \ - if (!WIFEXITED(status) && WIFSIGNALED(status)) \ + if (WIFEXITED(status)) { \ + if (WEXITSTATUS(status)) \ + _metadata->exit_code = WEXITSTATUS(status); \ + } else if (WIFSIGNALED(status)) { \ /* Forward signal to __wait_for_test(). */ \ kill(getpid(), WTERMSIG(status)); \ + } \ __test_check_assert(_metadata); \ } \ static struct __test_metadata *_##fixture_name##_##test_name##_object; \ -- cgit v1.2.3 From d14d6b0e7da3c2d768418fc7e5ad65c359c35962 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 2 May 2024 10:27:17 -0700 Subject: selftests/damon/_damon_sysfs: support quota goals Patch series "selftests/damon: add DAMOS quota goal test". Extend DAMON selftest-purpose sysfs wrapper to support DAMOS quota goal, and implement a simple selftest for the feature using it. This patch (of 2): The DAMON sysfs test purpose wrapper, _damon_sysfs.py, is not supporting quota goals. Implement the support for testing the feature. The test will be implemented and added by the following commit. Link: https://lkml.kernel.org/r/20240502172718.74166-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240502172718.74166-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 84 ++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index d23d7398a27a..036e64070fdb 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -70,16 +70,56 @@ class DamosAccessPattern: if err != None: return err +qgoal_metric_user_input = 'user_input' +qgoal_metric_some_mem_psi_us = 'some_mem_psi_us' +qgoal_metrics = [qgoal_metric_user_input, qgoal_metric_some_mem_psi_us] + +class DamosQuotaGoal: + metric = None + target_value = None + current_value = None + effective_bytes = None + quota = None # owner quota + idx = None + + def __init__(self, metric, target_value=10000, current_value=0): + self.metric = metric + self.target_value = target_value + self.current_value = current_value + + def sysfs_dir(self): + return os.path.join(self.quota.sysfs_dir(), 'goals', '%d' % self.idx) + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'target_metric'), + self.metric) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'target_value'), + self.target_value) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'current_value'), + self.current_value) + if err is not None: + return err + return None + class DamosQuota: sz = None # size quota, in bytes ms = None # time quota + goals = None # quota goals reset_interval_ms = None # quota reset interval scheme = None # owner scheme - def __init__(self, sz=0, ms=0, reset_interval_ms=0): + def __init__(self, sz=0, ms=0, goals=None, reset_interval_ms=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms + self.goals = goals if goals is not None else [] + for idx, goal in enumerate(self.goals): + goal.idx = idx + goal.quota = self def sysfs_dir(self): return os.path.join(self.scheme.sysfs_dir(), 'quotas') @@ -96,6 +136,20 @@ class DamosQuota: if err != None: return err + nr_goals_file = os.path.join(self.sysfs_dir(), 'goals', 'nr_goals') + content, err = read_file(nr_goals_file) + if err is not None: + return err + if int(content) != len(self.goals): + err = write_file(nr_goals_file, len(self.goals)) + if err is not None: + return err + for goal in self.goals: + err = goal.stage() + if err is not None: + return err + return None + class DamosStats: nr_tried = None sz_tried = None @@ -361,6 +415,34 @@ class Kdamond: stat_values.append(int(content)) scheme.stats = DamosStats(*stat_values) + def update_schemes_effective_quotas(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_effective_quotas') + if err is not None: + return err + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + content, err = read_file( + os.path.join(scheme.quota.sysfs_dir(), + 'effective_bytes')) + if err is not None: + return err + goal.effective_bytes = int(content) + return None + + def commit_schemes_quota_goals(self): + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + err = goal.stage() + if err is not None: + print('commit_schemes_quota_goals failed stagign: %s'% + err) + exit(1) + return write_file(os.path.join(self.sysfs_dir(), 'state'), + 'commit_schemes_quota_goals') + class Kdamonds: kdamonds = [] -- cgit v1.2.3 From f1c07c0a1662b4f8aa7dae381013729aac6ba691 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 2 May 2024 10:27:18 -0700 Subject: selftests/damon: add a test for DAMOS quota goal Add a selftest for DAMOS quota goal. It tests the feature by setting a user_input metric based goal, change the current feedback, and check if the effective quota size is increased and decreased as expected. Link: https://lkml.kernel.org/r/20240502172718.74166-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- tools/testing/selftests/damon/damos_quota_goal.py | 77 +++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/damon/damos_quota_goal.py (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 789d6949c247..06c248880172 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -16,7 +16,7 @@ TEST_PROGS += debugfs_target_ids_pid_leak.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py -TEST_PROGS += damos_quota.py damos_apply_interval.py +TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py new file mode 100644 index 000000000000..18246f3b62f7 --- /dev/null +++ b/tools/testing/selftests/damon/damos_quota_goal.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # access two 10 MiB memory regions, 2 second per each + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) + + goal = _damon_sysfs.DamosQuotaGoal( + metric=_damon_sysfs.qgoal_metric_user_input, target_value=10000) + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos( + action='stat', + quota=_damon_sysfs.DamosQuota( + goals=[goal], reset_interval_ms=100), + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err != None: + print('kdamond start failed: %s' % err) + exit(1) + + score_values_to_test = [0, 15000, 5000, 18000] + while proc.poll() == None: + if len(score_values_to_test) == 0: + time.sleep(0.1) + continue + + goal.current_value = score_values_to_test.pop(0) + expect_increase = goal.current_value < goal.target_value + + err = kdamonds.kdamonds[0].commit_schemes_quota_goals() + if err is not None: + print('commit_schemes_quota_goals failed: %s' % err) + exit(1) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('before-update_schemes_effective_quotas failed: %s' % err) + exit(1) + last_effective_bytes = goal.effective_bytes + + time.sleep(0.5) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('after-update_schemes_effective_quotas failed: %s' % err) + exit(1) + + print('score: %s, effective quota: %d -> %d (%.3fx)' % ( + goal.current_value, last_effective_bytes, goal.effective_bytes, + goal.effective_bytes / last_effective_bytes + if last_effective_bytes != 0 else -1.0)) + + if last_effective_bytes == goal.effective_bytes: + print('efective bytes not changed: %d' % goal.effective_bytes) + exit(1) + + increased = last_effective_bytes < goal.effective_bytes + if expect_increase != increased: + print('expectation of increase (%s) != increased (%s)' % + (expect_increase, increased)) + exit(1) + last_effective_bytes = goal.effective_bytes + +if __name__ == '__main__': + main() -- cgit v1.2.3 From 732b8815c079199d29b0426d9372bb098c63cdc7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 May 2024 11:03:10 -0700 Subject: selftests/damon/_damon_sysfs: check errors from nr_schemes file reads DAMON context staging method in _damon_sysfs.py is not checking the returned error from nr_schemes file read. Check it. Link: https://lkml.kernel.org/r/20240503180318.72798-3-sj@kernel.org Fixes: f5f0e5a2bef9 ("selftests/damon/_damon_sysfs: implement kdamonds start function") Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 036e64070fdb..43239e5b38b2 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -341,6 +341,8 @@ class DamonCtx: nr_schemes_file = os.path.join( self.sysfs_dir(), 'schemes', 'nr_schemes') content, err = read_file(nr_schemes_file) + if err is not None: + return err if int(content) != len(self.schemes): err = write_file(nr_schemes_file, '%d' % len(self.schemes)) if err != None: -- cgit v1.2.3 From e799fda6926ec01f8488023d4f891289996322aa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 May 2024 11:03:11 -0700 Subject: selftests/damon/_damon_sysfs: find sysfs mount point from /proc/mounts _damon_sysfs.py assumes sysfs is mounted at /sys. In some systems, that might not be true. Find the mount point from /proc/mounts file content. Link: https://lkml.kernel.org/r/20240503180318.72798-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 43239e5b38b2..9682e18dd450 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -2,7 +2,18 @@ import os -sysfs_root = '/sys/kernel/mm/damon/admin' +ksft_skip=4 + +sysfs_root = None +with open('/proc/mounts', 'r') as f: + for line in f: + dev_name, mount_point, dev_fs = line.split()[:3] + if dev_fs == 'sysfs': + sysfs_root = '%s/kernel/mm/damon/admin' % mount_point + break +if sysfs_root is None: + print('Seems sysfs not mounted?') + exit(ksft_skip) def write_file(path, string): "Returns error string if failed, or None otherwise" -- cgit v1.2.3 From 06cf8ce12cd659a3064c2e7b0208a2c0a3426838 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 May 2024 11:03:12 -0700 Subject: selftests/damon/_damon_sysfs: use 'is' instead of '==' for 'None' _damon_sysfs.py is using '==' or '!=' for 'None'. Since 'None' is a singleton, using 'is' or 'is not' is more efficient. Use the more efficient one. Link: https://lkml.kernel.org/r/20240503180318.72798-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 80 +++++++++++++-------------- 1 file changed, 40 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 9682e18dd450..2bd44c32be1b 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -45,11 +45,11 @@ class DamosAccessPattern: self.nr_accesses = nr_accesses self.age = age - if self.size == None: + if self.size is None: self.size = [0, 2**64 - 1] - if self.nr_accesses == None: + if self.nr_accesses is None: self.nr_accesses = [0, 2**64 - 1] - if self.age == None: + if self.age is None: self.age = [0, 2**64 - 1] def sysfs_dir(self): @@ -58,27 +58,27 @@ class DamosAccessPattern: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'min'), self.size[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'max'), self.size[1]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'min'), self.nr_accesses[0]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'max'), self.nr_accesses[1]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'min'), self.age[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'max'), self.age[1]) - if err != None: + if err is not None: return err qgoal_metric_user_input = 'user_input' @@ -137,14 +137,14 @@ class DamosQuota: def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'), self.reset_interval_ms) - if err != None: + if err is not None: return err nr_goals_file = os.path.join(self.sysfs_dir(), 'goals', 'nr_goals') @@ -201,30 +201,30 @@ class Damos: def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'action'), self.action) - if err != None: + if err is not None: return err err = self.access_pattern.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'), '%d' % self.apply_interval_us) - if err != None: + if err is not None: return err err = self.quota.stage() - if err != None: + if err is not None: return err # disable watermarks err = write_file( os.path.join(self.sysfs_dir(), 'watermarks', 'metric'), 'none') - if err != None: + if err is not None: return err # disable filters err = write_file( os.path.join(self.sysfs_dir(), 'filters', 'nr_filters'), '0') - if err != None: + if err is not None: return err class DamonTarget: @@ -243,7 +243,7 @@ class DamonTarget: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') - if err != None: + if err is not None: return err return write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) @@ -275,27 +275,27 @@ class DamonAttrs: def stage(self): err = write_file(os.path.join(self.interval_sysfs_dir(), 'sample_us'), self.sample_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'aggr_us'), self.aggr_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'update_us'), self.update_us) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'min'), self.min_nr_regions) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'max'), self.max_nr_regions) - if err != None: + if err is not None: return err class DamonCtx: @@ -329,24 +329,24 @@ class DamonCtx: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'operations'), self.ops) - if err != None: + if err is not None: return err err = self.monitoring_attrs.stage() - if err != None: + if err is not None: return err nr_targets_file = os.path.join( self.sysfs_dir(), 'targets', 'nr_targets') content, err = read_file(nr_targets_file) - if err != None: + if err is not None: return err if int(content) != len(self.targets): err = write_file(nr_targets_file, '%d' % len(self.targets)) - if err != None: + if err is not None: return err for target in self.targets: err = target.stage() - if err != None: + if err is not None: return err nr_schemes_file = os.path.join( @@ -356,11 +356,11 @@ class DamonCtx: return err if int(content) != len(self.schemes): err = write_file(nr_schemes_file, '%d' % len(self.schemes)) - if err != None: + if err is not None: return err for scheme in self.schemes: err = scheme.stage() - if err != None: + if err is not None: return err return None @@ -384,16 +384,16 @@ class Kdamond: nr_contexts_file = os.path.join(self.sysfs_dir(), 'contexts', 'nr_contexts') content, err = read_file(nr_contexts_file) - if err != None: + if err is not None: return err if int(content) != len(self.contexts): err = write_file(nr_contexts_file, '%d' % len(self.contexts)) - if err != None: + if err is not None: return err for context in self.contexts: err = context.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err @@ -401,20 +401,20 @@ class Kdamond: def update_schemes_tried_bytes(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_tried_bytes') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: content, err = read_file(os.path.join(scheme.sysfs_dir(), 'tried_regions', 'total_bytes')) - if err != None: + if err is not None: return err scheme.tried_bytes = int(content) def update_schemes_stats(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_stats') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: @@ -423,7 +423,7 @@ class Kdamond: 'sz_applied', 'qt_exceeds']: content, err = read_file( os.path.join(scheme.sysfs_dir(), 'stats', stat)) - if err != None: + if err is not None: return err stat_values.append(int(content)) scheme.stats = DamosStats(*stat_values) @@ -471,10 +471,10 @@ class Kdamonds: def start(self): err = write_file(os.path.join(self.sysfs_dir(), 'nr_kdamonds'), '%s' % len(self.kdamonds)) - if err != None: + if err is not None: return err for kdamond in self.kdamonds: err = kdamond.start() - if err != None: + if err is not None: return err return None -- cgit v1.2.3 From 5965d5615b75bbc770a7f36895f91cc72cfd4118 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 May 2024 11:03:13 -0700 Subject: selftests/damon: classify tests for functionalities and regressions DAMON selftests can be classified into two categories: functionalities and regressions. Functionality tests are for checking if the function is working as specified, while the regression tests are basically reproducers of previously reported and fixed bugs. The tests of the categories are mixed in the selftests Makefile. Separate those for easier understanding of the types of tests. Link: https://lkml.kernel.org/r/20240503180318.72798-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 06c248880172..29a22f50e762 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -7,16 +7,21 @@ TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh + +# functionality tests TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh +TEST_PROGS += sysfs.sh +TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py +TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py +TEST_PROGS += reclaim.sh lru_sort.sh + +# regression tests (reproducers of previously found bugs) TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh TEST_PROGS += debugfs_target_ids_pid_leak.sh -TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py -TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py -TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py -TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk -- cgit v1.2.3 From e6b331ab0a716c1d9273383ae59c5b2eea5ac00d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 2 May 2024 21:04:26 +0100 Subject: selftests: cgroup: remove redundant enabling of memory controller Memory controller is already enabled in main which invokes the test, hence this does not need to be done in test_no_kmem_bypass. Link: https://lkml.kernel.org/r/20240502200529.4193651-2-usamaarif642@gmail.com Signed-off-by: Usama Arif Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index f0e488ed90d8..0f1023813c03 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -364,8 +364,6 @@ static int test_no_kmem_bypass(const char *root) trigger_allocation_size = sys_info.totalram / 20; /* Set up test memcg */ - if (cg_write(root, "cgroup.subtree_control", "+memory")) - goto out; test_group = cg_name(root, "kmem_bypass_test"); if (!test_group) goto out; -- cgit v1.2.3 From 158863e5d7cc6c9ee7e6c998ef84625caa2e88b3 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 8 May 2024 18:13:59 +0100 Subject: selftests: cgroup: add tests to verify the zswap writeback path Attempt writeback with the below steps and check using memory.stat.zswpwb if zswap writeback occurred: 1. Allocate memory. 2. Reclaim memory equal to the amount that was allocated in step 1. This will move it into zswap. 3. Save current zswap usage. 4. Move the memory allocated in step 1 back in from zswap. 5. Set zswap.max to half the amount that was recorded in step 3. 6. Attempt to reclaim memory equal to the amount that was allocated, this will either trigger writeback if it's enabled, or reclamation will fail if writeback is disabled as there isn't enough zswap space. Link: https://lkml.kernel.org/r/20240508171359.1545744-1-usamaarif642@gmail.com Signed-off-by: Usama Arif Suggested-by: Nhat Pham Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Johannes Weiner Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 130 +++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 0f1023813c03..e0aed703f3da 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,7 +50,7 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_cg_wb_count(const char *cg) +static long get_cg_wb_count(const char *cg) { return cg_read_key_long(cg, "memory.stat", "zswpwb"); } @@ -248,6 +248,132 @@ out: return ret; } +/* + * Attempt writeback with the following steps: + * 1. Allocate memory. + * 2. Reclaim memory equal to the amount that was allocated in step 1. + This will move it into zswap. + * 3. Save current zswap usage. + * 4. Move the memory allocated in step 1 back in from zswap. + * 5. Set zswap.max to half the amount that was recorded in step 3. + * 6. Attempt to reclaim memory equal to the amount that was allocated, + this will either trigger writeback if it's enabled, or reclamation + will fail if writeback is disabled as there isn't enough zswap space. + */ +static int attempt_writeback(const char *cgroup, void *arg) +{ + long pagesize = sysconf(_SC_PAGESIZE); + char *test_group = arg; + size_t memsize = MB(4); + char buf[pagesize]; + long zswap_usage; + bool wb_enabled; + int ret = -1; + char *mem; + + wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); + mem = (char *)malloc(memsize); + if (!mem) + return ret; + + /* + * Fill half of each page with increasing data, and keep other + * half empty, this will result in data that is still compressible + * and ends up in zswap, with material zswap usage. + */ + for (int i = 0; i < pagesize; i++) + buf[i] = i < pagesize/2 ? (char) i : 0; + + for (int i = 0; i < memsize; i += pagesize) + memcpy(&mem[i], buf, pagesize); + + /* Try and reclaim allocated memory */ + if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { + ksft_print_msg("Failed to reclaim all of the requested memory\n"); + goto out; + } + + zswap_usage = cg_read_long(test_group, "memory.zswap.current"); + + /* zswpin */ + for (int i = 0; i < memsize; i += pagesize) { + if (memcmp(&mem[i], buf, pagesize)) { + ksft_print_msg("invalid memory\n"); + goto out; + } + } + + if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) + goto out; + + /* + * If writeback is enabled, trying to reclaim memory now will trigger a + * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * If writeback is disabled, memory reclaim will fail as zswap is limited and + * it can't writeback to swap. + */ + ret = cg_write_numeric(test_group, "memory.reclaim", memsize); + if (!wb_enabled) + ret = (ret == -EAGAIN) ? 0 : -1; + +out: + free(mem); + return ret; +} + +/* Test to verify the zswap writeback path */ +static int test_zswap_writeback(const char *root, bool wb) +{ + long zswpwb_before, zswpwb_after; + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "zswap_writeback_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) + goto out; + + zswpwb_before = get_cg_wb_count(test_group); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + goto out; + } + + if (cg_run(test_group, attempt_writeback, (void *) test_group)) + goto out; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(test_group); + if (zswpwb_after < 0) + goto out; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s", + zswpwb_after, wb ? "enabled" : "disabled"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +static int test_zswap_writeback_enabled(const char *root) +{ + return test_zswap_writeback(root, true); +} + +static int test_zswap_writeback_disabled(const char *root) +{ + return test_zswap_writeback(root, false); +} + /* * When trying to store a memcg page in zswap, if the memcg hits its memory * limit in zswap, writeback should affect only the zswapped pages of that @@ -423,6 +549,8 @@ struct zswap_test { T(test_zswap_usage), T(test_swapin_nozswap), T(test_zswapin), + T(test_zswap_writeback_enabled), + T(test_zswap_writeback_disabled), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), }; -- cgit v1.2.3 From b665eed25fed247510486353985060a9ba50c6a3 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 9 May 2024 15:24:47 +0530 Subject: selftests/mm: hugetlb_madv_vs_map: avoid test skipping by querying hugepage size at runtime Currently, the size used in mmap() is statically defined, leading to skipping of the test on a hugepage size other than 2 MB, since munmap() won't free the hugepage for a size greater than 2 MB. Hence, query the size at runtime. Also, there is no reason why a hugepage allocation should fail, since we are using a simple mmap() using MAP_HUGETLB; hence, instead of skipping the test, make it fail. Link: https://lkml.kernel.org/r/20240509095447.3791573-1-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Cc: Anshuman Khandual Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb_madv_vs_map.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index d01e8d4901d0..8f122a0f0828 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -27,9 +27,9 @@ #include "vm_util.h" #include "../kselftest.h" -#define MMAP_SIZE (1 << 21) #define INLOOP_ITER 100 +size_t mmap_size; char *huge_ptr; /* Touch the memory while it is being madvised() */ @@ -44,7 +44,7 @@ void *touch(void *unused) void *madv(void *unused) { for (int i = 0; i < INLOOP_ITER; i++) - madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + madvise(huge_ptr, mmap_size, MADV_DONTNEED); return NULL; } @@ -59,7 +59,7 @@ void *map_extra(void *unused) void *ptr; for (int i = 0; i < INLOOP_ITER; i++) { - ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); @@ -93,14 +93,16 @@ int main(void) free_hugepages); } + mmap_size = default_huge_page_size(); + while (max--) { - huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + huge_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if ((unsigned long)huge_ptr == -1) { - ksft_exit_skip("Failed to allocated huge page\n"); - return KSFT_SKIP; + ksft_test_result_fail("Failed to allocate huge page\n"); + return KSFT_FAIL; } pthread_create(&thread1, NULL, madv, NULL); @@ -117,7 +119,7 @@ int main(void) } /* Unmap and restart */ - munmap(huge_ptr, MMAP_SIZE); + munmap(huge_ptr, mmap_size); } return KSFT_PASS; -- cgit v1.2.3 From eb59a58113717df04b8a8229befd8ab1e5dbf86e Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Mon, 29 Apr 2024 23:46:09 +0000 Subject: selftests/kcmp: remove unused open mode Android bionic warns that open modes are ignored if O_CREAT or O_TMPFILE aren't specified. The permissions for the file are set above: fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644); Link: https://lkml.kernel.org/r/20240429234610.191144-1-edliaw@google.com Fixes: d97b46a64674 ("syscalls, x86: add __NR_kcmp syscall") Signed-off-by: Edward Liaw Reviewed-by: Cyrill Gorcunov Cc: Eric Biederman Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/kcmp/kcmp_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c index 25110c7c0b3e..d7a8e321bb16 100644 --- a/tools/testing/selftests/kcmp/kcmp_test.c +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -91,7 +91,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(3); - fd2 = open(kpath, O_RDWR, 0644); + fd2 = open(kpath, O_RDWR); if (fd2 < 0) { perror("Can't open file"); ksft_exit_fail(); -- cgit v1.2.3 From ea558c86248b4955e5c5f3c0c921df450880605e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 29 Apr 2024 16:37:07 -0700 Subject: tools lib subcmd: Show parent options in help I've just realized that help message in a subcommand didn't show one in the parent command. Since the option parser understands the parent, display code should do the same. For example, `perf ftrace latency -h` should show options in the `perf ftrace` command too. Before: $ perf ftrace latency -h Usage: perf ftrace [] [] or: perf ftrace [] -- [] [] or: perf ftrace {trace|latency} [] [] or: perf ftrace {trace|latency} [] -- [] [] -b, --use-bpf Use BPF to measure function latency -n, --use-nsec Use nano-second histogram -T, --trace-funcs Show latency of given function After: $ perf ftrace latency -h Usage: perf ftrace [] [] or: perf ftrace [] -- [] [] or: perf ftrace {trace|latency} [] [] or: perf ftrace {trace|latency} [] -- [] [] -a, --all-cpus System-wide collection from all CPUs -b, --use-bpf Use BPF to measure function latency -C, --cpu List of cpus to monitor -n, --use-nsec Use nano-second histogram -p, --pid Trace on existing process id -T, --trace-funcs Show latency of given function -v, --verbose Be more verbose --tid Trace on existing thread id (exclusive to --pid) Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240429233707.1511175-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/parse-options.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index d943d78b787e..4b60ec03b0bb 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -808,18 +808,30 @@ static int option__cmp(const void *va, const void *vb) static struct option *options__order(const struct option *opts) { - int nr_opts = 0, nr_group = 0, len; - const struct option *o = opts; - struct option *opt, *ordered, *group; - - for (o = opts; o->type != OPTION_END; o++) - ++nr_opts; - - len = sizeof(*o) * (nr_opts + 1); - ordered = malloc(len); - if (!ordered) - goto out; - memcpy(ordered, opts, len); + int nr_opts = 0, nr_group = 0, nr_parent = 0, len; + const struct option *o, *p = opts; + struct option *opt, *ordered = NULL, *group; + + /* flatten the options that have parents */ + for (p = opts; p != NULL; p = o->parent) { + for (o = p; o->type != OPTION_END; o++) + ++nr_opts; + + /* + * the length is given by the number of options plus a null + * terminator for the last loop iteration. + */ + len = sizeof(*o) * (nr_opts + !o->parent); + group = realloc(ordered, len); + if (!group) + goto out; + ordered = group; + memcpy(&ordered[nr_parent], p, sizeof(*o) * (nr_opts - nr_parent)); + + nr_parent = nr_opts; + } + /* copy the last OPTION_END */ + memcpy(&ordered[nr_opts], o, sizeof(*o)); /* sort each option group individually */ for (opt = group = ordered; opt->type != OPTION_END; opt++) { -- cgit v1.2.3 From 73964e9085bbea517a675d5d8ceeb1e609a34748 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:18 -0500 Subject: selftests/bpf: Migrate recvmsg* return code tests to verifier_sock_addr.c This set of tests check that the BPF verifier rejects programs with invalid return codes (recvmsg4 and recvmsg6 hooks can only return 1). This patch replaces the tests in test_sock_addr.c with verifier_sock_addr.c, a new verifier prog_tests for sockaddr hooks, in a step towards fully retiring test_sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-2-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_sock_addr.c | 37 ++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 70 ---------------------- 3 files changed, 39 insertions(+), 70 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_sock_addr.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index c4f9f306646e..c60db8beeb73 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -66,6 +66,7 @@ #include "verifier_sdiv.skel.h" #include "verifier_search_pruning.skel.h" #include "verifier_sock.skel.h" +#include "verifier_sock_addr.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" @@ -181,6 +182,7 @@ void test_verifier_scalar_ids(void) { RUN(verifier_scalar_ids); } void test_verifier_sdiv(void) { RUN(verifier_sdiv); } void test_verifier_search_pruning(void) { RUN(verifier_search_pruning); } void test_verifier_sock(void) { RUN(verifier_sock); } +void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c new file mode 100644 index 000000000000..5081fa723d3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include +#include +#include +#include "bpf_misc.h" + +SEC("cgroup/recvmsg4") +__success +int recvmsg4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/recvmsg6") +__success +int recvmsg6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index aa2198a0f24d..40e33167bec2 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -94,8 +94,6 @@ static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg_allow_prog_load(const struct sock_addr_test *test); static int sendmsg_deny_prog_load(const struct sock_addr_test *test); -static int recvmsg_allow_prog_load(const struct sock_addr_test *test); -static int recvmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); @@ -373,64 +371,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_EPERM, }, - - /* recvmsg */ - { - "recvmsg4: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg4: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "recvmsg6: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg6: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -527,16 +467,6 @@ static int sendmsg_deny_prog_load(const struct sock_addr_test *test) return xmsg_ret_only_prog_load(test, /*rc*/ 0); } -static int recvmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - -static int recvmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; -- cgit v1.2.3 From 86b65c6db0190fb6c119e83da4de0eccf74fb1ff Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:19 -0500 Subject: selftests/bpf: Use program name for skel load/destroy functions In preparation to migrate tests from bpf/test_sock_addr.c to sock_addr.c, update BPF_SKEL_FUNCS so that it generates functions based on prog_name instead of skel_name. This allows us to differentiate between programs in the same skeleton. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-3-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 96 +++++++++++----------- 1 file changed, 50 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 9c709c33f889..039c3e38e1bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -357,7 +357,7 @@ struct sock_addr_test { }; #define BPF_SKEL_FUNCS(skel_name, prog_name) \ -static void *skel_name##_load(int cgroup_fd) \ +static void *prog_name##_load(int cgroup_fd) \ { \ struct skel_name *skel; \ skel = skel_name##__open_and_load(); \ @@ -372,7 +372,7 @@ cleanup: \ skel_name##__destroy(skel); \ return NULL; \ } \ -static void skel_name##_destroy(void *skel) \ +static void prog_name##_destroy(void *skel) \ { \ skel_name##__destroy(skel); \ } @@ -396,8 +396,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind4: bind (stream)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &user_ops, AF_INET, SOCK_STREAM, @@ -405,12 +405,13 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, }, { SOCK_ADDR_TEST_BIND, "bind4: bind (dgram)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &user_ops, AF_INET, SOCK_DGRAM, @@ -418,12 +419,13 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &user_ops, AF_INET6, SOCK_STREAM, @@ -431,12 +433,13 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (dgram)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &user_ops, AF_INET6, SOCK_DGRAM, @@ -444,14 +447,15 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, }, /* bind - kernel calls */ { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (stream)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -463,8 +467,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (dgram)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -476,8 +480,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (stream)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -489,8 +493,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (dgram)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -504,8 +508,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: connect (stream)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &user_ops, AF_INET, SOCK_STREAM, @@ -518,8 +522,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: connect (dgram)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &user_ops, AF_INET, SOCK_DGRAM, @@ -532,8 +536,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &user_ops, AF_INET6, SOCK_STREAM, @@ -546,8 +550,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: connect (dgram)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &user_ops, AF_INET6, SOCK_DGRAM, @@ -576,8 +580,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (stream)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -590,8 +594,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (dgram)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -604,8 +608,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (stream)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -618,8 +622,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (dgram)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -648,8 +652,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg4: sendmsg (dgram)", - sendmsg4_prog_load, - sendmsg4_prog_destroy, + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, &user_ops, AF_INET, SOCK_DGRAM, @@ -662,8 +666,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", - sendmsg6_prog_load, - sendmsg6_prog_destroy, + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, &user_ops, AF_INET6, SOCK_DGRAM, @@ -692,8 +696,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg4: sock_sendmsg (dgram)", - sendmsg4_prog_load, - sendmsg4_prog_destroy, + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -706,8 +710,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sock_sendmsg (dgram)", - sendmsg6_prog_load, - sendmsg6_prog_destroy, + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -736,8 +740,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg4: kernel_sendmsg (dgram)", - sendmsg4_prog_load, - sendmsg4_prog_destroy, + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, &kern_ops_kernel_sendmsg, AF_INET, SOCK_DGRAM, @@ -750,8 +754,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: kernel_sendmsg (dgram)", - sendmsg6_prog_load, - sendmsg6_prog_destroy, + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, &kern_ops_kernel_sendmsg, AF_INET6, SOCK_DGRAM, -- cgit v1.2.3 From 5eff48f33fb733de9b88a5381e0428f3e873c670 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:20 -0500 Subject: selftests/bpf: Handle LOAD_REJECT test cases In preparation to move test cases from bpf/test_sock_addr.c that expect LOAD_REJECT, this patch adds expected_attach_type and extends load_fn to accept an expected attach type and a flag indicating whether or not rejection is expected. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-4-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 103 ++++++++++++++++++++- 1 file changed, 98 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 039c3e38e1bc..3033641fd756 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -52,7 +52,9 @@ enum sock_addr_test_type { SOCK_ADDR_TEST_GETPEERNAME, }; -typedef void *(*load_fn)(int cgroup_fd); +typedef void *(*load_fn)(int cgroup_fd, + enum bpf_attach_type attach_type, + bool expect_reject); typedef void (*destroy_fn)(void *skel); static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len, @@ -343,6 +345,7 @@ struct sock_addr_test { /* BPF prog properties */ load_fn loadfn; destroy_fn destroyfn; + enum bpf_attach_type attach_type; /* Socket operations */ struct sock_ops *ops; /* Socket properties */ @@ -354,15 +357,34 @@ struct sock_addr_test { const char *expected_addr; unsigned short expected_port; const char *expected_src_addr; + /* Expected test result */ + enum { + LOAD_REJECT, + ATTACH_REJECT, + SYSCALL_EPERM, + SYSCALL_ENOTSUPP, + SUCCESS, + } expected_result; }; #define BPF_SKEL_FUNCS(skel_name, prog_name) \ -static void *prog_name##_load(int cgroup_fd) \ +static void *prog_name##_load(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ { \ - struct skel_name *skel; \ - skel = skel_name##__open_and_load(); \ + struct skel_name *skel = skel_name##__open(); \ if (!ASSERT_OK_PTR(skel, "skel_open")) \ goto cleanup; \ + if (!ASSERT_OK(bpf_program__set_expected_attach_type(skel->progs.prog_name, \ + attach_type), \ + "set_expected_attach_type")) \ + goto cleanup; \ + if (skel_name##__load(skel)) { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ skel->links.prog_name = bpf_program__attach_cgroup( \ skel->progs.prog_name, cgroup_fd); \ if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \ @@ -398,6 +420,7 @@ static struct sock_addr_test tests[] = { "bind4: bind (stream)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &user_ops, AF_INET, SOCK_STREAM, @@ -406,12 +429,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind4: bind (dgram)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &user_ops, AF_INET, SOCK_DGRAM, @@ -420,12 +445,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &user_ops, AF_INET6, SOCK_STREAM, @@ -434,12 +461,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (dgram)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &user_ops, AF_INET6, SOCK_DGRAM, @@ -448,6 +477,7 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, NULL, + SUCCESS, }, /* bind - kernel calls */ @@ -456,6 +486,7 @@ static struct sock_addr_test tests[] = { "bind4: kernel_bind (stream)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -463,12 +494,15 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (dgram)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -476,12 +510,15 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (stream)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -489,12 +526,15 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (dgram)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -502,6 +542,8 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, + SUCCESS, }, /* connect - system calls */ @@ -510,6 +552,7 @@ static struct sock_addr_test tests[] = { "connect4: connect (stream)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &user_ops, AF_INET, SOCK_STREAM, @@ -518,12 +561,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect4: connect (dgram)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &user_ops, AF_INET, SOCK_DGRAM, @@ -532,12 +577,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &user_ops, AF_INET6, SOCK_STREAM, @@ -546,12 +593,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (dgram)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &user_ops, AF_INET6, SOCK_DGRAM, @@ -560,12 +609,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, &user_ops, AF_UNIX, SOCK_STREAM, @@ -574,6 +625,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* connect - kernel calls */ @@ -582,6 +634,7 @@ static struct sock_addr_test tests[] = { "connect4: kernel_connect (stream)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -590,12 +643,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (dgram)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -604,12 +659,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (stream)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -618,12 +675,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (dgram)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -632,12 +691,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: kernel_connect (dgram)", connect_unix_prog_load, connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, &kern_ops_sock_sendmsg, AF_UNIX, SOCK_STREAM, @@ -646,6 +707,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* sendmsg - system calls */ @@ -654,6 +716,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: sendmsg (dgram)", sendmsg_v4_prog_load, sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, &user_ops, AF_INET, SOCK_DGRAM, @@ -662,12 +725,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", sendmsg_v6_prog_load, sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, &user_ops, AF_INET6, SOCK_DGRAM, @@ -676,12 +741,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, &user_ops, AF_UNIX, SOCK_DGRAM, @@ -690,6 +757,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* sendmsg - kernel calls (sock_sendmsg) */ @@ -698,6 +766,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: sock_sendmsg (dgram)", sendmsg_v4_prog_load, sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -706,12 +775,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sock_sendmsg (dgram)", sendmsg_v6_prog_load, sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -720,12 +791,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, &kern_ops_sock_sendmsg, AF_UNIX, SOCK_DGRAM, @@ -734,6 +807,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* sendmsg - kernel calls (kernel_sendmsg) */ @@ -742,6 +816,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: kernel_sendmsg (dgram)", sendmsg_v4_prog_load, sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, &kern_ops_kernel_sendmsg, AF_INET, SOCK_DGRAM, @@ -750,12 +825,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: kernel_sendmsg (dgram)", sendmsg_v6_prog_load, sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, &kern_ops_kernel_sendmsg, AF_INET6, SOCK_DGRAM, @@ -764,12 +841,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, &kern_ops_kernel_sendmsg, AF_UNIX, SOCK_DGRAM, @@ -778,6 +857,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* recvmsg - system calls */ @@ -786,6 +866,7 @@ static struct sock_addr_test tests[] = { "recvmsg4: recvfrom (dgram)", recvmsg4_prog_load, recvmsg4_prog_destroy, + BPF_CGROUP_UDP4_RECVMSG, &user_ops, AF_INET, SOCK_DGRAM, @@ -794,12 +875,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SERV4_IP, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg6: recvfrom (dgram)", recvmsg6_prog_load, recvmsg6_prog_destroy, + BPF_CGROUP_UDP6_RECVMSG, &user_ops, AF_INET6, SOCK_DGRAM, @@ -808,12 +891,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SERV6_IP, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, &user_ops, AF_UNIX, SOCK_DGRAM, @@ -822,12 +907,14 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, &user_ops, AF_UNIX, SOCK_STREAM, @@ -836,6 +923,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, }, /* getsockname - system calls */ @@ -844,6 +932,7 @@ static struct sock_addr_test tests[] = { "getsockname_unix", getsockname_unix_prog_load, getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, &user_ops, AF_UNIX, SOCK_STREAM, @@ -852,6 +941,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* getpeername - system calls */ @@ -860,6 +950,7 @@ static struct sock_addr_test tests[] = { "getpeername_unix", getpeername_unix_prog_load, getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, &user_ops, AF_UNIX, SOCK_STREAM, @@ -868,6 +959,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, }; @@ -1249,7 +1341,8 @@ void test_sock_addr(void) if (!test__start_subtest(test->name)) continue; - skel = test->loadfn(cgroup_fd); + skel = test->loadfn(cgroup_fd, test->attach_type, + test->expected_result == LOAD_REJECT); if (!skel) continue; -- cgit v1.2.3 From 5a047b2226c0511d4528d1467dc90f08fffafc38 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:21 -0500 Subject: selftests/bpf: Handle ATTACH_REJECT test cases In preparation to move test cases from bpf/test_sock_addr.c that expect ATTACH_REJECT, this patch adds BPF_SKEL_FUNCS_RAW to generate load and destroy functions that use bpf_prog_attach() to control the attach_type. The normal load functions use bpf_program__attach_cgroup which does not have the same degree of control over the attach type, as bpf_program_attach_fd() calls bpf_link_create() with the attach type extracted from prog using bpf_program__expected_attach_type(). It is currently not possible to modify the attach type before bpf_program__attach_cgroup() is called, since bpf_program__set_expected_attach_type() has no effect after the program is loaded. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-5-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 35 +++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 3033641fd756..53440458f365 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -367,6 +367,38 @@ struct sock_addr_test { } expected_result; }; +#define BPF_SKEL_FUNCS_RAW(skel_name, prog_name) \ +static void *prog_name##_load_raw(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ +{ \ + struct skel_name *skel = skel_name##__open(); \ + int prog_fd = -1; \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + if (!ASSERT_OK(skel_name##__load(skel), "load")) \ + goto cleanup; \ + prog_fd = bpf_program__fd(skel->progs.prog_name); \ + if (!ASSERT_GT(prog_fd, 0, "prog_fd")) \ + goto cleanup; \ + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, \ + BPF_F_ALLOW_OVERRIDE), "bpf_prog_attach") { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ +cleanup: \ + if (prog_fd > 0) \ + bpf_prog_detach(cgroup_fd, attach_type); \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void prog_name##_destroy_raw(void *progfd) \ +{ \ + /* No-op. *_load_raw does all cleanup. */ \ +} \ + #define BPF_SKEL_FUNCS(skel_name, prog_name) \ static void *prog_name##_load(int cgroup_fd, \ enum bpf_attach_type attach_type, \ @@ -1342,7 +1374,8 @@ void test_sock_addr(void) continue; skel = test->loadfn(cgroup_fd, test->attach_type, - test->expected_result == LOAD_REJECT); + test->expected_result == LOAD_REJECT || + test->expected_result == ATTACH_REJECT); if (!skel) continue; -- cgit v1.2.3 From a2618c0d854235deaac2325cf8200a55274afa2b Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:22 -0500 Subject: selftests/bpf: Handle SYSCALL_EPERM and SYSCALL_ENOTSUPP test cases In preparation to move test cases from bpf/test_sock_addr.c that expect system calls to return ENOTSUPP or EPERM, this patch propagates errno from relevant system calls up to test_sock_addr() where the result can be checked. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-6-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 78 ++++++++++++++++------ 1 file changed, 58 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 53440458f365..626be900a8fd 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -19,6 +19,10 @@ #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" +#ifndef ENOTSUPP +# define ENOTSUPP 524 +#endif + #define TEST_NS "sock_addr" #define TEST_IF_PREFIX "test_sock_addr" #define TEST_IPV4 "127.0.0.4" @@ -43,6 +47,8 @@ #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite" #define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" +#define save_errno_do(op) ({ int __save = errno; op; errno = __save; }) + enum sock_addr_test_type { SOCK_ADDR_TEST_BIND, SOCK_ADDR_TEST_CONNECT, @@ -98,6 +104,7 @@ static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size) goto err; err = topts.retval; + errno = -topts.retval; goto out; err: err = -1; @@ -221,8 +228,7 @@ int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, sockle "kernel_init_sock")) goto err; - if (!ASSERT_OK(kernel_connect((struct sockaddr *)addr, addrlen), - "kernel_connect")) + if (kernel_connect((struct sockaddr *)addr, addrlen) < 0) goto err; /* Test code expects a "file descriptor" on success. */ @@ -230,7 +236,7 @@ int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, sockle goto out; err: err = -1; - ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); out: return err; } @@ -248,8 +254,7 @@ int kernel_start_server(int family, int type, const char *addr_str, __u16 port, if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) goto err; - if (!ASSERT_OK(kernel_bind(0, (struct sockaddr *)&addr, addrlen), - "kernel_bind")) + if (kernel_bind(0, (struct sockaddr *)&addr, addrlen) < 0) goto err; if (type == SOCK_STREAM) { @@ -262,7 +267,7 @@ int kernel_start_server(int family, int type, const char *addr_str, __u16 port, goto out; err: err = -1; - ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); out: return err; } @@ -1066,7 +1071,7 @@ static void unload_sock_addr_kern(void) sock_addr_kern__destroy(skel); } -static void test_bind(struct sock_addr_test *test) +static int test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); @@ -1075,8 +1080,10 @@ static void test_bind(struct sock_addr_test *test) serv = test->ops->start_server(test->socket_family, test->socket_type, test->requested_addr, test->requested_port, 0); - if (!ASSERT_GE(serv, 0, "start_server")) - goto cleanup; + if (serv < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, @@ -1095,13 +1102,17 @@ static void test_bind(struct sock_addr_test *test) goto cleanup; cleanup: + err = 0; +err: if (client != -1) close(client); if (serv != -1) test->ops->close(serv); + + return err; } -static void test_connect(struct sock_addr_test *test) +static int test_connect(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr, expected_src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -1121,8 +1132,10 @@ static void test_connect(struct sock_addr_test *test) client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, NULL); - if (!ASSERT_GE(client, 0, "connect_to_addr")) - goto cleanup; + if (client < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, &expected_addr, &expected_addr_len); @@ -1149,13 +1162,17 @@ static void test_connect(struct sock_addr_test *test) goto cleanup; } cleanup: + err = 0; +err: if (client != -1) test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_xmsg(struct sock_addr_test *test) +static int test_xmsg(struct sock_addr_test *test) { struct sockaddr_storage addr, src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -1196,6 +1213,11 @@ static void test_xmsg(struct sock_addr_test *test) if (test->socket_type == SOCK_DGRAM) { err = test->ops->sendmsg(client, (struct sockaddr *)&addr, addr_len, &data, sizeof(data)); + if (err < 0) { + err = errno; + goto err; + } + if (!ASSERT_EQ(err, sizeof(data), "sendmsg")) goto cleanup; } else { @@ -1245,13 +1267,17 @@ static void test_xmsg(struct sock_addr_test *test) } cleanup: + err = 0; +err: if (client != -1) test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_getsockname(struct sock_addr_test *test) +static int test_getsockname(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); @@ -1275,9 +1301,11 @@ static void test_getsockname(struct sock_addr_test *test) cleanup: if (serv != -1) test->ops->close(serv); + + return 0; } -static void test_getpeername(struct sock_addr_test *test) +static int test_getpeername(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -1314,6 +1342,8 @@ cleanup: test->ops->close(client); if (serv != -1) close(serv); + + return 0; } static int setup_test_env(struct nstoken **tok) @@ -1369,6 +1399,7 @@ void test_sock_addr(void) for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; + int err; if (!test__start_subtest(test->name)) continue; @@ -1385,26 +1416,33 @@ void test_sock_addr(void) * the future. */ case SOCK_ADDR_TEST_BIND: - test_bind(test); + err = test_bind(test); break; case SOCK_ADDR_TEST_CONNECT: - test_connect(test); + err = test_connect(test); break; case SOCK_ADDR_TEST_SENDMSG: case SOCK_ADDR_TEST_RECVMSG: - test_xmsg(test); + err = test_xmsg(test); break; case SOCK_ADDR_TEST_GETSOCKNAME: - test_getsockname(test); + err = test_getsockname(test); break; case SOCK_ADDR_TEST_GETPEERNAME: - test_getpeername(test); + err = test_getpeername(test); break; default: ASSERT_TRUE(false, "Unknown sock addr test type"); break; } + if (test->expected_result == SYSCALL_EPERM) + ASSERT_EQ(err, EPERM, "socket operation returns EPERM"); + else if (test->expected_result == SYSCALL_ENOTSUPP) + ASSERT_EQ(err, ENOTSUPP, "socket operation returns ENOTSUPP"); + else if (test->expected_result == SUCCESS) + ASSERT_OK(err, "socket operation succeeds"); + test->destroyfn(skel); } -- cgit v1.2.3 From d1b24fcf1c16290ce8cac467be2f7d6773de9da4 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:23 -0500 Subject: selftests/bpf: Migrate WILDCARD_IP test Move wildcard IP sendmsg test case out of bpf/test_sock_addr.c into prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-7-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 50 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/sendmsg6_prog.c | 6 +++ tools/testing/selftests/bpf/test_sock_addr.c | 20 --------- 3 files changed, 56 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 626be900a8fd..37e9ef5a5ae1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -40,6 +40,7 @@ #define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" #define SRC6_IP "::1" #define SRC6_REWRITE_IP TEST_IPV6 +#define WILDCARD6_IP "::" #define SERV6_PORT 6060 #define SERV6_REWRITE_PORT 6666 @@ -443,6 +444,7 @@ BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); @@ -780,6 +782,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", @@ -830,6 +848,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", @@ -880,6 +914,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index bf9b46b806f6..03956a654ce5 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -59,4 +59,10 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) +{ + return 1; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 40e33167bec2..ab8ef02c9c55 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -92,7 +92,6 @@ static int bind4_prog_load(const struct sock_addr_test *test); static int bind6_prog_load(const struct sock_addr_test *test); static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); -static int sendmsg_allow_prog_load(const struct sock_addr_test *test); static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); @@ -343,20 +342,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: preserve dst IP = [::] (BSD'ism)", - sendmsg_allow_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - WILDCARD6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_PORT, - SRC6_IP, - SUCCESS, - }, { "sendmsg6: deny call", sendmsg_deny_prog_load, @@ -457,11 +442,6 @@ static int xmsg_ret_only_prog_load(const struct sock_addr_test *test, return load_insns(test, insns, ARRAY_SIZE(insns)); } -static int sendmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - static int sendmsg_deny_prog_load(const struct sock_addr_test *test) { return xmsg_ret_only_prog_load(test, /*rc*/ 0); -- cgit v1.2.3 From f46a10483b27cc5a62b45e7e727445de6430e785 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:24 -0500 Subject: selftests/bpf: Migrate sendmsg deny test cases This set of tests checks that sendmsg calls are rejected (return -EPERM) when the sendmsg* hook returns 0. Replace those in bpf/test_sock_addr.c with corresponding tests in prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-8-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 98 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/sendmsg4_prog.c | 6 ++ tools/testing/selftests/bpf/progs/sendmsg6_prog.c | 6 ++ tools/testing/selftests/bpf/test_sock_addr.c | 45 ---------- 4 files changed, 110 insertions(+), 45 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 37e9ef5a5ae1..634f7a31b35d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -443,7 +443,9 @@ BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); @@ -766,6 +768,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", @@ -798,6 +816,22 @@ static struct sock_addr_test tests[] = { SRC6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", @@ -832,6 +866,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sock_sendmsg (dgram)", @@ -864,6 +914,22 @@ static struct sock_addr_test tests[] = { SRC6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", @@ -898,6 +964,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: kernel_sendmsg (dgram)", @@ -930,6 +1012,22 @@ static struct sock_addr_test tests[] = { SRC6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c index 351e79aef2fa..edc159598a0e 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c @@ -49,4 +49,10 @@ int sendmsg_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg4") +int sendmsg_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 03956a654ce5..0c1825cb994d 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -65,4 +65,10 @@ int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index ab8ef02c9c55..91d88358090e 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -92,7 +92,6 @@ static int bind4_prog_load(const struct sock_addr_test *test); static int bind6_prog_load(const struct sock_addr_test *test); static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); -static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); @@ -258,20 +257,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg4: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SYSCALL_EPERM, - }, { "sendmsg6: load prog with wrong expected attach type", sendmsg6_rw_asm_prog_load, @@ -342,20 +327,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_EPERM, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -431,22 +402,6 @@ static int connect6_prog_load(const struct sock_addr_test *test) return load_path(test, CONNECT6_PROG_PATH); } -static int xmsg_ret_only_prog_load(const struct sock_addr_test *test, - int32_t rc) -{ - struct bpf_insn insns[] = { - /* return rc */ - BPF_MOV64_IMM(BPF_REG_0, rc), - BPF_EXIT_INSN(), - }; - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; -- cgit v1.2.3 From 54462e8452f139e313e315959e005408cd31a4e6 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:25 -0500 Subject: selftests/bpf: Migrate sendmsg6 v4 mapped address tests Migrate test case from bpf/test_sock_addr.c ensuring that sendmsg returns -ENOTSUPP when sending to an IPv4-mapped IPv6 address to prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-9-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 17 +++++++++++++++ tools/testing/selftests/bpf/progs/sendmsg6_prog.c | 25 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 20 ----------------- 3 files changed, 42 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 634f7a31b35d..f096203171b1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -447,6 +447,7 @@ BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); @@ -832,6 +833,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg IPv4-mapped IPv6 (dgram)", + sendmsg_v6_v4mapped_prog_load, + sendmsg_v6_v4mapped_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_ENOTSUPP, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 0c1825cb994d..7611d9e17dd1 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -20,6 +20,11 @@ #define DST_REWRITE_IP6_2 0 #define DST_REWRITE_IP6_3 1 +#define DST_REWRITE_IP6_V4_MAPPED_0 0 +#define DST_REWRITE_IP6_V4_MAPPED_1 0 +#define DST_REWRITE_IP6_V4_MAPPED_2 0x0000FFFF +#define DST_REWRITE_IP6_V4_MAPPED_3 0xc0a80004 // 192.168.0.4 + #define DST_REWRITE_PORT6 6666 SEC("cgroup/sendmsg6") @@ -59,6 +64,26 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_0); + ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_1); + ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_2); + ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_3); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + SEC("cgroup/sendmsg6") int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) { diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 91d88358090e..4ead113753f8 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -94,7 +94,6 @@ static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { @@ -299,20 +298,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: IPv4-mapped IPv6", - sendmsg6_rw_v4mapped_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_ENOTSUPP, - }, { "sendmsg6: set dst IP = [::] (BSD'ism)", sendmsg6_rw_wildcard_prog_load, @@ -512,11 +497,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); -} - static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) { return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); -- cgit v1.2.3 From 8eaf8056a44b28a7b198aa699e35854bbec2c452 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:26 -0500 Subject: selftests/bpf: Migrate wildcard destination rewrite test Migrate test case from bpf/test_sock_addr.c ensuring that sendmsg respects when sendmsg6 hooks rewrite the destination IP with the IPv6 wildcard IP, [::]. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-10-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 17 +++++++++++++++++ tools/testing/selftests/bpf/progs/sendmsg6_prog.c | 20 ++++++++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 20 -------------------- 3 files changed, 37 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index f096203171b1..e3c450d11b9e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -448,6 +448,7 @@ BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); @@ -849,6 +850,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_ENOTSUPP, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram)", + sendmsg_v6_wildcard_prog_load, + sendmsg_v6_wildcard_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 7611d9e17dd1..36a7f960799f 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -84,6 +84,26 @@ int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_wildcard_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(0); + ctx->user_ip6[1] = bpf_htonl(0); + ctx->user_ip6[2] = bpf_htonl(0); + ctx->user_ip6[3] = bpf_htonl(0); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + SEC("cgroup/sendmsg6") int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) { diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 4ead113753f8..85fb2a793be5 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -94,7 +94,6 @@ static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { /* bind */ @@ -298,20 +297,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: set dst IP = [::] (BSD'ism)", - sendmsg6_rw_wildcard_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -497,11 +482,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); -} - static int cmp_addr(const struct sockaddr_storage *addr1, const struct sockaddr_storage *addr2, int cmp_port) { -- cgit v1.2.3 From b0f3af0bffefc54650d9fb10810fc2f974365dfd Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:27 -0500 Subject: selftests/bpf: Migrate expected_attach_type tests Migrates tests from progs/test_sock_addr.c ensuring that programs fail to load when the expected attach type does not match. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-11-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 96 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 84 ------------------- 2 files changed, 96 insertions(+), 84 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index e3c450d11b9e..8c7c56f99754 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -490,6 +490,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: load prog with wrong expected attach type", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", @@ -522,6 +538,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: load prog with wrong expected attach type", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, /* bind - kernel calls */ { @@ -622,6 +654,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: load prog with wrong expected attach type", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", @@ -654,6 +702,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: load prog with wrong expected attach type", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: connect (stream)", @@ -786,6 +850,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: load prog with wrong expected attach type", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", @@ -866,6 +946,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: load prog with wrong expected attach type", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 85fb2a793be5..4ecbc72477f1 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -97,20 +97,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { /* bind */ - { - "bind4: load prog with wrong expected attach type", - bind4_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "bind4: attach prog with wrong attach type", bind4_prog_load, @@ -125,20 +111,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "bind6: load prog with wrong expected attach type", - bind6_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "bind6: attach prog with wrong attach type", bind6_prog_load, @@ -155,20 +127,6 @@ static struct sock_addr_test tests[] = { }, /* connect */ - { - "connect4: load prog with wrong expected attach type", - connect4_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "connect4: attach prog with wrong attach type", connect4_prog_load, @@ -183,20 +141,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "connect6: load prog with wrong expected attach type", - connect6_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "connect6: attach prog with wrong attach type", connect6_prog_load, @@ -213,20 +157,6 @@ static struct sock_addr_test tests[] = { }, /* sendmsg */ - { - "sendmsg4: load prog with wrong expected attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "sendmsg4: attach prog with wrong attach type", sendmsg4_rw_asm_prog_load, @@ -255,20 +185,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: load prog with wrong expected attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "sendmsg6: attach prog with wrong attach type", sendmsg6_rw_asm_prog_load, -- cgit v1.2.3 From cded71f595c0c4396acc9657911c5aa2a289a8dc Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:28 -0500 Subject: selftests/bpf: Migrate ATTACH_REJECT test cases Migrate test case from bpf/test_sock_addr.c ensuring that program attachment fails when using an inappropriate attach type. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-12-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 102 ++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 146 --------------------- 2 files changed, 102 insertions(+), 146 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 8c7c56f99754..ebd5e58e38c5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -438,13 +438,19 @@ static void prog_name##_destroy(void *skel) \ } BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog); BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog); BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS_RAW(sendmsg6_prog, sendmsg_v6_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); @@ -506,6 +512,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: attach prog with wrong attach type", + bind_v4_prog_load_raw, + bind_v4_prog_destroy_raw, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", @@ -554,6 +576,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: attach prog with wrong attach type", + bind_v6_prog_load_raw, + bind_v6_prog_destroy_raw, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, /* bind - kernel calls */ { @@ -670,6 +708,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: attach prog with wrong attach type", + connect_v4_prog_load_raw, + connect_v4_prog_destroy_raw, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", @@ -718,6 +772,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: attach prog with wrong attach type", + connect_v6_prog_load_raw, + connect_v6_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: connect (stream)", @@ -866,6 +936,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: attach prog with wrong attach type", + sendmsg_v4_prog_load_raw, + sendmsg_v4_prog_destroy_raw, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", @@ -962,6 +1048,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: attach prog with wrong attach type", + sendmsg_v6_prog_load_raw, + sendmsg_v6_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 4ecbc72477f1..311eda4f4864 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -88,89 +88,11 @@ struct sock_addr_test { } expected_result; }; -static int bind4_prog_load(const struct sock_addr_test *test); -static int bind6_prog_load(const struct sock_addr_test *test); -static int connect4_prog_load(const struct sock_addr_test *test); -static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { - /* bind */ - { - "bind4: attach prog with wrong attach type", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "bind6: attach prog with wrong attach type", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - - /* connect */ - { - "connect4: attach prog with wrong attach type", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "connect6: attach prog with wrong attach type", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - /* sendmsg */ - { - "sendmsg4: attach prog with wrong attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, { "sendmsg4: rewrite IP & port (asm)", sendmsg4_rw_asm_prog_load, @@ -185,20 +107,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: attach prog with wrong attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, { "sendmsg6: rewrite IP & port (asm)", sendmsg6_rw_asm_prog_load, @@ -234,60 +142,6 @@ static int load_insns(const struct sock_addr_test *test, return ret; } -static int load_path(const struct sock_addr_test *test, const char *path) -{ - struct bpf_object *obj; - struct bpf_program *prog; - int err; - - obj = bpf_object__open_file(path, NULL); - err = libbpf_get_error(obj); - if (err) { - log_err(">>> Opening BPF object (%s) error.\n", path); - return -1; - } - - prog = bpf_object__next_program(obj, NULL); - if (!prog) - goto err_out; - - bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR); - bpf_program__set_expected_attach_type(prog, test->expected_attach_type); - bpf_program__set_flags(prog, testing_prog_flags()); - - err = bpf_object__load(obj); - if (err) { - if (test->expected_result != LOAD_REJECT) - log_err(">>> Loading program (%s) error.\n", path); - goto err_out; - } - - return bpf_program__fd(prog); -err_out: - bpf_object__close(obj); - return -1; -} - -static int bind4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND4_PROG_PATH); -} - -static int bind6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND6_PROG_PATH); -} - -static int connect4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT4_PROG_PATH); -} - -static int connect6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT6_PROG_PATH); -} - static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; -- cgit v1.2.3 From 9c3f17862faef89696d26655a6d10f90137df42e Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:29 -0500 Subject: selftests/bpf: Remove redundant sendmsg test cases Remove these test cases completely, as the same behavior is already covered by other sendmsg* test cases in prog_tests/sock_addr.c. This just rewrites the destination address similar to sendmsg_v4_prog and sendmsg_v6_prog. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-13-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_sock_addr.c | 161 --------------------------- 1 file changed, 161 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 311eda4f4864..a2b587273331 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -88,170 +88,9 @@ struct sock_addr_test { } expected_result; }; -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); - static struct sock_addr_test tests[] = { - /* sendmsg */ - { - "sendmsg4: rewrite IP & port (asm)", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: rewrite IP & port (asm)", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, }; -static int load_insns(const struct sock_addr_test *test, - const struct bpf_insn *insns, size_t insns_cnt) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts); - int ret; - - opts.expected_attach_type = test->expected_attach_type; - opts.log_buf = bpf_log_buf; - opts.log_size = BPF_LOG_BUF_SIZE; - - ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, NULL, "GPL", insns, insns_cnt, &opts); - if (ret < 0 && test->expected_result != LOAD_REJECT) { - log_err(">>> Loading program error.\n" - ">>> Verifier output:\n%s\n-------\n", bpf_log_buf); - } - - return ret; -} - -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) -{ - struct sockaddr_in dst4_rw_addr; - struct in_addr src4_rw_ip; - - if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) { - log_err("Invalid IPv4: %s", SRC4_REWRITE_IP); - return -1; - } - - if (make_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr_storage *)&dst4_rw_addr, - NULL) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8), - - /* sk.type == SOCK_DGRAM) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6), - - /* msg_src_ip4 = src4_rw_ip */ - BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, msg_src_ip4)), - - /* user_ip4 = dst4_rw_addr.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = dst4_rw_addr.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, - const char *rw_dst_ip) -{ - struct sockaddr_in6 dst6_rw_addr; - struct in6_addr src6_rw_ip; - - if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) { - log_err("Invalid IPv6: %s", SRC6_REWRITE_IP); - return -1; - } - - if (make_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr_storage *)&dst6_rw_addr, - NULL) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), - -#define STORE_IPV6_WORD_N(DST, SRC, N) \ - BPF_MOV32_IMM(BPF_REG_7, SRC[N]), \ - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ - offsetof(struct bpf_sock_addr, DST[N])) - -#define STORE_IPV6(DST, SRC) \ - STORE_IPV6_WORD_N(DST, SRC, 0), \ - STORE_IPV6_WORD_N(DST, SRC, 1), \ - STORE_IPV6_WORD_N(DST, SRC, 2), \ - STORE_IPV6_WORD_N(DST, SRC, 3) - - STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32), - STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32), - - /* user_port = dst6_rw_addr.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); -} - static int cmp_addr(const struct sockaddr_storage *addr1, const struct sockaddr_storage *addr2, int cmp_port) { -- cgit v1.2.3 From 61ecfdfce2647281e7d14119bfa529922ce2d8b2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:30 -0500 Subject: selftests/bpf: Retire test_sock_addr.(c|sh) Fully remove test_sock_addr.c and test_sock_addr.sh, as test coverage has been fully moved to prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-14-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/test_sock_addr.c | 574 -------------------------- tools/testing/selftests/bpf/test_sock_addr.sh | 58 --- 4 files changed, 1 insertion(+), 636 deletions(-) delete mode 100644 tools/testing/selftests/bpf/test_sock_addr.c delete mode 100755 tools/testing/selftests/bpf/test_sock_addr.sh (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f1aebabfb017..5025401323af 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -17,7 +17,6 @@ test_dev_cgroup test_verifier_log feature test_sock -test_sock_addr urandom_read test_sockmap test_lirc_mode2_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 135023a357b3..ed381b0197fe 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -115,7 +115,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ - test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ @@ -140,7 +139,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ +TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ @@ -296,7 +295,6 @@ NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(NETWORK_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c deleted file mode 100644 index a2b587273331..000000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ /dev/null @@ -1,574 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#define _GNU_SOURCE - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include "cgroup_helpers.h" -#include "network_helpers.h" -#include "testing_helpers.h" -#include "bpf_util.h" - -#ifndef ENOTSUPP -# define ENOTSUPP 524 -#endif - -#define CG_PATH "/foo" -#define CONNECT4_PROG_PATH "./connect4_prog.bpf.o" -#define CONNECT6_PROG_PATH "./connect6_prog.bpf.o" -#define SENDMSG4_PROG_PATH "./sendmsg4_prog.bpf.o" -#define SENDMSG6_PROG_PATH "./sendmsg6_prog.bpf.o" -#define RECVMSG4_PROG_PATH "./recvmsg4_prog.bpf.o" -#define RECVMSG6_PROG_PATH "./recvmsg6_prog.bpf.o" -#define BIND4_PROG_PATH "./bind4_prog.bpf.o" -#define BIND6_PROG_PATH "./bind6_prog.bpf.o" - -#define SERV4_IP "192.168.1.254" -#define SERV4_REWRITE_IP "127.0.0.1" -#define SRC4_IP "172.16.0.1" -#define SRC4_REWRITE_IP "127.0.0.4" -#define SERV4_PORT 4040 -#define SERV4_REWRITE_PORT 4444 - -#define SERV6_IP "face:b00c:1234:5678::abcd" -#define SERV6_REWRITE_IP "::1" -#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" -#define SRC6_IP "::1" -#define SRC6_REWRITE_IP "::6" -#define WILDCARD6_IP "::" -#define SERV6_PORT 6060 -#define SERV6_REWRITE_PORT 6666 - -#define INET_NTOP_BUF 40 - -struct sock_addr_test; - -typedef int (*load_fn)(const struct sock_addr_test *test); -typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -struct sock_addr_test { - const char *descr; - /* BPF prog properties */ - load_fn loadfn; - enum bpf_attach_type expected_attach_type; - enum bpf_attach_type attach_type; - /* Socket properties */ - int domain; - int type; - /* IP:port pairs for BPF prog to override */ - const char *requested_ip; - unsigned short requested_port; - const char *expected_ip; - unsigned short expected_port; - const char *expected_src_ip; - /* Expected test result */ - enum { - LOAD_REJECT, - ATTACH_REJECT, - ATTACH_OKAY, - SYSCALL_EPERM, - SYSCALL_ENOTSUPP, - SUCCESS, - } expected_result; -}; - -static struct sock_addr_test tests[] = { -}; - -static int cmp_addr(const struct sockaddr_storage *addr1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - const struct sockaddr_in *four1, *four2; - const struct sockaddr_in6 *six1, *six2; - - if (addr1->ss_family != addr2->ss_family) - return -1; - - if (addr1->ss_family == AF_INET) { - four1 = (const struct sockaddr_in *)addr1; - four2 = (const struct sockaddr_in *)addr2; - return !((four1->sin_port == four2->sin_port || !cmp_port) && - four1->sin_addr.s_addr == four2->sin_addr.s_addr); - } else if (addr1->ss_family == AF_INET6) { - six1 = (const struct sockaddr_in6 *)addr1; - six2 = (const struct sockaddr_in6 *)addr2; - return !((six1->sin6_port == six2->sin6_port || !cmp_port) && - !memcmp(&six1->sin6_addr, &six2->sin6_addr, - sizeof(struct in6_addr))); - } - - return -1; -} - -static int cmp_sock_addr(info_fn fn, int sock1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - struct sockaddr_storage addr1; - socklen_t len1 = sizeof(addr1); - - memset(&addr1, 0, len1); - if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0) - return -1; - - return cmp_addr(&addr1, addr2, cmp_port); -} - -static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0); -} - -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1); -} - -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); -} - -int init_pktinfo(int domain, struct cmsghdr *cmsg) -{ - struct in6_pktinfo *pktinfo6; - struct in_pktinfo *pktinfo4; - - if (domain == AF_INET) { - cmsg->cmsg_level = SOL_IP; - cmsg->cmsg_type = IP_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); - pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo4, 0, sizeof(struct in_pktinfo)); - if (inet_pton(domain, SRC4_IP, - (void *)&pktinfo4->ipi_spec_dst) != 1) - return -1; - } else if (domain == AF_INET6) { - cmsg->cmsg_level = SOL_IPV6; - cmsg->cmsg_type = IPV6_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); - pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo6, 0, sizeof(struct in6_pktinfo)); - if (inet_pton(domain, SRC6_IP, - (void *)&pktinfo6->ipi6_addr) != 1) - return -1; - } else { - return -1; - } - - return 0; -} - -static int sendmsg_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len, int set_cmsg, int flags, - int *syscall_err) -{ - union { - char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))]; - struct cmsghdr align; - } control6; - union { - char buf[CMSG_SPACE(sizeof(struct in_pktinfo))]; - struct cmsghdr align; - } control4; - struct msghdr hdr; - struct iovec iov; - char data = 'a'; - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - if (set_cmsg) { - if (domain == AF_INET) { - hdr.msg_control = &control4; - hdr.msg_controllen = sizeof(control4.buf); - } else if (domain == AF_INET6) { - hdr.msg_control = &control6; - hdr.msg_controllen = sizeof(control6.buf); - } - if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) { - log_err("Fail to init pktinfo"); - goto err; - } - } - - if (sendmsg(fd, &hdr, flags) != sizeof(data)) { - log_err("Fail to send message to server"); - *syscall_err = errno; - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - -static int fastconnect_to_server(const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int sendmsg_err; - - return sendmsg_to_server(SOCK_STREAM, addr, addr_len, /*set_cmsg*/0, - MSG_FASTOPEN, &sendmsg_err); -} - -static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr) -{ - struct timeval tv; - struct msghdr hdr; - struct iovec iov; - char data[64]; - fd_set rfds; - - FD_ZERO(&rfds); - FD_SET(sockfd, &rfds); - - tv.tv_sec = 2; - tv.tv_usec = 0; - - if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 || - !FD_ISSET(sockfd, &rfds)) - return -1; - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = src_addr; - hdr.msg_namelen = sizeof(struct sockaddr_storage); - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - return recvmsg(sockfd, &hdr, 0); -} - -static int init_addrs(const struct sock_addr_test *test, - struct sockaddr_storage *requested_addr, - struct sockaddr_storage *expected_addr, - struct sockaddr_storage *expected_src_addr) -{ - if (make_sockaddr(test->domain, test->expected_ip, test->expected_port, - expected_addr, NULL) == -1) - goto err; - - if (make_sockaddr(test->domain, test->requested_ip, test->requested_port, - requested_addr, NULL) == -1) - goto err; - - if (test->expected_src_ip && - make_sockaddr(test->domain, test->expected_src_ip, 0, - expected_src_addr, NULL) == -1) - goto err; - - return 0; -err: - return -1; -} - -static int run_bind_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, NULL)) - goto err; - - servfd = start_server_addr(test->type, &requested_addr, addr_len, NULL); - if (servfd == -1) - goto err; - - if (cmp_local_addr(servfd, &expected_addr)) - goto err; - - /* Try to connect to server just in case */ - clientfd = connect_to_addr(test->type, &expected_addr, addr_len, NULL); - if (clientfd == -1) - goto err; - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_connect_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_src_addr; - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, - &expected_src_addr)) - goto err; - - /* Prepare server to connect to */ - servfd = start_server_addr(test->type, &expected_addr, addr_len, NULL); - if (servfd == -1) - goto err; - - clientfd = connect_to_addr(test->type, &requested_addr, addr_len, NULL); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - - if (test->type == SOCK_STREAM) { - /* Test TCP Fast Open scenario */ - clientfd = fastconnect_to_server(&requested_addr, addr_len); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_addr; - struct sockaddr_storage server_addr; - struct sockaddr_storage sendmsg_addr; - struct sockaddr_storage recvmsg_addr; - int clientfd = -1; - int servfd = -1; - int set_cmsg; - int err = 0; - - if (test->type != SOCK_DGRAM) - goto err; - - if (init_addrs(test, &sendmsg_addr, &server_addr, &expected_addr)) - goto err; - - /* Prepare server to sendmsg to */ - servfd = start_server_addr(test->type, &server_addr, addr_len, NULL); - if (servfd == -1) - goto err; - - for (set_cmsg = 0; set_cmsg <= max_cmsg; ++set_cmsg) { - if (clientfd >= 0) - close(clientfd); - - clientfd = sendmsg_to_server(test->type, &sendmsg_addr, - addr_len, set_cmsg, /*flags*/0, - &err); - if (err) - goto out; - else if (clientfd == -1) - goto err; - - /* Try to receive message on server instead of using - * getpeername(2) on client socket, to check that client's - * destination address was rewritten properly, since - * getpeername(2) doesn't work with unconnected datagram - * sockets. - * - * Get source address from recvmsg(2) as well to make sure - * source was rewritten properly: getsockname(2) can't be used - * since socket is unconnected and source defined for one - * specific packet may differ from the one used by default and - * returned by getsockname(2). - */ - if (recvmsg_from_client(servfd, &recvmsg_addr) == -1) - goto err; - - if (cmp_addr(&recvmsg_addr, &expected_addr, /*cmp_port*/0)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_test_case(int cgfd, const struct sock_addr_test *test) -{ - int progfd = -1; - int err = 0; - - printf("Test case: %s .. ", test->descr); - - progfd = test->loadfn(test); - if (test->expected_result == LOAD_REJECT && progfd < 0) - goto out; - else if (test->expected_result == LOAD_REJECT || progfd < 0) - goto err; - - err = bpf_prog_attach(progfd, cgfd, test->attach_type, - BPF_F_ALLOW_OVERRIDE); - if (test->expected_result == ATTACH_REJECT && err) { - err = 0; /* error was expected, reset it */ - goto out; - } else if (test->expected_result == ATTACH_REJECT || err) { - goto err; - } else if (test->expected_result == ATTACH_OKAY) { - err = 0; - goto out; - } - - switch (test->attach_type) { - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - err = run_bind_test_case(test); - break; - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - err = run_connect_test_case(test); - break; - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - err = run_xmsg_test_case(test, 1); - break; - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - err = run_xmsg_test_case(test, 0); - break; - default: - goto err; - } - - if (test->expected_result == SYSCALL_EPERM && err == EPERM) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (err || test->expected_result != SUCCESS) - goto err; - - goto out; -err: - err = -1; -out: - /* Detaching w/o checking return code: best effort attempt. */ - if (progfd != -1) - bpf_prog_detach(cgfd, test->attach_type); - close(progfd); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} - -static int run_tests(int cgfd) -{ - int passes = 0; - int fails = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - if (run_test_case(cgfd, &tests[i])) - ++fails; - else - ++passes; - } - printf("Summary: %d PASSED, %d FAILED\n", passes, fails); - return fails ? -1 : 0; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - if (argc < 2) { - fprintf(stderr, - "%s has to be run via %s.sh. Skip direct run.\n", - argv[0], argv[0]); - exit(err); - } - - cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - return err; -} diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh deleted file mode 100755 index 3b9fdb8094aa..000000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -set -eu - -ping_once() -{ - type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}" - $PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 -} - -wait_for_ip() -{ - local _i - echo -n "Wait for testing IPv4/IPv6 to become available " - for _i in $(seq ${MAX_PING_TRIES}); do - echo -n "." - if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then - echo " OK" - return - fi - done - echo 1>&2 "ERROR: Timeout waiting for test IP to become available." - exit 1 -} - -setup() -{ - # Create testing interfaces not to interfere with current environment. - ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} - ip link set ${TEST_IF} up - ip link set ${TEST_IF_PEER} up - - ip -4 addr add ${TEST_IPv4} dev ${TEST_IF} - ip -6 addr add ${TEST_IPv6} dev ${TEST_IF} - wait_for_ip -} - -cleanup() -{ - ip link del ${TEST_IF} 2>/dev/null || : - ip link del ${TEST_IF_PEER} 2>/dev/null || : -} - -main() -{ - trap cleanup EXIT 2 3 6 15 - setup - ./test_sock_addr setup_done -} - -BASENAME=$(basename $0 .sh) -TEST_IF="${BASENAME}1" -TEST_IF_PEER="${BASENAME}2" -TEST_IPv4="127.0.0.4/8" -TEST_IPv6="::6/128" -MAX_PING_TRIES=5 - -main -- cgit v1.2.3 From 1e0a8367c89f82816735973d0e65a3c8e1b43179 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:31 -0500 Subject: selftests/bpf: Expand sockaddr program return value tests This patch expands verifier coverage for program return values to cover bind, connect, sendmsg, getsockname, and getpeername hooks. It also rounds out the recvmsg coverage by adding test cases for recvmsg_unix hooks. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-15-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_sock_addr.c | 294 +++++++++++++++++++++ 1 file changed, 294 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c index 5081fa723d3a..9c31448a0f52 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c @@ -34,4 +34,298 @@ int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx) return 0; } +SEC("cgroup/recvmsg_unix") +__success +int recvmsg_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/getpeername4") +__success +int getpeername4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername6") +__success +int getpeername6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername_unix") +__success +int getpeername_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname4") +__success +int getsockname4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname6") +__success +int getsockname6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname_unix") +__success +int getsockname_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname_unix_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind4") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind6") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From dfb7539b47b501ccc0d23bae718500ada2157aee Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:32 -0500 Subject: sefltests/bpf: Expand sockaddr hook deny tests This patch expands test coverage for EPERM tests to include connect and bind calls and rounds out the coverage for sendmsg by adding tests for sendmsg_unix. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-16-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 342 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/bind4_prog.c | 6 + tools/testing/selftests/bpf/progs/bind6_prog.c | 6 + tools/testing/selftests/bpf/progs/connect4_prog.c | 6 + tools/testing/selftests/bpf/progs/connect6_prog.c | 6 + .../selftests/bpf/progs/connect_unix_prog.c | 6 + .../selftests/bpf/progs/sendmsg_unix_prog.c | 6 + 7 files changed, 378 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index ebd5e58e38c5..0477b4080b2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -439,13 +439,18 @@ static void prog_name##_destroy(void *skel) \ BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS(bind4_prog, bind_v4_deny_prog); BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_deny_prog); BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_deny_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); @@ -456,6 +461,7 @@ BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); @@ -480,6 +486,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind4: bind (dgram)", @@ -496,6 +518,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind4: load prog with wrong expected attach type", @@ -544,6 +582,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: bind (dgram)", @@ -560,6 +614,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: load prog with wrong expected attach type", @@ -610,6 +680,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (dgram)", @@ -626,6 +712,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (stream)", @@ -642,6 +744,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (dgram)", @@ -658,6 +776,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, /* connect - system calls */ { @@ -676,6 +810,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect4: connect (dgram)", @@ -692,6 +842,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect4: load prog with wrong expected attach type", @@ -740,6 +906,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (dgram)", @@ -756,6 +938,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: load prog with wrong expected attach type", @@ -804,6 +1002,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect deny (stream)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* connect - kernel calls */ { @@ -822,6 +1036,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (dgram)", @@ -838,6 +1068,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (stream)", @@ -854,6 +1100,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (dgram)", @@ -870,6 +1132,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: kernel_connect (dgram)", @@ -886,6 +1164,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect deny (dgram)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* sendmsg - system calls */ { @@ -1080,6 +1374,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* sendmsg - kernel calls (sock_sendmsg) */ { @@ -1178,6 +1488,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* sendmsg - kernel calls (kernel_sendmsg) */ { @@ -1276,6 +1602,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: kernel_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* recvmsg - system calls */ { diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 66005c1a5b36..b7ddf8ec4ee8 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -158,4 +158,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind4") +int bind_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 9c86c712348c..501c3fc11d35 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -175,4 +175,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind6") +int bind_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index bec529da7c9d..9e9ebf27b878 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -199,4 +199,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) return do_bind(ctx) ? 1 : 0; } +SEC("cgroup/connect4") +int connect_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c index 40266d2c737c..e98573b00ddb 100644 --- a/tools/testing/selftests/bpf/progs/connect6_prog.c +++ b/tools/testing/selftests/bpf/progs/connect6_prog.c @@ -90,4 +90,10 @@ int connect_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect6") +int connect_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect_unix_prog.c b/tools/testing/selftests/bpf/progs/connect_unix_prog.c index 2ef0e0c46d17..ba60adadb335 100644 --- a/tools/testing/selftests/bpf/progs/connect_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/connect_unix_prog.c @@ -36,4 +36,10 @@ int connect_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect_unix") +int connect_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c index d8869b03dda9..332d0eb1116f 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c @@ -36,4 +36,10 @@ int sendmsg_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg_unix") +int sendmsg_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bc467e953e4fbafd94d04c355f875bf1adf438e2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:33 -0500 Subject: selftests/bpf: Expand getsockname and getpeername tests This expands coverage for getsockname and getpeername hooks to include getsockname4, getsockname6, getpeername4, and getpeername6. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-17-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 304 ++++++++++++++++++++- .../selftests/bpf/progs/getpeername4_prog.c | 24 ++ .../selftests/bpf/progs/getpeername6_prog.c | 31 +++ .../selftests/bpf/progs/getsockname4_prog.c | 24 ++ .../selftests/bpf/progs/getsockname6_prog.c | 31 +++ 5 files changed, 412 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/getpeername4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getpeername6_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getsockname4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getsockname6_prog.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 0477b4080b2e..a0a40bdcfe45 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -15,7 +15,11 @@ #include "recvmsg6_prog.skel.h" #include "sendmsg_unix_prog.skel.h" #include "recvmsg_unix_prog.skel.h" +#include "getsockname4_prog.skel.h" +#include "getsockname6_prog.skel.h" #include "getsockname_unix_prog.skel.h" +#include "getpeername4_prog.skel.h" +#include "getpeername6_prog.skel.h" #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" @@ -466,7 +470,11 @@ BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog); BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog); static struct sock_addr_test tests[] = { /* bind - system calls */ @@ -1688,7 +1696,71 @@ static struct sock_addr_test tests[] = { /* getsockname - system calls */ { SOCK_ADDR_TEST_GETSOCKNAME, - "getsockname_unix", + "getsockname4: getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: getsockname", getsockname_unix_prog_load, getsockname_unix_prog_destroy, BPF_CGROUP_UNIX_GETSOCKNAME, @@ -1703,10 +1775,156 @@ static struct sock_addr_test tests[] = { SUCCESS, }, + /* getsockname - kernel calls */ + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: kernel_getsockname", + getsockname_unix_prog_load, + getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + /* getpeername - system calls */ { SOCK_ADDR_TEST_GETPEERNAME, - "getpeername_unix", + "getpeername4: getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: getpeername", getpeername_unix_prog_load, getpeername_unix_prog_destroy, BPF_CGROUP_UNIX_GETPEERNAME, @@ -1720,6 +1938,88 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + + /* getpeername - kernel calls */ + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: kernel_getpeername", + getpeername_unix_prog_load, + getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, }; typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); diff --git a/tools/testing/selftests/bpf/progs/getpeername4_prog.c b/tools/testing/selftests/bpf/progs/getpeername4_prog.c new file mode 100644 index 000000000000..4c97208cd25d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getpeername4") +int getpeername_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getpeername6_prog.c b/tools/testing/selftests/bpf/progs/getpeername6_prog.c new file mode 100644 index 000000000000..070e4d7f636c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getpeername6") +int getpeername_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname4_prog.c b/tools/testing/selftests/bpf/progs/getsockname4_prog.c new file mode 100644 index 000000000000..e298487c6347 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getsockname4") +int getsockname_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname6_prog.c b/tools/testing/selftests/bpf/progs/getsockname6_prog.c new file mode 100644 index 000000000000..811d10cd5525 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getsockname6") +int getsockname_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a3d3eb957ddc733d04c0da67024b1c30d8826cc2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:34 -0500 Subject: selftests/bpf: Expand ATTACH_REJECT tests This expands coverage for ATTACH_REJECT tests to include connect_unix, sendmsg_unix, recvmsg*, getsockname*, and getpeername*. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-18-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 187 +++++++++++++++++++++ 1 file changed, 187 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index a0a40bdcfe45..b880c564a204 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -454,6 +454,7 @@ BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS_RAW(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); @@ -465,16 +466,26 @@ BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS_RAW(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS_RAW(recvmsg6_prog, recvmsg6_prog); BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(recvmsg_unix_prog, recvmsg_unix_prog); BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS_RAW(getsockname_unix_prog, getsockname_unix_prog); BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS_RAW(getsockname4_prog, getsockname_v4_prog); BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog); +BPF_SKEL_FUNCS_RAW(getsockname6_prog, getsockname_v6_prog); BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS_RAW(getpeername_unix_prog, getpeername_unix_prog); BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS_RAW(getpeername4_prog, getpeername_v4_prog); BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog); +BPF_SKEL_FUNCS_RAW(getpeername6_prog, getpeername_v6_prog); static struct sock_addr_test tests[] = { /* bind - system calls */ @@ -1026,6 +1037,22 @@ static struct sock_addr_test tests[] = { NULL, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: attach prog with wrong attach type", + connect_unix_prog_load_raw, + connect_unix_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* connect - kernel calls */ { @@ -1398,6 +1425,22 @@ static struct sock_addr_test tests[] = { NULL, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: attach prog with wrong attach type", + sendmsg_unix_prog_load_raw, + sendmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* sendmsg - kernel calls (sock_sendmsg) */ { @@ -1644,6 +1687,22 @@ static struct sock_addr_test tests[] = { SERV4_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: attach prog with wrong attach type", + recvmsg4_prog_load_raw, + recvmsg4_prog_destroy_raw, + BPF_CGROUP_UDP6_RECVMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg6: recvfrom (dgram)", @@ -1660,6 +1719,22 @@ static struct sock_addr_test tests[] = { SERV6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg6: attach prog with wrong attach type", + recvmsg6_prog_load_raw, + recvmsg6_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg_unix: recvfrom (dgram)", @@ -1692,6 +1767,22 @@ static struct sock_addr_test tests[] = { SERVUN_ADDRESS, SUCCESS, }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: attach prog with wrong attach type", + recvmsg_unix_prog_load_raw, + recvmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_ADDRESS, + ATTACH_REJECT, + }, /* getsockname - system calls */ { @@ -1726,6 +1817,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: attach prog with wrong attach type", + getsockname_v4_prog_load_raw, + getsockname_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETSOCKNAME, "getsockname6: getsockname (stream)", @@ -1758,6 +1865,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: attach prog with wrong attach type", + getsockname_v6_prog_load_raw, + getsockname_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETSOCKNAME, "getsockname_unix: getsockname", @@ -1774,6 +1897,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: attach prog with wrong attach type", + getsockname_unix_prog_load_raw, + getsockname_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* getsockname - kernel calls */ { @@ -1890,6 +2029,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: attach prog with wrong attach type", + getpeername_v4_prog_load_raw, + getpeername_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETPEERNAME, "getpeername6: getpeername (stream)", @@ -1922,6 +2077,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: attach prog with wrong attach type", + getpeername_v6_prog_load_raw, + getpeername_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETPEERNAME, "getpeername_unix: getpeername", @@ -1938,6 +2109,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: attach prog with wrong attach type", + getpeername_unix_prog_load_raw, + getpeername_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* getpeername - kernel calls */ { -- cgit v1.2.3 From bbe91a9f6889934e661fa924144c7023f0a1c4cf Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Wed, 8 May 2024 10:41:23 +0000 Subject: tools: remove redundant ethtool.h from tooling infra Remove the redundant ethtool.h header file from tools/include/uapi/linux. The file is unnecessary as the system uses the kernel's include/uapi/linux/ethtool.h directly. Signed-off-by: Tushar Vyavahare Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240508104123.434769-1-tushar.vyavahare@intel.com Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/ethtool.h | 2271 ------------------------------------ 1 file changed, 2271 deletions(-) delete mode 100644 tools/include/uapi/linux/ethtool.h (limited to 'tools') diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h deleted file mode 100644 index 11fc18988bc2..000000000000 --- a/tools/include/uapi/linux/ethtool.h +++ /dev/null @@ -1,2271 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * ethtool.h: Defines for Linux ethtool. - * - * Copyright (C) 1998 David S. Miller (davem@redhat.com) - * Copyright 2001 Jeff Garzik - * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) - * Portions Copyright 2002 Intel (eli.kupermann@intel.com, - * christopher.leech@intel.com, - * scott.feldman@intel.com) - * Portions Copyright (C) Sun Microsystems 2008 - */ - -#ifndef _UAPI_LINUX_ETHTOOL_H -#define _UAPI_LINUX_ETHTOOL_H - -#include -#include -#include - -#ifndef __KERNEL__ -#include /* for INT_MAX */ -#endif - -/* All structures exposed to userland should be defined such that they - * have the same layout for 32-bit and 64-bit userland. - */ - -/* Note on reserved space. - * Reserved fields must not be accessed directly by user space because - * they may be replaced by a different field in the future. They must - * be initialized to zero before making the request, e.g. via memset - * of the entire structure or implicitly by not being set in a structure - * initializer. - */ - -/** - * struct ethtool_cmd - DEPRECATED, link control and status - * This structure is DEPRECATED, please use struct ethtool_link_settings. - * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET - * @supported: Bitmask of %SUPPORTED_* flags for the link modes, - * physical connectors and other link features for which the - * interface supports autonegotiation or auto-detection. - * Read-only. - * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, - * physical connectors and other link features that are - * advertised through autonegotiation or enabled for - * auto-detection. - * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN - * @duplex: Duplex mode; one of %DUPLEX_* - * @port: Physical connector type; one of %PORT_* - * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not - * applicable. For clause 45 PHYs this is the PRTAD. - * @transceiver: Historically used to distinguish different possible - * PHY types, but not in a consistent way. Deprecated. - * @autoneg: Enable/disable autonegotiation and auto-detection; - * either %AUTONEG_DISABLE or %AUTONEG_ENABLE - * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO - * protocols supported by the interface; 0 if unknown. - * Read-only. - * @maxtxpkt: Historically used to report TX IRQ coalescing; now - * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. - * @maxrxpkt: Historically used to report RX IRQ coalescing; now - * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. - * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN - * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of - * %ETH_TP_MDI_*. If the status is unknown or not applicable, the - * value will be %ETH_TP_MDI_INVALID. Read-only. - * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of - * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads - * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. - * When written successfully, the link should be renegotiated if - * necessary. - * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes - * and other link features that the link partner advertised - * through autonegotiation; 0 if unknown or not applicable. - * Read-only. - * @reserved: Reserved for future use; see the note on reserved space. - * - * The link speed in Mbps is split between @speed and @speed_hi. Use - * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to - * access it. - * - * If autonegotiation is disabled, the speed and @duplex represent the - * fixed link mode and are writable if the driver supports multiple - * link modes. If it is enabled then they are read-only; if the link - * is up they represent the negotiated link mode; if the link is down, - * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and - * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. - * - * Some hardware interfaces may have multiple PHYs and/or physical - * connectors fitted or do not allow the driver to detect which are - * fitted. For these interfaces @port and/or @phy_address may be - * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. - * Otherwise, attempts to write different values may be ignored or - * rejected. - * - * Users should assume that all fields not marked read-only are - * writable and subject to validation by the driver. They should use - * %ETHTOOL_GSET to get the current values before making specific - * changes and then applying them with %ETHTOOL_SSET. - * - * Deprecated fields should be ignored by both users and drivers. - */ -struct ethtool_cmd { - __u32 cmd; - __u32 supported; - __u32 advertising; - __u16 speed; - __u8 duplex; - __u8 port; - __u8 phy_address; - __u8 transceiver; - __u8 autoneg; - __u8 mdio_support; - __u32 maxtxpkt; - __u32 maxrxpkt; - __u16 speed_hi; - __u8 eth_tp_mdix; - __u8 eth_tp_mdix_ctrl; - __u32 lp_advertising; - __u32 reserved[2]; -}; - -static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, - __u32 speed) -{ - ep->speed = (__u16)(speed & 0xFFFF); - ep->speed_hi = (__u16)(speed >> 16); -} - -static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) -{ - return (ep->speed_hi << 16) | ep->speed; -} - -/* Device supports clause 22 register access to PHY or peripherals - * using the interface defined in . This should not be - * set if there are known to be no such peripherals present or if - * the driver only emulates clause 22 registers for compatibility. - */ -#define ETH_MDIO_SUPPORTS_C22 1 - -/* Device supports clause 45 register access to PHY or peripherals - * using the interface defined in and . - * This should not be set if there are known to be no such peripherals - * present. - */ -#define ETH_MDIO_SUPPORTS_C45 2 - -#define ETHTOOL_FWVERS_LEN 32 -#define ETHTOOL_BUSINFO_LEN 32 -#define ETHTOOL_EROMVERS_LEN 32 - -/** - * struct ethtool_drvinfo - general driver and device information - * @cmd: Command number = %ETHTOOL_GDRVINFO - * @driver: Driver short name. This should normally match the name - * in its bus driver structure (e.g. pci_driver::name). Must - * not be an empty string. - * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; driver defined; may be an - * empty string - * @erom_version: Expansion ROM version string; driver defined; may be - * an empty string - * @bus_info: Device bus address. This should match the dev_name() - * string for the underlying bus device, if there is one. May be - * an empty string. - * @reserved2: Reserved for future use; see the note on reserved space. - * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and - * %ETHTOOL_SPFLAGS commands; also the number of strings in the - * %ETH_SS_PRIV_FLAGS set - * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS - * command; also the number of strings in the %ETH_SS_STATS set - * @testinfo_len: Number of results returned by the %ETHTOOL_TEST - * command; also the number of strings in the %ETH_SS_TEST set - * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM - * and %ETHTOOL_SEEPROM commands, in bytes - * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS - * command, in bytes - * - * Users can use the %ETHTOOL_GSSET_INFO command to get the number of - * strings in any string set (from Linux 2.6.34). - */ -struct ethtool_drvinfo { - __u32 cmd; - char driver[32]; - char version[32]; - char fw_version[ETHTOOL_FWVERS_LEN]; - char bus_info[ETHTOOL_BUSINFO_LEN]; - char erom_version[ETHTOOL_EROMVERS_LEN]; - char reserved2[12]; - __u32 n_priv_flags; - __u32 n_stats; - __u32 testinfo_len; - __u32 eedump_len; - __u32 regdump_len; -}; - -#define SOPASS_MAX 6 - -/** - * struct ethtool_wolinfo - Wake-On-Lan configuration - * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL - * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. - * Read-only. - * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. - * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE - * is set in @wolopts. - */ -struct ethtool_wolinfo { - __u32 cmd; - __u32 supported; - __u32 wolopts; - __u8 sopass[SOPASS_MAX]; -}; - -/* for passing single values */ -struct ethtool_value { - __u32 cmd; - __u32 data; -}; - -#define PFC_STORM_PREVENTION_AUTO 0xffff -#define PFC_STORM_PREVENTION_DISABLE 0 - -enum tunable_id { - ETHTOOL_ID_UNSPEC, - ETHTOOL_RX_COPYBREAK, - ETHTOOL_TX_COPYBREAK, - ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ - ETHTOOL_TX_COPYBREAK_BUF_SIZE, - /* - * Add your fresh new tunable attribute above and remember to update - * tunable_strings[] in net/ethtool/common.c - */ - __ETHTOOL_TUNABLE_COUNT, -}; - -enum tunable_type_id { - ETHTOOL_TUNABLE_UNSPEC, - ETHTOOL_TUNABLE_U8, - ETHTOOL_TUNABLE_U16, - ETHTOOL_TUNABLE_U32, - ETHTOOL_TUNABLE_U64, - ETHTOOL_TUNABLE_STRING, - ETHTOOL_TUNABLE_S8, - ETHTOOL_TUNABLE_S16, - ETHTOOL_TUNABLE_S32, - ETHTOOL_TUNABLE_S64, -}; - -struct ethtool_tunable { - __u32 cmd; - __u32 id; - __u32 type_id; - __u32 len; - void *data[]; -}; - -#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff -#define DOWNSHIFT_DEV_DISABLE 0 - -/* Time in msecs after which link is reported as down - * 0 = lowest time supported by the PHY - * 0xff = off, link down detection according to standard - */ -#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 -#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff - -/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where - * the PHY's RX & TX blocks are put into a low-power mode when there is no - * link detected (typically cable is un-plugged). For RX, only a minimal - * link-detection is available, and for TX the PHY wakes up to send link pulses - * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode. - * - * Some PHYs may support configuration of the wake-up interval for TX pulses, - * and some PHYs may support only disabling TX pulses entirely. For the latter - * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be - * configured from userspace (should the user want it). - * - * The interval units for TX wake-up are in milliseconds, since this should - * cover a reasonable range of intervals: - * - from 1 millisecond, which does not sound like much of a power-saver - * - to ~65 seconds which is quite a lot to wait for a link to come up when - * plugging a cable - */ -#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff -#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe -#define ETHTOOL_PHY_EDPD_DISABLE 0 - -enum phy_tunable_id { - ETHTOOL_PHY_ID_UNSPEC, - ETHTOOL_PHY_DOWNSHIFT, - ETHTOOL_PHY_FAST_LINK_DOWN, - ETHTOOL_PHY_EDPD, - /* - * Add your fresh new phy tunable attribute above and remember to update - * phy_tunable_strings[] in net/ethtool/common.c - */ - __ETHTOOL_PHY_TUNABLE_COUNT, -}; - -/** - * struct ethtool_regs - hardware register dump - * @cmd: Command number = %ETHTOOL_GREGS - * @version: Dump format version. This is driver-specific and may - * distinguish different chips/revisions. Drivers must use new - * version numbers whenever the dump format changes in an - * incompatible way. - * @len: On entry, the real length of @data. On return, the number of - * bytes used. - * @data: Buffer for the register dump - * - * Users should use %ETHTOOL_GDRVINFO to find the maximum length of - * a register dump for the interface. They must allocate the buffer - * immediately following this structure. - */ -struct ethtool_regs { - __u32 cmd; - __u32 version; - __u32 len; - __u8 data[]; -}; - -/** - * struct ethtool_eeprom - EEPROM dump - * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or - * %ETHTOOL_SEEPROM - * @magic: A 'magic cookie' value to guard against accidental changes. - * The value passed in to %ETHTOOL_SEEPROM must match the value - * returned by %ETHTOOL_GEEPROM for the same device. This is - * unused when @cmd is %ETHTOOL_GMODULEEEPROM. - * @offset: Offset within the EEPROM to begin reading/writing, in bytes - * @len: On entry, number of bytes to read/write. On successful - * return, number of bytes actually read/written. In case of - * error, this may indicate at what point the error occurred. - * @data: Buffer to read/write from - * - * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find - * the length of an on-board or module EEPROM, respectively. They - * must allocate the buffer immediately following this structure. - */ -struct ethtool_eeprom { - __u32 cmd; - __u32 magic; - __u32 offset; - __u32 len; - __u8 data[]; -}; - -/** - * struct ethtool_eee - Energy Efficient Ethernet information - * @cmd: ETHTOOL_{G,S}EEE - * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations - * for which there is EEE support. - * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations - * advertised as eee capable. - * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex - * combinations advertised by the link partner as eee capable. - * @eee_active: Result of the eee auto negotiation. - * @eee_enabled: EEE configured mode (enabled/disabled). - * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given - * that eee was negotiated. - * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting - * its tx lpi (after reaching 'idle' state). Effective only when eee - * was negotiated and tx_lpi_enabled was set. - * @reserved: Reserved for future use; see the note on reserved space. - */ -struct ethtool_eee { - __u32 cmd; - __u32 supported; - __u32 advertised; - __u32 lp_advertised; - __u32 eee_active; - __u32 eee_enabled; - __u32 tx_lpi_enabled; - __u32 tx_lpi_timer; - __u32 reserved[2]; -}; - -/** - * struct ethtool_modinfo - plugin module eeprom information - * @cmd: %ETHTOOL_GMODULEINFO - * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx - * @eeprom_len: Length of the eeprom - * @reserved: Reserved for future use; see the note on reserved space. - * - * This structure is used to return the information to - * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. - * The type code indicates the eeprom data format - */ -struct ethtool_modinfo { - __u32 cmd; - __u32 type; - __u32 eeprom_len; - __u32 reserved[8]; -}; - -/** - * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates - * @cmd: ETHTOOL_{G,S}COALESCE - * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after - * a packet arrives. - * @rx_max_coalesced_frames: Maximum number of packets to receive - * before an RX interrupt. - * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that - * this value applies while an IRQ is being serviced by the host. - * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, - * except that this value applies while an IRQ is being serviced - * by the host. - * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after - * a packet is sent. - * @tx_max_coalesced_frames: Maximum number of packets to be sent - * before a TX interrupt. - * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that - * this value applies while an IRQ is being serviced by the host. - * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, - * except that this value applies while an IRQ is being serviced - * by the host. - * @stats_block_coalesce_usecs: How many usecs to delay in-memory - * statistics block updates. Some drivers do not have an - * in-memory statistic block, and in such cases this value is - * ignored. This value must not be zero. - * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. - * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. - * @pkt_rate_low: Threshold for low packet rate (packets per second). - * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after - * a packet arrives, when the packet rate is below @pkt_rate_low. - * @rx_max_coalesced_frames_low: Maximum number of packets to be received - * before an RX interrupt, when the packet rate is below @pkt_rate_low. - * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after - * a packet is sent, when the packet rate is below @pkt_rate_low. - * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before - * a TX interrupt, when the packet rate is below @pkt_rate_low. - * @pkt_rate_high: Threshold for high packet rate (packets per second). - * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after - * a packet arrives, when the packet rate is above @pkt_rate_high. - * @rx_max_coalesced_frames_high: Maximum number of packets to be received - * before an RX interrupt, when the packet rate is above @pkt_rate_high. - * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after - * a packet is sent, when the packet rate is above @pkt_rate_high. - * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before - * a TX interrupt, when the packet rate is above @pkt_rate_high. - * @rate_sample_interval: How often to do adaptive coalescing packet rate - * sampling, measured in seconds. Must not be zero. - * - * Each pair of (usecs, max_frames) fields specifies that interrupts - * should be coalesced until - * (usecs > 0 && time_since_first_completion >= usecs) || - * (max_frames > 0 && completed_frames >= max_frames) - * - * It is illegal to set both usecs and max_frames to zero as this - * would cause interrupts to never be generated. To disable - * coalescing, set usecs = 0 and max_frames = 1. - * - * Some implementations ignore the value of max_frames and use the - * condition time_since_first_completion >= usecs - * - * This is deprecated. Drivers for hardware that does not support - * counting completions should validate that max_frames == !rx_usecs. - * - * Adaptive RX/TX coalescing is an algorithm implemented by some - * drivers to improve latency under low packet rates and improve - * throughput under high packet rates. Some drivers only implement - * one of RX or TX adaptive coalescing. Anything not implemented by - * the driver causes these values to be silently ignored. - * - * When the packet rate is below @pkt_rate_high but above - * @pkt_rate_low (both measured in packets per second) the - * normal {rx,tx}_* coalescing parameters are used. - */ -struct ethtool_coalesce { - __u32 cmd; - __u32 rx_coalesce_usecs; - __u32 rx_max_coalesced_frames; - __u32 rx_coalesce_usecs_irq; - __u32 rx_max_coalesced_frames_irq; - __u32 tx_coalesce_usecs; - __u32 tx_max_coalesced_frames; - __u32 tx_coalesce_usecs_irq; - __u32 tx_max_coalesced_frames_irq; - __u32 stats_block_coalesce_usecs; - __u32 use_adaptive_rx_coalesce; - __u32 use_adaptive_tx_coalesce; - __u32 pkt_rate_low; - __u32 rx_coalesce_usecs_low; - __u32 rx_max_coalesced_frames_low; - __u32 tx_coalesce_usecs_low; - __u32 tx_max_coalesced_frames_low; - __u32 pkt_rate_high; - __u32 rx_coalesce_usecs_high; - __u32 rx_max_coalesced_frames_high; - __u32 tx_coalesce_usecs_high; - __u32 tx_max_coalesced_frames_high; - __u32 rate_sample_interval; -}; - -/** - * struct ethtool_ringparam - RX/TX ring parameters - * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM - * @rx_max_pending: Maximum supported number of pending entries per - * RX ring. Read-only. - * @rx_mini_max_pending: Maximum supported number of pending entries - * per RX mini ring. Read-only. - * @rx_jumbo_max_pending: Maximum supported number of pending entries - * per RX jumbo ring. Read-only. - * @tx_max_pending: Maximum supported number of pending entries per - * TX ring. Read-only. - * @rx_pending: Current maximum number of pending entries per RX ring - * @rx_mini_pending: Current maximum number of pending entries per RX - * mini ring - * @rx_jumbo_pending: Current maximum number of pending entries per RX - * jumbo ring - * @tx_pending: Current maximum supported number of pending entries - * per TX ring - * - * If the interface does not have separate RX mini and/or jumbo rings, - * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. - * - * There may also be driver-dependent minimum values for the number - * of entries per ring. - */ -struct ethtool_ringparam { - __u32 cmd; - __u32 rx_max_pending; - __u32 rx_mini_max_pending; - __u32 rx_jumbo_max_pending; - __u32 tx_max_pending; - __u32 rx_pending; - __u32 rx_mini_pending; - __u32 rx_jumbo_pending; - __u32 tx_pending; -}; - -/** - * struct ethtool_channels - configuring number of network channel - * @cmd: ETHTOOL_{G,S}CHANNELS - * @max_rx: Read only. Maximum number of receive channel the driver support. - * @max_tx: Read only. Maximum number of transmit channel the driver support. - * @max_other: Read only. Maximum number of other channel the driver support. - * @max_combined: Read only. Maximum number of combined channel the driver - * support. Set of queues RX, TX or other. - * @rx_count: Valid values are in the range 1 to the max_rx. - * @tx_count: Valid values are in the range 1 to the max_tx. - * @other_count: Valid values are in the range 1 to the max_other. - * @combined_count: Valid values are in the range 1 to the max_combined. - * - * This can be used to configure RX, TX and other channels. - */ - -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -/** - * struct ethtool_pauseparam - Ethernet pause (flow control) parameters - * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM - * @autoneg: Flag to enable autonegotiation of pause frame use - * @rx_pause: Flag to enable reception of pause frames - * @tx_pause: Flag to enable transmission of pause frames - * - * Drivers should reject a non-zero setting of @autoneg when - * autoneogotiation is disabled (or not supported) for the link. - * - * If the link is autonegotiated, drivers should use - * mii_advertise_flowctrl() or similar code to set the advertised - * pause frame capabilities based on the @rx_pause and @tx_pause flags, - * even if @autoneg is zero. They should also allow the advertised - * pause frame capabilities to be controlled directly through the - * advertising field of &struct ethtool_cmd. - * - * If @autoneg is non-zero, the MAC is configured to send and/or - * receive pause frames according to the result of autonegotiation. - * Otherwise, it is configured directly based on the @rx_pause and - * @tx_pause flags. - */ -struct ethtool_pauseparam { - __u32 cmd; - __u32 autoneg; - __u32 rx_pause; - __u32 tx_pause; -}; - -/* Link extended state */ -enum ethtool_link_ext_state { - ETHTOOL_LINK_EXT_STATE_AUTONEG, - ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, - ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, - ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, - ETHTOOL_LINK_EXT_STATE_NO_CABLE, - ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, - ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, - ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, - ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, - ETHTOOL_LINK_EXT_STATE_OVERHEAT, - ETHTOOL_LINK_EXT_STATE_MODULE, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ -enum ethtool_link_ext_substate_autoneg { - ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, - ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, - ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, - ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, - ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, - ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. - */ -enum ethtool_link_ext_substate_link_training { - ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, - ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, - ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, - ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. - */ -enum ethtool_link_ext_substate_link_logical_mismatch { - ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. - */ -enum ethtool_link_ext_substate_bad_signal_integrity { - ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, - ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, - ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, - ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ -enum ethtool_link_ext_substate_cable_issue { - ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, - ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ -enum ethtool_link_ext_substate_module { - ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, -}; - -#define ETH_GSTRING_LEN 32 - -/** - * enum ethtool_stringset - string set ID - * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST - * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS - * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with - * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS - * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; - * now deprecated - * @ETH_SS_FEATURES: Device feature names - * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names - * @ETH_SS_TUNABLES: tunable names - * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS - * @ETH_SS_PHY_TUNABLES: PHY tunable names - * @ETH_SS_LINK_MODES: link mode names - * @ETH_SS_MSG_CLASSES: debug message class names - * @ETH_SS_WOL_MODES: wake-on-lan modes - * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags - * @ETH_SS_TS_TX_TYPES: timestamping Tx types - * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters - * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types - * @ETH_SS_STATS_STD: standardized stats - * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics - * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics - * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics - * @ETH_SS_STATS_RMON: names of RMON statistics - * - * @ETH_SS_COUNT: number of defined string sets - */ -enum ethtool_stringset { - ETH_SS_TEST = 0, - ETH_SS_STATS, - ETH_SS_PRIV_FLAGS, - ETH_SS_NTUPLE_FILTERS, - ETH_SS_FEATURES, - ETH_SS_RSS_HASH_FUNCS, - ETH_SS_TUNABLES, - ETH_SS_PHY_STATS, - ETH_SS_PHY_TUNABLES, - ETH_SS_LINK_MODES, - ETH_SS_MSG_CLASSES, - ETH_SS_WOL_MODES, - ETH_SS_SOF_TIMESTAMPING, - ETH_SS_TS_TX_TYPES, - ETH_SS_TS_RX_FILTERS, - ETH_SS_UDP_TUNNEL_TYPES, - ETH_SS_STATS_STD, - ETH_SS_STATS_ETH_PHY, - ETH_SS_STATS_ETH_MAC, - ETH_SS_STATS_ETH_CTRL, - ETH_SS_STATS_RMON, - - /* add new constants above here */ - ETH_SS_COUNT -}; - -/** - * enum ethtool_mac_stats_src - source of ethtool MAC statistics - * @ETHTOOL_MAC_STATS_SRC_AGGREGATE: - * if device supports a MAC merge layer, this retrieves the aggregate - * statistics of the eMAC and pMAC. Otherwise, it retrieves just the - * statistics of the single (express) MAC. - * @ETHTOOL_MAC_STATS_SRC_EMAC: - * if device supports a MM layer, this retrieves the eMAC statistics. - * Otherwise, it retrieves the statistics of the single (express) MAC. - * @ETHTOOL_MAC_STATS_SRC_PMAC: - * if device supports a MM layer, this retrieves the pMAC statistics. - */ -enum ethtool_mac_stats_src { - ETHTOOL_MAC_STATS_SRC_AGGREGATE, - ETHTOOL_MAC_STATS_SRC_EMAC, - ETHTOOL_MAC_STATS_SRC_PMAC, -}; - -/** - * enum ethtool_module_power_mode_policy - plug-in module power mode policy - * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. - * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host - * to high power mode when the first port using it is put administratively - * up and to low power mode when the last port using it is put - * administratively down. - */ -enum ethtool_module_power_mode_policy { - ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, - ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, -}; - -/** - * enum ethtool_module_power_mode - plug-in module power mode - * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. - * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. - */ -enum ethtool_module_power_mode { - ETHTOOL_MODULE_POWER_MODE_LOW = 1, - ETHTOOL_MODULE_POWER_MODE_HIGH, -}; - -/** - * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE - * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState - * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are - * unknown - * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled - * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled - */ -enum ethtool_podl_pse_admin_state { - ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, - ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, - ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, -}; - -/** - * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. - * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: - * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE - * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is - * asserted true when the PoDL PSE state diagram variable mr_pse_enable is - * false" - * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is - * asserted true when either of the PSE state diagram variables - * pi_detecting or pi_classifying is true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” - * is asserted true when the PoDL PSE state diagram variable pi_powered is - * true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted - * true when the PoDL PSE state diagram variable pi_sleeping is true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true - * when the logical combination of the PoDL PSE state diagram variables - * pi_prebiased*!pi_sleeping is true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted - * true when the PoDL PSE state diagram variable overload_held is true." - */ -enum ethtool_podl_pse_pw_d_status { - ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, - ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, - ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, - ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, - ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, - ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, - ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, -}; - -/** - * enum ethtool_mm_verify_status - status of MAC Merge Verify function - * @ETHTOOL_MM_VERIFY_STATUS_UNKNOWN: - * verification status is unknown - * @ETHTOOL_MM_VERIFY_STATUS_INITIAL: - * the 802.3 Verify State diagram is in the state INIT_VERIFICATION - * @ETHTOOL_MM_VERIFY_STATUS_VERIFYING: - * the Verify State diagram is in the state VERIFICATION_IDLE, - * SEND_VERIFY or WAIT_FOR_RESPONSE - * @ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: - * indicates that the Verify State diagram is in the state VERIFIED - * @ETHTOOL_MM_VERIFY_STATUS_FAILED: - * the Verify State diagram is in the state VERIFY_FAIL - * @ETHTOOL_MM_VERIFY_STATUS_DISABLED: - * verification of preemption operation is disabled - */ -enum ethtool_mm_verify_status { - ETHTOOL_MM_VERIFY_STATUS_UNKNOWN, - ETHTOOL_MM_VERIFY_STATUS_INITIAL, - ETHTOOL_MM_VERIFY_STATUS_VERIFYING, - ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED, - ETHTOOL_MM_VERIFY_STATUS_FAILED, - ETHTOOL_MM_VERIFY_STATUS_DISABLED, -}; - -/** - * struct ethtool_gstrings - string set for data tagging - * @cmd: Command number = %ETHTOOL_GSTRINGS - * @string_set: String set ID; one of &enum ethtool_stringset - * @len: On return, the number of strings in the string set - * @data: Buffer for strings. Each string is null-padded to a size of - * %ETH_GSTRING_LEN. - * - * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in - * the string set. They must allocate a buffer of the appropriate - * size immediately following this structure. - */ -struct ethtool_gstrings { - __u32 cmd; - __u32 string_set; - __u32 len; - __u8 data[]; -}; - -/** - * struct ethtool_sset_info - string set information - * @cmd: Command number = %ETHTOOL_GSSET_INFO - * @reserved: Reserved for future use; see the note on reserved space. - * @sset_mask: On entry, a bitmask of string sets to query, with bits - * numbered according to &enum ethtool_stringset. On return, a - * bitmask of those string sets queried that are supported. - * @data: Buffer for string set sizes. On return, this contains the - * size of each string set that was queried and supported, in - * order of ID. - * - * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on - * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the - * size of set 1 and @data[1] contains the size of set 2. - * - * Users must allocate a buffer of the appropriate size (4 * number of - * sets queried) immediately following this structure. - */ -struct ethtool_sset_info { - __u32 cmd; - __u32 reserved; - __u64 sset_mask; - __u32 data[]; -}; - -/** - * enum ethtool_test_flags - flags definition of ethtool_test - * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise - * only online tests. - * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. - * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback - * test. - * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test - */ - -enum ethtool_test_flags { - ETH_TEST_FL_OFFLINE = (1 << 0), - ETH_TEST_FL_FAILED = (1 << 1), - ETH_TEST_FL_EXTERNAL_LB = (1 << 2), - ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), -}; - -/** - * struct ethtool_test - device self-test invocation - * @cmd: Command number = %ETHTOOL_TEST - * @flags: A bitmask of flags from &enum ethtool_test_flags. Some - * flags may be set by the user on entry; others may be set by - * the driver on return. - * @reserved: Reserved for future use; see the note on reserved space. - * @len: On return, the number of test results - * @data: Array of test results - * - * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the - * number of test results that will be returned. They must allocate a - * buffer of the appropriate size (8 * number of results) immediately - * following this structure. - */ -struct ethtool_test { - __u32 cmd; - __u32 flags; - __u32 reserved; - __u32 len; - __u64 data[]; -}; - -/** - * struct ethtool_stats - device-specific statistics - * @cmd: Command number = %ETHTOOL_GSTATS - * @n_stats: On return, the number of statistics - * @data: Array of statistics - * - * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the - * number of statistics that will be returned. They must allocate a - * buffer of the appropriate size (8 * number of statistics) - * immediately following this structure. - */ -struct ethtool_stats { - __u32 cmd; - __u32 n_stats; - __u64 data[]; -}; - -/** - * struct ethtool_perm_addr - permanent hardware address - * @cmd: Command number = %ETHTOOL_GPERMADDR - * @size: On entry, the size of the buffer. On return, the size of the - * address. The command fails if the buffer is too small. - * @data: Buffer for the address - * - * Users must allocate the buffer immediately following this structure. - * A buffer size of %MAX_ADDR_LEN should be sufficient for any address - * type. - */ -struct ethtool_perm_addr { - __u32 cmd; - __u32 size; - __u8 data[]; -}; - -/* boolean flags controlling per-interface behavior characteristics. - * When reading, the flag indicates whether or not a certain behavior - * is enabled/present. When writing, the flag indicates whether - * or not the driver should turn on (set) or off (clear) a behavior. - * - * Some behaviors may read-only (unconditionally absent or present). - * If such is the case, return EINVAL in the set-flags operation if the - * flag differs from the read-only value. - */ -enum ethtool_flags { - ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ - ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ - ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ - ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ - ETH_FLAG_RXHASH = (1 << 28), -}; - -/* The following structures are for supporting RX network flow - * classification and RX n-tuple configuration. Note, all multibyte - * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to - * be in network byte order. - */ - -/** - * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. - * @ip4src: Source host - * @ip4dst: Destination host - * @psrc: Source port - * @pdst: Destination port - * @tos: Type-of-service - * - * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. - */ -struct ethtool_tcpip4_spec { - __be32 ip4src; - __be32 ip4dst; - __be16 psrc; - __be16 pdst; - __u8 tos; -}; - -/** - * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 - * @ip4src: Source host - * @ip4dst: Destination host - * @spi: Security parameters index - * @tos: Type-of-service - * - * This can be used to specify an IPsec transport or tunnel over IPv4. - */ -struct ethtool_ah_espip4_spec { - __be32 ip4src; - __be32 ip4dst; - __be32 spi; - __u8 tos; -}; - -#define ETH_RX_NFC_IP4 1 - -/** - * struct ethtool_usrip4_spec - general flow specification for IPv4 - * @ip4src: Source host - * @ip4dst: Destination host - * @l4_4_bytes: First 4 bytes of transport (layer 4) header - * @tos: Type-of-service - * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 - * @proto: Transport protocol number; mask must be 0 - */ -struct ethtool_usrip4_spec { - __be32 ip4src; - __be32 ip4dst; - __be32 l4_4_bytes; - __u8 tos; - __u8 ip_ver; - __u8 proto; -}; - -/** - * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. - * @ip6src: Source host - * @ip6dst: Destination host - * @psrc: Source port - * @pdst: Destination port - * @tclass: Traffic Class - * - * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. - */ -struct ethtool_tcpip6_spec { - __be32 ip6src[4]; - __be32 ip6dst[4]; - __be16 psrc; - __be16 pdst; - __u8 tclass; -}; - -/** - * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 - * @ip6src: Source host - * @ip6dst: Destination host - * @spi: Security parameters index - * @tclass: Traffic Class - * - * This can be used to specify an IPsec transport or tunnel over IPv6. - */ -struct ethtool_ah_espip6_spec { - __be32 ip6src[4]; - __be32 ip6dst[4]; - __be32 spi; - __u8 tclass; -}; - -/** - * struct ethtool_usrip6_spec - general flow specification for IPv6 - * @ip6src: Source host - * @ip6dst: Destination host - * @l4_4_bytes: First 4 bytes of transport (layer 4) header - * @tclass: Traffic Class - * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) - */ -struct ethtool_usrip6_spec { - __be32 ip6src[4]; - __be32 ip6dst[4]; - __be32 l4_4_bytes; - __u8 tclass; - __u8 l4_proto; -}; - -union ethtool_flow_union { - struct ethtool_tcpip4_spec tcp_ip4_spec; - struct ethtool_tcpip4_spec udp_ip4_spec; - struct ethtool_tcpip4_spec sctp_ip4_spec; - struct ethtool_ah_espip4_spec ah_ip4_spec; - struct ethtool_ah_espip4_spec esp_ip4_spec; - struct ethtool_usrip4_spec usr_ip4_spec; - struct ethtool_tcpip6_spec tcp_ip6_spec; - struct ethtool_tcpip6_spec udp_ip6_spec; - struct ethtool_tcpip6_spec sctp_ip6_spec; - struct ethtool_ah_espip6_spec ah_ip6_spec; - struct ethtool_ah_espip6_spec esp_ip6_spec; - struct ethtool_usrip6_spec usr_ip6_spec; - struct ethhdr ether_spec; - __u8 hdata[52]; -}; - -/** - * struct ethtool_flow_ext - additional RX flow fields - * @h_dest: destination MAC address - * @vlan_etype: VLAN EtherType - * @vlan_tci: VLAN tag control information - * @data: user defined data - * @padding: Reserved for future use; see the note on reserved space. - * - * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT - * is set in &struct ethtool_rx_flow_spec @flow_type. - * @h_dest is valid if %FLOW_MAC_EXT is set. - */ -struct ethtool_flow_ext { - __u8 padding[2]; - unsigned char h_dest[ETH_ALEN]; - __be16 vlan_etype; - __be16 vlan_tci; - __be32 data[2]; -}; - -/** - * struct ethtool_rx_flow_spec - classification rule for RX flows - * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW - * @h_u: Flow fields to match (dependent on @flow_type) - * @h_ext: Additional fields to match - * @m_u: Masks for flow field bits to be matched - * @m_ext: Masks for additional field bits to be matched - * Note, all additional fields must be ignored unless @flow_type - * includes the %FLOW_EXT or %FLOW_MAC_EXT flag - * (see &struct ethtool_flow_ext description). - * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC - * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the - * packets should be used for Wake-on-LAN with %WAKE_FILTER - * @location: Location of rule in the table. Locations must be - * numbered such that a flow matching multiple rules will be - * classified according to the first (lowest numbered) rule. - */ -struct ethtool_rx_flow_spec { - __u32 flow_type; - union ethtool_flow_union h_u; - struct ethtool_flow_ext h_ext; - union ethtool_flow_union m_u; - struct ethtool_flow_ext m_ext; - __u64 ring_cookie; - __u32 location; -}; - -/* How rings are laid out when accessing virtual functions or - * offloaded queues is device specific. To allow users to do flow - * steering and specify these queues the ring cookie is partitioned - * into a 32bit queue index with an 8 bit virtual function id. - * This also leaves the 3bytes for further specifiers. It is possible - * future devices may support more than 256 virtual functions if - * devices start supporting PCIe w/ARI. However at the moment I - * do not know of any devices that support this so I do not reserve - * space for this at this time. If a future patch consumes the next - * byte it should be aware of this possibility. - */ -#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL -#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL -#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 -static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) -{ - return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; -} - -static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) -{ - return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> - ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; -} - -/** - * struct ethtool_rxnfc - command to get or set RX flow classification rules - * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, - * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, - * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS - * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW - * @data: Command-dependent value - * @fs: Flow classification rule - * @rss_context: RSS context to be affected - * @rule_cnt: Number of rules to be affected - * @rule_locs: Array of used rule locations - * - * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating - * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following - * structure fields must not be used, except that if @flow_type includes - * the %FLOW_RSS flag, then @rss_context determines which RSS context to - * act on. - * - * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues - * on return. - * - * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined - * rules on return. If @data is non-zero on return then it is the - * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the - * driver supports any special location values. If that flag is not - * set in @data then special location values should not be used. - * - * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an - * existing rule on entry and @fs contains the rule on return; if - * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is - * filled with the RSS context ID associated with the rule. - * - * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the - * user buffer for @rule_locs on entry. On return, @data is the size - * of the rule table, @rule_cnt is the number of defined rules, and - * @rule_locs contains the locations of the defined rules. Drivers - * must use the second parameter to get_rxnfc() instead of @rule_locs. - * - * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. - * @fs.@location either specifies the location to use or is a special - * location value with %RX_CLS_LOC_SPECIAL flag set. On return, - * @fs.@location is the actual rule location. If @fs.@flow_type - * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to - * use for flow spreading traffic which matches this rule. The value - * from the rxfh indirection table will be added to @fs.@ring_cookie - * to choose which ring to deliver to. - * - * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an - * existing rule on entry. - * - * A driver supporting the special location values for - * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused - * location, and may remove a rule at a later location (lower - * priority) that matches exactly the same set of flows. The special - * values are %RX_CLS_LOC_ANY, selecting any location; - * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum - * priority); and %RX_CLS_LOC_LAST, selecting the last suitable - * location (minimum priority). Additional special values may be - * defined in future and drivers must return -%EINVAL for any - * unrecognised value. - */ -struct ethtool_rxnfc { - __u32 cmd; - __u32 flow_type; - __u64 data; - struct ethtool_rx_flow_spec fs; - union { - __u32 rule_cnt; - __u32 rss_context; - }; - __u32 rule_locs[]; -}; - - -/** - * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection - * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR - * @size: On entry, the array size of the user buffer, which may be zero. - * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware - * indirection table. - * @ring_index: RX ring/queue index for each hash value - * - * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size - * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means - * the table should be reset to default values. This last feature - * is not supported by the original implementations. - */ -struct ethtool_rxfh_indir { - __u32 cmd; - __u32 size; - __u32 ring_index[]; -}; - -/** - * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. - * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH - * @rss_context: RSS context identifier. Context 0 is the default for normal - * traffic; other contexts can be referenced as the destination for RX flow - * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command - * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will - * contain the ID of the newly allocated context. - * @indir_size: On entry, the array size of the user buffer for the - * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), - * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, - * the array size of the hardware indirection table. - * @key_size: On entry, the array size of the user buffer for the hash key, - * which may be zero. On return from %ETHTOOL_GRSSH, the size of the - * hardware hash key. - * @hfunc: Defines the current RSS hash function used by HW (or to be set to). - * Valid values are one of the %ETH_RSS_HASH_*. - * @input_xfrm: Defines how the input data is transformed. Valid values are one - * of %RXH_XFRM_*. - * @rsvd8: Reserved for future use; see the note on reserved space. - * @rsvd32: Reserved for future use; see the note on reserved space. - * @rss_config: RX ring/queue index for each hash value i.e., indirection table - * of @indir_size __u32 elements, followed by hash key of @key_size - * bytes. - * - * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the - * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of - * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested - * and a @indir_size of zero means the indir table should be reset to default - * values (if @rss_context == 0) or that the RSS context should be deleted. - * An hfunc of zero means that hash function setting is not requested. - */ -struct ethtool_rxfh { - __u32 cmd; - __u32 rss_context; - __u32 indir_size; - __u32 key_size; - __u8 hfunc; - __u8 input_xfrm; - __u8 rsvd8[2]; - __u32 rsvd32; - __u32 rss_config[]; -}; -#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff -#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff - -/** - * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter - * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW - * @h_u: Flow field values to match (dependent on @flow_type) - * @m_u: Masks for flow field value bits to be ignored - * @vlan_tag: VLAN tag to match - * @vlan_tag_mask: Mask for VLAN tag bits to be ignored - * @data: Driver-dependent data to match - * @data_mask: Mask for driver-dependent data bits to be ignored - * @action: RX ring/queue index to deliver to (non-negative) or other action - * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) - * - * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where - * a field value and mask are both zero this is treated as if all mask - * bits are set i.e. the field is ignored. - */ -struct ethtool_rx_ntuple_flow_spec { - __u32 flow_type; - union { - struct ethtool_tcpip4_spec tcp_ip4_spec; - struct ethtool_tcpip4_spec udp_ip4_spec; - struct ethtool_tcpip4_spec sctp_ip4_spec; - struct ethtool_ah_espip4_spec ah_ip4_spec; - struct ethtool_ah_espip4_spec esp_ip4_spec; - struct ethtool_usrip4_spec usr_ip4_spec; - struct ethhdr ether_spec; - __u8 hdata[72]; - } h_u, m_u; - - __u16 vlan_tag; - __u16 vlan_tag_mask; - __u64 data; - __u64 data_mask; - - __s32 action; -#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ -#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ -}; - -/** - * struct ethtool_rx_ntuple - command to set or clear RX flow filter - * @cmd: Command number - %ETHTOOL_SRXNTUPLE - * @fs: Flow filter specification - */ -struct ethtool_rx_ntuple { - __u32 cmd; - struct ethtool_rx_ntuple_flow_spec fs; -}; - -#define ETHTOOL_FLASH_MAX_FILENAME 128 -enum ethtool_flash_op_type { - ETHTOOL_FLASH_ALL_REGIONS = 0, -}; - -/* for passing firmware flashing related parameters */ -struct ethtool_flash { - __u32 cmd; - __u32 region; - char data[ETHTOOL_FLASH_MAX_FILENAME]; -}; - -/** - * struct ethtool_dump - used for retrieving, setting device dump - * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or - * %ETHTOOL_SET_DUMP - * @version: FW version of the dump, filled in by driver - * @flag: driver dependent flag for dump setting, filled in by driver during - * get and filled in by ethtool for set operation. - * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when - * firmware dump is disabled. - * @len: length of dump data, used as the length of the user buffer on entry to - * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver - * for %ETHTOOL_GET_DUMP_FLAG command - * @data: data collected for get dump data operation - */ -struct ethtool_dump { - __u32 cmd; - __u32 version; - __u32 flag; - __u32 len; - __u8 data[]; -}; - -#define ETH_FW_DUMP_DISABLE 0 - -/* for returning and changing feature sets */ - -/** - * struct ethtool_get_features_block - block with state of 32 features - * @available: mask of changeable features - * @requested: mask of features requested to be enabled if possible - * @active: mask of currently enabled features - * @never_changed: mask of features not changeable for any device - */ -struct ethtool_get_features_block { - __u32 available; - __u32 requested; - __u32 active; - __u32 never_changed; -}; - -/** - * struct ethtool_gfeatures - command to get state of device's features - * @cmd: command number = %ETHTOOL_GFEATURES - * @size: On entry, the number of elements in the features[] array; - * on return, the number of elements in features[] needed to hold - * all features - * @features: state of features - */ -struct ethtool_gfeatures { - __u32 cmd; - __u32 size; - struct ethtool_get_features_block features[]; -}; - -/** - * struct ethtool_set_features_block - block with request for 32 features - * @valid: mask of features to be changed - * @requested: values of features to be changed - */ -struct ethtool_set_features_block { - __u32 valid; - __u32 requested; -}; - -/** - * struct ethtool_sfeatures - command to request change in device's features - * @cmd: command number = %ETHTOOL_SFEATURES - * @size: array size of the features[] array - * @features: feature change masks - */ -struct ethtool_sfeatures { - __u32 cmd; - __u32 size; - struct ethtool_set_features_block features[]; -}; - -/** - * struct ethtool_ts_info - holds a device's timestamping and PHC association - * @cmd: command number = %ETHTOOL_GET_TS_INFO - * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags - * @phc_index: device index of the associated PHC, or -1 if there is none - * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values - * @tx_reserved: Reserved for future use; see the note on reserved space. - * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values - * @rx_reserved: Reserved for future use; see the note on reserved space. - * - * The bits in the 'tx_types' and 'rx_filters' fields correspond to - * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, - * respectively. For example, if the device supports HWTSTAMP_TX_ON, - * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. - * - * Drivers should only report the filters they actually support without - * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for - * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the - * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. - */ -struct ethtool_ts_info { - __u32 cmd; - __u32 so_timestamping; - __s32 phc_index; - __u32 tx_types; - __u32 tx_reserved[3]; - __u32 rx_filters; - __u32 rx_reserved[3]; -}; - -/* - * %ETHTOOL_SFEATURES changes features present in features[].valid to the - * values of corresponding bits in features[].requested. Bits in .requested - * not set in .valid or not changeable are ignored. - * - * Returns %EINVAL when .valid contains undefined or never-changeable bits - * or size is not equal to required number of features words (32-bit blocks). - * Returns >= 0 if request was completed; bits set in the value mean: - * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not - * changeable (not present in %ETHTOOL_GFEATURES' features[].available) - * those bits were ignored. - * %ETHTOOL_F_WISH - some or all changes requested were recorded but the - * resulting state of bits masked by .valid is not equal to .requested. - * Probably there are other device-specific constraints on some features - * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered - * here as though ignored bits were cleared. - * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling - * compatibility functions. Requested offload state cannot be properly - * managed by kernel. - * - * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of - * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands - * for ETH_SS_FEATURES string set. First entry in the table corresponds to least - * significant bit in features[0] fields. Empty strings mark undefined features. - */ -enum ethtool_sfeatures_retval_bits { - ETHTOOL_F_UNSUPPORTED__BIT, - ETHTOOL_F_WISH__BIT, - ETHTOOL_F_COMPAT__BIT, -}; - -#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) -#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) -#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) - -#define MAX_NUM_QUEUE 4096 - -/** - * struct ethtool_per_queue_op - apply sub command to the queues in mask. - * @cmd: ETHTOOL_PERQUEUE - * @sub_command: the sub command which apply to each queues - * @queue_mask: Bitmap of the queues which sub command apply to - * @data: A complete command structure following for each of the queues addressed - */ -struct ethtool_per_queue_op { - __u32 cmd; - __u32 sub_command; - __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; - char data[]; -}; - -/** - * struct ethtool_fecparam - Ethernet Forward Error Correction parameters - * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on the port, single bit set, GET only. - * @fec: Bitmask of configured FEC modes. - * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. - * - * Note that @reserved was never validated on input and ethtool user space - * left it uninitialized when calling SET. Hence going forward it can only be - * used to return a value to userspace with GET. - * - * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. - * FEC settings are configured by link autonegotiation whenever it's enabled. - * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. - * - * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. - * It is recommended that drivers only accept a single bit set in @fec. - * When multiple bits are set in @fec drivers may pick mode in an implementation - * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other - * FEC modes, because it's unclear whether in this case other modes constrain - * AUTO or are independent choices. - * Drivers must reject SET requests if they support none of the requested modes. - * - * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead - * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. - * - * See enum ethtool_fec_config_bits for definition of valid bits for both - * @fec and @active_fec. - */ -struct ethtool_fecparam { - __u32 cmd; - /* bitmask of FEC modes */ - __u32 active_fec; - __u32 fec; - __u32 reserved; -}; - -/** - * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration - * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not - * be used together with other bits. GET only. - * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually - * based link mode and SFP parameters read from module's - * EEPROM. This bit does _not_ mean autonegotiation. - * @ETHTOOL_FEC_OFF_BIT: No FEC Mode - * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode - * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode - * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet - * Consortium) - */ -enum ethtool_fec_config_bits { - ETHTOOL_FEC_NONE_BIT, - ETHTOOL_FEC_AUTO_BIT, - ETHTOOL_FEC_OFF_BIT, - ETHTOOL_FEC_RS_BIT, - ETHTOOL_FEC_BASER_BIT, - ETHTOOL_FEC_LLRS_BIT, -}; - -#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) -#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) -#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) -#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) -#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) -#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) - -/* CMDs currently supported */ -#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. - * Please use ETHTOOL_GLINKSETTINGS - */ -#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. - * Please use ETHTOOL_SLINKSETTINGS - */ -#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ -#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ -#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ -#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ -#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ -#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ -#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ -/* Get link status for host, i.e. whether the interface *and* the - * physical port (if there is one) are up (ethtool_value). */ -#define ETHTOOL_GLINK 0x0000000a -#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ -#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ -#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ -#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ -#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ -#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ -#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ -#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ -#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ -#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ -#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ -#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ -#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable - * (ethtool_value) */ -#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable - * (ethtool_value). */ -#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ -#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ -#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ -#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ -#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ -#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ -#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ -#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ -#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ -#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ -#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ -#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ -#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ -#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ -#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ - -#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ -#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ -#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ -#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ -#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ -#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ -#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ -#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ -#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ -#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ -#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ -#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ -#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ -#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ -#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ -#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ -#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ - -#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ -#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ -#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ -#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ -#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ -#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ -#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ -#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ -#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ -#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ -#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ - -#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ -#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ -#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ -#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ -#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ - -#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ - -#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ -#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ -#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ -#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ -#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ -#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ - -/* compatibility with older code */ -#define SPARC_ETH_GSET ETHTOOL_GSET -#define SPARC_ETH_SSET ETHTOOL_SSET - -/* Link mode bit indices */ -enum ethtool_link_mode_bit_indices { - ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, - ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, - ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, - ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, - ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, - ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, - ETHTOOL_LINK_MODE_Autoneg_BIT = 6, - ETHTOOL_LINK_MODE_TP_BIT = 7, - ETHTOOL_LINK_MODE_AUI_BIT = 8, - ETHTOOL_LINK_MODE_MII_BIT = 9, - ETHTOOL_LINK_MODE_FIBRE_BIT = 10, - ETHTOOL_LINK_MODE_BNC_BIT = 11, - ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, - ETHTOOL_LINK_MODE_Pause_BIT = 13, - ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, - ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, - ETHTOOL_LINK_MODE_Backplane_BIT = 16, - ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, - ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, - ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, - ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, - ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, - ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, - ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, - ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, - ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, - ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, - ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, - ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, - ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, - ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, - ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, - - /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit - * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* - * macro for bits > 31. The only way to use indices > 31 is to - * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. - */ - - ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, - ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, - ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, - ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, - ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, - ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, - ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, - ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, - ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, - ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, - ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, - ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, - ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, - ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, - ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, - ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, - ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, - - ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, - ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, - ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, - ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, - ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, - ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, - ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, - ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, - ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, - ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, - ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, - ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, - ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, - ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, - ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, - ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, - ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, - ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, - ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, - ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, - ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, - ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, - ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, - ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, - ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, - ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, - ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, - ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, - ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, - ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, - ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, - ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, - ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, - ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, - ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, - ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, - ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, - ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, - ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, - ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, - ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, - ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, - ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, - ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, - ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, - ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, - ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, - ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, - ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, - ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, - ETHTOOL_LINK_MODE_10baseT1S_Full_BIT = 99, - ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, - ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, - - /* must be last entry */ - __ETHTOOL_LINK_MODE_MASK_NBITS -}; - -#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ - (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) - -/* DEPRECATED macros. Please migrate to - * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT - * define any new SUPPORTED_* macro for bits > 31. - */ -#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) -#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) -#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) -#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) -#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) -#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) -#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) -#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) -#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) -#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) -#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) -#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) -#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) -#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) -#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) -#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) -#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) -#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) -#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) -#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) -#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) -#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) -#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) -#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) -#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) -#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) -#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) -#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) -#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) -#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) -#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) -/* Please do not define any new SUPPORTED_* macro for bits > 31, see - * notice above. - */ - -/* - * DEPRECATED macros. Please migrate to - * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT - * define any new ADERTISE_* macro for bits > 31. - */ -#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) -#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) -#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) -#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) -#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) -#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) -#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) -#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) -#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) -#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) -#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) -#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) -#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) -#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) -#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) -#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) -#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) -#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) -#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) -#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) -#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) -#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) -#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) -#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) -#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) -#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) -#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) -#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) -#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) -#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) -#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) -/* Please do not define any new ADVERTISED_* macro for bits > 31, see - * notice above. - */ - -/* The following are all involved in forcing a particular link - * mode for the device for setting things. When getting the - * devices settings, these indicate the current mode and whether - * it was forced up into this mode or autonegotiated. - */ - -/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. - * Update drivers/net/phy/phy.c:phy_speed_to_str() and - * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. - */ -#define SPEED_10 10 -#define SPEED_100 100 -#define SPEED_1000 1000 -#define SPEED_2500 2500 -#define SPEED_5000 5000 -#define SPEED_10000 10000 -#define SPEED_14000 14000 -#define SPEED_20000 20000 -#define SPEED_25000 25000 -#define SPEED_40000 40000 -#define SPEED_50000 50000 -#define SPEED_56000 56000 -#define SPEED_100000 100000 -#define SPEED_200000 200000 -#define SPEED_400000 400000 -#define SPEED_800000 800000 - -#define SPEED_UNKNOWN -1 - -static inline int ethtool_validate_speed(__u32 speed) -{ - return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; -} - -/* Duplex, half or full. */ -#define DUPLEX_HALF 0x00 -#define DUPLEX_FULL 0x01 -#define DUPLEX_UNKNOWN 0xff - -static inline int ethtool_validate_duplex(__u8 duplex) -{ - switch (duplex) { - case DUPLEX_HALF: - case DUPLEX_FULL: - case DUPLEX_UNKNOWN: - return 1; - } - - return 0; -} - -#define MASTER_SLAVE_CFG_UNSUPPORTED 0 -#define MASTER_SLAVE_CFG_UNKNOWN 1 -#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 -#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 -#define MASTER_SLAVE_CFG_MASTER_FORCE 4 -#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 -#define MASTER_SLAVE_STATE_UNSUPPORTED 0 -#define MASTER_SLAVE_STATE_UNKNOWN 1 -#define MASTER_SLAVE_STATE_MASTER 2 -#define MASTER_SLAVE_STATE_SLAVE 3 -#define MASTER_SLAVE_STATE_ERR 4 - -/* These are used to throttle the rate of data on the phy interface when the - * native speed of the interface is higher than the link speed. These should - * not be used for phy interfaces which natively support multiple speeds (e.g. - * MII or SGMII). - */ -/* No rate matching performed. */ -#define RATE_MATCH_NONE 0 -/* The phy sends pause frames to throttle the MAC. */ -#define RATE_MATCH_PAUSE 1 -/* The phy asserts CRS to prevent the MAC from transmitting. */ -#define RATE_MATCH_CRS 2 -/* The MAC is programmed with a sufficiently-large IPG. */ -#define RATE_MATCH_OPEN_LOOP 3 - -/* Which connector port. */ -#define PORT_TP 0x00 -#define PORT_AUI 0x01 -#define PORT_MII 0x02 -#define PORT_FIBRE 0x03 -#define PORT_BNC 0x04 -#define PORT_DA 0x05 -#define PORT_NONE 0xef -#define PORT_OTHER 0xff - -/* Which transceiver to use. */ -#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ -#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ -#define XCVR_DUMMY1 0x02 -#define XCVR_DUMMY2 0x03 -#define XCVR_DUMMY3 0x04 - -/* Enable or disable autonegotiation. */ -#define AUTONEG_DISABLE 0x00 -#define AUTONEG_ENABLE 0x01 - -/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then - * the driver is required to renegotiate link - */ -#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ -#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ -#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ -#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ - -/* Wake-On-Lan options. */ -#define WAKE_PHY (1 << 0) -#define WAKE_UCAST (1 << 1) -#define WAKE_MCAST (1 << 2) -#define WAKE_BCAST (1 << 3) -#define WAKE_ARP (1 << 4) -#define WAKE_MAGIC (1 << 5) -#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ -#define WAKE_FILTER (1 << 7) - -#define WOL_MODE_COUNT 8 - -/* RSS hash function data - * XOR the corresponding source and destination fields of each specified - * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH - * calculation. Note that this XORing reduces the input set entropy and could - * be exploited to reduce the RSS queue spread. - */ -#define RXH_XFRM_SYM_XOR (1 << 0) -#define RXH_XFRM_NO_CHANGE 0xff - -/* L2-L4 network traffic flow types */ -#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ -#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ -#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ -#define AH_ESP_V4_FLOW 0x04 /* hash only */ -#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ -#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ -#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ -#define AH_ESP_V6_FLOW 0x08 /* hash only */ -#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ -#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ -#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ -#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ -#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ -#define IP_USER_FLOW IPV4_USER_FLOW -#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ -#define IPV4_FLOW 0x10 /* hash only */ -#define IPV6_FLOW 0x11 /* hash only */ -#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ - -/* Used for GTP-U IPv4 and IPv6. - * The format of GTP packets only includes - * elements such as TEID and GTP version. - * It is primarily intended for data communication of the UE. - */ -#define GTPU_V4_FLOW 0x13 /* hash only */ -#define GTPU_V6_FLOW 0x14 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * The format of these GTP packets does not include TEID. - * Primarily expected to be used for communication - * to create sessions for UE data communication, - * commonly referred to as CSR (Create Session Request). - */ -#define GTPC_V4_FLOW 0x15 /* hash only */ -#define GTPC_V6_FLOW 0x16 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. - * After session creation, it becomes this packet. - * This is mainly used for requests to realize UE handover. - */ -#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ -#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ - -/* Use for GTP-U and extended headers for the PSC (PDU Session Container). - * The format of these GTP packets includes TEID and QFI. - * In 5G communication using UPF (User Plane Function), - * data communication with this extended header is performed. - */ -#define GTPU_EH_V4_FLOW 0x19 /* hash only */ -#define GTPU_EH_V6_FLOW 0x1a /* hash only */ - -/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. - * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by - * UL/DL included in the PSC. - * There are differences in the data included based on Downlink/Uplink, - * and can be used to distinguish packets. - * The functions described so far are useful when you want to - * handle communication from the mobile network in UPF, PGW, etc. - */ -#define GTPU_UL_V4_FLOW 0x1b /* hash only */ -#define GTPU_UL_V6_FLOW 0x1c /* hash only */ -#define GTPU_DL_V4_FLOW 0x1d /* hash only */ -#define GTPU_DL_V6_FLOW 0x1e /* hash only */ - -/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ -#define FLOW_EXT 0x80000000 -#define FLOW_MAC_EXT 0x40000000 -/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ -#define FLOW_RSS 0x20000000 - -/* L3-L4 network traffic flow hash options */ -#define RXH_L2DA (1 << 1) -#define RXH_VLAN (1 << 2) -#define RXH_L3_PROTO (1 << 3) -#define RXH_IP_SRC (1 << 4) -#define RXH_IP_DST (1 << 5) -#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ -#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ -#define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ -#define RXH_DISCARD (1 << 31) - -#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL -#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL - -/* Special RX classification rule insert location values */ -#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ -#define RX_CLS_LOC_ANY 0xffffffff -#define RX_CLS_LOC_FIRST 0xfffffffe -#define RX_CLS_LOC_LAST 0xfffffffd - -/* EEPROM Standards for plug in modules */ -#define ETH_MODULE_SFF_8079 0x1 -#define ETH_MODULE_SFF_8079_LEN 256 -#define ETH_MODULE_SFF_8472 0x2 -#define ETH_MODULE_SFF_8472_LEN 512 -#define ETH_MODULE_SFF_8636 0x3 -#define ETH_MODULE_SFF_8636_LEN 256 -#define ETH_MODULE_SFF_8436 0x4 -#define ETH_MODULE_SFF_8436_LEN 256 - -#define ETH_MODULE_SFF_8636_MAX_LEN 640 -#define ETH_MODULE_SFF_8436_MAX_LEN 640 - -/* Reset flags */ -/* The reset() operation must clear the flags for the components which - * were actually reset. On successful return, the flags indicate the - * components which were not reset, either because they do not exist - * in the hardware or because they cannot be reset independently. The - * driver must never reset any components that were not requested. - */ -enum ethtool_reset_flags { - /* These flags represent components dedicated to the interface - * the command is addressed to. Shift any flag left by - * ETH_RESET_SHARED_SHIFT to reset a shared component of the - * same type. - */ - ETH_RESET_MGMT = 1 << 0, /* Management processor */ - ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ - ETH_RESET_DMA = 1 << 2, /* DMA engine */ - ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ - ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ - ETH_RESET_MAC = 1 << 5, /* Media access controller */ - ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ - ETH_RESET_RAM = 1 << 7, /* RAM shared between - * multiple components */ - ETH_RESET_AP = 1 << 8, /* Application processor */ - - ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to - * this interface */ - ETH_RESET_ALL = 0xffffffff, /* All components used by this - * interface, even if shared */ -}; -#define ETH_RESET_SHARED_SHIFT 16 - - -/** - * struct ethtool_link_settings - link control and status - * - * IMPORTANT, Backward compatibility notice: When implementing new - * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and - * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link - * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS - * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in - * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use - * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link - * settings; do not use %ETHTOOL_SLINKSETTINGS if - * %ETHTOOL_GLINKSETTINGS failed: stick to - * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. - * - * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS - * @speed: Link speed (Mbps) - * @duplex: Duplex mode; one of %DUPLEX_* - * @port: Physical connector type; one of %PORT_* - * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not - * applicable. For clause 45 PHYs this is the PRTAD. - * @autoneg: Enable/disable autonegotiation and auto-detection; - * either %AUTONEG_DISABLE or %AUTONEG_ENABLE - * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO - * protocols supported by the interface; 0 if unknown. - * Read-only. - * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of - * %ETH_TP_MDI_*. If the status is unknown or not applicable, the - * value will be %ETH_TP_MDI_INVALID. Read-only. - * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of - * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads - * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. - * When written successfully, the link should be renegotiated if - * necessary. - * @link_mode_masks_nwords: Number of 32-bit words for each of the - * supported, advertising, lp_advertising link mode bitmaps. For - * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user - * (>= 0); on return, if handshake in progress, negative if - * request size unsupported by kernel: absolute value indicates - * kernel expected size and all the other fields but cmd - * are 0; otherwise (handshake completed), strictly positive - * to indicate size used by kernel and cmd field stays - * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For - * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive - * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise - * refused. For drivers: ignore this field (use kernel's - * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will - * be overwritten by kernel. - * @transceiver: Used to distinguish different possible PHY types, - * reported consistently by PHYLIB. Read-only. - * @master_slave_cfg: Master/slave port mode. - * @master_slave_state: Master/slave port state. - * @rate_matching: Rate adaptation performed by the PHY - * @reserved: Reserved for future use; see the note on reserved space. - * @link_mode_masks: Variable length bitmaps. - * - * If autonegotiation is disabled, the speed and @duplex represent the - * fixed link mode and are writable if the driver supports multiple - * link modes. If it is enabled then they are read-only; if the link - * is up they represent the negotiated link mode; if the link is down, - * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and - * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. - * - * Some hardware interfaces may have multiple PHYs and/or physical - * connectors fitted or do not allow the driver to detect which are - * fitted. For these interfaces @port and/or @phy_address may be - * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. - * Otherwise, attempts to write different values may be ignored or - * rejected. - * - * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt - * are not available in %ethtool_link_settings. These fields will be - * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will - * fail if any of them is set to non-zero value. - * - * Users should assume that all fields not marked read-only are - * writable and subject to validation by the driver. They should use - * %ETHTOOL_GLINKSETTINGS to get the current values before making specific - * changes and then applying them with %ETHTOOL_SLINKSETTINGS. - * - * Drivers that implement %get_link_ksettings and/or - * %set_link_ksettings should ignore the @cmd - * and @link_mode_masks_nwords fields (any change to them overwritten - * by kernel), and rely only on kernel's internal - * %__ETHTOOL_LINK_MODE_MASK_NBITS and - * %ethtool_link_mode_mask_t. Drivers that implement - * %set_link_ksettings() should validate all fields other than @cmd - * and @link_mode_masks_nwords that are not described as read-only or - * deprecated, and must ignore all fields described as read-only. - * - * @link_mode_masks is divided into three bitfields, each of length - * @link_mode_masks_nwords: - * - supported: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, physical - * connectors and other link features for which the interface - * supports autonegotiation or auto-detection. Read-only. - * - advertising: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, physical - * connectors and other link features that are advertised through - * autonegotiation or enabled for auto-detection. - * - lp_advertising: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, and other - * link features that the link partner advertised through - * autonegotiation; 0 if unknown or not applicable. Read-only. - */ -struct ethtool_link_settings { - __u32 cmd; - __u32 speed; - __u8 duplex; - __u8 port; - __u8 phy_address; - __u8 autoneg; - __u8 mdio_support; - __u8 eth_tp_mdix; - __u8 eth_tp_mdix_ctrl; - __s8 link_mode_masks_nwords; - __u8 transceiver; - __u8 master_slave_cfg; - __u8 master_slave_state; - __u8 rate_matching; - __u32 reserved[7]; - __u32 link_mode_masks[]; - /* layout of link_mode_masks fields: - * __u32 map_supported[link_mode_masks_nwords]; - * __u32 map_advertising[link_mode_masks_nwords]; - * __u32 map_lp_advertising[link_mode_masks_nwords]; - */ -}; -#endif /* _UAPI_LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 792a04bed41caec79c787d105b0d442351b3bcc8 Mon Sep 17 00:00:00 2001 From: David Faust Date: Wed, 8 May 2024 12:35:12 -0700 Subject: bpf: avoid gcc overflow warning in test_xdp_vlan.c This patch fixes an integer overflow warning raised by GCC in xdp_prognum1 of progs/test_xdp_vlan.c: GCC-BPF [test_maps] test_xdp_vlan.bpf.o progs/test_xdp_vlan.c: In function 'xdp_prognum1': progs/test_xdp_vlan.c:163:25: error: integer overflow in expression '(short int)(((__builtin_constant_p((int)vlan_hdr->h_vlan_TCI)) != 0 ? (int)(short unsigned int)((short int)((int)vlan_hdr->h_vlan_TCI << 8 >> 8) << 8 | (short int)((int)vlan_hdr->h_vlan_TCI << 0 >> 8 << 0)) & 61440 : (int)__builtin_bswap16(vlan_hdr->h_vlan_TCI) & 61440) << 8 >> 8) << 8' of type 'short int' results in '0' [-Werror=overflow] 163 | bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) | ^~~~~~~~~ The problem lies with the expansion of the bpf_htons macro and the expression passed into it. The bpf_htons macro (and similarly the bpf_ntohs macro) expand to a ternary operation using either __builtin_bswap16 or ___bpf_swab16 to swap the bytes, depending on whether the expression is constant. For an expression, with 'value' as a u16, like: bpf_htons (value & 0xf000) The entire (value & 0xf000) is 'x' in the expansion of ___bpf_swab16 and we get as one part of the expanded swab16: ((__u16)(value & 0xf000) << 8 >> 8 << 8 This will always evaluate to 0, which is intentional since this subexpression deals with the byte guaranteed to be 0 by the mask. However, GCC warns because the precise reason this always evaluates to 0 is an overflow. Specifically, the plain 0xf000 in the expression is a signed 32-bit integer, which causes 'value' to also be promoted to a signed 32-bit integer, and the combination of the 8-bit left shift and down-cast back to __u16 results in a signed overflow (really a 'warning: overflow in conversion from int to __u16' which is propegated up through the rest of the expression leading to the ultimate overflow warning above), which is a valid warning despite being the intended result of this code. Clang does not warn on this case, likely because it performs constant folding later in the compilation process relative to GCC. It seems that by the time clang does constant folding for this expression, the side of the ternary with this overflow has already been discarded. Fortunately, this warning is easily silenced by simply making the 0xf000 mask explicitly unsigned. This has no impact on the result. Signed-off-by: David Faust Cc: jose.marchesi@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20240508193512.152759-1-david.faust@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_xdp_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c index f3ec8086482d..a7588302268d 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c @@ -160,7 +160,7 @@ int xdp_prognum1(struct xdp_md *ctx) /* Modifying VLAN, preserve top 4 bits */ vlan_hdr->h_vlan_TCI = - bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) + bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000U) | TO_VLAN); } -- cgit v1.2.3 From 5ddafcc377f98778acc08f660dee6400aece6a62 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Fri, 10 May 2024 19:38:50 +0100 Subject: selftests/bpf: Fix a few tests for GCC related warnings. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch corrects a few warnings to allow selftests to compile for GCC. -- progs/cpumask_failure.c -- progs/bpf_misc.h:136:22: error: ‘cpumask’ is used uninitialized [-Werror=uninitialized] 136 | #define __sink(expr) asm volatile("" : "+g"(expr)) | ^~~ progs/cpumask_failure.c:68:9: note: in expansion of macro ‘__sink’ 68 | __sink(cpumask); The macro __sink(cpumask) with the '+' contraint modifier forces the the compiler to expect a read and write from cpumask. GCC detects that cpumask is never initialized and reports an error. This patch removes the spurious non required definitions of cpumask. -- progs/dynptr_fail.c -- progs/dynptr_fail.c:1444:9: error: ‘ptr1’ may be used uninitialized [-Werror=maybe-uninitialized] 1444 | bpf_dynptr_clone(&ptr1, &ptr2); Many of the tests in the file are related to the detection of uninitialized pointers by the verifier. GCC is able to detect possible uninitialized values, and reports this as an error. The patch initializes all of the previous uninitialized structs. -- progs/test_tunnel_kern.c -- progs/test_tunnel_kern.c:590:9: error: array subscript 1 is outside array bounds of ‘struct geneve_opt[1]’ [-Werror=array-bounds=] 590 | *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef); | ^~~~~~~~~~~~~~~~~~~~~~~ progs/test_tunnel_kern.c:575:27: note: at offset 4 into object ‘gopt’ of size 4 575 | struct geneve_opt gopt; This tests accesses beyond the defined data for the struct geneve_opt which contains as last field "u8 opt_data[0]" which clearly does not get reserved space (in stack) in the function header. This pattern is repeated in ip6geneve_set_tunnel and geneve_set_tunnel functions. GCC is able to see this and emits a warning. The patch introduces a local struct that allocates enough space to safely allow the write to opt_data field. -- progs/jeq_infer_not_null_fail.c -- progs/jeq_infer_not_null_fail.c:21:40: error: array subscript ‘struct bpf_map[0]’ is partly outside array bounds of ‘struct [1]’ [-Werror=array-bounds=] 21 | struct bpf_map *inner_map = map->inner_map_meta; | ^~ progs/jeq_infer_not_null_fail.c:14:3: note: object ‘m_hash’ of size 32 14 | } m_hash SEC(".maps"); This example defines m_hash in the context of the compilation unit and casts it to struct bpf_map which is much smaller than the size of struct bpf_map. It errors out in GCC when it attempts to access an element that would be defined in struct bpf_map outsize of the defined limits for m_hash. This patch disables the warning through a GCC pragma. This changes were tested in bpf-next master selftests without any regressions. Signed-off-by: Cupertino Miranda Cc: jose.marchesi@oracle.com Cc: david.faust@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240510183850.286661-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/cpumask_failure.c | 3 -- tools/testing/selftests/bpf/progs/dynptr_fail.c | 12 +++--- .../selftests/bpf/progs/jeq_infer_not_null_fail.c | 4 ++ .../testing/selftests/bpf/progs/test_tunnel_kern.c | 47 +++++++++++++--------- 4 files changed, 37 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index a9bf6ea336cf..a988d2823b52 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -61,11 +61,8 @@ SEC("tp_btf/task_newtask") __failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { - struct bpf_cpumask *cpumask; - /* Can't set the CPU of a non-struct bpf_cpumask. */ bpf_cpumask_set_cpu(0, (struct bpf_cpumask *)task->cpus_ptr); - __sink(cpumask); return 0; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index 7ce7e827d5f0..66a60bfb5867 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -80,7 +80,7 @@ SEC("?raw_tp") __failure __msg("Unreleased reference id=2") int ringbuf_missing_release1(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr); @@ -1385,7 +1385,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_adjust_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_adjust(&ptr, 1, 2); @@ -1398,7 +1398,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_null_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_null(&ptr); @@ -1411,7 +1411,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_rdonly_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_rdonly(&ptr); @@ -1424,7 +1424,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_size_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_size(&ptr); @@ -1437,7 +1437,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int clone_invalid1(void *ctx) { - struct bpf_dynptr ptr1; + struct bpf_dynptr ptr1 = {}; struct bpf_dynptr ptr2; /* this should fail */ diff --git a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c index f46965053acb..4d619bea9c75 100644 --- a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c +++ b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c @@ -4,6 +4,10 @@ #include #include "bpf_misc.h" +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3e436e6f7312..3f5abcf3ff13 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -567,12 +567,18 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb) return TC_ACT_OK; } +struct local_geneve_opt { + struct geneve_opt gopt; + int data; +}; + SEC("tc") int geneve_set_tunnel(struct __sk_buff *skb) { int ret; struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; __builtin_memset(&key, 0x0, sizeof(key)); key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ @@ -580,14 +586,14 @@ int geneve_set_tunnel(struct __sk_buff *skb) key.tunnel_tos = 0; key.tunnel_ttl = 64; - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xdeadbeef); ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); @@ -596,7 +602,7 @@ int geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(local_gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -631,7 +637,8 @@ SEC("tc") int ip6geneve_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; int ret; __builtin_memset(&key, 0x0, sizeof(key)); @@ -647,16 +654,16 @@ int ip6geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xfeedbeef); - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; -- cgit v1.2.3 From a3c1c95538e22283ef6fa529e3ffa0e6d47ee190 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 11 May 2024 16:50:24 +0800 Subject: selftests/bpf: Free strdup memory in xdp_hw_metadata The strdup() function returns a pointer to a new string which is a duplicate of the string "ifname". Memory for the new string is obtained with malloc(), and need to be freed with free(). This patch adds this missing "free(saved_hwtstamp_ifname)" in cleanup() to avoid a potential memory leak in xdp_hw_metadata.c. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/af9bcccb96655e82de5ce2b4510b88c9c8ed5ed0.1715417367.git.tanggeliang@kylinos.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 0859fe727da7..6f9956eed797 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -581,6 +581,8 @@ static void cleanup(void) if (bpf_obj) xdp_hw_metadata__destroy(bpf_obj); + + free((void *)saved_hwtstamp_ifname); } static void handle_signal(int sig) -- cgit v1.2.3 From 73868988c90d2701587ab2a48b5858ab935afb17 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 11 May 2024 23:22:13 +0200 Subject: bpf: disable strict aliasing in test_global_func9.c The BPF selftest test_global_func9.c performs type punning and breaks srict-aliasing rules. In particular, given: int global_func9(struct __sk_buff *skb) { int result = 0; [...] { const struct C c = {.x = skb->len, .y = skb->family }; result |= foo((const struct S *)&c); } } When building with strict-aliasing enabled (the default) the initialization of `c' gets optimized away in its entirely: [... no initialization of `c' ...] r1 = r10 r1 += -40 call foo w0 |= w6 Since GCC knows that `foo' accesses s->x, we get a "maybe uninitialized" warning. On the other hand, when strict-aliasing is disabled GCC only optimizes away the store to `.y': r1 = *(u32 *) (r6+0) *(u32 *) (r10+-40) = r1 ; This is .x = skb->len in `c' r1 = r10 r1 += -40 call foo w0 |= w6 In this case the warning is not emitted, because s-> is initialized. This patch disables strict aliasing in this test when building with GCC. clang seems to not optimize this particular code even when strict aliasing is enabled. Tested in bpf-next master. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Link: https://lore.kernel.org/r/20240511212213.23418-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ed381b0197fe..e0b3887b3d2d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -53,6 +53,7 @@ progs/syscall.c-CFLAGS := -fno-strict-aliasing progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing ifneq ($(LLVM),) # Silence some warnings when compiled with clang -- cgit v1.2.3 From 6a2f786e6905007e82bac212296deca29815916d Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 11 May 2024 23:23:49 +0200 Subject: bpf: ignore expected GCC warning in test_global_func10.c The BPF selftest global_func10 in progs/test_global_func10.c contains: struct Small { long x; }; struct Big { long x; long y; }; [...] __noinline int foo(const struct Big *big) { if (!big) return 0; return bpf_get_prandom_u32() < big->y; } [...] SEC("cgroup_skb/ingress") __failure __msg("invalid indirect access to stack") int global_func10(struct __sk_buff *skb) { const struct Small small = {.x = skb->len }; return foo((struct Big *)&small) ? 1 : 0; } GCC emits a "maybe uninitialized" warning for the code above, because it knows `foo' accesses `big->y'. Since the purpose of this selftest is to check that the verifier will fail on this sort of invalid memory access, this patch just silences the compiler warning. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Link: https://lore.kernel.org/r/20240511212349.23549-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_global_func10.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 8fba3f3649e2..5da001ca57a5 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -4,6 +4,10 @@ #include #include "bpf_misc.h" +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + struct Small { long x; }; -- cgit v1.2.3 From ba39486d2c43ba7c103c438540aa56c8bde3b6c7 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 11 May 2024 23:22:43 +0200 Subject: bpf: make list_for_each_entry portable [Changes from V1: - The __compat_break has been abandoned in favor of a more readable can_loop macro that can be used anywhere, including loop conditions.] The macro list_for_each_entry is defined in bpf_arena_list.h as follows: #define list_for_each_entry(pos, head, member) \ for (void * ___tmp = (pos = list_entry_safe((head)->first, \ typeof(*(pos)), member), \ (void *)0); \ pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ cond_break, \ pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) The macro cond_break, in turn, expands to a statement expression that contains a `break' statement. Compound statement expressions, and the subsequent ability of placing statements in the header of a `for' loop, are GNU extensions. Unfortunately, clang implements this GNU extension differently than GCC: - In GCC the `break' statement is bound to the containing "breakable" context in which the defining `for' appears. If there is no such context, GCC emits a warning: break statement without enclosing `for' o `switch' statement. - In clang the `break' statement is bound to the defining `for'. If the defining `for' is itself inside some breakable construct, then clang emits a -Wgcc-compat warning. This patch adds a new macro can_loop to bpf_experimental, that implements the same logic than cond_break but evaluates to a boolean expression. The patch also changes all the current instances of usage of cond_break withing the header of loop accordingly. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Alexei Starovoitov Link: https://lore.kernel.org/r/20240511212243.23477-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_arena_list.h | 4 +-- tools/testing/selftests/bpf/bpf_experimental.h | 33 ++++++++++++++++++++-- tools/testing/selftests/bpf/progs/arena_list.c | 2 +- .../bpf/progs/verifier_iterating_callbacks.c | 9 +++--- 4 files changed, 38 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index b99b9f408eff..85dbc3ea4da5 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -29,6 +29,7 @@ static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { re static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } #define cond_break ({}) +#define can_loop true #endif /* Safely walk link list elements. Deletion of elements is allowed. */ @@ -36,8 +37,7 @@ static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } for (void * ___tmp = (pos = list_entry_safe((head)->first, \ typeof(*(pos)), member), \ (void *)0); \ - pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ - cond_break, \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }) && can_loop; \ pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 8b9cc87be4c4..3d9e4b8c6b81 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,19 +326,48 @@ l_true: \ }) #endif +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ #ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + #define cond_break \ ({ __label__ l_break, l_continue; \ - asm volatile goto("may_goto %l[l_break]" \ + asm volatile goto("may_goto %l[l_break]" \ :::: l_break); \ goto l_continue; \ l_break: break; \ l_continue:; \ }) #else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + #define cond_break \ ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ + asm volatile goto("1:.byte 0xe5; \ .byte 0; \ .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ .short 0" \ diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index c0422c58cee2..93bd0600eba0 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -49,7 +49,7 @@ int arena_list_add(void *ctx) list_head = &global_head; - for (i = zero; i < cnt; cond_break, i++) { + for (i = zero; i < cnt && can_loop; i++) { struct elem __arena *n = bpf_alloc(sizeof(*n)); test_val++; diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 99e561f18f9b..bd676d7e615f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -318,7 +318,7 @@ int cond_break1(const void *ctx) unsigned long i; unsigned int sum = 0; - for (i = zero; i < ARR_SZ; cond_break, i++) + for (i = zero; i < ARR_SZ && can_loop; i++) sum += i; for (i = zero; i < ARR_SZ; i++) { barrier_var(i); @@ -336,12 +336,11 @@ int cond_break2(const void *ctx) int i, j; int sum = 0; - for (i = zero; i < 1000; cond_break, i++) + for (i = zero; i < 1000 && can_loop; i++) for (j = zero; j < 1000; j++) { sum += i + j; cond_break; - } - + } return sum; } @@ -349,7 +348,7 @@ static __noinline int loop(void) { int i, sum = 0; - for (i = zero; i <= 1000000; i++, cond_break) + for (i = zero; i <= 1000000 && can_loop; i++) sum += i; return sum; -- cgit v1.2.3 From b25f7415eb4108aa32dd3e74289d7f997090708f Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:12 +0000 Subject: landlock: Add IOCTL access right for character and block devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the LANDLOCK_ACCESS_FS_IOCTL_DEV right and increments the Landlock ABI version to 5. This access right applies to device-custom IOCTL commands when they are invoked on block or character device files. Like the truncate right, this right is associated with a file descriptor at the time of open(2), and gets respected even when the file descriptor is used outside of the thread which it was originally opened in. Therefore, a newly enabled Landlock policy does not apply to file descriptors which are already open. If the LANDLOCK_ACCESS_FS_IOCTL_DEV right is handled, only a small number of safe IOCTL commands will be permitted on newly opened device files. These include FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC, as well as other IOCTL commands for regular files which are implemented in fs/ioctl.c. Noteworthy scenarios which require special attention: TTY devices are often passed into a process from the parent process, and so a newly enabled Landlock policy does not retroactively apply to them automatically. In the past, TTY devices have often supported IOCTL commands like TIOCSTI and some TIOCLINUX subcommands, which were letting callers control the TTY input buffer (and simulate keypresses). This should be restricted to CAP_SYS_ADMIN programs on modern kernels though. Known limitations: The LANDLOCK_ACCESS_FS_IOCTL_DEV access right is a coarse-grained control over IOCTL commands. Landlock users may use path-based restrictions in combination with their knowledge about the file system layout to control what IOCTLs can be done. Cc: Paul Moore Cc: Christian Brauner Cc: Arnd Bergmann Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-2-gnoack@google.com Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 38 ++++- security/landlock/fs.c | 225 ++++++++++++++++++++++++++- security/landlock/limits.h | 2 +- security/landlock/syscalls.c | 2 +- tools/testing/selftests/landlock/base_test.c | 2 +- tools/testing/selftests/landlock/fs_test.c | 5 +- 6 files changed, 258 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 25c8d7677539..68625e728f43 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -128,7 +128,7 @@ struct landlock_net_port_attr { * files and directories. Files or directories opened before the sandboxing * are not subject to these restrictions. * - * A file can only receive these access rights: + * The following access rights apply only to files: * * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file. * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. Note that @@ -138,12 +138,13 @@ struct landlock_net_port_attr { * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access. * - %LANDLOCK_ACCESS_FS_TRUNCATE: Truncate a file with :manpage:`truncate(2)`, * :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with - * ``O_TRUNC``. Whether an opened file can be truncated with - * :manpage:`ftruncate(2)` is determined during :manpage:`open(2)`, in the - * same way as read and write permissions are checked during - * :manpage:`open(2)` using %LANDLOCK_ACCESS_FS_READ_FILE and - * %LANDLOCK_ACCESS_FS_WRITE_FILE. This access right is available since the - * third version of the Landlock ABI. + * ``O_TRUNC``. This access right is available since the third version of the + * Landlock ABI. + * + * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used + * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as + * read and write permissions are checked during :manpage:`open(2)` using + * %LANDLOCK_ACCESS_FS_READ_FILE and %LANDLOCK_ACCESS_FS_WRITE_FILE. * * A directory can receive access rights related to files or directories. The * following access right is applied to the directory itself, and the @@ -198,13 +199,33 @@ struct landlock_net_port_attr { * If multiple requirements are not met, the ``EACCES`` error code takes * precedence over ``EXDEV``. * + * The following access right applies both to files and directories: + * + * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened + * character or block device. + * + * This access right applies to all `ioctl(2)` commands implemented by device + * drivers. However, the following common IOCTL commands continue to be + * invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right: + * + * * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``), + * * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``), + * * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``, + * ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``) + * * Some IOCTL commands which do not make sense when used with devices, but + * whose implementations are safe and return the right error codes + * (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``) + * + * This access right is available since the fifth version of the Landlock + * ABI. + * * .. warning:: * * It is currently not possible to restrict some file-related actions * accessible through these syscall families: :manpage:`chdir(2)`, * :manpage:`stat(2)`, :manpage:`flock(2)`, :manpage:`chmod(2)`, * :manpage:`chown(2)`, :manpage:`setxattr(2)`, :manpage:`utime(2)`, - * :manpage:`ioctl(2)`, :manpage:`fcntl(2)`, :manpage:`access(2)`. + * :manpage:`fcntl(2)`, :manpage:`access(2)`. * Future Landlock evolutions will enable to restrict them. */ /* clang-format off */ @@ -223,6 +244,7 @@ struct landlock_net_port_attr { #define LANDLOCK_ACCESS_FS_MAKE_SYM (1ULL << 12) #define LANDLOCK_ACCESS_FS_REFER (1ULL << 13) #define LANDLOCK_ACCESS_FS_TRUNCATE (1ULL << 14) +#define LANDLOCK_ACCESS_FS_IOCTL_DEV (1ULL << 15) /* clang-format on */ /** diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c15559432d3d..22d8b7c28074 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -5,8 +5,11 @@ * Copyright © 2016-2020 Mickaël Salaün * Copyright © 2018-2020 ANSSI * Copyright © 2021-2022 Microsoft Corporation + * Copyright © 2022 Günther Noack + * Copyright © 2023-2024 Google LLC */ +#include #include #include #include @@ -14,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +33,7 @@ #include #include #include +#include #include #include "common.h" @@ -84,6 +89,160 @@ static const struct landlock_object_underops landlock_fs_underops = { .release = release_inode }; +/* IOCTL helpers */ + +/** + * is_masked_device_ioctl - Determine whether an IOCTL command is always + * permitted with Landlock for device files. These commands can not be + * restricted on device files by enforcing a Landlock policy. + * + * @cmd: The IOCTL command that is supposed to be run. + * + * By default, any IOCTL on a device file requires the + * LANDLOCK_ACCESS_FS_IOCTL_DEV right. However, we blanket-permit some + * commands, if: + * + * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(), + * not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl(). + * + * 2. The command is harmless when invoked on devices. + * + * We also permit commands that do not make sense for devices, but where the + * do_vfs_ioctl() implementation returns a more conventional error code. + * + * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl() + * should be considered for inclusion here. + * + * Returns: true if the IOCTL @cmd can not be restricted with Landlock for + * device files. + */ +static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd) +{ + switch (cmd) { + /* + * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's + * close-on-exec and the file's buffered-IO and async flags. These + * operations are also available through fcntl(2), and are + * unconditionally permitted in Landlock. + */ + case FIOCLEX: + case FIONCLEX: + case FIONBIO: + case FIOASYNC: + /* + * FIOQSIZE queries the size of a regular file, directory, or link. + * + * We still permit it, because it always returns -ENOTTY for + * other file types. + */ + case FIOQSIZE: + /* + * FIFREEZE and FITHAW freeze and thaw the file system which the + * given file belongs to. Requires CAP_SYS_ADMIN. + * + * These commands operate on the file system's superblock rather + * than on the file itself. The same operations can also be + * done through any other file or directory on the same file + * system, so it is safe to permit these. + */ + case FIFREEZE: + case FITHAW: + /* + * FS_IOC_FIEMAP queries information about the allocation of + * blocks within a file. + * + * This IOCTL command only makes sense for regular files and is + * not implemented by devices. It is harmless to permit. + */ + case FS_IOC_FIEMAP: + /* + * FIGETBSZ queries the file system's block size for a file or + * directory. + * + * This command operates on the file system's superblock rather + * than on the file itself. The same operation can also be done + * through any other file or directory on the same file system, + * so it is safe to permit it. + */ + case FIGETBSZ: + /* + * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share + * their underlying storage ("reflink") between source and + * destination FDs, on file systems which support that. + * + * These IOCTL commands only apply to regular files + * and are harmless to permit for device files. + */ + case FICLONE: + case FICLONERANGE: + case FIDEDUPERANGE: + /* + * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on + * the file system superblock, not on the specific file, so + * these operations are available through any other file on the + * same file system as well. + */ + case FS_IOC_GETFSUUID: + case FS_IOC_GETFSSYSFSPATH: + return true; + + /* + * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and + * FS_IOC_FSSETXATTR are forwarded to device implementations. + */ + + /* + * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64, + * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are + * forwarded to device implementations, so not permitted. + */ + + /* Other commands are guarded by the access right. */ + default: + return false; + } +} + +/* + * is_masked_device_ioctl_compat - same as the helper above, but checking the + * "compat" IOCTL commands. + * + * The IOCTL commands with special handling in compat-mode should behave the + * same as their non-compat counterparts. + */ +static __attribute_const__ bool +is_masked_device_ioctl_compat(const unsigned int cmd) +{ + switch (cmd) { + /* FICLONE is permitted, same as in the non-compat variant. */ + case FICLONE: + return true; + +#if defined(CONFIG_X86_64) + /* + * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32, + * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted, + * for consistency with their non-compat variants. + */ + case FS_IOC_RESVSP_32: + case FS_IOC_RESVSP64_32: + case FS_IOC_UNRESVSP_32: + case FS_IOC_UNRESVSP64_32: + case FS_IOC_ZERO_RANGE_32: +#endif + + /* + * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device + * implementations. + */ + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + return false; + default: + return is_masked_device_ioctl(cmd); + } +} + /* Ruleset management */ static struct landlock_object *get_inode_object(struct inode *const inode) @@ -148,7 +307,8 @@ retry: LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) /* clang-format on */ /* @@ -1332,11 +1492,18 @@ static int hook_file_alloc_security(struct file *const file) return 0; } +static bool is_device(const struct file *const file) +{ + const struct inode *inode = file_inode(file); + + return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode); +} + static int hook_file_open(struct file *const file) { layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {}; - access_mask_t open_access_request, full_access_request, allowed_access; - const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE; + access_mask_t open_access_request, full_access_request, allowed_access, + optional_access; const struct landlock_ruleset *const dom = get_fs_domain(landlock_cred(file->f_cred)->domain); @@ -1354,6 +1521,10 @@ static int hook_file_open(struct file *const file) * We look up more access than what we immediately need for open(), so * that we can later authorize operations on opened files. */ + optional_access = LANDLOCK_ACCESS_FS_TRUNCATE; + if (is_device(file)) + optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV; + full_access_request = open_access_request | optional_access; if (is_access_to_paths_allowed( @@ -1410,6 +1581,52 @@ static int hook_file_truncate(struct file *const file) return -EACCES; } +static int hook_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + access_mask_t allowed_access = landlock_file(file)->allowed_access; + + /* + * It is the access rights at the time of opening the file which + * determine whether IOCTL can be used on the opened file later. + * + * The access right is attached to the opened file in hook_file_open(). + */ + if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV) + return 0; + + if (!is_device(file)) + return 0; + + if (is_masked_device_ioctl(cmd)) + return 0; + + return -EACCES; +} + +static int hook_file_ioctl_compat(struct file *file, unsigned int cmd, + unsigned long arg) +{ + access_mask_t allowed_access = landlock_file(file)->allowed_access; + + /* + * It is the access rights at the time of opening the file which + * determine whether IOCTL can be used on the opened file later. + * + * The access right is attached to the opened file in hook_file_open(). + */ + if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV) + return 0; + + if (!is_device(file)) + return 0; + + if (is_masked_device_ioctl_compat(cmd)) + return 0; + + return -EACCES; +} + static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_free_security, hook_inode_free_security), @@ -1432,6 +1649,8 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security), LSM_HOOK_INIT(file_open, hook_file_open), LSM_HOOK_INIT(file_truncate, hook_file_truncate), + LSM_HOOK_INIT(file_ioctl, hook_file_ioctl), + LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat), }; __init void landlock_add_fs_hooks(void) diff --git a/security/landlock/limits.h b/security/landlock/limits.h index 93c9c6f91556..20fdb5ff3514 100644 --- a/security/landlock/limits.h +++ b/security/landlock/limits.h @@ -18,7 +18,7 @@ #define LANDLOCK_MAX_NUM_LAYERS 16 #define LANDLOCK_MAX_NUM_RULES U32_MAX -#define LANDLOCK_LAST_ACCESS_FS LANDLOCK_ACCESS_FS_TRUNCATE +#define LANDLOCK_LAST_ACCESS_FS LANDLOCK_ACCESS_FS_IOCTL_DEV #define LANDLOCK_MASK_ACCESS_FS ((LANDLOCK_LAST_ACCESS_FS << 1) - 1) #define LANDLOCK_NUM_ACCESS_FS __const_hweight64(LANDLOCK_MASK_ACCESS_FS) #define LANDLOCK_SHIFT_ACCESS_FS 0 diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index 6788e73b6681..03b470f5a85a 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -149,7 +149,7 @@ static const struct file_operations ruleset_fops = { .write = fop_dummy_write, }; -#define LANDLOCK_ABI_VERSION 4 +#define LANDLOCK_ABI_VERSION 5 /** * sys_landlock_create_ruleset - Create a new ruleset diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index a6f89aaea77d..3c1e9f35b531 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -75,7 +75,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(4, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(5, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 27744524df51..17b00e6c778d 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -536,9 +536,10 @@ TEST_F_FORK(layout1, inval) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) -#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE +#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL_DEV #define ACCESS_ALL ( \ ACCESS_FILE | \ -- cgit v1.2.3 From 3ecf19e56843a6fe65f109c773728c36d220f947 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:13 +0000 Subject: selftests/landlock: Test IOCTL support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exercises Landlock's IOCTL feature in different combinations of handling and permitting the LANDLOCK_ACCESS_FS_IOCTL_DEV right, and in different combinations of using files and directories. Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-3-gnoack@google.com Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 192 ++++++++++++++++++++++++++++- 1 file changed, 189 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 17b00e6c778d..fd7793b413d1 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -8,6 +8,7 @@ */ #define _GNU_SOURCE +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +26,12 @@ #include #include +/* + * Intentionally included last to work around header conflict. + * See https://sourceware.org/glibc/wiki/Synchronizing_Headers. + */ +#include + #include "common.h" #ifndef renameat2 @@ -744,6 +752,9 @@ static int create_ruleset(struct __test_metadata *const _metadata, } for (i = 0; rules[i].path; i++) { + if (!rules[i].access) + continue; + add_path_beneath(_metadata, ruleset_fd, rules[i].access, rules[i].path); } @@ -3452,7 +3463,7 @@ TEST_F_FORK(layout1, truncate_unhandled) LANDLOCK_ACCESS_FS_WRITE_FILE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3535,7 +3546,7 @@ TEST_F_FORK(layout1, truncate) LANDLOCK_ACCESS_FS_TRUNCATE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3761,7 +3772,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate) }; int fd, ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, variant->handled, rules); ASSERT_LE(0, ruleset_fd); enforce_ruleset(_metadata, ruleset_fd); @@ -3854,6 +3865,181 @@ TEST(memfd_ftruncate) ASSERT_EQ(0, close(fd)); } +static int test_fionread_ioctl(int fd) +{ + size_t sz = 0; + + if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES) + return errno; + return 0; +} + +/* clang-format off */ +FIXTURE(ioctl) {}; + +FIXTURE_SETUP(ioctl) {}; + +FIXTURE_TEARDOWN(ioctl) {}; +/* clang-format on */ + +FIXTURE_VARIANT(ioctl) +{ + const __u64 handled; + const __u64 allowed; + const mode_t open_mode; + /* + * FIONREAD is used as a characteristic device-specific IOCTL command. + * It is implemented in fs/ioctl.c for regular files, + * but we do not blanket-permit it for devices. + */ + const int expected_fionread_result; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = 0, + .open_mode = O_RDWR, + .expected_fionread_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, unhandled) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_EXECUTE, + .allowed = LANDLOCK_ACCESS_FS_EXECUTE, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +TEST_F_FORK(ioctl, handle_dir_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd); + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + +TEST_F_FORK(ioctl, handle_dir_access_dir) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int dir_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Ignore variant->open_mode for this test, as we intend to open a + * directory. If the directory can not be opened, the variant is + * infeasible to test with an opened directory. + */ + dir_fd = open("/dev", O_RDONLY); + if (dir_fd < 0) + return; + + /* + * Checks that IOCTL commands return the expected errors. + * We do not use the expected values from the fixture here. + * + * When using IOCTL on a directory, no Landlock restrictions apply. + */ + EXPECT_EQ(0, test_fionread_ioctl(dir_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(dir_fd)); +} + +TEST_F_FORK(ioctl, handle_file_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev/zero", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd) + { + TH_LOG("Failed to open /dev/zero: %s", strerror(errno)); + } + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + /* clang-format off */ FIXTURE(layout1_bind) {}; /* clang-format on */ -- cgit v1.2.3 From dd6d32afdf5f1869a8f543e8efdd191f7e4b0368 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:14 +0000 Subject: selftests/landlock: Test IOCTL with memfds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because the LANDLOCK_ACCESS_FS_IOCTL_DEV right is associated with the opened file during open(2), IOCTLs are supposed to work with files which are opened by means other than open(2). Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-4-gnoack@google.com Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 44 ++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index fd7793b413d1..193dc58bac7a 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -3849,20 +3849,48 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes) ASSERT_EQ(0, close(socket_fds[1])); } -TEST(memfd_ftruncate) +/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */ +static int test_fs_ioc_getflags_ioctl(int fd) { - int fd; + uint32_t flags; + + if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0) + return errno; + return 0; +} - fd = memfd_create("name", MFD_CLOEXEC); - ASSERT_LE(0, fd); +TEST(memfd_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd, i; /* - * Checks that ftruncate is permitted on file descriptors that are - * created in ways other than open(2). + * We exercise the same test both with and without Landlock enabled, to + * ensure that it behaves the same in both cases. */ - EXPECT_EQ(0, test_ftruncate(fd)); + for (i = 0; i < 2; i++) { + /* Creates a new memfd. */ + fd = memfd_create("name", MFD_CLOEXEC); + ASSERT_LE(0, fd); - ASSERT_EQ(0, close(fd)); + /* + * Checks that operations associated with the opened file + * (ftruncate, ioctl) are permitted on file descriptors that are + * created in ways other than open(2). + */ + EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + } } static int test_fionread_ioctl(int fd) -- cgit v1.2.3 From 7954a1d155975a3fbc9570151358738f59605121 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:15 +0000 Subject: selftests/landlock: Test ioctl(2) and ftruncate(2) with open(O_PATH) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioctl(2) and ftruncate(2) operations on files opened with O_PATH should always return EBADF, independent of the LANDLOCK_ACCESS_FS_TRUNCATE and LANDLOCK_ACCESS_FS_IOCTL_DEV access rights in that file hierarchy. Suggested-by: Mickaël Salaün Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-5-gnoack@google.com Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 193dc58bac7a..cb6e9330fcf5 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -3902,6 +3902,46 @@ static int test_fionread_ioctl(int fd) return 0; } +TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd; + + /* + * Checks that for files opened with O_PATH, both ioctl(2) and + * ftruncate(2) yield EBADF, as it is documented in open(2) for the + * O_PATH flag. + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Checks that after enabling Landlock, + * - the file can still be opened with O_PATH + * - both ioctl and truncate still yield EBADF (not EACCES). + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); +} + /* clang-format off */ FIXTURE(ioctl) {}; -- cgit v1.2.3 From 56ffd377c7abe670ba5f024328e0c0fc0f49920c Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:16 +0000 Subject: selftests/landlock: Test IOCTLs on named pipes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Named pipes should behave like pipes created with pipe(2), so we don't want to restrict IOCTLs on them. Suggested-by: Mickaël Salaün Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-6-gnoack@google.com Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 43 ++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index cb6e9330fcf5..b133020c7761 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -3942,6 +3942,49 @@ TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) ASSERT_EQ(0, close(fd)); } +/* + * Named pipes are not governed by the LANDLOCK_ACCESS_FS_IOCTL_DEV right, + * because they are not character or block devices. + */ +TEST_F_FORK(layout1, named_pipe_ioctl) +{ + pid_t child_pid; + int fd, ruleset_fd; + const char *const path = file1_s1d1; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + ASSERT_EQ(0, unlink(path)); + ASSERT_EQ(0, mkfifo(path, 0600)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* The child process opens the pipe for writing. */ + child_pid = fork(); + ASSERT_NE(-1, child_pid); + if (child_pid == 0) { + fd = open(path, O_WRONLY); + close(fd); + exit(0); + } + + fd = open(path, O_RDONLY); + ASSERT_LE(0, fd); + + /* FIONREAD is implemented by pipefifo_fops. */ + EXPECT_EQ(0, test_fionread_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + ASSERT_EQ(0, unlink(path)); + + ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0)); +} + /* clang-format off */ FIXTURE(ioctl) {}; -- cgit v1.2.3 From f83d51a5bdfe5ab56c27fd5dfe01c613b37e2dad Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:17 +0000 Subject: selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LANDLOCK_ACCESS_FS_IOCTL_DEV right should have no effect on the use of named UNIX domain sockets. Suggested-by: Mickaël Salaün Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-7-gnoack@google.com [mic: Add missing stddef.h for offsetof()] Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index b133020c7761..8443f329ac7c 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -21,8 +22,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -3985,6 +3988,56 @@ TEST_F_FORK(layout1, named_pipe_ioctl) ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0)); } +/* For named UNIX domain sockets, no IOCTL restrictions apply. */ +TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) +{ + const char *const path = file1_s1d1; + int srv_fd, cli_fd, ruleset_fd; + socklen_t size; + struct sockaddr_un srv_un, cli_un; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + /* Sets up a server */ + srv_un.sun_family = AF_UNIX; + strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); + + ASSERT_EQ(0, unlink(path)); + srv_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, srv_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path); + ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size)); + ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Sets up a client connection to it */ + cli_un.sun_family = AF_UNIX; + cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, cli_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size)); + + bzero(&cli_un, sizeof(cli_un)); + cli_un.sun_family = AF_UNIX; + strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + + ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size)); + + /* FIONREAD and other IOCTLs should not be forbidden. */ + EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); + + ASSERT_EQ(0, close(cli_fd)); +} + /* clang-format off */ FIXTURE(ioctl) {}; -- cgit v1.2.3 From bce605e0cfa55da513b672500dd838be1ef41de7 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 19 Apr 2024 16:11:18 +0000 Subject: selftests/landlock: Exhaustive test for the IOCTL allow-list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This test checks all IOCTL commands implemented in do_vfs_ioctl(). Test coverage for security/landlock is 90.9% of 722 lines according to gcc/gcov-13. Suggested-by: Mickaël Salaün Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20240419161122.2023765-8-gnoack@google.com [mic: Add test coverage] Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 114 +++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 8443f329ac7c..6b5a9ff88c3d 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -3945,6 +3946,119 @@ TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) ASSERT_EQ(0, close(fd)); } +/* + * ioctl_error - generically call the given ioctl with a pointer to a + * sufficiently large zeroed-out memory region. + * + * Returns the IOCTLs error, or 0. + */ +static int ioctl_error(struct __test_metadata *const _metadata, int fd, + unsigned int cmd) +{ + char buf[128]; /* sufficiently large */ + int res, stdinbak_fd; + + /* + * Depending on the IOCTL command, parts of the zeroed-out buffer might + * be interpreted as file descriptor numbers. We do not want to + * accidentally operate on file descriptor 0 (stdin), so we temporarily + * move stdin to a different FD and close FD 0 for the IOCTL call. + */ + stdinbak_fd = dup(0); + ASSERT_LT(0, stdinbak_fd); + ASSERT_EQ(0, close(0)); + + /* Invokes the IOCTL with a zeroed-out buffer. */ + bzero(&buf, sizeof(buf)); + res = ioctl(fd, cmd, &buf); + + /* Restores the old FD 0 and closes the backup FD. */ + ASSERT_EQ(0, dup2(stdinbak_fd, 0)); + ASSERT_EQ(0, close(stdinbak_fd)); + + if (res < 0) + return errno; + + return 0; +} + +/* Define some linux/falloc.h IOCTL commands which are not available in uapi headers. */ +struct space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserved area */ +}; + +#define FS_IOC_RESVSP _IOW('X', 40, struct space_resv) +#define FS_IOC_UNRESVSP _IOW('X', 41, struct space_resv) +#define FS_IOC_RESVSP64 _IOW('X', 42, struct space_resv) +#define FS_IOC_UNRESVSP64 _IOW('X', 43, struct space_resv) +#define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) + +/* + * Tests a series of blanket-permitted and denied IOCTLs. + */ +TEST_F_FORK(layout1, blanket_permitted_ioctls) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + int ruleset_fd, fd; + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + fd = open("/dev/null", O_RDWR | O_CLOEXEC); + ASSERT_LE(0, fd); + + /* + * Checks permitted commands. + * These ones may return errors, but should not be blocked by Landlock. + */ + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONBIO)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOASYNC)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOQSIZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIFREEZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FITHAW)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_FIEMAP)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIGETBSZ)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIDEDUPERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSUUID)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSSYSFSPATH)); + + /* + * Checks blocked commands. + * A call to a blocked IOCTL command always returns EACCES. + */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIONREAD)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_SETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSGETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSSETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIBMAP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_ZERO_RANGE)); + + /* Default case is also blocked. */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, 0xc00ffeee)); + + ASSERT_EQ(0, close(fd)); +} + /* * Named pipes are not governed by the LANDLOCK_ACCESS_FS_IOCTL_DEV right, * because they are not character or block devices. -- cgit v1.2.3 From 78490b74435a8c738e91260e7df387e7cb6d6568 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 23 Apr 2024 16:23:38 +0000 Subject: selftests/user_events: Add non-spacing separator check The ABI documentation indicates that field separators do not need a space between them, only a ';'. When no spacing is used, the register must work. Any subsequent register, with or without spaces, must match and not return -EADDRINUSE. Add a non-spacing separator case to our self-test register case to ensure it works going forward. Link: https://lore.kernel.org/linux-trace-kernel/20240423162338.292-3-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/user_events/ftrace_test.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index dcd7509fe2e0..0bb46793dcd4 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -261,6 +261,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Register without separator spacing should still match */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;u32 field2"; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ reg.enable_bit = 29; reg.name_args = (__u64)"__test_event u32 field1;"; @@ -288,6 +294,8 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); + unreg.disable_bit = 29; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); /* Delete should have been auto-done after close and unregister */ close(self->data_fd); -- cgit v1.2.3 From c1457d9aad5ee2feafcf85aa9a58ab50500159d2 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Fri, 10 May 2024 00:06:25 +0000 Subject: selftests/cgroup: Drop define _GNU_SOURCE _GNU_SOURCE is provided by lib.mk, so it should be dropped to prevent redefinition warnings. Signed-off-by: Edward Liaw Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/cgroup_util.c | 3 --- tools/testing/selftests/cgroup/test_core.c | 2 -- tools/testing/selftests/cgroup/test_cpu.c | 2 -- tools/testing/selftests/cgroup/test_hugetlb_memcg.c | 2 -- tools/testing/selftests/cgroup/test_kmem.c | 2 -- tools/testing/selftests/cgroup/test_memcontrol.c | 2 -- tools/testing/selftests/cgroup/test_zswap.c | 2 -- 7 files changed, 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 432db923bced..ce16a50ecff8 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -1,7 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ - -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index a5672a91d273..de8baad46022 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -1,6 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ - -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index dad2ed82f3ef..5a4a314f6af7 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 - -#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c index 856f9508ea56..80d05d50a42d 100644 --- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 96693d8772be..2e453ac50c0d 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 41ae8047b889..c871630d62a3 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1,6 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define _GNU_SOURCE - #include #include #include diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index fa571bfd2432..8418a8d7439f 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - #include #include #include -- cgit v1.2.3 From f37dc28ac6e2624afd7916faacea259f57f5ca82 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 10 May 2024 09:48:11 +0100 Subject: selftest: epoll_busy_poll: Fix spelling mistake "couldnt" -> "couldn't" There is a spelling mistake in a TH_LOG message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240510084811.3299685-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/epoll_busy_poll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c index 9dd2830fd67c..16e457c2f877 100644 --- a/tools/testing/selftests/net/epoll_busy_poll.c +++ b/tools/testing/selftests/net/epoll_busy_poll.c @@ -232,7 +232,7 @@ TEST_F(epoll_busy_poll, test_set_invalid) ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_CLEAR); EXPECT_EQ(0, ret) - TH_LOG("couldnt clear CAP_NET_ADMIN"); + TH_LOG("couldn't clear CAP_NET_ADMIN"); ret = cap_set_proc(self->caps); EXPECT_EQ(0, ret) -- cgit v1.2.3 From cfc2eefd40f1bc121a41a2acd54458046d77f9ae Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 10 May 2024 14:28:56 +0300 Subject: selftests: net: use upstream mtools Joachim kindly merged the IPv6 support in https://github.com/troglobit/mtools/pull/2, so we can just use his version now. A few more fixes subsequently came in for IPv6, so even better. Check that the deployed mtools version is 3.0 or above. Note that the version check breaks compatibility with my fork where I didn't bump the version, but I assume that won't be a problem. Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20240510112856.1262901-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 3353a1745946..112c85c35092 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -309,6 +309,21 @@ require_command() fi } +# IPv6 support was added in v3.0 +check_mtools_version() +{ + local version="$(msend -v)" + local major + + version=${version##msend version } + major=$(echo $version | cut -d. -f1) + + if [ $major -lt 3 ]; then + echo "SKIP: expected mtools version 3.0, got $version" + exit $ksft_skip + fi +} + if [[ "$REQUIRE_JQ" = "yes" ]]; then require_command jq fi @@ -316,10 +331,10 @@ if [[ "$REQUIRE_MZ" = "yes" ]]; then require_command $MZ fi if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then - # https://github.com/vladimiroltean/mtools/ - # patched for IPv6 support + # https://github.com/troglobit/mtools require_command msend require_command mreceive + check_mtools_version fi ############################################################################## -- cgit v1.2.3 From 5fcc17dfe05e127a38d24b0e5bf93aaba01fdddc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 11 May 2024 08:48:03 +0200 Subject: selftests: netfilter: nft_flowtable.sh: bump socat timeout to 1m Now that this test runs in netdev CI it looks like 10s isn't enough for debug kernels: selftests: net/netfilter: nft_flowtable.sh 2024/05/10 20:33:08 socat[12204] E write(7, 0x563feb16a000, 8192): Broken pipe FAIL: file mismatch for ns1 -> ns2 -rw------- 1 root root 37345280 May 10 20:32 /tmp/tmp.Am0yEHhNqI ... Looks like socat gets zapped too quickly, so increase timeout to 1m. Could also reduce tx file size for KSFT_MACHINE_SLOW, but its preferrable to have same test for both debug and nondebug. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240511064814.561525-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nft_flowtable.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 86d516e8acd6..b3995550856a 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -17,6 +17,7 @@ source lib.sh ret=0 +SOCAT_TIMEOUT=60 nsin="" ns1out="" @@ -350,12 +351,12 @@ test_tcp_forwarding_ip() local dstport=$4 local lret=0 - timeout 10 ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & lpid=$! busywait 1000 listener_ready - timeout 10 ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" wait $lpid -- cgit v1.2.3 From bc21faefbe58feff0ae7cae8b52c4145073e2208 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Thu, 9 May 2024 21:08:19 +0200 Subject: selftests/net: add flush id selftests Added flush id selftests to test different cases where DF flag is set or unset and id value changes in the following packets. All cases where the packets should coalesce or should not coalesce are tested. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240509190819.2985-4-richardbgobert@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/gro.c | 138 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 6038b96ecee8..b2184847e388 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -93,6 +93,7 @@ static bool tx_socket = true; static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; +static const int num_flush_id_cases = 6; static void vlog(const char *fmt, ...) { @@ -620,6 +621,113 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE); } +static void fix_ip4_checksum(struct iphdr *iph) +{ + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); +} + +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +{ + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN]; + bool send_three = false; + struct iphdr *iph1; + struct iphdr *iph2; + struct iphdr *iph3; + + iph1 = (struct iphdr *)(buf1 + ETH_HLEN); + iph2 = (struct iphdr *)(buf2 + ETH_HLEN); + iph3 = (struct iphdr *)(buf3 + ETH_HLEN); + + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0); + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + + switch (tcase) { + case 0: /* DF=1, Incrementing - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + break; + + case 1: /* DF=1, Fixed - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + break; + + case 2: /* DF=0, Incrementing - should coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(9); + break; + + case 3: /* DF=0, Fixed - should not coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(8); + break; + + case 4: /* DF=1, two packets incrementing, and one fixed - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + + case 5: /* DF=1, two packets fixed, and one incrementing - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + } + + fix_ip4_checksum(iph1); + fix_ip4_checksum(iph2); + write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr); + write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr); + + if (send_three) { + fix_ip4_checksum(iph3); + write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) +{ + for (int i = 0; i < num_flush_id_cases; i++) { + sleep(1); + send_flush_id_case(fd, daddr, i); + sleep(1); + write_packet(fd, fin_pkt, total_hdr_len, daddr); + } +} + static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -938,6 +1046,8 @@ static void gro_sender(void) send_fragment4(txfd, &daddr); sleep(1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + test_flush_id(txfd, &daddr, fin_pkt); } else if (proto == PF_INET6) { sleep(1); send_fragment6(txfd, &daddr); @@ -1064,6 +1174,34 @@ static void gro_receiver(void) printf("fragmented ip4 doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); + + /* is_atomic checks */ + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Fixed - should not coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); } else if (proto == PF_INET6) { /* GRO doesn't check for ipv6 hop limit when flushing. * Hence no corresponding test to the ipv4 case. -- cgit v1.2.3 From b56035101e1cdd9c4420ea5da17f09f87fb69285 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Fri, 10 May 2024 23:19:26 +0300 Subject: netdev: Add queue stats for TX stop and wake TX queue stop and wake are counted by some drivers. Support reporting these via netdev-genl queue stats. Signed-off-by: Daniel Jurgens Reviewed-by: Jiri Pirko Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20240510201927.1821109-2-danielj@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 14 ++++++++++++++ include/net/netdev_queues.h | 3 +++ include/uapi/linux/netdev.h | 2 ++ net/core/netdev-genl.c | 4 +++- tools/include/uapi/linux/netdev.h | 2 ++ 5 files changed, 24 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 2be4b3714d17..11a32373365a 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -439,6 +439,20 @@ attribute-sets: Number of the packets dropped by the device due to the transmit packets bitrate exceeding the device rate limit. type: uint + - + name: tx-stop + doc: | + Number of times driver paused accepting new tx packets + from the stack to this queue, because the queue was full. + Note that if BQL is supported and enabled on the device + the networking stack will avoid queuing a lot of data at once. + type: uint + - + name: tx-wake + doc: | + Number of times driver re-started accepting send + requests to this queue from the stack. + type: uint operations: list: diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index e7b84f018cee..a8a7e48dfa6c 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -41,6 +41,9 @@ struct netdev_queue_stats_tx { u64 hw_gso_wire_bytes; u64 hw_drop_ratelimits; + + u64 stop; + u64 wake; }; /** diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index cf24f1d9adf8..a8188202413e 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -165,6 +165,8 @@ enum { NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_STOP, + NETDEV_A_QSTATS_TX_WAKE, __NETDEV_A_QSTATS_MAX, NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 4b5054087309..1f6ae6379e0f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -517,7 +517,9 @@ netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index cf24f1d9adf8..a8188202413e 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -165,6 +165,8 @@ enum { NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_STOP, + NETDEV_A_QSTATS_TX_WAKE, __NETDEV_A_QSTATS_MAX, NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) -- cgit v1.2.3 From ec8c25746e3232634f7842755a0c4bc08def6a49 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sat, 11 May 2024 01:22:02 +0200 Subject: ynl: ensure exact-len value is resolved For type String and Binary we are currently usinig the exact-len limit value as is without attempting any name resolution. However, the spec may specify the name of a constant rather than an actual value, which would result in using the constant name as is and thus break the policy. Ensure the limit value is passed to get_limit(), which will always attempt resolving the name before printing the policy rule. Signed-off-by: Antonio Quartulli Link: https://lore.kernel.org/r/20240510232202.24051-1-a@unstable.cc Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index c0b90c104d92..a42d62b23ee0 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -413,7 +413,7 @@ class TypeString(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' else: mem = '{ .type = ' + policy if 'max-len' in self.checks: @@ -465,7 +465,7 @@ class TypeBinary(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' else: mem = '{ ' if len(self.checks) == 1 and 'min-len' in self.checks: -- cgit v1.2.3 From 75961e55415cdc368ca2d7f6203fb627c259d58a Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 10 May 2024 15:04:34 +0100 Subject: ring-buffer/selftest: Add ring-buffer mapping test Map a ring-buffer, validate the meta-page before and after emitting few events. Also check ring-buffer mapping boundaries and finally ensure the tracing snapshot is mutually exclusive. Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-6-vdonnefort@google.com Cc: Shuah Khan Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Acked-by: Muhammad Usama Anjum Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/ring-buffer/.gitignore | 1 + tools/testing/selftests/ring-buffer/Makefile | 8 + tools/testing/selftests/ring-buffer/config | 2 + tools/testing/selftests/ring-buffer/map_test.c | 294 +++++++++++++++++++++++++ 4 files changed, 305 insertions(+) create mode 100644 tools/testing/selftests/ring-buffer/.gitignore create mode 100644 tools/testing/selftests/ring-buffer/Makefile create mode 100644 tools/testing/selftests/ring-buffer/config create mode 100644 tools/testing/selftests/ring-buffer/map_test.c (limited to 'tools') diff --git a/tools/testing/selftests/ring-buffer/.gitignore b/tools/testing/selftests/ring-buffer/.gitignore new file mode 100644 index 000000000000..3aed1a2a6c67 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/.gitignore @@ -0,0 +1 @@ +map_test diff --git a/tools/testing/selftests/ring-buffer/Makefile b/tools/testing/selftests/ring-buffer/Makefile new file mode 100644 index 000000000000..627c5fa6d1ab --- /dev/null +++ b/tools/testing/selftests/ring-buffer/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS = map_test + +include ../lib.mk diff --git a/tools/testing/selftests/ring-buffer/config b/tools/testing/selftests/ring-buffer/config new file mode 100644 index 000000000000..d936f8f00e78 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/config @@ -0,0 +1,2 @@ +CONFIG_FTRACE=y +CONFIG_TRACER_SNAPSHOT=y diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c new file mode 100644 index 000000000000..a9006fa7097e --- /dev/null +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ring-buffer memory mapping tests + * + * Copyright (c) 2024 Vincent Donnefort + */ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "../user_events/user_events_selftests.h" /* share tracefs setup */ +#include "../kselftest_harness.h" + +#define TRACEFS_ROOT "/sys/kernel/tracing" + +static int __tracefs_write(const char *path, const char *value) +{ + int fd, ret; + + fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + return fd; + + ret = write(fd, value, strlen(value)); + + close(fd); + + return ret == -1 ? -errno : 0; +} + +static int __tracefs_write_int(const char *path, int value) +{ + char *str; + int ret; + + if (asprintf(&str, "%d", value) < 0) + return -1; + + ret = __tracefs_write(path, str); + + free(str); + + return ret; +} + +#define tracefs_write_int(path, value) \ + ASSERT_EQ(__tracefs_write_int((path), (value)), 0) + +#define tracefs_write(path, value) \ + ASSERT_EQ(__tracefs_write((path), (value)), 0) + +static int tracefs_reset(void) +{ + if (__tracefs_write_int(TRACEFS_ROOT"/tracing_on", 0)) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/trace", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/set_event", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/current_tracer", "nop")) + return -1; + + return 0; +} + +struct tracefs_cpu_map_desc { + struct trace_buffer_meta *meta; + int cpu_fd; +}; + +int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu) +{ + int page_size = getpagesize(); + char *cpu_path; + void *map; + + if (asprintf(&cpu_path, + TRACEFS_ROOT"/per_cpu/cpu%d/trace_pipe_raw", + cpu) < 0) + return -ENOMEM; + + desc->cpu_fd = open(cpu_path, O_RDONLY | O_NONBLOCK); + free(cpu_path); + if (desc->cpu_fd < 0) + return -ENODEV; + + map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0); + if (map == MAP_FAILED) + return -errno; + + desc->meta = (struct trace_buffer_meta *)map; + + return 0; +} + +void tracefs_cpu_unmap(struct tracefs_cpu_map_desc *desc) +{ + munmap(desc->meta, desc->meta->meta_page_size); + close(desc->cpu_fd); +} + +FIXTURE(map) { + struct tracefs_cpu_map_desc map_desc; + bool umount; +}; + +FIXTURE_VARIANT(map) { + int subbuf_size; +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_4k) { + .subbuf_size = 4, +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_8k) { + .subbuf_size = 8, +}; + +FIXTURE_SETUP(map) +{ + int cpu = sched_getcpu(); + cpu_set_t cpu_mask; + bool fail, umount; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; + + ASSERT_GE(cpu, 0); + + ASSERT_EQ(tracefs_reset(), 0); + + tracefs_write_int(TRACEFS_ROOT"/buffer_subbuf_size_kb", variant->subbuf_size); + + ASSERT_EQ(tracefs_cpu_map(&self->map_desc, cpu), 0); + + /* + * Ensure generated events will be found on this very same ring-buffer. + */ + CPU_ZERO(&cpu_mask); + CPU_SET(cpu, &cpu_mask); + ASSERT_EQ(sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask), 0); +} + +FIXTURE_TEARDOWN(map) +{ + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); + + tracefs_cpu_unmap(&self->map_desc); +} + +TEST_F(map, meta_page_check) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + int cnt = 0; + + ASSERT_EQ(desc->meta->entries, 0); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 0); + + ASSERT_EQ(desc->meta->reader.id, 0); + ASSERT_EQ(desc->meta->reader.read, 0); + + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + ASSERT_EQ(desc->meta->reader.id, 0); + + tracefs_write_int(TRACEFS_ROOT"/tracing_on", 1); + for (int i = 0; i < 16; i++) + tracefs_write_int(TRACEFS_ROOT"/trace_marker", i); +again: + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + + ASSERT_EQ(desc->meta->entries, 16); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 16); + + ASSERT_EQ(desc->meta->reader.id, 1); + + if (!(cnt++)) + goto again; +} + +TEST_F(map, data_mmap) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + unsigned long meta_len, data_len; + void *data; + + meta_len = desc->meta->meta_page_size; + data_len = desc->meta->subbuf_size * desc->meta->nr_subbufs; + + /* Map all the available subbufs */ + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Map all the available subbufs - 1 */ + data_len -= desc->meta->subbuf_size; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Overflow the available subbufs by 1 */ + meta_len += desc->meta->subbuf_size * 2; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_EQ(data, MAP_FAILED); +} + +FIXTURE(snapshot) { + bool umount; +}; + +FIXTURE_SETUP(snapshot) +{ + bool fail, umount; + struct stat sb; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (stat(TRACEFS_ROOT"/snapshot", &sb)) + SKIP(return, "Skipping: %s", "snapshot not available"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; +} + +FIXTURE_TEARDOWN(snapshot) +{ + __tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "!snapshot"); + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); +} + +TEST_F(snapshot, excludes_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_GE(cpu, 0); + tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"); + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), -EBUSY); +} + +TEST_F(snapshot, excluded_by_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), 0); + + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"), -EBUSY); + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/snapshot", + "1"), -EBUSY); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From eafbf0574e05160c9a256666261d04a1bc59fa71 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 10 May 2024 16:37:10 +0200 Subject: test: hsr: Extend the hsr_redbox.sh to have more SAN devices connected After this change the single SAN device (ns3eth1) is now replaced with two SAN devices - respectively ns4eth1 and ns5eth1. It is possible to extend this script to have more SAN devices connected by adding them to ns3br1 bridge. Signed-off-by: Lukasz Majewski Link: https://lore.kernel.org/r/20240510143710.3916631-1-lukma@denx.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/hsr/hsr_redbox.sh | 71 ++++++++++++++++++--------- 1 file changed, 49 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh index db69be95ecb3..1f36785347c0 100755 --- a/tools/testing/selftests/net/hsr/hsr_redbox.sh +++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh @@ -8,12 +8,19 @@ source ./hsr_common.sh do_complete_ping_test() { echo "INFO: Initial validation ping (HSR-SAN/RedBox)." - # Each node has to be able each one. + # Each node has to be able to reach each one. do_ping "${ns1}" 100.64.0.2 do_ping "${ns2}" 100.64.0.1 - # Ping from SAN to hsr1 (via hsr2) + # Ping between SANs (test bridge) + do_ping "${ns4}" 100.64.0.51 + do_ping "${ns5}" 100.64.0.41 + # Ping from SANs to hsr1 (via hsr2) (and opposite) do_ping "${ns3}" 100.64.0.1 do_ping "${ns1}" 100.64.0.3 + do_ping "${ns1}" 100.64.0.41 + do_ping "${ns4}" 100.64.0.1 + do_ping "${ns1}" 100.64.0.51 + do_ping "${ns5}" 100.64.0.1 stop_if_error "Initial validation failed." # Wait for MGNT HSR frames being received and nodes being @@ -23,8 +30,12 @@ do_complete_ping_test() echo "INFO: Longer ping test (HSR-SAN/RedBox)." # Ping from SAN to hsr1 (via hsr2) do_ping_long "${ns3}" 100.64.0.1 - # Ping from hsr1 (via hsr2) to SAN + # Ping from hsr1 (via hsr2) to SANs (and opposite) do_ping_long "${ns1}" 100.64.0.3 + do_ping_long "${ns1}" 100.64.0.41 + do_ping_long "${ns4}" 100.64.0.1 + do_ping_long "${ns1}" 100.64.0.51 + do_ping_long "${ns5}" 100.64.0.1 stop_if_error "Longer ping test failed." echo "INFO: All good." @@ -35,22 +46,26 @@ setup_hsr_interfaces() local HSRv="$1" echo "INFO: preparing interfaces for HSRv${HSRv} (HSR-SAN/RedBox)." - -# |NS1 | -# | | -# | /-- hsr1 --\ | -# | ns1eth1 ns1eth2 | -# |------------------------| -# | | -# | | -# | | -# |------------------------| |-----------| -# | ns2eth1 ns2eth2 | | | -# | \-- hsr2 --/ | | | -# | \ | | | -# | ns2eth3 |--------| ns3eth1 | -# | (interlink)| | | -# |NS2 (RedBOX) | |NS3 (SAN) | +# +# IPv4 addresses (100.64.X.Y/24), and [X.Y] is presented on below diagram: +# +# +# |NS1 | |NS4 | +# | [0.1] | | | +# | /-- hsr1 --\ | | [0.41] | +# | ns1eth1 ns1eth2 | | ns4eth1 (SAN) | +# |------------------------| |-------------------| +# | | | +# | | | +# | | | +# |------------------------| |-------------------------------| +# | ns2eth1 ns2eth2 | | ns3eth2 | +# | \-- hsr2 --/ | | / | +# | [0.2] \ | | / | |------------| +# | ns2eth3 |---| ns3eth1 -- ns3br1 -- ns3eth3--|--| ns5eth1 | +# | (interlink)| | [0.3] [0.11] | | [0.51] | +# |NS2 (RedBOX) | |NS3 (BR) | | NS5 (SAN) | +# # # Check if iproute2 supports adding interlink port to hsrX device ip link help hsr | grep -q INTERLINK @@ -59,7 +74,9 @@ setup_hsr_interfaces() # Create interfaces for name spaces ip link add ns1eth1 netns "${ns1}" type veth peer name ns2eth1 netns "${ns2}" ip link add ns1eth2 netns "${ns1}" type veth peer name ns2eth2 netns "${ns2}" - ip link add ns3eth1 netns "${ns3}" type veth peer name ns2eth3 netns "${ns2}" + ip link add ns2eth3 netns "${ns2}" type veth peer name ns3eth1 netns "${ns3}" + ip link add ns3eth2 netns "${ns3}" type veth peer name ns4eth1 netns "${ns4}" + ip link add ns3eth3 netns "${ns3}" type veth peer name ns5eth1 netns "${ns5}" sleep 1 @@ -70,21 +87,31 @@ setup_hsr_interfaces() ip -n "${ns2}" link set ns2eth2 up ip -n "${ns2}" link set ns2eth3 up - ip -n "${ns3}" link set ns3eth1 up + ip -n "${ns3}" link add name ns3br1 type bridge + ip -n "${ns3}" link set ns3br1 up + ip -n "${ns3}" link set ns3eth1 master ns3br1 up + ip -n "${ns3}" link set ns3eth2 master ns3br1 up + ip -n "${ns3}" link set ns3eth3 master ns3br1 up + + ip -n "${ns4}" link set ns4eth1 up + ip -n "${ns5}" link set ns5eth1 up ip -net "${ns1}" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version ${HSRv} proto 0 ip -net "${ns2}" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 interlink ns2eth3 supervision 45 version ${HSRv} proto 0 ip -n "${ns1}" addr add 100.64.0.1/24 dev hsr1 ip -n "${ns2}" addr add 100.64.0.2/24 dev hsr2 + ip -n "${ns3}" addr add 100.64.0.11/24 dev ns3br1 ip -n "${ns3}" addr add 100.64.0.3/24 dev ns3eth1 + ip -n "${ns4}" addr add 100.64.0.41/24 dev ns4eth1 + ip -n "${ns5}" addr add 100.64.0.51/24 dev ns5eth1 ip -n "${ns1}" link set hsr1 up ip -n "${ns2}" link set hsr2 up } check_prerequisites -setup_ns ns1 ns2 ns3 +setup_ns ns1 ns2 ns3 ns4 ns5 trap cleanup_all_ns EXIT -- cgit v1.2.3 From d9bab776ed9e50fa076d2b42544b0c612be16d7c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 13 May 2024 16:46:02 +0200 Subject: tools arch x86: Add dell-uart-backlight-emulator Dell All In One (AIO) models released after 2017 use a backlight controller board connected to an UART. Add a small emulator to allow development and testing of the drivers/platform/x86/dell/dell-uart-backlight.c driver for this board, without requiring access to an actual Dell All In One. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240513144603.93874-3-hdegoede@redhat.com --- .../x86/dell-uart-backlight-emulator/.gitignore | 1 + .../arch/x86/dell-uart-backlight-emulator/Makefile | 19 +++ tools/arch/x86/dell-uart-backlight-emulator/README | 46 ++++++ .../dell-uart-backlight-emulator.c | 163 +++++++++++++++++++++ 4 files changed, 229 insertions(+) create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/.gitignore create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/Makefile create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/README create mode 100644 tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c (limited to 'tools') diff --git a/tools/arch/x86/dell-uart-backlight-emulator/.gitignore b/tools/arch/x86/dell-uart-backlight-emulator/.gitignore new file mode 100644 index 000000000000..5c8cad8d72b9 --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/.gitignore @@ -0,0 +1 @@ +dell-uart-backlight-emulator diff --git a/tools/arch/x86/dell-uart-backlight-emulator/Makefile b/tools/arch/x86/dell-uart-backlight-emulator/Makefile new file mode 100644 index 000000000000..6ea1d9fd534b --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for Intel Software Defined Silicon provisioning tool + +dell-uart-backlight-emulator: dell-uart-backlight-emulator.c + +BINDIR ?= /usr/bin + +override CFLAGS += -O2 -Wall + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +.PHONY : clean +clean : + @rm -f dell-uart-backlight-emulator + +install : dell-uart-backlight-emulator + install -d $(DESTDIR)$(BINDIR) + install -m 755 -p dell-uart-backlight-emulator $(DESTDIR)$(BINDIR)/dell-uart-backlight-emulator diff --git a/tools/arch/x86/dell-uart-backlight-emulator/README b/tools/arch/x86/dell-uart-backlight-emulator/README new file mode 100644 index 000000000000..c0d8e52046ee --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/README @@ -0,0 +1,46 @@ +Emulator for DELL0501 UART attached backlight controller +-------------------------------------------------------- + +Dell All In One (AIO) models released after 2017 use a backlight controller +board connected to an UART. + +In DSDT this uart port will be defined as: + + Name (_HID, "DELL0501") + Name (_CID, EisaId ("PNP0501") + +With the DELL0501 indicating that we are dealing with an UART with +the backlight controller board attached. + +This small emulator allows testing +the drivers/platform/x86/dell/dell-uart-backlight.c driver without access +to an actual Dell All In One. + +This requires: +1. A (desktop) PC with a 16550 UART on the motherboard and a standard DB9 + connector connected to this UART. +2. A DB9 NULL modem cable. +3. A second DB9 serial port, this can e.g. be a USB to serial converter + with a DB9 connector plugged into the same desktop PC. +4. A DSDT overlay for the desktop PC replacing the _HID of the 16550 UART + ACPI Device() with "DELL0501" and adding a _CID of "PNP0501", see + DSDT.patch for an example of the necessary DSDT changes. + +With everything setup and the NULL modem cable connected between +the 2 serial ports run: + +./dell-uart-backlight-emulator + +For example when using an USB to serial converter for the second port: + +./dell-uart-backlight-emulator /dev/ttyUSB0 + +And then (re)load the dell-uart-backlight driver: + +sudo rmmod dell-uart-backlight; sudo modprobe dell-uart-backlight dyndbg + +After this check "dmesg" to see if the driver correctly received +the firmware version string from the emulator. If this works there +should be a /sys/class/backlight/dell_uart_backlight/ directory now +and writes to the brightness or bl_power files should be reflected +by matching output from the emulator. diff --git a/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c b/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c new file mode 100644 index 000000000000..655b6c96d8cf --- /dev/null +++ b/tools/arch/x86/dell-uart-backlight-emulator/dell-uart-backlight-emulator.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Dell AIO Serial Backlight board emulator for testing + * the Linux dell-uart-backlight driver. + * + * Copyright (C) 2024 Hans de Goede + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int serial_fd; +int brightness = 50; + +static unsigned char dell_uart_checksum(unsigned char *buf, int len) +{ + unsigned char val = 0; + + while (len-- > 0) + val += buf[len]; + + return val ^ 0xff; +} + +/* read() will return -1 on SIGINT / SIGTERM causing the mainloop to cleanly exit */ +void signalhdlr(int signum) +{ +} + +int main(int argc, char *argv[]) +{ + struct sigaction sigact = { .sa_handler = signalhdlr }; + unsigned char buf[4], csum, response[32]; + const char *version_str = "PHI23-V321"; + struct termios tty, saved_tty; + int ret, idx, len = 0; + + if (argc != 2) { + fprintf(stderr, "Invalid or missing arguments\n"); + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + serial_fd = open(argv[1], O_RDWR | O_NOCTTY); + if (serial_fd == -1) { + fprintf(stderr, "Error opening %s: %s\n", argv[1], strerror(errno)); + return 1; + } + + ret = tcgetattr(serial_fd, &tty); + if (ret == -1) { + fprintf(stderr, "Error getting tcattr: %s\n", strerror(errno)); + goto out_close; + } + saved_tty = tty; + + cfsetspeed(&tty, 9600); + cfmakeraw(&tty); + tty.c_cflag &= ~CSTOPB; + tty.c_cflag &= ~CRTSCTS; + tty.c_cflag |= CLOCAL | CREAD; + + ret = tcsetattr(serial_fd, TCSANOW, &tty); + if (ret == -1) { + fprintf(stderr, "Error setting tcattr: %s\n", strerror(errno)); + goto out_restore; + } + + sigaction(SIGINT, &sigact, 0); + sigaction(SIGTERM, &sigact, 0); + + idx = 0; + while (read(serial_fd, &buf[idx], 1) == 1) { + if (idx == 0) { + switch (buf[0]) { + /* 3 MSB bits: cmd-len + 01010 SOF marker */ + case 0x6a: len = 3; break; + case 0x8a: len = 4; break; + default: + fprintf(stderr, "Error unexpected first byte: 0x%02x\n", buf[0]); + continue; /* Try to sync up with sender */ + } + } + + /* Process msg when len bytes have been received */ + if (idx != (len - 1)) { + idx++; + continue; + } + + /* Reset idx for next command */ + idx = 0; + + csum = dell_uart_checksum(buf, len - 1); + if (buf[len - 1] != csum) { + fprintf(stderr, "Error checksum mismatch got 0x%02x expected 0x%02x\n", + buf[len - 1], csum); + continue; + } + + switch ((buf[0] << 8) | buf[1]) { + case 0x6a06: /* cmd = 0x06, get version */ + len = strlen(version_str); + strcpy((char *)&response[2], version_str); + printf("Get version, reply: %s\n", version_str); + break; + case 0x8a0b: /* cmd = 0x0b, set brightness */ + if (buf[2] > 100) { + fprintf(stderr, "Error invalid brightness param: %d\n", buf[2]); + continue; + } + + len = 0; + brightness = buf[2]; + printf("Set brightness %d\n", brightness); + break; + case 0x6a0c: /* cmd = 0x0c, get brightness */ + len = 1; + response[2] = brightness; + printf("Get brightness, reply: %d\n", brightness); + break; + case 0x8a0e: /* cmd = 0x0e, set backlight power */ + if (buf[2] != 0 && buf[2] != 1) { + fprintf(stderr, "Error invalid set power param: %d\n", buf[2]); + continue; + } + + len = 0; + printf("Set power %d\n", buf[2]); + break; + default: + fprintf(stderr, "Error unknown cmd 0x%04x\n", + (buf[0] << 8) | buf[1]); + continue; + } + + /* Respond with */ + response[0] = len + 3; /* response length in bytes */ + response[1] = buf[1]; /* ack cmd */ + csum = dell_uart_checksum(response, len + 2); + response[len + 2] = csum; + ret = write(serial_fd, response, response[0]); + if (ret != (response[0])) + fprintf(stderr, "Error writing %d bytes: %d\n", + response[0], ret); + } + + ret = 0; +out_restore: + tcsetattr(serial_fd, TCSANOW, &saved_tty); +out_close: + close(serial_fd); + return ret; +} -- cgit v1.2.3 From dc9dfd8ae4b5ac28e457a830556b53b15f4b9a1c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 May 2024 16:44:09 +0200 Subject: selftests: netfilter: fix packetdrill conntrack testcase Some versions of conntrack(8) default to ipv4-only, so this needs to request ipv6 explicitly, like all other spots already do. Fixes: a8a388c2aae4 ("selftests: netfilter: add packetdrill based conntrack tests") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240513114649.6d764307@kernel.org/ Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240514144415.11433-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt index 21e1bb6395e4..842242f8ccf7 100644 --- a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt @@ -30,5 +30,5 @@ 1.5 < S 2000:2000(0) win 32792 -+0 `conntrack -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV` ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV` +0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` -- cgit v1.2.3 From 06080ea23095afe04a2cb7a8d05fab4311782623 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 13 May 2024 13:52:57 +0300 Subject: selftests: net: bridge: increase IGMP/MLD exclude timeout membership interval When running the bridge IGMP/MLD selftests on debug kernels we can get spurious errors when setting up the IGMP/MLD exclude timeout tests because the membership interval is just 3 seconds and the setup has 2 seconds of sleep plus various validations, the one second that is left is not enough. Increase the membership interval from 3 to 5 seconds to make room for the setup validation and 2 seconds of sleep. Fixes: 34d7ecb3d4f7 ("selftests: net: bridge: update IGMP/MLD membership interval value") Reported-by: Jakub Kicinski Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/bridge_igmp.sh | 6 +++--- tools/testing/selftests/net/forwarding/bridge_mld.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index 2aa66d2a1702..e6a3e04fd83f 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -478,10 +478,10 @@ v3exc_timeout_test() RET=0 local X=("192.0.2.20" "192.0.2.30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ v3exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh index e2b9ff773c6b..f84ab2e65754 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mld.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -478,10 +478,10 @@ mldv2exc_timeout_test() RET=0 local X=("2001:db8:1::20" "2001:db8:1::30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 mldv2exclude_prepare $h1 ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ mldv2exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 $MZPKT_ALLOW2 -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ -- cgit v1.2.3 From 5f0769331a965675cdfec97c09f3f6e875d7c246 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:50 +0200 Subject: rtla/timerlat: Simplify "no value" printing on top Instead of printing three times the same output, print it only once, reducing lines and being sure that all no values have the same length. It also fixes an extra '\n' when running the with kernel threads, like here: =============== %< ============== Timer Latency 0 00:00:01 | IRQ Timer Latency (us) | Thread Timer Latency (us) CPU COUNT | cur min avg max | cur min avg max 2 #0 | - - - - | 161 161 161 161 3 #0 | - - - - | 161 161 161 161 8 #1 | 54 54 54 54 | - - - -'\n' ---------------|----------------------------------------|--------------------------------------- ALL #1 e0 | 54 54 54 | 161 161 161 =============== %< ============== This '\n' should have been removed with the user-space support that added another '\n' if not running with kernel threads. Link: https://lkml.kernel.org/r/0a4d8085e7cd706733a5dc10a81ca38b82bd4992.1713968967.git.bristot@kernel.org Cc: stable@vger.kernel.org Cc: Jonathan Corbet Cc: Juri Lelli Fixes: cdca4f4e5e8e ("rtla/timerlat_top: Add timerlat user-space support") Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/timerlat_top.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 8a3fa64319c6..2665e0bb5f1e 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -212,6 +212,8 @@ static void timerlat_top_header(struct osnoise_tool *top) trace_seq_printf(s, "\n"); } +static const char *no_value = " -"; + /* * timerlat_top_print - prints the output of a given CPU */ @@ -239,10 +241,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) trace_seq_printf(s, "%3d #%-9d |", cpu, cpu_data->irq_count); if (!cpu_data->irq_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - |"); + trace_seq_printf(s, "%s %s %s %s |", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_irq / params->output_divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_irq / params->output_divisor); @@ -251,10 +250,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } if (!cpu_data->thread_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " -\n"); + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_thread / divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_thread / divisor); @@ -271,10 +267,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) trace_seq_printf(s, " |"); if (!cpu_data->user_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " -\n"); + trace_seq_printf(s, "%s %s %s %s\n", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_user / divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_user / divisor); -- cgit v1.2.3 From a40e5e4dd0207485dee75e2b8e860d5853bcc5f7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:51 +0200 Subject: rtla/auto-analysis: Replace \t with spaces When copying timerlat auto-analysis from a terminal to some web pages or chats, the \t are being replaced with a single ' ' or ' ', breaking the output. For example: ## CPU 3 hit stop tracing, analyzing it ## IRQ handler delay: 1.30 us (0.11 %) IRQ latency: 1.90 us Timerlat IRQ duration: 3.00 us (0.24 %) Blocking thread: 1223.16 us (99.00 %) insync:4048 1223.16 us IRQ interference 4.93 us (0.40 %) local_timer:236 4.93 us ------------------------------------------------------------------------ Thread latency: 1235.47 us (100%) Replace \t with spaces to avoid this problem. Link: https://lkml.kernel.org/r/ec7ed2b2809c22ab0dfc8eb7c805ab9cddc4254a.1713968967.git.bristot@kernel.org Cc: stable@vger.kernel.org Cc: Jonathan Corbet Cc: Juri Lelli Fixes: 27e348b221f6 ("rtla/timerlat: Add auto-analysis core") Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/timerlat_aa.c | 109 ++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat_aa.c b/tools/tracing/rtla/src/timerlat_aa.c index 7093fd5333be..7bd80ee2a5b4 100644 --- a/tools/tracing/rtla/src/timerlat_aa.c +++ b/tools/tracing/rtla/src/timerlat_aa.c @@ -16,6 +16,9 @@ enum timelat_state { TIMERLAT_WAITING_THREAD, }; +/* Used to fill spaces in the output */ +static const char *spaces = " "; + #define MAX_COMM 24 /* @@ -274,14 +277,17 @@ static int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *recor taa_data->prev_irq_timstamp = start; trace_seq_reset(taa_data->prev_irqs_seq); - trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s \t\t\t%9.2f us\n", - "nmi", ns_to_usf(duration)); + trace_seq_printf(taa_data->prev_irqs_seq, " %24s %.*s %9.2f us\n", + "nmi", + 24, spaces, + ns_to_usf(duration)); return 0; } taa_data->thread_nmi_sum += duration; - trace_seq_printf(taa_data->nmi_seq, " %24s \t\t\t%9.2f us\n", - "nmi", ns_to_usf(duration)); + trace_seq_printf(taa_data->nmi_seq, " %24s %.*s %9.2f us\n", + "nmi", + 24, spaces, ns_to_usf(duration)); return 0; } @@ -323,8 +329,10 @@ static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *recor taa_data->prev_irq_timstamp = start; trace_seq_reset(taa_data->prev_irqs_seq); - trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s:%-3llu \t\t%9.2f us\n", - desc, vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->prev_irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + desc, vector, + 15, spaces, + ns_to_usf(duration)); return 0; } @@ -372,8 +380,10 @@ static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *recor * IRQ interference. */ taa_data->thread_irq_sum += duration; - trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu \t %9.2f us\n", - desc, vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + desc, vector, + 24, spaces, + ns_to_usf(duration)); return 0; } @@ -408,8 +418,10 @@ static int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *r taa_data->thread_softirq_sum += duration; - trace_seq_printf(taa_data->softirqs_seq, "\t%24s:%-3llu \t %9.2f us\n", - softirq_name[vector], vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->softirqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + softirq_name[vector], vector, + 24, spaces, + ns_to_usf(duration)); return 0; } @@ -452,8 +464,10 @@ static int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *re } else { taa_data->thread_thread_sum += duration; - trace_seq_printf(taa_data->threads_seq, "\t%24s:%-3llu \t\t%9.2f us\n", - comm, pid, ns_to_usf(duration)); + trace_seq_printf(taa_data->threads_seq, " %24s:%-12llu %.*s %9.2f us\n", + comm, pid, + 15, spaces, + ns_to_usf(duration)); } return 0; @@ -482,7 +496,8 @@ static int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *rec function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]); if (!function) break; - trace_seq_printf(taa_data->stack_seq, "\t\t-> %s\n", function); + trace_seq_printf(taa_data->stack_seq, " %.*s -> %s\n", + 14, spaces, function); } } return 0; @@ -568,23 +583,24 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay; if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) { if (taa_data->prev_irq_timstamp < taa_data->timer_irq_start_time) - printf(" Previous IRQ interference: \t\t up to %9.2f us\n", - ns_to_usf(taa_data->prev_irq_duration)); + printf(" Previous IRQ interference: %.*s up to %9.2f us\n", + 16, spaces, + ns_to_usf(taa_data->prev_irq_duration)); } /* * The delay that the IRQ suffered before starting. */ - printf(" IRQ handler delay: %16s %9.2f us (%.2f %%)\n", - (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", - ns_to_usf(taa_data->timer_irq_start_delay), - ns_to_per(total, taa_data->timer_irq_start_delay)); + printf(" IRQ handler delay: %.*s %16s %9.2f us (%.2f %%)\n", 16, spaces, + (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", + ns_to_usf(taa_data->timer_irq_start_delay), + ns_to_per(total, taa_data->timer_irq_start_delay)); /* * Timerlat IRQ. */ - printf(" IRQ latency: \t\t\t\t %9.2f us\n", - ns_to_usf(taa_data->tlat_irq_latency)); + printf(" IRQ latency: %.*s %9.2f us\n", 40, spaces, + ns_to_usf(taa_data->tlat_irq_latency)); if (irq) { /* @@ -595,15 +611,16 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * so it will be displayed, it is the key. */ printf(" Blocking thread:\n"); - printf(" %24s:%-9llu\n", - taa_data->run_thread_comm, taa_data->run_thread_pid); + printf(" %.*s %24s:%-9llu\n", 6, spaces, taa_data->run_thread_comm, + taa_data->run_thread_pid); } else { /* * The duration of the IRQ handler that handled the timerlat IRQ. */ - printf(" Timerlat IRQ duration: \t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->timer_irq_duration), - ns_to_per(total, taa_data->timer_irq_duration)); + printf(" Timerlat IRQ duration: %.*s %9.2f us (%.2f %%)\n", + 30, spaces, + ns_to_usf(taa_data->timer_irq_duration), + ns_to_per(total, taa_data->timer_irq_duration)); /* * The amount of time that the current thread postponed the scheduler. @@ -611,13 +628,13 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Recalling that it is net from NMI/IRQ/Softirq interference, so there * is no need to compute values here. */ - printf(" Blocking thread: \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_blocking_duration), - ns_to_per(total, taa_data->thread_blocking_duration)); + printf(" Blocking thread: %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_blocking_duration), + ns_to_per(total, taa_data->thread_blocking_duration)); - printf(" %24s:%-9llu %9.2f us\n", - taa_data->run_thread_comm, taa_data->run_thread_pid, - ns_to_usf(taa_data->thread_blocking_duration)); + printf(" %.*s %24s:%-9llu %.*s %9.2f us\n", 6, spaces, + taa_data->run_thread_comm, taa_data->run_thread_pid, + 12, spaces, ns_to_usf(taa_data->thread_blocking_duration)); } /* @@ -629,9 +646,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * NMIs can happen during the IRQ, so they are always possible. */ if (taa_data->thread_nmi_sum) - printf(" NMI interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_nmi_sum), - ns_to_per(total, taa_data->thread_nmi_sum)); + printf(" NMI interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_nmi_sum), + ns_to_per(total, taa_data->thread_nmi_sum)); /* * If it is an IRQ latency, the other factors can be skipped. @@ -643,9 +660,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Prints the interference caused by IRQs to the thread latency. */ if (taa_data->thread_irq_sum) { - printf(" IRQ interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_irq_sum), - ns_to_per(total, taa_data->thread_irq_sum)); + printf(" IRQ interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_irq_sum), + ns_to_per(total, taa_data->thread_irq_sum)); trace_seq_do_printf(taa_data->irqs_seq); } @@ -654,9 +671,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Prints the interference caused by Softirqs to the thread latency. */ if (taa_data->thread_softirq_sum) { - printf(" Softirq interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_softirq_sum), - ns_to_per(total, taa_data->thread_softirq_sum)); + printf(" Softirq interference %.*s %9.2f us (%.2f %%)\n", 32, spaces, + ns_to_usf(taa_data->thread_softirq_sum), + ns_to_per(total, taa_data->thread_softirq_sum)); trace_seq_do_printf(taa_data->softirqs_seq); } @@ -670,9 +687,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * timer handling latency. */ if (taa_data->thread_thread_sum) { - printf(" Thread interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_thread_sum), - ns_to_per(total, taa_data->thread_thread_sum)); + printf(" Thread interference %.*s %9.2f us (%.2f %%)\n", 33, spaces, + ns_to_usf(taa_data->thread_thread_sum), + ns_to_per(total, taa_data->thread_thread_sum)); trace_seq_do_printf(taa_data->threads_seq); } @@ -682,8 +699,8 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, */ print_total: printf("------------------------------------------------------------------------\n"); - printf(" %s latency: \t\t\t %9.2f us (100%%)\n", irq ? "IRQ" : "Thread", - ns_to_usf(total)); + printf(" %s latency: %.*s %9.2f us (100%%)\n", irq ? " IRQ" : "Thread", + 37, spaces, ns_to_usf(total)); } static int timerlat_auto_analysis_collect_trace(struct timerlat_aa_context *taa_ctx) -- cgit v1.2.3 From f5c0cdad6684aa4212346f48554636ec2ab98434 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:52 +0200 Subject: rtla/timerlat: Use pretty formatting only on interactive tty timerlat top does some background/font color formatting. While useful on terminal, it breaks the output on other formats. For example, when piping the output for pastebin tools, the format strings are printed as characters. For instance: [2;37;40m Timer Latency [0;0;0m 0 00:00:01 | IRQ Timer Latency (us) | Thread Timer Latency (us) [2;30;47mCPU COUNT | cur min avg max | cur min avg max[0;0;0m 0 #1013 | 1 0 1 54 | 5 2 4 57 1 #1013 | 3 0 1 10 | 6 2 4 15 To avoid this problem, do the formatting only if running on a tty, and in !quiet mode. Link: https://lkml.kernel.org/r/8288e1544ceab21557d5dda93a0f00339497c649.1713968967.git.bristot@kernel.org Cc: Jonathan Corbet Cc: Juri Lelli Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/timerlat_top.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 2665e0bb5f1e..c9cf90ed4e6d 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -44,6 +44,7 @@ struct timerlat_top_params { int hk_cpus; int user_top; int user_workload; + int pretty_output; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -179,19 +180,22 @@ timerlat_top_handler(struct trace_seq *s, struct tep_record *record, /* * timerlat_top_header - print the header of the tool output */ -static void timerlat_top_header(struct osnoise_tool *top) +static void timerlat_top_header(struct timerlat_top_params *params, struct osnoise_tool *top) { - struct timerlat_top_params *params = top->params; struct trace_seq *s = top->trace.seq; char duration[26]; get_duration(top->start_time, duration, sizeof(duration)); - trace_seq_printf(s, "\033[2;37;40m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;37;40m"); + trace_seq_printf(s, " Timer Latency "); if (params->user_top) trace_seq_printf(s, " "); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); trace_seq_printf(s, "%-6s | IRQ Timer Latency (%s) | Thread Timer Latency (%s)", duration, @@ -204,11 +208,15 @@ static void timerlat_top_header(struct osnoise_tool *top) } trace_seq_printf(s, "\n"); - trace_seq_printf(s, "\033[2;30;47m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;30;47m"); + trace_seq_printf(s, "CPU COUNT | cur min avg max | cur min avg max"); if (params->user_top) trace_seq_printf(s, " | cur min avg max"); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); } @@ -305,7 +313,7 @@ timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *to if (!params->quiet) clear_terminal(trace->seq); - timerlat_top_header(top); + timerlat_top_header(params, top); for (i = 0; i < nr_cpus; i++) { if (params->cpus && !CPU_ISSET(i, ¶ms->monitored_cpus)) @@ -693,6 +701,9 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * } } + if (isatty(1) && !params->quiet) + params->pretty_output = 1; + return 0; out_err: -- cgit v1.2.3 From 285dcb7665ae83d07f194a517ba290f02d4f5f73 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:53 +0200 Subject: rtla/timerlat: Add a summary for top mode While the per-cpu values are the results to take into consideration, the overall system values are also useful. Add a summary at the bottom of rtla timerlat top showing the overall results. For instance: Timer Latency 0 00:00:10 | IRQ Timer Latency (us) | Thread Timer Latency (us) CPU COUNT | cur min avg max | cur min avg max 0 #10003 | 113 19 150 441 | 134 35 170 459 1 #10003 | 63 8 99 462 | 84 15 119 481 2 #10003 | 3 2 89 396 | 21 8 108 414 3 #10002 | 206 11 210 394 | 223 21 228 415 ---------------|----------------------------------------|--------------------------------------- ALL #40011 e0 | 2 137 462 | 8 156 481 Link: https://lkml.kernel.org/r/5eb510d6faeb4ce745e09395196752df75a2dd1a.1713968967.git.bristot@kernel.org Cc: Jonathan Corbet Suggested-by: Juri Lelli Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/timerlat_top.c | 108 ++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index c9cf90ed4e6d..bdcf8ef8f815 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -119,6 +119,37 @@ cleanup: return NULL; } +static void +timerlat_top_reset_sum(struct timerlat_top_cpu *summary) +{ + memset(summary, 0, sizeof(*summary)); + summary->min_irq = ~0; + summary->min_thread = ~0; + summary->min_user = ~0; +} + +static void +timerlat_top_update_sum(struct osnoise_tool *tool, int cpu, struct timerlat_top_cpu *sum) +{ + struct timerlat_top_data *data = tool->data; + struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu]; + + sum->irq_count += cpu_data->irq_count; + update_min(&sum->min_irq, &cpu_data->min_irq); + update_sum(&sum->sum_irq, &cpu_data->sum_irq); + update_max(&sum->max_irq, &cpu_data->max_irq); + + sum->thread_count += cpu_data->thread_count; + update_min(&sum->min_thread, &cpu_data->min_thread); + update_sum(&sum->sum_thread, &cpu_data->sum_thread); + update_max(&sum->max_thread, &cpu_data->max_thread); + + sum->user_count += cpu_data->user_count; + update_min(&sum->min_user, &cpu_data->min_user); + update_sum(&sum->sum_user, &cpu_data->sum_user); + update_max(&sum->max_user, &cpu_data->max_user); +} + /* * timerlat_hist_update - record a new timerlat occurent on cpu, updating data */ @@ -285,6 +316,77 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } } +/* + * timerlat_top_print_sum - prints the summary output + */ +static void +timerlat_top_print_sum(struct osnoise_tool *top, struct timerlat_top_cpu *summary) +{ + const char *split = "----------------------------------------"; + struct timerlat_top_params *params = top->params; + unsigned long long count = summary->irq_count; + int divisor = params->output_divisor; + struct trace_seq *s = top->trace.seq; + int e = 0; + + if (divisor == 0) + return; + + /* + * Skip if no data is available: is this cpu offline? + */ + if (!summary->irq_count && !summary->thread_count) + return; + + while (count > 999999) { + e++; + count /= 10; + } + + trace_seq_printf(s, "%.*s|%.*s|%.*s", 15, split, 40, split, 39, split); + if (params->user_top) + trace_seq_printf(s, "-|%.*s", 39, split); + trace_seq_printf(s, "\n"); + + trace_seq_printf(s, "ALL #%-6llu e%d |", count, e); + + if (!summary->irq_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_irq / params->output_divisor); + trace_seq_printf(s, "%9llu ", (summary->sum_irq / summary->irq_count) / divisor); + trace_seq_printf(s, "%9llu |", summary->max_irq / divisor); + } + + if (!summary->thread_count) { + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_thread / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_thread / summary->thread_count) / divisor); + trace_seq_printf(s, "%9llu", summary->max_thread / divisor); + } + + if (!params->user_top) { + trace_seq_printf(s, "\n"); + return; + } + + trace_seq_printf(s, " |"); + + if (!summary->user_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_user / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_user / summary->user_count) / divisor); + trace_seq_printf(s, "%9llu\n", summary->max_user / divisor); + } +} + /* * clear_terminal - clears the output terminal */ @@ -301,6 +403,7 @@ static void timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *top) { struct trace_instance *trace = &top->trace; + struct timerlat_top_cpu summary; static int nr_cpus = -1; int i; @@ -313,14 +416,19 @@ timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *to if (!params->quiet) clear_terminal(trace->seq); + timerlat_top_reset_sum(&summary); + timerlat_top_header(params, top); for (i = 0; i < nr_cpus; i++) { if (params->cpus && !CPU_ISSET(i, ¶ms->monitored_cpus)) continue; timerlat_top_print(top, i); + timerlat_top_update_sum(top, i, &summary); } + timerlat_top_print_sum(top, &summary); + trace_seq_do_printf(trace->seq); trace_seq_reset(trace->seq); } -- cgit v1.2.3 From 1462501c7a8d565f5949d3d5635b2111d889aaaa Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:54 +0200 Subject: rtla/timerlat: Add a summary for hist mode Like on rtla timerlat top, add an overall summary at the bottom of timerlat hist. For instance: # timerlat hist -c 0-1 -d 10s -E 20 # RTLA timerlat histogram # Time unit is microseconds (us) # Duration: 0 00:00:10 Index IRQ-000 Thr-000 IRQ-001 Thr-001 6 1 0 0 0 7 1 0 0 0 8 1 0 1 0 9 7 0 0 0 10 16 0 0 0 11 1 0 3 0 15 0 0 3 0 16 0 0 12 0 17 0 0 28 0 18 0 2 26 0 19 1 1 80 1 over: 9973 9998 9848 10000 count: 10001 10001 10001 10001 min: 6 18 8 19 avg: 185 204 95 113 max: 428 450 341 371 ALL: IRQ Thr count: 20002 20002 min: 6 18 avg: 140 159 max: 428 450 Link: https://lkml.kernel.org/r/a6bc06c798f72127edc57d1f99da8d57e1187cee.1713968967.git.bristot@kernel.org Cc: Jonathan Corbet Suggested-by: Juri Lelli Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/timerlat_hist.c | 130 ++++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 8bd51aab6513..b12c6b571dd4 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -401,8 +401,135 @@ timerlat_print_summary(struct timerlat_hist_params *params, trace_seq_reset(trace->seq); } +static void +timerlat_print_stats_all(struct timerlat_hist_params *params, + struct trace_instance *trace, + struct timerlat_hist_data *data) +{ + struct timerlat_hist_cpu *cpu_data; + struct timerlat_hist_cpu sum; + int cpu; + + if (params->no_summary) + return; + + memset(&sum, 0, sizeof(sum)); + sum.min_irq = ~0; + sum.min_thread = ~0; + sum.min_user = ~0; + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !CPU_ISSET(cpu, ¶ms->monitored_cpus)) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + cpu_data = &data->hist[cpu]; + + sum.irq_count += cpu_data->irq_count; + update_min(&sum.min_irq, &cpu_data->min_irq); + update_sum(&sum.sum_irq, &cpu_data->sum_irq); + update_max(&sum.max_irq, &cpu_data->max_irq); + + sum.thread_count += cpu_data->thread_count; + update_min(&sum.min_thread, &cpu_data->min_thread); + update_sum(&sum.sum_thread, &cpu_data->sum_thread); + update_max(&sum.max_thread, &cpu_data->max_thread); + + sum.user_count += cpu_data->user_count; + update_min(&sum.min_user, &cpu_data->min_user); + update_sum(&sum.sum_user, &cpu_data->sum_user); + update_max(&sum.max_user, &cpu_data->max_user); + } + + if (!params->no_index) + trace_seq_printf(trace->seq, "ALL: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, " IRQ"); + + if (!params->no_thread) + trace_seq_printf(trace->seq, " Thr"); + + if (params->user_hist) + trace_seq_printf(trace->seq, " Usr"); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "count:"); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9d ", + sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9d ", + sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9d ", + sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "min: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_user); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "avg: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_irq / sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_thread / sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_user / sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "max: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_user); + + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + /* - * timerlat_print_stats - print data for all CPUs + * timerlat_print_stats - print data for each CPUs */ static void timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *tool) @@ -485,6 +612,7 @@ timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *t trace_seq_reset(trace->seq); timerlat_print_summary(params, trace, data); + timerlat_print_stats_all(params, trace, data); } /* -- cgit v1.2.3 From cdbf71962bb07493d67fee34536a5724a8bb5886 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:55 +0200 Subject: rtla: Add the --warm-up option On many cases, the results right after the startup are different from the rest of the execution, biasing the results. For example, on osnoise, the scheduler might take some time to adapt to the new busy-loop workload. Add the --warm-up option, adding a warm-up phase (in seconds) where the workload is set, but the results are discarded. Link: https://lkml.kernel.org/r/e682d5ce5af90f123bd13220f63d5c3d118a92be.1713968967.git.bristot@kernel.org Cc: Jonathan Corbet Cc: Juri Lelli Signed-off-by: Daniel Bristot de Oliveira --- Documentation/tools/rtla/common_options.rst | 4 +++ tools/tracing/rtla/src/osnoise_hist.c | 30 ++++++++++++++++-- tools/tracing/rtla/src/osnoise_top.c | 29 +++++++++++++++-- tools/tracing/rtla/src/timerlat_hist.c | 49 +++++++++++++++++++---------- tools/tracing/rtla/src/timerlat_top.c | 47 ++++++++++++++++----------- 5 files changed, 119 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/Documentation/tools/rtla/common_options.rst b/Documentation/tools/rtla/common_options.rst index aeb91ff3bd68..a96ea0ed662e 100644 --- a/Documentation/tools/rtla/common_options.rst +++ b/Documentation/tools/rtla/common_options.rst @@ -50,6 +50,10 @@ Set a *cgroup* to the tracer's threads. If the **-C** option is passed without arguments, the tracer's thread will inherit **rtla**'s *cgroup*. Otherwise, the threads will be placed on the *cgroup* passed to the option. +**--warm-up** *s* + + After starting the workload, let it run for *s* seconds before starting collecting the data, allowing the system to warm-up. Statistical data generated during warm-up is discarded. + **-h**, **--help** Print help menu. diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 01870d50942a..c6100ff46a7f 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -36,13 +36,13 @@ struct osnoise_hist_params { cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; - char no_header; char no_summary; char no_index; char with_zeros; int bucket_size; int entries; + int warmup; }; struct osnoise_hist_cpu { @@ -438,7 +438,7 @@ static void osnoise_hist_usage(char *usage) " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", " [-T us] [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [-C[=cgroup_name]]", + " [--no-index] [--with-zeros] [-C[=cgroup_name]] [--warm-up]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -468,6 +468,7 @@ static void osnoise_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up: let the workload run for s seconds before collecting data", NULL, }; @@ -531,13 +532,14 @@ static struct osnoise_hist_params {"with-zeros", no_argument, 0, '3'}, {"trigger", required_argument, 0, '4'}, {"filter", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:", long_options, &option_index); /* detect the end of the options. */ @@ -680,6 +682,9 @@ static struct osnoise_hist_params osnoise_hist_usage("--filter requires a previous -e\n"); } break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; default: osnoise_hist_usage("Invalid option"); } @@ -899,6 +904,25 @@ int osnoise_hist_main(int argc, char *argv[]) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_hist; + } + + } + tool->start_time = time(NULL); osnoise_hist_set_signals(params); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 457360db0767..53a074c1222e 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -40,6 +40,7 @@ struct osnoise_top_params { int set_sched; int cgroup; int hk_cpus; + int warmup; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -282,7 +283,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) static const char * const msg[] = { " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", " [-T us] [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] \\", - " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]]", + " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -307,6 +308,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up s: let the workload run for s seconds before collecting data", NULL, }; @@ -381,13 +383,14 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) {"trace", optional_argument, 0, 't'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, + {"warm-up", required_argument, 0, '2'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:", long_options, &option_index); /* Detect the end of the options. */ @@ -511,6 +514,9 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) osnoise_top_usage(params, "--filter requires a previous -e\n"); } break; + case '2': + params->warmup = get_llong_from_str(optarg); + break; default: osnoise_top_usage(params, "Invalid option"); } @@ -732,6 +738,25 @@ int osnoise_top_main(int argc, char **argv) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_top; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_top; + } + + } + tool->start_time = time(NULL); osnoise_top_set_signals(params); diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index b12c6b571dd4..da1c353bdac9 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -52,6 +52,7 @@ struct timerlat_hist_params { char with_zeros; int bucket_size; int entries; + int warmup; }; struct timerlat_hist_cpu { @@ -628,6 +629,7 @@ static void timerlat_hist_usage(char *usage) " [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u]", + " [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -664,6 +666,7 @@ static void timerlat_hist_usage(char *usage) " in nanoseconds", " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", NULL, }; @@ -738,13 +741,14 @@ static struct timerlat_hist_params {"dma-latency", required_argument, 0, '8'}, {"no-aa", no_argument, 0, '9'}, {"dump-task", no_argument, 0, '\1'}, + {"warm-up", required_argument, 0, '\2'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1\2:", long_options, &option_index); /* detect the end of the options. */ @@ -913,6 +917,9 @@ static struct timerlat_hist_params case '\1': params->dump_tasks = 1; break; + case '\2': + params->warmup = get_llong_from_str(optarg); + break; default: timerlat_hist_usage("Invalid option"); } @@ -1167,22 +1174,6 @@ int timerlat_hist_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - tool->start_time = time(NULL); - timerlat_hist_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -1202,6 +1193,29 @@ int timerlat_hist_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + tool->start_time = time(NULL); + timerlat_hist_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); @@ -1227,6 +1241,7 @@ int timerlat_hist_main(int argc, char *argv[]) } } } + if (params->user_workload && !params_u.stopped_running) { params_u.should_run = 0; sleep(1); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index bdcf8ef8f815..410eb5174913 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -45,6 +45,7 @@ struct timerlat_top_params { int user_top; int user_workload; int pretty_output; + int warmup; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -444,7 +445,7 @@ static void timerlat_top_usage(char *usage) "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", " [[-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", - " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u]", + " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -475,6 +476,7 @@ static void timerlat_top_usage(char *usage) " in nanoseconds", " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", NULL, }; @@ -541,13 +543,14 @@ static struct timerlat_top_params {"no-aa", no_argument, 0, '3'}, {"dump-tasks", no_argument, 0, '4'}, {"aa-only", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:6:", long_options, &option_index); /* detect the end of the options. */ @@ -704,6 +707,9 @@ static struct timerlat_top_params case '4': params->dump_tasks = 1; break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; default: timerlat_top_usage("Invalid option"); } @@ -971,22 +977,6 @@ int timerlat_top_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa && aa != top) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - top->start_time = time(NULL); - timerlat_top_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -1006,6 +996,27 @@ int timerlat_top_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa && aa != top) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + top->start_time = time(NULL); + timerlat_top_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); -- cgit v1.2.3 From fb9e90a67ee9a42779a8ea296a4cf7734258b27d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 24 Apr 2024 16:36:56 +0200 Subject: rtla/timerlat: Make user-space threads the default After ther -u addition, most of the known users are setting it. And it makes sense, as it adds more information, and inherits the default setup for the threads - e.g., cgroups configs. Thus, if the user-space interface is available, enable -u. Otherwise, use the in-kernel thread. Add the -k option to allow the user to request kernel-threads. Link: https://lkml.kernel.org/r/9241d3089de4091b124f780ed832a0e6646cadaa.1713968967.git.bristot@kernel.org Cc: Jonathan Corbet Cc: Juri Lelli Signed-off-by: Daniel Bristot de Oliveira --- .../tools/rtla/common_timerlat_options.rst | 6 ++++- tools/tracing/rtla/src/timerlat_hist.c | 31 +++++++++++++++++++--- tools/tracing/rtla/src/timerlat_top.c | 31 +++++++++++++++++++--- 3 files changed, 61 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/Documentation/tools/rtla/common_timerlat_options.rst b/Documentation/tools/rtla/common_timerlat_options.rst index d3255ed70195..090700a6ae9f 100644 --- a/Documentation/tools/rtla/common_timerlat_options.rst +++ b/Documentation/tools/rtla/common_timerlat_options.rst @@ -27,12 +27,16 @@ *cyclictest* sets this value to *0* by default, use **--dma-latency** *0* to have similar results. +**-k**, **--kernel-threads** + + Use timerlat kernel-space threads, in contrast of **-u**. + **-u**, **--user-threads** Set timerlat to run without a workload, and then dispatches user-space workloads to wait on the timerlat_fd. Once the workload is awakes, it goes to sleep again adding so the measurement for the kernel-to-user and user-to-kernel to the tracer - output. + output. **--user-threads** will be used unless the user specify **-k**. **-U**, **--user-load** diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index da1c353bdac9..6eb6e38d4a05 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -40,6 +40,7 @@ struct timerlat_hist_params { int no_aa; int dump_tasks; int user_workload; + int kernel_workload; int user_hist; cpu_set_t hk_cpu_set; struct sched_attr sched_param; @@ -628,7 +629,7 @@ static void timerlat_hist_usage(char *usage) " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", " [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u]", + " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", " [--warm-up s]", "", " -h/--help: print this menu", @@ -664,7 +665,8 @@ static void timerlat_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", " --warm-up s: let the workload run for s seconds before collecting data", NULL, @@ -728,6 +730,7 @@ static struct timerlat_hist_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"event", required_argument, 0, 'e'}, {"no-irq", no_argument, 0, '0'}, @@ -748,7 +751,7 @@ static struct timerlat_hist_params /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1\2:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:", long_options, &option_index); /* detect the end of the options. */ @@ -831,6 +834,9 @@ static struct timerlat_hist_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = 1; + break; case 'n': params->output_divisor = 1; break; @@ -942,6 +948,9 @@ static struct timerlat_hist_params if (!params->stop_us && !params->stop_total_us) params->no_aa = 1; + if (params->kernel_workload && params->user_workload) + timerlat_hist_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -1017,6 +1026,22 @@ timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_param auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_hist = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_hist) { retval = osnoise_set_workload(tool->context, 0); if (retval) { diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 410eb5174913..0acfefe151f7 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -44,6 +44,7 @@ struct timerlat_top_params { int hk_cpus; int user_top; int user_workload; + int kernel_workload; int pretty_output; int warmup; cpu_set_t hk_cpu_set; @@ -445,7 +446,7 @@ static void timerlat_top_usage(char *usage) "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", " [[-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", - " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u] [--warm-up s]", + " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u|-k] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -474,7 +475,8 @@ static void timerlat_top_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", " --warm-up s: let the workload run for s seconds before collecting data", NULL, @@ -536,6 +538,7 @@ static struct timerlat_top_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, @@ -550,7 +553,7 @@ static struct timerlat_top_params /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:6:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:", long_options, &option_index); /* detect the end of the options. */ @@ -635,6 +638,9 @@ static struct timerlat_top_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = true; + break; case 'n': params->output_divisor = 1; break; @@ -729,6 +735,9 @@ static struct timerlat_top_params if (params->no_aa && params->aa_only) timerlat_top_usage("--no-aa and --aa-only are mutually exclusive!"); + if (params->kernel_workload && params->user_workload) + timerlat_top_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -807,6 +816,22 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_top = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_top) { retval = osnoise_set_workload(top->context, 0); if (retval) { -- cgit v1.2.3 From 1de27bba6d50a909647f304eadc0f7c59a842a50 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 13 May 2024 11:08:03 -0700 Subject: libbpf: fix feature detectors when using token_fd Adjust `union bpf_attr` size passed to kernel in two feature-detecting functions to take into account prog_token_fd field. Libbpf is avoiding memset()'ing entire `union bpf_attr` by only using minimal set of bpf_attr's fields. Two places have been missed when wiring BPF token support in libbpf's feature detection logic. Fix them trivially. Fixes: f3dcee938f48 ("libbpf: Wire up token_fd into feature probing logic") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240513180804.403775-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 2 +- tools/lib/bpf/features.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 466a29d80124..2a4c71501a17 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -105,7 +105,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) */ int probe_memcg_account(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), BPF_EXIT_INSN(), diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 4e783cc7fc4b..a336786a22a3 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -22,7 +22,7 @@ int probe_fd(int fd) static int probe_kern_prog_name(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), -- cgit v1.2.3 From 2322113ac9d0c5653017adbab504fb307b0e92e2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 May 2024 23:24:40 -0700 Subject: selftests/bpf: add more variations of map-in-map situations Add test cases validating usage of PERCPU_ARRAY and PERCPU_HASH maps as inner maps. Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240515062440.846086-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/map_kptr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index da30f0d59364..ab0ce1d01a4a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -110,10 +110,14 @@ DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_map, array_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_map, array_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_malloc_map, array_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, lru_hash_map, array_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_array_map, array_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_hash_map, array_of_pcpu_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, array_map, hash_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_map, hash_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_malloc_map, hash_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, lru_hash_map, hash_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_array_map, hash_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_hash_map, hash_of_pcpu_hash_maps); #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) @@ -204,6 +208,8 @@ int test_map_kptr(struct __sk_buff *ctx) TEST(hash_map); TEST(hash_malloc_map); TEST(lru_hash_map); + TEST(pcpu_array_map); + TEST(pcpu_hash_map); #undef TEST return 0; @@ -281,10 +287,14 @@ int test_map_in_map_kptr(struct __sk_buff *ctx) TEST(array_of_hash_maps); TEST(array_of_hash_malloc_maps); TEST(array_of_lru_hash_maps); + TEST(array_of_pcpu_array_maps); + TEST(array_of_pcpu_hash_maps); TEST(hash_of_array_maps); TEST(hash_of_hash_maps); TEST(hash_of_hash_malloc_maps); TEST(hash_of_lru_hash_maps); + TEST(hash_of_pcpu_array_maps); + TEST(hash_of_pcpu_hash_maps); #undef TEST return 0; -- cgit v1.2.3 From cba23f333fedf8e39743b0c9787b45a5bd7d03af Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 May 2024 13:40:08 -0400 Subject: selftests/kvm: remove dead file This file was supposed to be removed in commit 2b7deea3ec7c ("Revert "kvm: selftests: move base kvm_util.h declarations to kvm_util_base.h""), but it survived. Remove it now. Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 1135 -------------------- 1 file changed, 1135 deletions(-) delete mode 100644 tools/testing/selftests/kvm/include/kvm_util_base.h (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h deleted file mode 100644 index e850269a3219..000000000000 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ /dev/null @@ -1,1135 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/kvm_util_base.h - * - * Copyright (C) 2018, Google LLC. - */ -#ifndef SELFTEST_KVM_UTIL_BASE_H -#define SELFTEST_KVM_UTIL_BASE_H - -#include "test_util.h" - -#include -#include "linux/hashtable.h" -#include "linux/list.h" -#include -#include -#include "linux/rbtree.h" -#include - -#include -#include - -#include - -#include "kvm_util_arch.h" -#include "sparsebit.h" - -/* - * Provide a version of static_assert() that is guaranteed to have an optional - * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE - * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and - * #defines static_assert() as a direct alias to _Static_assert() (see - * usr/include/assert.h). Define a custom macro instead of redefining - * static_assert() to avoid creating non-deterministic behavior that is - * dependent on include order. - */ -#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) -#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) - -#define KVM_DEV_PATH "/dev/kvm" -#define KVM_MAX_VCPUS 512 - -#define NSEC_PER_SEC 1000000000L - -typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ -typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ - -struct userspace_mem_region { - struct kvm_userspace_memory_region2 region; - struct sparsebit *unused_phy_pages; - struct sparsebit *protected_phy_pages; - int fd; - off_t offset; - enum vm_mem_backing_src_type backing_src_type; - void *host_mem; - void *host_alias; - void *mmap_start; - void *mmap_alias; - size_t mmap_size; - struct rb_node gpa_node; - struct rb_node hva_node; - struct hlist_node slot_node; -}; - -struct kvm_vcpu { - struct list_head list; - uint32_t id; - int fd; - struct kvm_vm *vm; - struct kvm_run *run; -#ifdef __x86_64__ - struct kvm_cpuid2 *cpuid; -#endif - struct kvm_dirty_gfn *dirty_gfns; - uint32_t fetch_index; - uint32_t dirty_gfns_count; -}; - -struct userspace_mem_regions { - struct rb_root gpa_tree; - struct rb_root hva_tree; - DECLARE_HASHTABLE(slot_hash, 9); -}; - -enum kvm_mem_region_type { - MEM_REGION_CODE, - MEM_REGION_DATA, - MEM_REGION_PT, - MEM_REGION_TEST_DATA, - NR_MEM_REGIONS, -}; - -struct kvm_vm { - int mode; - unsigned long type; - int kvm_fd; - int fd; - unsigned int pgtable_levels; - unsigned int page_size; - unsigned int page_shift; - unsigned int pa_bits; - unsigned int va_bits; - uint64_t max_gfn; - struct list_head vcpus; - struct userspace_mem_regions regions; - struct sparsebit *vpages_valid; - struct sparsebit *vpages_mapped; - bool has_irqchip; - bool pgd_created; - vm_paddr_t ucall_mmio_addr; - vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; - vm_vaddr_t idt; - vm_vaddr_t handlers; - uint32_t dirty_ring_size; - uint64_t gpa_tag_mask; - - struct kvm_vm_arch arch; - - /* Cache of information for binary stats interface */ - int stats_fd; - struct kvm_stats_header stats_header; - struct kvm_stats_desc *stats_desc; - - /* - * KVM region slots. These are the default memslots used by page - * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] - * memslot. - */ - uint32_t memslots[NR_MEM_REGIONS]; -}; - -struct vcpu_reg_sublist { - const char *name; - long capability; - int feature; - int feature_type; - bool finalize; - __u64 *regs; - __u64 regs_n; - __u64 *rejects_set; - __u64 rejects_set_n; - __u64 *skips_set; - __u64 skips_set_n; -}; - -struct vcpu_reg_list { - char *name; - struct vcpu_reg_sublist sublists[]; -}; - -#define for_each_sublist(c, s) \ - for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) - -#define kvm_for_each_vcpu(vm, i, vcpu) \ - for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ - if (!((vcpu) = vm->vcpus[i])) \ - continue; \ - else - -struct userspace_mem_region * -memslot2region(struct kvm_vm *vm, uint32_t memslot); - -static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, - enum kvm_mem_region_type type) -{ - assert(type < NR_MEM_REGIONS); - return memslot2region(vm, vm->memslots[type]); -} - -/* Minimum allocated guest virtual and physical addresses */ -#define KVM_UTIL_MIN_VADDR 0x2000 -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - -#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 -#define DEFAULT_STACK_PGS 5 - -enum vm_guest_mode { - VM_MODE_P52V48_4K, - VM_MODE_P52V48_16K, - VM_MODE_P52V48_64K, - VM_MODE_P48V48_4K, - VM_MODE_P48V48_16K, - VM_MODE_P48V48_64K, - VM_MODE_P40V48_4K, - VM_MODE_P40V48_16K, - VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ - VM_MODE_P47V64_4K, - VM_MODE_P44V64_4K, - VM_MODE_P36V48_4K, - VM_MODE_P36V48_16K, - VM_MODE_P36V48_64K, - VM_MODE_P36V47_16K, - NUM_VM_MODES, -}; - -struct vm_shape { - uint32_t type; - uint8_t mode; - uint8_t pad0; - uint16_t pad1; -}; - -kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); - -#define VM_TYPE_DEFAULT 0 - -#define VM_SHAPE(__mode) \ -({ \ - struct vm_shape shape = { \ - .mode = (__mode), \ - .type = VM_TYPE_DEFAULT \ - }; \ - \ - shape; \ -}) - -#if defined(__aarch64__) - -extern enum vm_guest_mode vm_mode_default; - -#define VM_MODE_DEFAULT vm_mode_default -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#elif defined(__x86_64__) - -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#elif defined(__s390x__) - -#define VM_MODE_DEFAULT VM_MODE_P44V64_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 16) - -#elif defined(__riscv) - -#if __riscv_xlen == 32 -#error "RISC-V 32-bit kvm selftests not supported" -#endif - -#define VM_MODE_DEFAULT VM_MODE_P40V48_4K -#define MIN_PAGE_SHIFT 12U -#define ptes_per_page(page_size) ((page_size) / 8) - -#endif - -#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) - -#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) -#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) - -struct vm_guest_mode_params { - unsigned int pa_bits; - unsigned int va_bits; - unsigned int page_size; - unsigned int page_shift; -}; -extern const struct vm_guest_mode_params vm_guest_mode_params[]; - -int open_path_or_exit(const char *path, int flags); -int open_kvm_dev_path_or_exit(void); - -bool get_kvm_param_bool(const char *param); -bool get_kvm_intel_param_bool(const char *param); -bool get_kvm_amd_param_bool(const char *param); - -int get_kvm_param_integer(const char *param); -int get_kvm_intel_param_integer(const char *param); -int get_kvm_amd_param_integer(const char *param); - -unsigned int kvm_check_cap(long cap); - -static inline bool kvm_has_cap(long cap) -{ - return kvm_check_cap(cap); -} - -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -/* - * Use the "inner", double-underscore macro when reporting errors from within - * other macros so that the name of ioctl() and not its literal numeric value - * is printed on error. The "outer" macro is strongly preferred when reporting - * errors "directly", i.e. without an additional layer of macros, as it reduces - * the probability of passing in the wrong string. - */ -#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) -#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) - -#define kvm_do_ioctl(fd, cmd, arg) \ -({ \ - kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ - ioctl(fd, cmd, arg); \ -}) - -#define __kvm_ioctl(kvm_fd, cmd, arg) \ - kvm_do_ioctl(kvm_fd, cmd, arg) - -#define kvm_ioctl(kvm_fd, cmd, arg) \ -({ \ - int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ - \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ -}) - -static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } - -#define __vm_ioctl(vm, cmd, arg) \ -({ \ - static_assert_is_vm(vm); \ - kvm_do_ioctl((vm)->fd, cmd, arg); \ -}) - -/* - * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if - * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, - * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before - * selftests existed and (b) should never outright fail, i.e. is supposed to - * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the - * VM and its vCPUs, including KVM_CHECK_EXTENSION. - */ -#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ -do { \ - int __errno = errno; \ - \ - static_assert_is_vm(vm); \ - \ - if (cond) \ - break; \ - \ - if (errno == EIO && \ - __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ - TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ - TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ - } \ - errno = __errno; \ - TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ -} while (0) - -#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ - __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) - -#define vm_ioctl(vm, cmd, arg) \ -({ \ - int ret = __vm_ioctl(vm, cmd, arg); \ - \ - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ -}) - -static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } - -#define __vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - static_assert_is_vcpu(vcpu); \ - kvm_do_ioctl((vcpu)->fd, cmd, arg); \ -}) - -#define vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - int ret = __vcpu_ioctl(vcpu, cmd, arg); \ - \ - __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ -}) - -/* - * Looks up and returns the value corresponding to the capability - * (KVM_CAP_*) given by cap. - */ -static inline int vm_check_cap(struct kvm_vm *vm, long cap) -{ - int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); - - TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); - return ret; -} - -static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); -} -static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); -} - -static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, - uint64_t size, uint64_t attributes) -{ - struct kvm_memory_attributes attr = { - .attributes = attributes, - .address = gpa, - .size = size, - .flags = 0, - }; - - /* - * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows - * need significant enhancements to support multiple attributes. - */ - TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, - "Update me to support multiple attributes!"); - - vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); -} - - -static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); -} - -static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_set_memory_attributes(vm, gpa, size, 0); -} - -void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, - bool punch_hole); - -static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_guest_mem_fallocate(vm, gpa, size, true); -} - -static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, - uint64_t size) -{ - vm_guest_mem_fallocate(vm, gpa, size, false); -} - -void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); -const char *vm_guest_mode_string(uint32_t i); - -void kvm_vm_free(struct kvm_vm *vmp); -void kvm_vm_restart(struct kvm_vm *vmp); -void kvm_vm_release(struct kvm_vm *vmp); -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, - size_t len); -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); -int kvm_memfd_alloc(size_t size, bool hugepages); - -void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) -{ - struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; - - vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); -} - -static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages) -{ - struct kvm_clear_dirty_log args = { - .dirty_bitmap = log, - .slot = slot, - .first_page = first_page, - .num_pages = num_pages - }; - - vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); -} - -static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) -{ - return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); -} - -static inline int vm_get_stats_fd(struct kvm_vm *vm) -{ - int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); - - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); - return fd; -} - -static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) -{ - ssize_t ret; - - ret = pread(stats_fd, header, sizeof(*header), 0); - TEST_ASSERT(ret == sizeof(*header), - "Failed to read '%lu' header bytes, ret = '%ld'", - sizeof(*header), ret); -} - -struct kvm_stats_desc *read_stats_descriptors(int stats_fd, - struct kvm_stats_header *header); - -static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) -{ - /* - * The base size of the descriptor is defined by KVM's ABI, but the - * size of the name field is variable, as far as KVM's ABI is - * concerned. For a given instance of KVM, the name field is the same - * size for all stats and is provided in the overall stats header. - */ - return sizeof(struct kvm_stats_desc) + header->name_size; -} - -static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, - int index, - struct kvm_stats_header *header) -{ - /* - * Note, size_desc includes the size of the name field, which is - * variable. i.e. this is NOT equivalent to &stats_desc[i]. - */ - return (void *)stats + index * get_stats_descriptor_size(header); -} - -void read_stat_data(int stats_fd, struct kvm_stats_header *header, - struct kvm_stats_desc *desc, uint64_t *data, - size_t max_elements); - -void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, - size_t max_elements); - -static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) -{ - uint64_t data; - - __vm_get_stat(vm, stat_name, &data, 1); - return data; -} - -void vm_create_irqchip(struct kvm_vm *vm); - -static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, - uint64_t flags) -{ - struct kvm_create_guest_memfd guest_memfd = { - .size = size, - .flags = flags, - }; - - return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); -} - -static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, - uint64_t flags) -{ - int fd = __vm_create_guest_memfd(vm, size, flags); - - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); - return fd; -} - -void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva); -int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva); -void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva, - uint32_t guest_memfd, uint64_t guest_memfd_offset); -int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, - uint64_t gpa, uint64_t size, void *hva, - uint32_t guest_memfd, uint64_t guest_memfd_offset); - -void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); -void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); - -#ifndef vm_arch_has_protected_memory -static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) -{ - return false; -} -#endif - -void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); -void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); -void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -void vm_populate_vaddr_bitmap(struct kvm_vm *vm); -vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); -vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, - vm_vaddr_t vaddr_min, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); -vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, - enum kvm_mem_region_type type); -vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); - -void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages); -void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); -void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); -vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); -void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); - -#ifndef vcpu_arch_put_guest -#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0) -#endif - -static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) -{ - return gpa & ~vm->gpa_tag_mask; -} - -void vcpu_run(struct kvm_vcpu *vcpu); -int _vcpu_run(struct kvm_vcpu *vcpu); - -static inline int __vcpu_run(struct kvm_vcpu *vcpu) -{ - return __vcpu_ioctl(vcpu, KVM_RUN, NULL); -} - -void vcpu_run_complete_io(struct kvm_vcpu *vcpu); -struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); - -static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, - uint64_t arg0) -{ - struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - - vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); -} - -static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *debug) -{ - vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); -} - -static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); -} -static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); -} - -static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_ioctl(vcpu, KVM_GET_REGS, regs); -} - -static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu_ioctl(vcpu, KVM_SET_REGS, regs); -} -static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); - -} -static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); -} -static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); -} -static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); -} -static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); -} - -static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - - return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); -} -static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - - return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); -} -static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - - vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); -} -static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - - vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); -} - -#ifdef __KVM_HAVE_VCPU_EVENTS -static inline void vcpu_events_get(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); -} -static inline void vcpu_events_set(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); -} -#endif -#ifdef __x86_64__ -static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); -} -static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); -} - -static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); -} -#endif -static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) -{ - int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); - - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); - return fd; -} - -int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); - -static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) -{ - int ret = __kvm_has_device_attr(dev_fd, group, attr); - - TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); -} - -int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); - -static inline void kvm_device_attr_get(int dev_fd, uint32_t group, - uint64_t attr, void *val) -{ - int ret = __kvm_device_attr_get(dev_fd, group, attr, val); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); -} - -int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); - -static inline void kvm_device_attr_set(int dev_fd, uint32_t group, - uint64_t attr, void *val) -{ - int ret = __kvm_device_attr_set(dev_fd, group, attr, val); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); -} - -static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr) -{ - return __kvm_has_device_attr(vcpu->fd, group, attr); -} - -static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr) -{ - kvm_has_device_attr(vcpu->fd, group, attr); -} - -static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_get(vcpu->fd, group, attr, val); -} - -static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_get(vcpu->fd, group, attr, val); -} - -static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_set(vcpu->fd, group, attr, val); -} - -static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_set(vcpu->fd, group, attr, val); -} - -int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); -int __kvm_create_device(struct kvm_vm *vm, uint64_t type); - -static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) -{ - int fd = __kvm_create_device(vm, type); - - TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); - return fd; -} - -void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); - -/* - * VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first @num input parameters for the function at @vcpu's entry point, - * per the C calling convention of the architecture, to the values given as - * variable args. Each of the variable args is expected to be of type uint64_t. - * The maximum @num can be is specific to the architecture. - */ -void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); - -void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); -int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); - -#define KVM_MAX_IRQ_ROUTES 4096 - -struct kvm_irq_routing *kvm_gsi_routing_create(void); -void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, - uint32_t gsi, uint32_t pin); -int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); -void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); - -const char *exit_reason_str(unsigned int exit_reason); - -vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, - uint32_t memslot); -vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot, - bool protected); -vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); - -static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot) -{ - /* - * By default, allocate memory as protected for VMs that support - * protected memory, as the majority of memory for such VMs is - * protected, i.e. using shared memory is effectively opt-in. - */ - return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, - vm_arch_has_protected_memory(vm)); -} - -/* - * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also - * loads the test binary into guest memory and creates an IRQ chip (x86 only). - * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to - * calculate the amount of memory needed for per-vCPU data, e.g. stacks. - */ -struct kvm_vm *____vm_create(struct vm_shape shape); -struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, - uint64_t nr_extra_pages); - -static inline struct kvm_vm *vm_create_barebones(void) -{ - return ____vm_create(VM_SHAPE_DEFAULT); -} - -static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) -{ - const struct vm_shape shape = { - .mode = VM_MODE_DEFAULT, - .type = type, - }; - - return ____vm_create(shape); -} - -static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) -{ - return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); -} - -struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, - uint64_t extra_mem_pages, - void *guest_code, struct kvm_vcpu *vcpus[]); - -static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, - void *guest_code, - struct kvm_vcpu *vcpus[]) -{ - return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, - guest_code, vcpus); -} - - -struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, - struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code); - -/* - * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages - * additional pages of guest memory. Returns the VM and vCPU (via out param). - */ -static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code) -{ - return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, - extra_mem_pages, guest_code); -} - -static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - void *guest_code) -{ - return __vm_create_with_one_vcpu(vcpu, 0, guest_code); -} - -static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, - struct kvm_vcpu **vcpu, - void *guest_code) -{ - return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); -} - -struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); - -void kvm_pin_this_task_to_pcpu(uint32_t pcpu); -void kvm_print_vcpu_pinning_help(void); -void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], - int nr_vcpus); - -unsigned long vm_compute_max_gfn(struct kvm_vm *vm); -unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); -unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); -unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); -static inline unsigned int -vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) -{ - unsigned int n; - n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); -#ifdef __s390x__ - /* s390 requires 1M aligned guest sizes */ - n = (n + 255) & ~255; -#endif - return n; -} - -#define sync_global_to_guest(vm, g) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - memcpy(_p, &(g), sizeof(g)); \ -}) - -#define sync_global_from_guest(vm, g) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - memcpy(&(g), _p, sizeof(g)); \ -}) - -/* - * Write a global value, but only in the VM's (guest's) domain. Primarily used - * for "globals" that hold per-VM values (VMs always duplicate code and global - * data into their own region of physical memory), but can be used anytime it's - * undesirable to change the host's copy of the global. - */ -#define write_guest_global(vm, g, val) ({ \ - typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ - typeof(g) _val = val; \ - \ - memcpy(_p, &(_val), sizeof(g)); \ -}) - -void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); - -void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, - uint8_t indent); - -static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, - uint8_t indent) -{ - vcpu_arch_dump(stream, vcpu, indent); -} - -/* - * Adds a vCPU with reasonable defaults (e.g. a stack) - * - * Input Args: - * vm - Virtual Machine - * vcpu_id - The id of the VCPU to add to the VM. - */ -struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); - -static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, - void *guest_code) -{ - struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); - - vcpu_arch_set_entry_point(vcpu, guest_code); - - return vcpu; -} - -/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ -struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); - -static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, - uint32_t vcpu_id) -{ - return vm_arch_vcpu_recreate(vm, vcpu_id); -} - -void vcpu_arch_free(struct kvm_vcpu *vcpu); - -void virt_arch_pgd_alloc(struct kvm_vm *vm); - -static inline void virt_pgd_alloc(struct kvm_vm *vm) -{ - virt_arch_pgd_alloc(vm); -} - -/* - * VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within @vm, creates a virtual translation for the page starting - * at @vaddr to the page starting at @paddr. - */ -void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); - -static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) -{ - virt_arch_pg_map(vm, vaddr, paddr); -} - - -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gva - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Returns the VM physical address of the translated VM virtual - * address given by @gva. - */ -vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); - -static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) -{ - return addr_arch_gva2gpa(vm, gva); -} - -/* - * Virtual Translation Tables Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps to the FILE stream given by @stream, the contents of all the - * virtual translation tables for the VM given by @vm. - */ -void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) -{ - virt_arch_dump(stream, vm, indent); -} - - -static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) -{ - return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); -} - -/* - * Arch hook that is invoked via a constructor, i.e. before exeucting main(), - * to allow for arch-specific setup that is common to all tests, e.g. computing - * the default guest "mode". - */ -void kvm_selftest_arch_init(void); - -void kvm_arch_vm_post_create(struct kvm_vm *vm); - -bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); - -uint32_t guest_get_vcpuid(void); - -#endif /* SELFTEST_KVM_UTIL_BASE_H */ -- cgit v1.2.3 From 78464d7681f79bb48995c3d29d7e93d27ba69bca Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 24 Apr 2024 21:12:18 -0400 Subject: tools/power turbostat: Add columns for clustered uncore frequency New machines have multiple uncore frequencies per package, visible in /sys/devices/system/cpu/intel_uncore_frequency/uncore##/ turbostat now samples these frequencies each measurement interval. For each package, turbostat now prints "UMHzX.Y" columns, where X = domain_id, and Y = fabric_cluster_id. The system summary for each UMHzX.Y column is the average value for across all of the packages in the system. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 4 +- tools/power/x86/turbostat/turbostat.c | 342 ++++++++++++++++++++++------------ 2 files changed, 226 insertions(+), 120 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d3672e5d9ed..8d37acd39201 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -155,7 +155,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .PP -\fBUncMHz\fP uncore MHz, instantaneous sample. +\fBUncMHz\fP per-package uncore MHz, instantaneous sample. +.PP +\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f92b46cfda31..abfddff6aebd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -59,15 +59,21 @@ #define MAX_NOFILE 0x8000 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +struct sysfs_path { + char path[PATH_BYTES]; + int id; + struct sysfs_path *next; +}; + struct msr_counter { unsigned int msr_num; char name[NAME_BYTES]; - char path[PATH_BYTES]; + struct sysfs_path *sp; unsigned int width; enum counter_type type; enum counter_format format; @@ -79,64 +85,64 @@ struct msr_counter { }; struct msr_counter bic[] = { - { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, - { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, - { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -300,6 +306,9 @@ struct gfx_sysfs_info { static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); +int add_counter(unsigned int msr_num, char *path, char *name, + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -999,8 +1008,9 @@ char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; -#define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 +#define MAX_ADDED_CORE_COUNTERS 8 +#define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 /* Indexes used to map data read from perf and MSRs into global variables */ @@ -1201,7 +1211,7 @@ struct core_data { struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1234,7 +1244,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1955,13 +1965,15 @@ void print_header(char *delim) if (mp->format == FORMAT_RAW) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else + else if (mp->width == 32) outp += sprintf(outp, "%s%10.10s", delim, mp->name); + else + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } else { if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) outp += sprintf(outp, "%s%8s", delim, mp->name); else - outp += sprintf(outp, "%s%s", delim, mp->name); + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } } @@ -1993,7 +2005,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->path); + t->counter[i], mp->sp->path); } } @@ -2014,7 +2026,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->path); + c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -2050,7 +2062,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->path); + p->counter[i], mp->sp->path); } } @@ -2415,7 +2427,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } + } else if (mp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } done: @@ -2525,6 +2538,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; + else if (mp->format == FORMAT_AVERAGE) + old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; } @@ -2997,7 +3012,7 @@ unsigned long long snapshot_sysfs_counter(char *path) return counter; } -int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) +int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path) { if (mp->msr_num != 0) { assert(!no_msr); @@ -3007,11 +3022,11 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path); *counterp = snapshot_sysfs_counter(path); } else { - *counterp = snapshot_sysfs_counter(mp->path); + *counterp = snapshot_sysfs_counter(counter_path); } } @@ -3486,6 +3501,18 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data return 0; } +char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) +{ + while (sp) { + if (sp->id == id) + return (sp->path); + sp = sp->next; + } + if (debug) + warnx("%s: id%d not found", __func__, id); + return NULL; +} + /* * get_counters(...) * migrate to cpu @@ -3547,7 +3574,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &t->counter[i])) + if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) return -10; } @@ -3602,7 +3629,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) get_core_throt_cnt(cpu, &c->core_throt_cnt); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &c->counter[i])) + if (get_mp(cpu, mp, &c->counter[i], mp->sp->path)) return -10; } @@ -3694,7 +3721,16 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &p->counter[i])) + char *path = NULL; + + if (mp->msr_num == 0) { + path = find_sysfs_path_by_id(mp->sp, p->package_id); + if (path == NULL) { + warnx("%s: package_id %d not found", __func__, p->package_id); + return -10; + } + } + if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } done: @@ -5377,8 +5413,9 @@ static void probe_intel_uncore_frequency_legacy(void) static void probe_intel_uncore_frequency_cluster(void) { - int i; + int i, uncore_max_id; char path[256]; + char path_base[128]; if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; @@ -5386,16 +5423,25 @@ static void probe_intel_uncore_frequency_cluster(void) if (quiet) return; - for (i = 0;; ++i) { + for (uncore_max_id = 0;; ++uncore_max_id) { + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id); + + /* uncore## start at 00 and skips no numbers, so stop upon first missing */ + if (access(path_base, R_OK)) { + uncore_max_id -= 1; + break; + } + } + for (i = uncore_max_id; i >= 0; --i) { int k, l; - char path_base[128]; int package_id, domain_id, cluster_id; + char name_buf[16]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); - /* uncore## start at 00 and skip no numbers, so stop upon first missing */ if (access(path_base, R_OK)) - break; + err(1, "%s: %s\n", __func__, path_base); sprintf(path, "%s/package_id", path_base); package_id = read_sysfs_int(path); @@ -5422,6 +5468,11 @@ static void probe_intel_uncore_frequency_cluster(void) sprintf(path, "%s/current_freq_khz", path_base); k = read_sysfs_int(path); fprintf(outf, " %d MHz\n", k / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); + + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); } } @@ -7575,61 +7626,114 @@ void print_bootcmd(void) fclose(fp); } +struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) +{ + struct msr_counter *mp; + + for (mp = head; mp; mp = mp->next) { + if (debug) + printf("%s: %s %s\n", __func__, name, mp->name); + if (!strncmp(name, mp->name, strlen(mp->name))) + return mp; + } + return NULL; +} + int add_counter(unsigned int msr_num, char *path, char *name, unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + enum counter_type type, enum counter_format format, int flags, int id) { struct msr_counter *msrp; if (no_msr && msr_num) errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); - msrp = calloc(1, sizeof(struct msr_counter)); - if (msrp == NULL) { - perror("calloc"); - exit(1); - } - - msrp->msr_num = msr_num; - strncpy(msrp->name, name, NAME_BYTES - 1); - if (path) - strncpy(msrp->path, path, PATH_BYTES - 1); - msrp->width = width; - msrp->type = type; - msrp->format = format; - msrp->flags = flags; + if (debug) + printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, + path, name, width, scope, type, format, flags, id); switch (scope) { case SCOPE_CPU: - msrp->next = sys.tp; - sys.tp = msrp; - sys.added_thread_counters++; - if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.tp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter %s", name); + return -1; } break; - case SCOPE_CORE: - msrp->next = sys.cp; - sys.cp = msrp; - sys.added_core_counters++; - if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.cp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter %s", name); + return -1; } break; - case SCOPE_PACKAGE: - msrp->next = sys.pp; - sys.pp = msrp; - sys.added_package_counters++; - if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.pp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter %s", name); + return -1; } break; + default: + warnx("ignoring counter %s with unknown scope", name); + return -1; + } + + if (msrp == NULL) { + msrp = calloc(1, sizeof(struct msr_counter)); + if (msrp == NULL) + err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; + strncpy(msrp->name, name, NAME_BYTES - 1); + msrp->width = width; + msrp->type = type; + msrp->format = format; + msrp->flags = flags; + + switch (scope) { + case SCOPE_CPU: + msrp->next = sys.tp; + sys.tp = msrp; + break; + case SCOPE_CORE: + msrp->next = sys.cp; + sys.cp = msrp; + break; + case SCOPE_PACKAGE: + msrp->next = sys.pp; + sys.pp = msrp; + break; + } + } + + if (path) { + struct sysfs_path *sp; + + sp = calloc(1, sizeof(struct sysfs_path)); + if (sp == NULL) { + perror("calloc"); + exit(1); + } + strncpy(sp->path, path, PATH_BYTES - 1); + sp->id = id; + sp->next = msrp->sp; + msrp->sp = sp; } return 0; @@ -7731,7 +7835,7 @@ next: sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0)) + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) fail++; if (fail) { @@ -7796,7 +7900,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); } for (state = 10; state >= 0; --state) { @@ -7824,7 +7928,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0); } } -- cgit v1.2.3 From 3559ea813ad3a9627934325c68ad05b18008a077 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 6 May 2024 15:39:08 +0200 Subject: tools/power turbostat: Avoid possible memory corruption due to sparse topology IDs Save the highest core and package id when parsing topology to allocate enough memory when get_rapl_counters() is called with a core or a package id as a domain. Note that RAPL domains are per-package on Intel, but per-core on AMD. Thus, the RAPL code effectively runs in different modes on those two product lines. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index abfddff6aebd..9ee9e4e5522c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1052,6 +1052,7 @@ struct rapl_counter_info_t { /* struct rapl_counter_info_t for each RAPL domain */ struct rapl_counter_info_t *rapl_counter_info_perdomain; +unsigned int rapl_counter_info_perdomain_size; #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) @@ -1451,6 +1452,8 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_core_id; + int max_package_id; int max_die_id; int max_node_num; int nodes_per_pkg; @@ -3425,15 +3428,18 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci rc->scale = rci->scale[idx]; } -int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; - struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + struct rapl_counter_info_t *rci; if (debug) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); + assert(domain < rapl_counter_info_perdomain_size); + + rci = &rapl_counter_info_perdomain[domain]; /* * If we have any perf counters to read, read them all now, in bulk @@ -4257,7 +4263,7 @@ void free_fd_rapl_percpu(void) if (!rapl_counter_info_perdomain) return; - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const int num_domains = rapl_counter_info_perdomain_size; for (int domain_id = 0; domain_id < num_domains; ++domain_id) { if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) @@ -4265,6 +4271,8 @@ void free_fd_rapl_percpu(void) } free(rapl_counter_info_perdomain); + rapl_counter_info_perdomain = NULL; + rapl_counter_info_perdomain_size = 0; } void free_all_buffers(void) @@ -6582,17 +6590,18 @@ void linux_perf_init(void) void rapl_perf_init(void) { - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1; bool *domain_visited = calloc(num_domains, sizeof(bool)); rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); if (rapl_counter_info_perdomain == NULL) err(-1, "calloc rapl_counter_info_percpu"); + rapl_counter_info_perdomain_size = num_domains; /* * Initialize rapl_counter_info_percpu */ - for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; rci->fd_perf = -1; @@ -6612,7 +6621,7 @@ void rapl_perf_init(void) bool has_counter = 0; double scale; enum rapl_unit unit; - int next_domain; + unsigned int next_domain; memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); @@ -6625,6 +6634,8 @@ void rapl_perf_init(void) next_domain = platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + assert(next_domain < num_domains); + if (domain_visited[next_domain]) continue; @@ -7207,6 +7218,8 @@ void topology_probe(bool startup) if (cpus[i].thread_id == 0) topo.num_cores++; } + topo.max_core_id = max_core_id; + topo.max_package_id = max_package_id; topo.cores_per_node = max_core_id + 1; if (debug > 1) -- cgit v1.2.3 From 1f9e46da9cba54d12880948fd2adac31bb0eaadb Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 11 Mar 2024 18:06:16 +0100 Subject: tools/power turbostat: Read Core-cstates via perf Reading the counters via perf can be done in bulk with a single syscall, making the counter values more accurate with respect to one another by minimizing the time gap between individual counter reads. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 379 ++++++++++++++++++++++++++++++---- 1 file changed, 335 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9ee9e4e5522c..9d0278d17965 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -63,6 +63,7 @@ enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; struct sysfs_path { char path[PATH_BYTES]; @@ -1183,6 +1184,77 @@ struct rapl_counter { double scale; }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum ccstate_rci_index { + CCSTATE_RCI_INDEX_C1_RESIDENCY = 0, + CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, + CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, + CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, + NUM_CCSTATE_COUNTERS, +}; + +struct cstate_counter_info_t { + unsigned long long data[NUM_CCSTATE_COUNTERS]; + enum cstate_source source[NUM_CCSTATE_COUNTERS]; + unsigned long long msr[NUM_CCSTATE_COUNTERS]; + int fd_perf; +}; + +struct cstate_counter_info_t *ccstate_counter_info; +unsigned int ccstate_counter_info_size; + +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD (1u << 0) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 1) + +struct cstate_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + unsigned long long flags; +}; + +static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { + { + .feature_mask = CC1, + .perf_subsys = "cstate_core", + .perf_name = "c1-residency", + .msr = MSR_CORE_C1_RES, + .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, + .bic = BIC_CPU_c1, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + }, + { + .feature_mask = CC3, + .perf_subsys = "cstate_core", + .perf_name = "c3-residency", + .msr = MSR_CORE_C3_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_CPU_c3, + .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + }, + { + .feature_mask = CC6, + .perf_subsys = "cstate_core", + .perf_name = "c6-residency", + .msr = MSR_CORE_C6_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_CPU_c6, + .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + }, + { + .feature_mask = CC7, + .perf_subsys = "cstate_core", + .perf_name = "c7-residency", + .msr = MSR_CORE_C7_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_CPU_c7, + .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1571,10 +1643,6 @@ static void bic_disable_msr_access(void) { const unsigned long bic_msrs = BIC_SMI | - BIC_CPU_c1 | - BIC_CPU_c3 | - BIC_CPU_c6 | - BIC_CPU_c7 | BIC_Mod_c6 | BIC_CoreTmp | BIC_Totl_c0 | @@ -3421,6 +3489,17 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) return ret; } +static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) + if (cci->source[i] == CSTATE_SOURCE_PERF) + ++ret; + + return ret; +} + void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { rc->raw_value = rci->data[idx]; @@ -3519,6 +3598,90 @@ char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) return NULL; } +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c) +{ + unsigned long long perf_data[NUM_CCSTATE_COUNTERS + 1]; + struct cstate_counter_info_t *cci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(ccstate_counter_info); + assert(cpu <= ccstate_counter_info_size); + + cci = &ccstate_counter_info[cpu]; + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (cci->fd_perf != -1) { + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + const ssize_t expected_read_size = + (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = + read(cci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", + __func__, expected_read_size, actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_CCSTATE_COUNTERS; ++i) { + switch (cci->source[i]) { + case CSTATE_SOURCE_NONE: + break; + + case CSTATE_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(cci->fd_perf != -1); + + if (debug) { + fprintf(stderr, "cstate via %s %u: %llu\n", + "perf", i, perf_data[pi]); + } + + cci->data[i] = perf_data[pi]; + + ++pi; + break; + + case CSTATE_SOURCE_MSR: + assert(!no_msr); + if (get_msr(cpu, cci->msr[i], &cci->data[i])) + return -13 - i; + + if (debug) { + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", + "msr", cci->msr[i], i, cci->data[i]); + } + + break; + } + } + + /* + * Helper to write the data only if the source of + * the counter for the current cpu is not none. + * + * Otherwise we would overwrite core data with 0 (default value), + * when invoked for the thread sibling. + */ +#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ + if (cci->source[index] != CSTATE_SOURCE_NONE) \ + out_counter = cci->data[index]; \ +} while (0) + + BUILD_BUG_ON(NUM_CCSTATE_COUNTERS != 4); + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + +#undef PERF_COUNTER_WRITE_DATA + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3574,10 +3737,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { - if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) - return -6; - } + + get_cstate_counters(cpu, t, c); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) @@ -3594,31 +3755,14 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return status; } - if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { - if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) - return -6; - } - - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { - if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { - if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } - - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { - if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) - return -8; - else if (t->is_atom) { - /* - * For Atom CPUs that has core cstate deeper than c6, - * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. - * Minus CC7 (and deeper cstates) residency to get - * accturate cc6 residency. - */ - c->c6 -= c->c7; - } + if (DO_BIC(BIC_CPU_c7) && t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; } if (DO_BIC(BIC_Mod_c6)) @@ -4258,6 +4402,23 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_cstate(void) +{ + if (!ccstate_counter_info) + return; + + const int counter_info_num = ccstate_counter_info_size; + + for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { + if (ccstate_counter_info[counter_id].fd_perf != -1) + close(ccstate_counter_info[counter_id].fd_perf); + } + + free(ccstate_counter_info); + ccstate_counter_info = NULL; + ccstate_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4319,6 +4480,7 @@ void free_all_buffers(void) free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free_fd_rapl_percpu(); + free_fd_cstate(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4654,6 +4816,7 @@ static void update_effective_set(bool startup) void linux_perf_init(void); void rapl_perf_init(void); +void cstate_perf_init(void); void re_initialize(void) { @@ -4661,6 +4824,7 @@ void re_initialize(void) setup_all_buffers(false); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -6517,7 +6681,8 @@ bool is_aperf_access_required(void) return BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC); + || BIC_IS_ENABLED(BIC_IPC) + || BIC_IS_ENABLED(BIC_CPU_c1); } int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, @@ -6749,21 +6914,132 @@ static int has_amperf_access(void) return 0; } -void probe_cstates(void) +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, + const struct cstate_counter_arch_info *cai) { - probe_cst_limit(); + if (no_perf) + return -1; - if (platform->supported_cstates & CC1) - BIC_PRESENT(BIC_CPU_c1); + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (cci->fd_perf == -1) + cci->fd_perf = fd_counter; + + return fd_counter; +} + +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, + const struct cstate_counter_arch_info *cai) +{ + int ret = add_cstate_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + +void cstate_perf_init_(bool soft_c1) +{ + bool has_counter; + bool *cores_visited; + const int cores_visited_elems = topo.max_core_id + 1; + const int cci_num = topo.max_cpu_num + 1; + + ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); + if (!ccstate_counter_info) + err(1, "calloc ccstate_counter_arch_info"); + ccstate_counter_info_size = cci_num; + + cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited)); + if (!cores_visited) + err(1, "calloc cores_visited"); + + /* Initialize cstate_counter_info_percpu */ + for (int cpu = 0; cpu < cci_num; ++cpu) + ccstate_counter_info[cpu].fd_perf = -1; + + for (int cidx = 0; cidx < NUM_CCSTATE_COUNTERS; ++cidx) { + has_counter = false; + memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + + const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; + + for (int cpu = 0; cpu < cci_num; ++cpu) { - if (platform->supported_cstates & CC3) - BIC_PRESENT(BIC_CPU_c3); + struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu]; - if (platform->supported_cstates & CC6) - BIC_PRESENT(BIC_CPU_c6); + if (cpu_is_not_allowed(cpu)) + continue; + + const int core_id = cpus[cpu].physical_core_id; + + assert(core_id < cores_visited_elems); + + const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + + if (!per_thread && cores_visited[core_id]) + continue; + + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || + (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); + const bool counter_supported = + platform->supported_cstates & cai->feature_mask; + + if (counter_needed && counter_supported) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_cstate_perf_counter(cpu, cci, cai) != -1) { - if (platform->supported_cstates & CC7) - BIC_PRESENT(BIC_CPU_c7); + cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + } + } + + if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + has_counter = true; + cores_visited[core_id] = true; + } + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(cores_visited); +} + +void cstate_perf_init(void) +{ + /* + * If we don't have a C1 residency MSR, we calculate it "in software", + * but we need APERF, MPERF too. + */ + const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() + && platform->supported_cstates & CC1; + + if (soft_c1) + BIC_PRESENT(BIC_CPU_c1); + + cstate_perf_init_(soft_c1); +} + +void probe_cstates(void) +{ + probe_cst_limit(); if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) BIC_PRESENT(BIC_Pkgpc2); @@ -7042,6 +7318,19 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); } +static void counter_info_init(void) +{ + for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) { + struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; + + if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) + cai->msr = MSR_KNL_CORE_C6_RESIDENCY; + + if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) + cai->msr = 0; + } +} + void probe_pm_features(void) { probe_pstates(); @@ -7519,10 +7808,12 @@ void turbostat_init() check_msr_access(); check_perf_access(); process_cpuid(); + counter_info_init(); probe_pm_features(); set_amperf_source(); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); -- cgit v1.2.3 From 0451adf4d46d5df91f888a3d010a4109aa23a7ae Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 8 May 2024 15:00:14 +0200 Subject: tools/power turbostat: Read Package-cstates via perf Reading the counters via perf can be done in bulk with a single syscall, making the counter values more accurate with respect to one another by minimizing the time gap between individual counter reads. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 374 ++++++++++++++++++++++------------ 1 file changed, 244 insertions(+), 130 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9d0278d17965..a3842e927799 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -224,6 +224,28 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* + * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: + * If you change the values, note they are used both in comparisons + * (>= PCL__7) and to index pkg_cstate_limit_strings[]. + */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ + struct amperf_group_fd; char *proc_stat = "/proc/stat"; @@ -1190,21 +1212,30 @@ enum ccstate_rci_index { CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, - NUM_CCSTATE_COUNTERS, + PCSTATE_RCI_INDEX_C2_RESIDENCY = 4, + PCSTATE_RCI_INDEX_C3_RESIDENCY = 5, + PCSTATE_RCI_INDEX_C6_RESIDENCY = 6, + PCSTATE_RCI_INDEX_C7_RESIDENCY = 7, + PCSTATE_RCI_INDEX_C8_RESIDENCY = 8, + PCSTATE_RCI_INDEX_C9_RESIDENCY = 9, + PCSTATE_RCI_INDEX_C10_RESIDENCY = 10, + NUM_CSTATE_COUNTERS, }; struct cstate_counter_info_t { - unsigned long long data[NUM_CCSTATE_COUNTERS]; - enum cstate_source source[NUM_CCSTATE_COUNTERS]; - unsigned long long msr[NUM_CCSTATE_COUNTERS]; - int fd_perf; + unsigned long long data[NUM_CSTATE_COUNTERS]; + enum cstate_source source[NUM_CSTATE_COUNTERS]; + unsigned long long msr[NUM_CSTATE_COUNTERS]; + int fd_perf_core; + int fd_perf_pkg; }; struct cstate_counter_info_t *ccstate_counter_info; unsigned int ccstate_counter_info_size; -#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD (1u << 0) -#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 1) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE (1u << 0) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2) struct cstate_counter_arch_info { int feature_mask; /* Mask for testing if the counter is supported on host */ @@ -1214,6 +1245,7 @@ struct cstate_counter_arch_info { unsigned int rci_index; /* Maps data from perf counters to global variables */ unsigned long long bic; unsigned long long flags; + int pkg_cstate_limit; }; static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { @@ -1225,6 +1257,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, .bic = BIC_CPU_c1, .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + .pkg_cstate_limit = 0, }, { .feature_mask = CC3, @@ -1233,7 +1266,8 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .msr = MSR_CORE_C3_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, .bic = BIC_CPU_c3, - .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, }, { .feature_mask = CC6, @@ -1242,7 +1276,8 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .msr = MSR_CORE_C6_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, .bic = BIC_CPU_c6, - .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, }, { .feature_mask = CC7, @@ -1251,7 +1286,78 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .msr = MSR_CORE_C7_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, .bic = BIC_CPU_c7, - .flags = CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = PC2, + .perf_subsys = "cstate_pkg", + .perf_name = "c2-residency", + .msr = MSR_PKG_C2_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, + .bic = BIC_Pkgpc2, + .flags = 0, + .pkg_cstate_limit = PCL__2, + }, + { + .feature_mask = PC3, + .perf_subsys = "cstate_pkg", + .perf_name = "c3-residency", + .msr = MSR_PKG_C3_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_Pkgpc3, + .flags = 0, + .pkg_cstate_limit = PCL__3, + }, + { + .feature_mask = PC6, + .perf_subsys = "cstate_pkg", + .perf_name = "c6-residency", + .msr = MSR_PKG_C6_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_Pkgpc6, + .flags = 0, + .pkg_cstate_limit = PCL__6, + }, + { + .feature_mask = PC7, + .perf_subsys = "cstate_pkg", + .perf_name = "c7-residency", + .msr = MSR_PKG_C7_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_Pkgpc7, + .flags = 0, + .pkg_cstate_limit = PCL__7, + }, + { + .feature_mask = PC8, + .perf_subsys = "cstate_pkg", + .perf_name = "c8-residency", + .msr = MSR_PKG_C8_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, + .bic = BIC_Pkgpc8, + .flags = 0, + .pkg_cstate_limit = PCL__8, + }, + { + .feature_mask = PC9, + .perf_subsys = "cstate_pkg", + .perf_name = "c9-residency", + .msr = MSR_PKG_C9_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, + .bic = BIC_Pkgpc9, + .flags = 0, + .pkg_cstate_limit = PCL__9, + }, + { + .feature_mask = PC10, + .perf_subsys = "cstate_pkg", + .perf_name = "c10-residency", + .msr = MSR_PKG_C10_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, + .bic = BIC_Pkgpc10, + .flags = 0, + .pkg_cstate_limit = PCL_10, }, }; @@ -1641,15 +1747,8 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = - BIC_SMI | - BIC_Mod_c6 | - BIC_CoreTmp | - BIC_Totl_c0 | - BIC_Any_c0 | - BIC_GFX_c0 | - BIC_CPUGFX | - BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; @@ -3493,7 +3592,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t { size_t ret = 0; - for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) if (cci->source[i] == CSTATE_SOURCE_PERF) ++ret; @@ -3598,9 +3697,16 @@ char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) return NULL; } -int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c) +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) { - unsigned long long perf_data[NUM_CCSTATE_COUNTERS + 1]; + /* + * Overcommit memory a little bit here, + * but skip calculating exact sizes for the buffers. + */ + unsigned long long perf_data[NUM_CSTATE_COUNTERS]; + unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1]; + unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1]; + struct cstate_counter_info_t *cci; if (debug) @@ -3609,35 +3715,72 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat assert(ccstate_counter_info); assert(cpu <= ccstate_counter_info_size); + memset(perf_data, 0, sizeof(perf_data)); + memset(perf_data_core, 0, sizeof(perf_data_core)); + memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + cci = &ccstate_counter_info[cpu]; /* * If we have any perf counters to read, read them all now, in bulk */ - if (cci->fd_perf != -1) { - const size_t num_perf_counters = cstate_counter_info_count_perf(cci); - const ssize_t expected_read_size = - (num_perf_counters + 1) * sizeof(unsigned long long); - const ssize_t actual_read_size = - read(cci->fd_perf, &perf_data[0], sizeof(perf_data)); + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long); + ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0; - if (actual_read_size != expected_read_size) - err(-1, "%s: failed to read perf_data (%zu %zu)", - __func__, expected_read_size, actual_read_size); + if (cci->fd_perf_core != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core)); + + if (actual_read_size_core <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core); + } + + if (cci->fd_perf_pkg != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg)); + + if (actual_read_size_pkg <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg); } - for (unsigned int i = 0, pi = 1; i < NUM_CCSTATE_COUNTERS; ++i) { + const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg; + + if (actual_read_size_total != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total); + + /* + * Copy ccstate and pcstate data into unified buffer. + * + * Skip first element from core and pkg buffers. + * Kernel puts there how many counters were read. + */ + const size_t num_core_counters = perf_data_core[0]; + const size_t num_pkg_counters = perf_data_pkg[0]; + + assert(num_perf_counters == num_core_counters + num_pkg_counters); + + /* Copy ccstate perf data */ + memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long)); + + /* Copy pcstate perf data */ + memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long)); + + for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { switch (cci->source[i]) { case CSTATE_SOURCE_NONE: break; case CSTATE_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); - assert(cci->fd_perf != -1); + assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); if (debug) { - fprintf(stderr, "cstate via %s %u: %llu\n", - "perf", i, perf_data[pi]); + fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); } cci->data[i] = perf_data[pi]; @@ -3651,8 +3794,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat return -13 - i; if (debug) { - fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", - "msr", cci->msr[i], i, cci->data[i]); + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); } break; @@ -3671,12 +3813,21 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat out_counter = cci->data[index]; \ } while (0) - BUILD_BUG_ON(NUM_CCSTATE_COUNTERS != 4); + BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11); + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY); + #undef PERF_COUNTER_WRITE_DATA return 0; @@ -3738,7 +3889,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) t->smi_count = msr & 0xFFFFFFFF; } - get_cstate_counters(cpu, t, c); + get_cstate_counters(cpu, t, c, p); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) @@ -3803,34 +3954,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0)) return -13; } - if (DO_BIC(BIC_Pkgpc3)) - if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) - return -9; - if (DO_BIC(BIC_Pkgpc6)) { - if (platform->has_msr_atom_pkg_c6_residency) { - if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } else { - if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } - } - - if (DO_BIC(BIC_Pkgpc2)) - if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2)) - return -11; - if (DO_BIC(BIC_Pkgpc7)) - if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) - return -12; - if (DO_BIC(BIC_Pkgpc8)) - if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8)) - return -13; - if (DO_BIC(BIC_Pkgpc9)) - if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9)) - return -13; - if (DO_BIC(BIC_Pkgpc10)) - if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10)) - return -13; if (DO_BIC(BIC_CPU_LPI)) p->cpu_lpi = cpuidle_cur_cpu_lpi_us; @@ -3889,29 +4012,6 @@ done: return 0; } -/* - * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: - * If you change the values, note they are used both in comparisons - * (>= PCL__7) and to index pkg_cstate_limit_strings[]. - */ - -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ - int pkg_cstate_limit = PCLUKN; char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" @@ -4410,8 +4510,11 @@ void free_fd_cstate(void) const int counter_info_num = ccstate_counter_info_size; for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { - if (ccstate_counter_info[counter_id].fd_perf != -1) - close(ccstate_counter_info[counter_id].fd_perf); + if (ccstate_counter_info[counter_id].fd_perf_core != -1) + close(ccstate_counter_info[counter_id].fd_perf_core); + + if (ccstate_counter_info[counter_id].fd_perf_pkg != -1) + close(ccstate_counter_info[counter_id].fd_perf_pkg); } free(ccstate_counter_info); @@ -6914,30 +7017,43 @@ static int has_amperf_access(void) return 0; } -int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, - const struct cstate_counter_arch_info *cai) +int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) +{ + if (strcmp(group_name, "cstate_core") == 0) + return &cci->fd_perf_core; + + if (strcmp(group_name, "cstate_pkg") == 0) + return &cci->fd_perf_pkg; + + return NULL; +} + +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) { if (no_perf) return -1; + int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys); + + if (pfd_group == NULL) + return -1; + const unsigned int type = read_perf_type(cai->perf_subsys); const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); - const int fd_counter = - open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); if (fd_counter == -1) return -1; /* If it's the first counter opened, make it a group descriptor */ - if (cci->fd_perf == -1) - cci->fd_perf = fd_counter; + if (*pfd_group == -1) + *pfd_group = fd_counter; return fd_counter; } -int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, - const struct cstate_counter_arch_info *cai) +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) { int ret = add_cstate_perf_counter_(cpu, cci, cai); @@ -6950,8 +7066,9 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, void cstate_perf_init_(bool soft_c1) { bool has_counter; - bool *cores_visited; + bool *cores_visited = NULL, *pkg_visited = NULL; const int cores_visited_elems = topo.max_core_id + 1; + const int pkg_visited_elems = topo.max_package_id + 1; const int cci_num = topo.max_cpu_num + 1; ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); @@ -6963,13 +7080,20 @@ void cstate_perf_init_(bool soft_c1) if (!cores_visited) err(1, "calloc cores_visited"); + pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited)); + if (!pkg_visited) + err(1, "calloc pkg_visited"); + /* Initialize cstate_counter_info_percpu */ - for (int cpu = 0; cpu < cci_num; ++cpu) - ccstate_counter_info[cpu].fd_perf = -1; + for (int cpu = 0; cpu < cci_num; ++cpu) { + ccstate_counter_info[cpu].fd_perf_core = -1; + ccstate_counter_info[cpu].fd_perf_pkg = -1; + } - for (int cidx = 0; cidx < NUM_CCSTATE_COUNTERS; ++cidx) { + for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) { has_counter = false; memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited)); const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; @@ -6981,23 +7105,29 @@ void cstate_perf_init_(bool soft_c1) continue; const int core_id = cpus[cpu].physical_core_id; + const int pkg_id = cpus[cpu].physical_package_id; assert(core_id < cores_visited_elems); + assert(pkg_id < pkg_visited_elems); const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE; if (!per_thread && cores_visited[core_id]) continue; + if (!per_core && pkg_visited[pkg_id]) + continue; + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); const bool counter_supported = - platform->supported_cstates & cai->feature_mask; + (platform->supported_cstates & cai->feature_mask) && + (pkg_cstate_limit >= cai->pkg_cstate_limit); if (counter_needed && counter_supported) { /* Use perf API for this counter */ - if (!no_perf && cai->perf_name - && add_cstate_perf_counter(cpu, cci, cai) != -1) { + if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; @@ -7011,6 +7141,7 @@ void cstate_perf_init_(bool soft_c1) if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { has_counter = true; cores_visited[core_id] = true; + pkg_visited[pkg_id] = true; } } @@ -7020,6 +7151,7 @@ void cstate_perf_init_(bool soft_c1) } free(cores_visited); + free(pkg_visited); } void cstate_perf_init(void) @@ -7029,7 +7161,7 @@ void cstate_perf_init(void) * but we need APERF, MPERF too. */ const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() - && platform->supported_cstates & CC1; + && platform->supported_cstates & CC1; if (soft_c1) BIC_PRESENT(BIC_CPU_c1); @@ -7041,27 +7173,6 @@ void probe_cstates(void) { probe_cst_limit(); - if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); - - if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) - BIC_PRESENT(BIC_Pkgpc3); - - if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) - BIC_PRESENT(BIC_Pkgpc6); - - if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); - - if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) - BIC_PRESENT(BIC_Pkgpc8); - - if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) - BIC_PRESENT(BIC_Pkgpc9); - - if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) - BIC_PRESENT(BIC_Pkgpc10); - if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); @@ -7320,7 +7431,7 @@ void process_cpuid() static void counter_info_init(void) { - for (int i = 0; i < NUM_CCSTATE_COUNTERS; ++i) { + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) { struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) @@ -7328,6 +7439,9 @@ static void counter_info_init(void) if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) cai->msr = 0; + + if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) + cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; } } -- cgit v1.2.3 From 4e7ee02300805d26d9731fd24c4de8e10a43ffea Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 9 May 2024 12:24:02 +0200 Subject: tools/power turbostat: Fix order of strings in pkg_cstate_limit_strings Change the order so that it matches the indexes defined in: Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a3842e927799..b45b2f494416 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4013,7 +4013,7 @@ done: } int pkg_cstate_limit = PCLUKN; -char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", +char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" }; -- cgit v1.2.3 From 29fea61cd8d4d0e646022c0479aa35381cf1e990 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 9 May 2024 12:39:47 +0200 Subject: tools/power turbostat: Ignore pkg_cstate_limit when it is not available When running in no-msr mode, the pkg_cstate_limit is not populated, thus we use perf to determine if given pcstate counter is present on the platform. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b45b2f494416..8cb18d42c189 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -7121,9 +7121,7 @@ void cstate_perf_init_(bool soft_c1) const bool counter_needed = BIC_IS_ENABLED(cai->bic) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); - const bool counter_supported = - (platform->supported_cstates & cai->feature_mask) && - (pkg_cstate_limit >= cai->pkg_cstate_limit); + const bool counter_supported = (platform->supported_cstates & cai->feature_mask); if (counter_needed && counter_supported) { /* Use perf API for this counter */ @@ -7132,7 +7130,8 @@ void cstate_perf_init_(bool soft_c1) cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; /* User MSR for this counter */ - } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit + && probe_msr(cpu, cai->msr) == 0) { cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } -- cgit v1.2.3 From 256d218ec6aea99855dc5c54af550fcff96fc732 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 27 Apr 2024 22:15:48 -0400 Subject: tools/power turbostat: version 2024.05.10 New since 2024.04.08: Len Brown (6): tools/power turbostat: Add "snapshot:" Makefile target tools/power turbostat: Harden probe_intel_uncore_frequency() tools/power turbostat: Remember global max_die_id tools/power turbostat: Survive sparse die_id tools/power turbostat: Add columns for clustered uncore frequency tools/power turbostat: version 2024.05.10 Patryk Wlazlyn (7): tools/power turbostat: Replace _Static_assert with BUILD_BUG_ON tools/power turbostat: Enable non-privileged users to read sysfs counters tools/power turbostat: Avoid possible memory corruption due to sparse topology IDs tools/power turbostat: Read Core-cstates via perf tools/power turbostat: Read Package-cstates via perf tools/power turbostat: Fix order of strings in pkg_cstate_limit_strings tools/power turbostat: Ignore pkg_cstate_limit when it is not available Zhang Rui (2): tools/power turbostat: Enhance ARL/LNL support tools/power turbostat: Add ARL-H support Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8cb18d42c189..8cdf41906e98 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -8017,7 +8017,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.04.08 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.05.10 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 -- cgit v1.2.3 From 83e93942796db58652288f0391ac00072401816f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 14 May 2024 10:33:59 +0800 Subject: selftests/net/lib: no need to record ns name if it already exist There is no need to add the name to ns_list again if the netns already recoreded. Fixes: 25ae948b4478 ("selftests/net: add lib.sh") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/lib.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index c868c0aec121..72b191e4e064 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -127,15 +127,17 @@ setup_ns() local ns="" local ns_name="" local ns_list="" + local ns_exist= for ns_name in "$@"; do # Some test may setup/remove same netns multi times if unset ${ns_name} 2> /dev/null; then ns="${ns_name,,}-$(mktemp -u XXXXXX)" eval readonly ${ns_name}="$ns" + ns_exist=false else eval ns='$'${ns_name} cleanup_ns "$ns" - + ns_exist=true fi if ! ip netns add "$ns"; then @@ -144,7 +146,7 @@ setup_ns() return $ksft_skip fi ip -n "$ns" link set lo up - ns_list="$ns_list $ns" + ! $ns_exist && ns_list="$ns_list $ns" done NS_LIST="$NS_LIST $ns_list" } -- cgit v1.2.3 From e9a4062e1527238c5649d0f4be794a8566fd77c9 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 16 May 2024 16:15:22 +0200 Subject: rtla: Add --trace-buffer-size option Add the option allow the users to set a different buffer size for the trace. For example, in large systems, the user might be interested on reducing the trace buffer to avoid large tracing files. The buffer size is specified in kB, and it is only affecting the tracing instance. The function trace_set_buffer_size() appears on libtracefs v1.6, so increase the minimum required version on Makefile.config. Link: https://lkml.kernel.org/r/e7c9ca5b3865f28e131a49ec3b984fadf2d056c6.1715860611.git.bristot@kernel.org Cc: Jonathan Corbet Cc: Juri Lelli Cc: John Kacur Signed-off-by: Daniel Bristot de Oliveira --- Documentation/tools/rtla/common_options.rst | 3 +++ tools/tracing/rtla/Makefile.config | 2 +- tools/tracing/rtla/src/osnoise_hist.c | 13 ++++++++++++- tools/tracing/rtla/src/osnoise_top.c | 14 +++++++++++++- tools/tracing/rtla/src/timerlat_hist.c | 14 +++++++++++++- tools/tracing/rtla/src/timerlat_top.c | 14 +++++++++++++- tools/tracing/rtla/src/trace.c | 15 +++++++++++++++ tools/tracing/rtla/src/trace.h | 1 + 8 files changed, 71 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/Documentation/tools/rtla/common_options.rst b/Documentation/tools/rtla/common_options.rst index a96ea0ed662e..7ac7b7581466 100644 --- a/Documentation/tools/rtla/common_options.rst +++ b/Documentation/tools/rtla/common_options.rst @@ -54,6 +54,9 @@ After starting the workload, let it run for *s* seconds before starting collecting the data, allowing the system to warm-up. Statistical data generated during warm-up is discarded. +**--trace-buffer-size** *kB* + Set the per-cpu trace buffer size in kB for the tracing output. + **-h**, **--help** Print help menu. diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 6d4ba77847b6..0b7ecfb30d19 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -3,7 +3,7 @@ STOP_ERROR := LIBTRACEEVENT_MIN_VERSION = 1.5 -LIBTRACEFS_MIN_VERSION = 1.3 +LIBTRACEFS_MIN_VERSION = 1.6 define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index c6100ff46a7f..198a17a3ea2e 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -43,6 +43,7 @@ struct osnoise_hist_params { int bucket_size; int entries; int warmup; + int buffer_size; }; struct osnoise_hist_cpu { @@ -469,6 +470,7 @@ static void osnoise_hist_usage(char *usage) " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", " --warm-up: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -533,13 +535,14 @@ static struct osnoise_hist_params {"trigger", required_argument, 0, '4'}, {"filter", required_argument, 0, '5'}, {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -685,6 +688,9 @@ static struct osnoise_hist_params case '6': params->warmup = get_llong_from_str(optarg); break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_hist_usage("Invalid option"); } @@ -891,6 +897,11 @@ int osnoise_hist_main(int argc, char *argv[]) goto out_hist; } + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } /* diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 53a074c1222e..7e5aab22727d 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -41,6 +41,7 @@ struct osnoise_top_params { int cgroup; int hk_cpus; int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -309,6 +310,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -384,13 +386,14 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, {"warm-up", required_argument, 0, '2'}, + {"trace-buffer-size", required_argument, 0, '3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, &option_index); /* Detect the end of the options. */ @@ -517,6 +520,9 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) case '2': params->warmup = get_llong_from_str(optarg); break; + case '3': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_top_usage(params, "Invalid option"); } @@ -725,6 +731,12 @@ int osnoise_top_main(int argc, char **argv) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } /* diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 6eb6e38d4a05..d4bab86ca1b9 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -54,6 +54,7 @@ struct timerlat_hist_params { int bucket_size; int entries; int warmup; + int buffer_size; }; struct timerlat_hist_cpu { @@ -669,6 +670,7 @@ static void timerlat_hist_usage(char *usage) " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -745,13 +747,14 @@ static struct timerlat_hist_params {"no-aa", no_argument, 0, '9'}, {"dump-task", no_argument, 0, '\1'}, {"warm-up", required_argument, 0, '\2'}, + {"trace-buffer-size", required_argument, 0, '\3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3", long_options, &option_index); /* detect the end of the options. */ @@ -926,6 +929,9 @@ static struct timerlat_hist_params case '\2': params->warmup = get_llong_from_str(optarg); break; + case '\3': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_hist_usage("Invalid option"); } @@ -1179,6 +1185,12 @@ int timerlat_hist_main(int argc, char *argv[]) if (retval) goto out_hist; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } if (!params->no_aa) { diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 0acfefe151f7..3a23e8d481c6 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -47,6 +47,7 @@ struct timerlat_top_params { int kernel_workload; int pretty_output; int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -479,6 +480,7 @@ static void timerlat_top_usage(char *usage) " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -547,13 +549,14 @@ static struct timerlat_top_params {"dump-tasks", no_argument, 0, '4'}, {"aa-only", required_argument, 0, '5'}, {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -716,6 +719,9 @@ static struct timerlat_top_params case '6': params->warmup = get_llong_from_str(optarg); break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_top_usage("Invalid option"); } @@ -973,6 +979,12 @@ int timerlat_top_main(int argc, char *argv[]) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } if (!params->no_aa) { diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index e1ba6d9f4265..170a706248ab 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -540,3 +540,18 @@ int trace_is_off(struct trace_instance *tool, struct trace_instance *trace) return 0; } + +/* + * trace_set_buffer_size - set the per-cpu tracing buffer size. + */ +int trace_set_buffer_size(struct trace_instance *trace, int size) +{ + int retval; + + debug_msg("Setting trace buffer size to %d Kb\n", size); + retval = tracefs_instance_set_buffer_size(trace->inst, size, -1); + if (retval) + err_msg("Error setting trace buffer size\n"); + + return retval; +} diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h index 2e9a89a25615..c7c92dc9a18a 100644 --- a/tools/tracing/rtla/src/trace.h +++ b/tools/tracing/rtla/src/trace.h @@ -48,3 +48,4 @@ int trace_events_enable(struct trace_instance *instance, int trace_event_add_filter(struct trace_events *event, char *filter); int trace_event_add_trigger(struct trace_events *event, char *trigger); int trace_is_off(struct trace_instance *tool, struct trace_instance *trace); +int trace_set_buffer_size(struct trace_instance *trace, int size); -- cgit v1.2.3 From 01b05fc0e5f3aec443a9a8ffa0022cbca2fd3608 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Fri, 10 May 2024 15:03:18 -0400 Subject: rtla/timerlat: Fix histogram report when a cpu count is 0 On short runs it is possible to get no samples on a cpu, like this: # rtla timerlat hist -u -T50 Index IRQ-001 Thr-001 Usr-001 IRQ-002 Thr-002 Usr-002 2 1 0 0 0 0 0 33 0 1 0 0 0 0 36 0 0 1 0 0 0 49 0 0 0 1 0 0 52 0 0 0 0 1 0 over: 0 0 0 0 0 0 count: 1 1 1 1 1 0 min: 2 33 36 49 52 18446744073709551615 avg: 2 33 36 49 52 - max: 2 33 36 49 52 0 rtla timerlat hit stop tracing IRQ handler delay: (exit from idle) 48.21 us (91.09 %) IRQ latency: 49.11 us Timerlat IRQ duration: 2.17 us (4.09 %) Blocking thread: 1.01 us (1.90 %) swapper/2:0 1.01 us ------------------------------------------------------------------------ Thread latency: 52.93 us (100%) Max timerlat IRQ latency from idle: 49.11 us in cpu 2 Note, the value 18446744073709551615 is the same as ~0. Fix this by reporting no results for the min, avg and max if the count is 0. Link: https://lkml.kernel.org/r/20240510190318.44295-1-jkacur@redhat.com Cc: stable@vger.kernel.org Fixes: 1eeb6328e8b3 ("rtla/timerlat: Add timerlat hist mode") Suggested-by: Daniel Bristot de Oliveria Signed-off-by: John Kacur Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/timerlat_hist.c | 60 ++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index d4bab86ca1b9..fbe2c6549bf9 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -327,17 +327,29 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_irq); + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_irq); + else + trace_seq_printf(trace->seq, " - "); + } - if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_thread); + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_thread); + else + trace_seq_printf(trace->seq, " - "); + } - if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_user); + if (params->user_hist) { + if (data->hist[cpu].user_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_user); + else + trace_seq_printf(trace->seq, " - "); + } } trace_seq_printf(trace->seq, "\n"); @@ -387,17 +399,29 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_irq); + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_irq); + else + trace_seq_printf(trace->seq, " - "); + } - if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_thread); + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_thread); + else + trace_seq_printf(trace->seq, " - "); + } - if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_user); + if (params->user_hist) { + if (data->hist[cpu].user_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_user); + else + trace_seq_printf(trace->seq, " - "); + } } trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); -- cgit v1.2.3 From 842fc5b87a5058d475b9411dbb94ed49f8d6bce3 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Wed, 15 May 2024 14:30:23 -0400 Subject: rtla: Fix -t\--trace[=file] The -t option has an optional argument. The usual case is for a short option to be specified without an '=' and for the long version to be specified with an '=' Various forms of this do not work as expected. For example: rtla timerlat hist -T50 -tfile.txt will result in a truncated file name of "ile.txt" Another example is that the long form without the '=' will result in the default file name instead of the requested file name. This patch properly parses the optional argument with and without '=' and with and without spaces for the short form. This patch was also tested using -t and --trace without providing a file name both as the last requested option and with a following long and short option. For example: rtla timerlat hist -T50 -t -u rtla timerlat hist -T50 --trace -u This fix is applied to both timerlat top and hist and to osnoise top and hist. Here is the full testing for rtla timerlat hist. Before applying the patch rtla timerlat hist -T50 -t=file.txt Works as expected, "file.txt" rtla timerlat hist -T50 -tfile.txt Truncated file name "ile.txt" rtla timerlat hist -T50 -t file.txt Default file name instead of file.txt rtla timerlat hist -T50 --trace=file.txt Truncated file name "ile.txt" rtla timerlat hist -T50 --trace file.txt Default file name "timerlat_trace.txt" instead of "file.txt" After applying the patch: rtla timerlat hist -T50 -t=file.txt Works as expected, "file.txt" rtla timerlat hist -T50 -tfile.txt Works as expected, "file.txt" rtla timerlat hist -T50 -t file.txt Works as expected, "file.txt" rtla timerlat hist -T50 --trace=file.txt Works as expected, "file.txt" rtla timerlat hist -T50 --trace file.txt Works as expected, "file.txt" In addition the following tests were performed to make sure that the default file name worked as expected including with trailing options. rtla timerlat hist -T50 -t Works as expected "timerlat_trace.txt" rtla timerlat hist -T50 --trace Works as expected "timerlat_trace.txt" rtla timerlat hist -T50 -t -u Works as expected "timerlat_trace.txt" rtla timerlat hist -T50 --trace -u Works as expected "timerlat_trace.txt" Link: https://lkml.kernel.org/r/20240515183024.59985-1-jkacur@redhat.com Cc: Daniel Bristot de Oliveria Signed-off-by: John Kacur Signed-off-by: Daniel Bristot de Oliveira --- tools/tracing/rtla/src/osnoise_hist.c | 14 +++++++++----- tools/tracing/rtla/src/osnoise_top.c | 14 +++++++++----- tools/tracing/rtla/src/timerlat_hist.c | 14 +++++++++----- tools/tracing/rtla/src/timerlat_top.c | 14 +++++++++----- 4 files changed, 36 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 198a17a3ea2e..7be17d09f7e8 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -437,7 +437,7 @@ static void osnoise_hist_usage(char *usage) static const char * const msg[] = { "", " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] \\", + " [-T us] [-t[file]] [-e sys[:event]] [--filter ] [--trigger ] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [-C[=cgroup_name]] [--warm-up]", "", @@ -453,7 +453,7 @@ static void osnoise_hist_usage(char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -645,9 +645,13 @@ static struct osnoise_hist_params params->threshold = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '0') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 7e5aab22727d..07ba55d4ec06 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -283,7 +283,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) static const char * const msg[] = { " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] \\", + " [-T us] [-t[file]] [-e sys[:event]] [--filter ] [--trigger ] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]] [--warm-up s]", "", " -h/--help: print this menu", @@ -298,7 +298,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -486,9 +486,13 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index fbe2c6549bf9..a3907c390d67 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -652,7 +652,7 @@ static void timerlat_hist_usage(char *usage) char *msg[] = { "", " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", - " [-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", + " [-t[file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", " [--warm-up s]", @@ -669,7 +669,7 @@ static void timerlat_hist_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -885,9 +885,13 @@ static struct timerlat_hist_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; break; diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 3a23e8d481c6..8c16419fe22a 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -446,7 +446,7 @@ static void timerlat_top_usage(char *usage) static const char *const msg[] = { "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", - " [[-t[=file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", + " [[-t[file]] [-e sys[:event]] [--filter ] [--trigger ] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u|-k] [--warm-up s]", "", " -h/--help: print this menu", @@ -462,7 +462,7 @@ static void timerlat_top_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " -D/--debug: print debug info", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event : enable the in the trace instance, multiple -e are allowed", " --filter : enable a trace event filter to the previous -e event", " --trigger : enable a trace event trigger to the previous -e event", @@ -668,9 +668,13 @@ static struct timerlat_top_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; -- cgit v1.2.3 From 5405807edd4168c2dc2f307f3c6b70e9579bf7be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 16 May 2024 10:01:40 -0700 Subject: selftests/bpf: Adjust test_access_variable_array after a kernel function name change After commit 4c3e509ea9f2 ("sched/balancing: Rename load_balance() => sched_balance_rq()"), the load_balance kernel function is renamed to sched_balance_rq. This patch adjusts the fentry program in test_access_variable_array.c to reflect this kernel function name change. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240516170140.2689430-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/progs/test_access_variable_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_access_variable_array.c b/tools/testing/selftests/bpf/progs/test_access_variable_array.c index 808c49b79889..326b7d1f496a 100644 --- a/tools/testing/selftests/bpf/progs/test_access_variable_array.c +++ b/tools/testing/selftests/bpf/progs/test_access_variable_array.c @@ -7,7 +7,7 @@ unsigned long span = 0; -SEC("fentry/load_balance") +SEC("fentry/sched_balance_rq") int BPF_PROG(fentry_fentry, int this_cpu, struct rq *this_rq, struct sched_domain *sd) { -- cgit v1.2.3 From 51e2b8d33199df9675d2a36ec6aad0c27e91c6fe Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 16 May 2024 09:43:10 -0700 Subject: selftests/bpf: Adjust btf_dump test to reflect recent change in file_operations The btf_dump test fails: test_btf_dump_struct_data:FAIL:file_operations unexpected file_operations: actual '(struct file_operations){ .owner = (struct module *)0xffffffffffffffff, .fop_flags = (fop_flags_t)4294967295, .llseek = (loff_t (*)(struct f' != expected '(struct file_operations){ .owner = (struct module *)0xffffffffffffffff, .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,' The "fop_flags" is a recent addition to the struct file_operations in commit 210a03c9d51a ("fs: claw back a few FMODE_* bits") This patch changes the test_btf_dump_struct_data() to reflect this change. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/20240516164310.2481460-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index e9ea38aa8248..09a8e6f9b379 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -653,7 +653,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, cmpstr = "(struct file_operations){\n" " .owner = (struct module *)0xffffffffffffffff,\n" -" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; +" .fop_flags = (fop_flags_t)4294967295,"; ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); } -- cgit v1.2.3 From 988af276360bc555c7685d5a607d4da0588616d9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 14 May 2024 17:52:27 +0800 Subject: selftests/net: reduce xfrm_policy test time The check_random_order test add/get plenty of xfrm rules, which consume a lot time on debug kernel and always TIMEOUT. Let's reduce the test loop and see if it works. Signed-off-by: Hangbin Liu Link: https://lore.kernel.org/r/20240514095227.2597730-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/xfrm_policy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 457789530645..3eeeeffb4005 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -293,7 +293,7 @@ check_random_order() local ns=$1 local log=$2 - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow @@ -306,7 +306,7 @@ check_random_order() done done - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do local addr=$(printf "e000:0000:%02x00::/56" $j) -- cgit v1.2.3 From fe56d6e4a99a40f50e64d5a8043f1fa838b1f7a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 16 May 2024 08:25:13 -0700 Subject: selftests: net: local_termination: annotate the expected failures Vladimir said when adding this test: The bridge driver fares particularly badly [...] mainly because it does not implement IFF_UNICAST_FLT. See commit 90b9566aa5cd ("selftests: forwarding: add a test for local_termination.sh"). We don't want to hide the known gaps, but having a test which always fails prevents us from catching regressions. Report the cases we know may fail as XFAIL. Reviewed-by: Simon Horman Reviewed-by: Hangbin Liu Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20240516152513.1115270-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/local_termination.sh | 30 +++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c5b0cbc85b3e..4b364cdf3ef0 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -155,25 +155,30 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \ - "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Multicast IPv4 to unknown group" \ + "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ @@ -187,9 +192,10 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true - check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ - "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ + "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ + false check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ -- cgit v1.2.3 From 6bb955fce08cbc8495a72755130d2d220994faee Mon Sep 17 00:00:00 2001 From: Tao Su Date: Thu, 9 May 2024 13:31:12 +0800 Subject: Revert "selftests/harness: remove use of LINE_MAX" Patch series "Selftests: Fix compilation warnings due to missing _GNU_SOURCE definition", v2. Since kselftest_harness.h introduces asprintf()[1], many selftests have compilation warnings or errors due to missing _GNU_SOURCE definitions. The issue stems from a lack of a LINE_MAX definition in Android (see commit 38c957f07038), which is the reason why asprintf() was introduced. We tried adding _GNU_SOURCE definitions to more selftests to fix, but asprintf() may continue to cause problems, and since it is quite late in the 6.9 cycle, we would like to revert 809216233555 first to provide testing for forks[2]. [1] https://lore.kernel.org/all/20240411231954.62156-1-edliaw@google.com [2] https://lore.kernel.org/linux-kselftest/ZjuA3aY_iHkjP7bQ@google.com This patch (of 2): This reverts commit 8092162335554c8ef5e7f50eff68aa9cfbdbf865. asprintf() is declared in stdio.h when defining _GNU_SOURCE, but stdio.h is so common that many files don't define _GNU_SOURCE before including stdio.h, and defining _GNU_SOURCE after including stdio.h will no longer take effect, which causes warnings or even errors during compilation in many selftests. Revert 'commit 809216233555 ("selftests/harness: remove use of LINE_MAX")' as that came in quite late in the 6.9 cycle. Link: https://lkml.kernel.org/r/20240509053113.43462-1-tao1.su@linux.intel.com Link: https://lore.kernel.org/linux-kselftest/ZjuA3aY_iHkjP7bQ@google.com/ Link: https://lkml.kernel.org/r/20240509053113.43462-2-tao1.su@linux.intel.com Fixes: 809216233555 ("selftests/harness: remove use of LINE_MAX") Signed-off-by: Tao Su Reviewed-by: Simon Horman Cc: Alexandre Belloni Cc: Bongsu Jeon Cc: Dave Hansen Cc: David S. Miller Cc: Edward Liaw Cc: Eric Dumazet Cc: Ivan Orlov Cc: Jakub Kicinski Cc: Jarkko Sakkinen Cc: Jaroslav Kysela Cc: Mark Brown Cc: Paolo Abeni Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Shuah Khan Cc: Takashi Iwai Signed-off-by: Andrew Morton --- tools/testing/selftests/kselftest_harness.h | 12 ++++-------- tools/testing/selftests/mm/mdwe_test.c | 1 - 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 37b03f1b8741..605372356dd6 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -56,6 +56,7 @@ static_assert(0, "kselftest harness requires _GNU_SOURCE to be defined"); #include #include #include +#include #include #include #include @@ -1216,7 +1217,7 @@ void __run_test(struct __fixture_metadata *f, struct __test_metadata *t) { struct __test_xfail *xfail; - char *test_name; + char test_name[LINE_MAX]; const char *diagnostic; /* reset test struct */ @@ -1227,12 +1228,8 @@ void __run_test(struct __fixture_metadata *f, memset(t->env, 0, sizeof(t->env)); memset(t->results->reason, 0, sizeof(t->results->reason)); - if (asprintf(&test_name, "%s%s%s.%s", f->name, - variant->name[0] ? "." : "", variant->name, t->name) == -1) { - ksft_print_msg("ERROR ALLOCATING MEMORY\n"); - t->exit_code = KSFT_FAIL; - _exit(t->exit_code); - } + snprintf(test_name, sizeof(test_name), "%s%s%s.%s", + f->name, variant->name[0] ? "." : "", variant->name, t->name); ksft_print_msg(" RUN %s ...\n", test_name); @@ -1270,7 +1267,6 @@ void __run_test(struct __fixture_metadata *f, ksft_test_result_code(t->exit_code, test_name, diagnostic ? "%s" : NULL, diagnostic); - free(test_name); } static int test_harness_run(int argc, char **argv) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c index 1e01d3ddc11c..200bedcdc32e 100644 --- a/tools/testing/selftests/mm/mdwe_test.c +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -7,7 +7,6 @@ #include #include -#define _GNU_SOURCE #include #include #include -- cgit v1.2.3 From 28d2188709d9c19a7c4601c6870edd9fa0527379 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Thu, 9 May 2024 13:31:13 +0800 Subject: selftests/harness: use 1024 in place of LINE_MAX Android was seeing a compilation error because its C library does not define LINE_MAX. Since LINE_MAX is only used to determine the size of test_name[] and 1024 should be enough for the test name, use 1024 instead of LINE_MAX. Link: https://lkml.kernel.org/r/20240509053113.43462-3-tao1.su@linux.intel.com Fixes: 38c957f07038 ("selftests: kselftest_harness: generate test name once") Signed-off-by: Tao Su Reviewed-by: Simon Horman Cc: Alexandre Belloni Cc: Bongsu Jeon Cc: Dave Hansen Cc: David S. Miller Cc: Edward Liaw Cc: Eric Dumazet Cc: Ivan Orlov Cc: Jakub Kicinski Cc: Jarkko Sakkinen Cc: Jaroslav Kysela Cc: Mark Brown Cc: Paolo Abeni Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Shuah Khan Cc: Takashi Iwai Signed-off-by: Andrew Morton --- tools/testing/selftests/kselftest_harness.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 605372356dd6..98749f5a2e56 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -56,7 +56,6 @@ static_assert(0, "kselftest harness requires _GNU_SOURCE to be defined"); #include #include #include -#include #include #include #include @@ -1217,7 +1216,7 @@ void __run_test(struct __fixture_metadata *f, struct __test_metadata *t) { struct __test_xfail *xfail; - char test_name[LINE_MAX]; + char test_name[1024]; const char *diagnostic; /* reset test struct */ -- cgit v1.2.3 From cc563e749810f5636451d4b833fbd689899ecdb9 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 18 May 2024 13:20:52 +0000 Subject: selftests: net: kill smcrouted in the cleanup logic in amt.sh The amt.sh requires smcrouted for multicasting routing. So, it starts smcrouted before forwarding tests. It must be stopped after all tests, but it isn't. To fix this issue, it kills smcrouted in the cleanup logic. Fixes: c08e8baea78e ("selftests: add amt interface selftest script") Signed-off-by: Taehee Yoo Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/amt.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index 5175a42cbe8a..7e7ed6c558da 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -77,6 +77,7 @@ readonly LISTENER=$(mktemp -u listener-XXXXXXXX) readonly GATEWAY=$(mktemp -u gateway-XXXXXXXX) readonly RELAY=$(mktemp -u relay-XXXXXXXX) readonly SOURCE=$(mktemp -u source-XXXXXXXX) +readonly SMCROUTEDIR="$(mktemp -d)" ERR=4 err=0 @@ -85,6 +86,11 @@ exit_cleanup() for ns in "$@"; do ip netns delete "${ns}" 2>/dev/null || true done + if [ -f "$SMCROUTEDIR/amt.pid" ]; then + smcpid=$(< $SMCROUTEDIR/amt.pid) + kill $smcpid + fi + rm -rf $SMCROUTEDIR exit $ERR } @@ -167,7 +173,7 @@ setup_iptables() setup_mcast_routing() { - ip netns exec "${RELAY}" smcrouted + ip netns exec "${RELAY}" smcrouted -P $SMCROUTEDIR/amt.pid ip netns exec "${RELAY}" smcroutectl a relay_src \ 172.17.0.2 239.0.0.1 amtr ip netns exec "${RELAY}" smcroutectl a relay_src \ -- cgit v1.2.3 From cee27ae5f1fb8bc4762f5d5de19ec6de6c45e239 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 16 May 2024 20:51:07 -0600 Subject: Revert "selftests: Compile kselftest headers with -D_GNU_SOURCE" This reverts commit daef47b89efd0b745e8478d69a3ad724bd8b4dc6. This framework change to add D_GNU_SOURCE to KHDR_INCLUDES to Makefile, lib.mk, and kselftest_harness.h is causing build failures and warnings. Revert this change. Reported-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 4 ++-- tools/testing/selftests/kselftest_harness.h | 2 +- tools/testing/selftests/lib.mk | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f0431e6cb67e..9039f3709aff 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -170,11 +170,11 @@ ifneq ($(KBUILD_OUTPUT),) # $(realpath ...) resolves symlinks abs_objtree := $(realpath $(abs_objtree)) BUILD := $(abs_objtree)/kselftest - KHDR_INCLUDES := -D_GNU_SOURCE -isystem ${abs_objtree}/usr/include + KHDR_INCLUDES := -isystem ${abs_objtree}/usr/include else BUILD := $(CURDIR) abs_srctree := $(shell cd $(top_srcdir) && pwd) - KHDR_INCLUDES := -D_GNU_SOURCE -isystem ${abs_srctree}/usr/include + KHDR_INCLUDES := -isystem ${abs_srctree}/usr/include DEFAULT_INSTALL_HDR_PATH := 1 endif diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 37b03f1b8741..3c8f2965c285 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -51,7 +51,7 @@ #define __KSELFTEST_HARNESS_H #ifndef _GNU_SOURCE -static_assert(0, "kselftest harness requires _GNU_SOURCE to be defined"); +#define _GNU_SOURCE #endif #include #include diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 33c24ceddfb7..09e8a854f0fc 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -67,7 +67,7 @@ MAKEFLAGS += --no-print-directory endif ifeq ($(KHDR_INCLUDES),) -KHDR_INCLUDES := -D_GNU_SOURCE -isystem $(top_srcdir)/usr/include +KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif # The following are built by lib.mk common compile rules. -- cgit v1.2.3 From 3da164023582969280df17636a9d829752787b1c Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 16 May 2024 20:58:26 -0600 Subject: Revert "selftests/sgx: Include KHDR_INCLUDES in Makefile" This reverts commit 2c3b8f8f37c6c0c926d584cf4158db95e62b960c. The framework change to add D_GNU_SOURCE to KHDR_INCLUDES to Makefile, lib.mk, and kselftest_harness.h is reverted as it is causing build failures and warnings. Revert this change as this change depends on the framework change. Reported-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/sgx/Makefile | 2 +- tools/testing/selftests/sgx/sigstruct.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 26ea30fae23c..867f88ce2570 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -12,7 +12,7 @@ OBJCOPY := $(CROSS_COMPILE)objcopy endif INCLUDES := -I$(top_srcdir)/tools/include -HOST_CFLAGS := -Wall -Werror $(KHDR_INCLUDES) -g $(INCLUDES) -fPIC +HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC HOST_LDFLAGS := -z noexecstack -lcrypto ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \ -fno-stack-protector -mrdrnd $(INCLUDES) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index 200034a0fee5..d73b29becf5b 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#define _GNU_SOURCE #include #include #include -- cgit v1.2.3 From a97853f25b06f71c23b2d7a59fbd40f3f42d55ac Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 16 May 2024 20:22:37 -0600 Subject: Revert "selftests/cgroup: Drop define _GNU_SOURCE" This reverts commit c1457d9aad5ee2feafcf85aa9a58ab50500159d2. The framework change to add D_GNU_SOURCE to KHDR_INCLUDES to Makefile, lib.mk, and kselftest_harness.h is reverted as it is causing build failures and warnings. Revert this change as this change depends on the framework change. Reported-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/cgroup/cgroup_util.c | 3 +++ tools/testing/selftests/cgroup/test_core.c | 2 ++ tools/testing/selftests/cgroup/test_cpu.c | 2 ++ tools/testing/selftests/cgroup/test_hugetlb_memcg.c | 2 ++ tools/testing/selftests/cgroup/test_kmem.c | 2 ++ tools/testing/selftests/cgroup/test_memcontrol.c | 2 ++ tools/testing/selftests/cgroup/test_zswap.c | 2 ++ 7 files changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index ce16a50ecff8..432db923bced 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE + #include #include #include diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index de8baad46022..a5672a91d273 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 5a4a314f6af7..dad2ed82f3ef 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE #include #include #include diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c index 80d05d50a42d..856f9508ea56 100644 --- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + #include #include #include diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 2e453ac50c0d..96693d8772be 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + #include #include #include diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index c871630d62a3..41ae8047b889 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define _GNU_SOURCE + #include #include #include diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 8418a8d7439f..fa571bfd2432 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + #include #include #include -- cgit v1.2.3 From ea63ac14292564eefc7dffe868ed354ff9ed6f4b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 17 May 2024 09:03:27 +0800 Subject: selftests/net: use tc rule to filter the na packet Test arp_ndisc_untracked_subnets use tcpdump to filter the unsolicited and untracked na messages. It set -e before calling tcpdump. But if tcpdump filters 0 packet, it will return none zero, and cause the script to exit. Instead of using slow tcpdump to capture packets, let's using tc rule to filter out the na message. At the same time, fix function setup_v6 which only needs one parameter. Move all the related helpers from forwarding lib.sh to net lib.sh. Fixes: 0ea7b0a454ca ("selftests: net: arp_ndisc_untracked_subnets: test for arp_accept and accept_untracked_na") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240517010327.2631319-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- .../selftests/net/arp_ndisc_untracked_subnets.sh | 53 +++++++------------- tools/testing/selftests/net/forwarding/lib.sh | 58 ---------------------- tools/testing/selftests/net/lib.sh | 58 ++++++++++++++++++++++ 3 files changed, 75 insertions(+), 94 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh b/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh index a40c0e9bd023..eef5cbf6eecc 100755 --- a/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh +++ b/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh @@ -73,25 +73,19 @@ setup_v6() { # namespaces. veth0 is veth-router, veth1 is veth-host. # first, set up the inteface's link to the namespace # then, set the interface "up" - ip -6 -netns ${ROUTER_NS_V6} link add name ${ROUTER_INTF} \ - type veth peer name ${HOST_INTF} - - ip -6 -netns ${ROUTER_NS_V6} link set dev ${ROUTER_INTF} up - ip -6 -netns ${ROUTER_NS_V6} link set dev ${HOST_INTF} netns \ - ${HOST_NS_V6} + ip -n ${ROUTER_NS_V6} link add name ${ROUTER_INTF} \ + type veth peer name ${HOST_INTF} netns ${HOST_NS_V6} - ip -6 -netns ${HOST_NS_V6} link set dev ${HOST_INTF} up - ip -6 -netns ${ROUTER_NS_V6} addr add \ - ${ROUTER_ADDR_V6}/${PREFIX_WIDTH_V6} dev ${ROUTER_INTF} nodad + # Add tc rule to filter out host na message + tc -n ${ROUTER_NS_V6} qdisc add dev ${ROUTER_INTF} clsact + tc -n ${ROUTER_NS_V6} filter add dev ${ROUTER_INTF} \ + ingress protocol ipv6 pref 1 handle 101 \ + flower src_ip ${HOST_ADDR_V6} ip_proto icmpv6 type 136 skip_hw action pass HOST_CONF=net.ipv6.conf.${HOST_INTF} ip netns exec ${HOST_NS_V6} sysctl -qw ${HOST_CONF}.ndisc_notify=1 ip netns exec ${HOST_NS_V6} sysctl -qw ${HOST_CONF}.disable_ipv6=0 - ip -6 -netns ${HOST_NS_V6} addr add ${HOST_ADDR_V6}/${PREFIX_WIDTH_V6} \ - dev ${HOST_INTF} - ROUTER_CONF=net.ipv6.conf.${ROUTER_INTF} - ip netns exec ${ROUTER_NS_V6} sysctl -w \ ${ROUTER_CONF}.forwarding=1 >/dev/null 2>&1 ip netns exec ${ROUTER_NS_V6} sysctl -w \ @@ -99,6 +93,13 @@ setup_v6() { ip netns exec ${ROUTER_NS_V6} sysctl -w \ ${ROUTER_CONF}.accept_untracked_na=${accept_untracked_na} \ >/dev/null 2>&1 + + ip -n ${ROUTER_NS_V6} link set dev ${ROUTER_INTF} up + ip -n ${HOST_NS_V6} link set dev ${HOST_INTF} up + ip -n ${ROUTER_NS_V6} addr add ${ROUTER_ADDR_V6}/${PREFIX_WIDTH_V6} \ + dev ${ROUTER_INTF} nodad + ip -n ${HOST_NS_V6} addr add ${HOST_ADDR_V6}/${PREFIX_WIDTH_V6} \ + dev ${HOST_INTF} set +e } @@ -162,26 +163,6 @@ arp_test_gratuitous_combinations() { arp_test_gratuitous 2 1 } -cleanup_tcpdump() { - set -e - [[ ! -z ${tcpdump_stdout} ]] && rm -f ${tcpdump_stdout} - [[ ! -z ${tcpdump_stderr} ]] && rm -f ${tcpdump_stderr} - tcpdump_stdout= - tcpdump_stderr= - set +e -} - -start_tcpdump() { - set -e - tcpdump_stdout=`mktemp` - tcpdump_stderr=`mktemp` - ip netns exec ${ROUTER_NS_V6} timeout 15s \ - tcpdump --immediate-mode -tpni ${ROUTER_INTF} -c 1 \ - "icmp6 && icmp6[0] == 136 && src ${HOST_ADDR_V6}" \ - > ${tcpdump_stdout} 2> /dev/null - set +e -} - verify_ndisc() { local accept_untracked_na=$1 local same_subnet=$2 @@ -222,8 +203,9 @@ ndisc_test_untracked_advertisements() { HOST_ADDR_V6=2001:db8:abcd:0012::3 fi fi - setup_v6 $1 $2 - start_tcpdump + setup_v6 $1 + slowwait_for_counter 15 1 \ + tc_rule_handle_stats_get "dev ${ROUTER_INTF} ingress" 101 ".packets" "-n ${ROUTER_NS_V6}" if verify_ndisc $1 $2; then printf " TEST: %-60s [ OK ]\n" "${test_msg[*]}" @@ -231,7 +213,6 @@ ndisc_test_untracked_advertisements() { printf " TEST: %-60s [FAIL]\n" "${test_msg[*]}" fi - cleanup_tcpdump cleanup_v6 set +e } diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 112c85c35092..eabbdf00d8ca 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -129,14 +129,6 @@ fi source "$net_forwarding_dir/../lib.sh" -# timeout in seconds -slowwait() -{ - local timeout_sec=$1; shift - - loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@" -} - ############################################################################## # Sanity checks @@ -678,33 +670,6 @@ wait_for_trap() "$@" | grep -q trap } -until_counter_is() -{ - local expr=$1; shift - local current=$("$@") - - echo $((current)) - ((current $expr)) -} - -busywait_for_counter() -{ - local timeout=$1; shift - local delta=$1; shift - - local base=$("$@") - busywait "$timeout" until_counter_is ">= $((base + delta))" "$@" -} - -slowwait_for_counter() -{ - local timeout=$1; shift - local delta=$1; shift - - local base=$("$@") - slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@" -} - setup_wait_dev() { local dev=$1; shift @@ -1023,29 +988,6 @@ link_stats_rx_errors_get() link_stats_get $1 rx errors } -tc_rule_stats_get() -{ - local dev=$1; shift - local pref=$1; shift - local dir=$1; shift - local selector=${1:-.packets}; shift - - tc -j -s filter show dev $dev ${dir:-ingress} pref $pref \ - | jq ".[1].options.actions[].stats$selector" -} - -tc_rule_handle_stats_get() -{ - local id=$1; shift - local handle=$1; shift - local selector=${1:-.packets}; shift - local netns=${1:-""}; shift - - tc $netns -j -s filter show $id \ - | jq ".[] | select(.options.handle == $handle) | \ - .options.actions[0].stats$selector" -} - ethtool_stats_get() { local dev=$1; shift diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 72b191e4e064..edc030e81a46 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -91,6 +91,41 @@ busywait() loopy_wait : "$timeout_ms" "$@" } +# timeout in seconds +slowwait() +{ + local timeout_sec=$1; shift + + loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@" +} + +until_counter_is() +{ + local expr=$1; shift + local current=$("$@") + + echo $((current)) + ((current $expr)) +} + +busywait_for_counter() +{ + local timeout=$1; shift + local delta=$1; shift + + local base=$("$@") + busywait "$timeout" until_counter_is ">= $((base + delta))" "$@" +} + +slowwait_for_counter() +{ + local timeout=$1; shift + local delta=$1; shift + + local base=$("$@") + slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@" +} + cleanup_ns() { local ns="" @@ -150,3 +185,26 @@ setup_ns() done NS_LIST="$NS_LIST $ns_list" } + +tc_rule_stats_get() +{ + local dev=$1; shift + local pref=$1; shift + local dir=$1; shift + local selector=${1:-.packets}; shift + + tc -j -s filter show dev $dev ${dir:-ingress} pref $pref \ + | jq ".[1].options.actions[].stats$selector" +} + +tc_rule_handle_stats_get() +{ + local id=$1; shift + local handle=$1; shift + local selector=${1:-.packets}; shift + local netns=${1:-""}; shift + + tc $netns -j -s filter show $id \ + | jq ".[] | select(.options.handle == $handle) | \ + .options.actions[0].stats$selector" +} -- cgit v1.2.3 From e060e433e51246d970c5a8aa1c5ccd9ecc7ba4bf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 17 May 2024 11:27:02 +0200 Subject: selftest: af_unix: Make SCM_RIGHTS into OOB data. scm_rights.c covers various test cases for inflight file descriptors and garbage collector for AF_UNIX sockets. Currently, SCM_RIGHTS messages are sent with 3-bytes string, and it's not good for MSG_OOB cases, as SCM_RIGTS cmsg goes with the first 2-bytes, which is non-OOB data. Let's send SCM_RIGHTS messages with 1-byte character to pack SCM_RIGHTS into OOB data. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/scm_rights.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c index bab606c9f1eb..2bfed46e0b19 100644 --- a/tools/testing/selftests/net/af_unix/scm_rights.c +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -197,8 +197,8 @@ void __send_fd(struct __test_metadata *_metadata, const FIXTURE_VARIANT(scm_rights) *variant, int inflight, int receiver) { -#define MSG "nop" -#define MSGLEN 3 +#define MSG "x" +#define MSGLEN 1 struct { struct cmsghdr cmsghdr; int fd[2]; -- cgit v1.2.3 From f8ea6ab92748e69216b44b07ea7213cb02070dba Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 25 Apr 2024 12:58:03 -0700 Subject: riscv: selftests: Add hwprobe binaries to .gitignore The cbo and which-cpu hwprobe selftests leave their artifacts in the kernel tree and end up being tracked by git. Add the binaries to the hwprobe selftest .gitignore so this no longer happens. Signed-off-by: Charlie Jenkins Fixes: a29e2a48afe3 ("RISC-V: selftests: Add CBO tests") Fixes: ef7d6abb2cf5 ("RISC-V: selftests: Add which-cpus hwprobe test") Reviewed-by: Muhammad Usama Anjum Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20240425-gitignore_hwprobe_artifacts-v1-1-dfc5a20da469@rivosinc.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/hwprobe/.gitignore | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/riscv/hwprobe/.gitignore b/tools/testing/selftests/riscv/hwprobe/.gitignore index 8113dc3bdd03..6e384e80ea1a 100644 --- a/tools/testing/selftests/riscv/hwprobe/.gitignore +++ b/tools/testing/selftests/riscv/hwprobe/.gitignore @@ -1 +1,3 @@ hwprobe +cbo +which-cpus -- cgit v1.2.3 From 9d5328eeb18597749b18f42ff7df1c9f485d3c3c Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 3 Apr 2024 16:50:29 -0700 Subject: riscv: selftests: Add signal handling vector tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two tests to check vector save/restore when a signal is received during a vector routine. One test ensures that a value is not clobbered during signal handling. The other verifies that vector registers modified in the signal handler are properly reflected when the signal handling is complete. Signed-off-by: Charlie Jenkins Reviewed-by: Björn Töpel Reviewed-by: Andy Chiu Tested-by: Andy Chiu Link: https://lore.kernel.org/r/20240403-vector_sigreturn_tests-v1-1-2e68b7a3b8d7@rivosinc.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/Makefile | 2 +- tools/testing/selftests/riscv/sigreturn/.gitignore | 1 + tools/testing/selftests/riscv/sigreturn/Makefile | 12 ++++ .../testing/selftests/riscv/sigreturn/sigreturn.c | 82 ++++++++++++++++++++++ 4 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/riscv/sigreturn/.gitignore create mode 100644 tools/testing/selftests/riscv/sigreturn/Makefile create mode 100644 tools/testing/selftests/riscv/sigreturn/sigreturn.c (limited to 'tools') diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 4a9ff515a3a0..7ce03d832b64 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= hwprobe vector mm +RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/sigreturn/.gitignore b/tools/testing/selftests/riscv/sigreturn/.gitignore new file mode 100644 index 000000000000..35002b8ae780 --- /dev/null +++ b/tools/testing/selftests/riscv/sigreturn/.gitignore @@ -0,0 +1 @@ +sigreturn diff --git a/tools/testing/selftests/riscv/sigreturn/Makefile b/tools/testing/selftests/riscv/sigreturn/Makefile new file mode 100644 index 000000000000..eb8bac9279a8 --- /dev/null +++ b/tools/testing/selftests/riscv/sigreturn/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2021 ARM Limited +# Originally tools/testing/arm64/abi/Makefile + +CFLAGS += -I$(top_srcdir)/tools/include + +TEST_GEN_PROGS := sigreturn + +include ../../lib.mk + +$(OUTPUT)/sigreturn: sigreturn.c + $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/sigreturn/sigreturn.c b/tools/testing/selftests/riscv/sigreturn/sigreturn.c new file mode 100644 index 000000000000..62397d5934f1 --- /dev/null +++ b/tools/testing/selftests/riscv/sigreturn/sigreturn.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +#define RISCV_V_MAGIC 0x53465457 +#define DEFAULT_VALUE 2 +#define SIGNAL_HANDLER_OVERRIDE 3 + +static void simple_handle(int sig_no, siginfo_t *info, void *vcontext) +{ + ucontext_t *context = vcontext; + + context->uc_mcontext.__gregs[REG_PC] = context->uc_mcontext.__gregs[REG_PC] + 4; +} + +static void vector_override(int sig_no, siginfo_t *info, void *vcontext) +{ + ucontext_t *context = vcontext; + + // vector state + struct __riscv_extra_ext_header *ext; + struct __riscv_v_ext_state *v_ext_state; + + /* Find the vector context. */ + ext = (void *)(&context->uc_mcontext.__fpregs); + if (ext->hdr.magic != RISCV_V_MAGIC) { + fprintf(stderr, "bad vector magic: %x\n", ext->hdr.magic); + abort(); + } + + v_ext_state = (void *)((char *)(ext) + sizeof(*ext)); + + *(int *)v_ext_state->datap = SIGNAL_HANDLER_OVERRIDE; + + context->uc_mcontext.__gregs[REG_PC] = context->uc_mcontext.__gregs[REG_PC] + 4; +} + +static int vector_sigreturn(int data, void (*handler)(int, siginfo_t *, void *)) +{ + int after_sigreturn; + struct sigaction sig_action = { + .sa_sigaction = handler, + .sa_flags = SA_SIGINFO + }; + + sigaction(SIGSEGV, &sig_action, 0); + + asm(".option push \n\ + .option arch, +v \n\ + vsetivli x0, 1, e32, ta, ma \n\ + vmv.s.x v0, %1 \n\ + # Generate SIGSEGV \n\ + lw a0, 0(x0) \n\ + vmv.x.s %0, v0 \n\ + .option pop" : "=r" (after_sigreturn) : "r" (data)); + + return after_sigreturn; +} + +TEST(vector_restore) +{ + int result; + + result = vector_sigreturn(DEFAULT_VALUE, &simple_handle); + + EXPECT_EQ(DEFAULT_VALUE, result); +} + +TEST(vector_restore_signal_handler_override) +{ + int result; + + result = vector_sigreturn(DEFAULT_VALUE, &vector_override); + + EXPECT_EQ(SIGNAL_HANDLER_OVERRIDE, result); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From a61a459f58221f09810d6f60c657dda7add739fa Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Mon, 20 May 2024 23:58:43 +0000 Subject: testing: net-drv: use stats64 for testing Testing a network device that has large numbers of bytes/packets may overflow. Using stats64 when comparing fixes this problem. I tripped on this while iterating on a qstats patch for mlx5. See below for confirmation without my added code that this is a bug. Before this patch (with added debugging output): $ NETIF=eth0 tools/testing/selftests/drivers/net/stats.py KTAP version 1 1..4 ok 1 stats.check_pause ok 2 stats.check_fec rstat: 481708634 qstat: 666201639514 key: tx-bytes not ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex Note the huge delta above ^^^ in the rtnl vs qstats. After this patch: $ NETIF=eth0 tools/testing/selftests/drivers/net/stats.py KTAP version 1 1..4 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex It looks like rtnl_fill_stats in net/core/rtnetlink.c will attempt to copy the 64bit stats into a 32bit structure which is probably why this behavior is occurring. To show this is happening, you can get the underlying stats that the stats.py test uses like this: $ ./cli.py --spec ../../../Documentation/netlink/specs/rt_link.yaml \ --do getlink --json '{"ifi-index": 7}' And examine the output (heavily snipped to show relevant fields): 'stats': { 'multicast': 3739197, 'rx-bytes': 1201525399, 'rx-packets': 56807158, 'tx-bytes': 492404458, 'tx-packets': 1200285371, 'stats64': { 'multicast': 3739197, 'rx-bytes': 35561263767, 'rx-packets': 56807158, 'tx-bytes': 666212335338, 'tx-packets': 1200285371, The stats.py test prior to this patch was using the 'stats' structure above, which matches the failure output on my system. Comparing side by side, rx-bytes and tx-bytes, and getting ethtool -S output: rx-bytes stats: 1201525399 rx-bytes stats64: 35561263767 rx-bytes ethtool: 36203402638 tx-bytes stats: 492404458 tx-bytes stats64: 666212335338 tx-bytes ethtool: 666215360113 Note that the above was taken from a system with an mlx5 NIC, which only exposes ndo_get_stats64. Based on the ethtool output and qstat output, it appears that stats.py should be updated to use the 'stats64' structure for accurate comparisons when packet/byte counters get very large. To confirm that this was not related to the qstats code I was iterating on, I booted a kernel without my driver changes and re-ran the test which shows the qstats are skipped (as they don't exist for mlx5): NETIF=eth0 tools/testing/selftests/drivers/net/stats.py KTAP version 1 1..4 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum # SKIP qstats not supported by the device ok 4 stats.qstat_by_ifindex # SKIP No ifindex supports qstats But, fetching the stats using the CLI $ ./cli.py --spec ../../../Documentation/netlink/specs/rt_link.yaml \ --do getlink --json '{"ifi-index": 7}' Shows the same issue (heavily snipped for relevant fields only): 'stats': { 'multicast': 105489, 'rx-bytes': 530879526, 'rx-packets': 751415, 'tx-bytes': 2510191396, 'tx-packets': 27700323, 'stats64': { 'multicast': 105489, 'rx-bytes': 530879526, 'rx-packets': 751415, 'tx-bytes': 15395093284, 'tx-packets': 27700323, Comparing side by side with ethtool -S on the unmodified mlx5 driver: tx-bytes stats: 2510191396 tx-bytes stats64: 15395093284 tx-bytes ethtool: 17718435810 Fixes: f0e6c86e4bab ("testing: net-drv: add a driver test for stats reporting") Signed-off-by: Joe Damato Link: https://lore.kernel.org/r/20240520235850.190041-1-jdamato@fastly.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 7a7b16b180e2..820b8e0a22c6 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -69,7 +69,7 @@ def pkt_byte_sum(cfg) -> None: return 0 for _ in range(10): - rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats'] + rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] if stat_cmp(rtstat, qstat) < 0: raise Exception("RTNL stats are lower, fetched later") qstat = get_qstat(cfg) -- cgit v1.2.3 From df73757cf8f66fa54c4721c53b0916af3c4d9818 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 3 Apr 2024 19:10:09 -0600 Subject: tools/latency-collector: Fix -Wformat-security compile warns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following -Wformat-security compile warnings adding missing format arguments: latency-collector.c: In function ‘show_available’: latency-collector.c:938:17: warning: format not a string literal and no format arguments [-Wformat-security] 938 | warnx(no_tracer_msg); | ^~~~~ latency-collector.c:943:17: warning: format not a string literal and no format arguments [-Wformat-security] 943 | warnx(no_latency_tr_msg); | ^~~~~ latency-collector.c: In function ‘find_default_tracer’: latency-collector.c:986:25: warning: format not a string literal and no format arguments [-Wformat-security] 986 | errx(EXIT_FAILURE, no_tracer_msg); | ^~~~ latency-collector.c: In function ‘scan_arguments’: latency-collector.c:1881:33: warning: format not a string literal and no format arguments [-Wformat-security] 1881 | errx(EXIT_FAILURE, no_tracer_msg); | ^~~~ Link: https://lore.kernel.org/linux-trace-kernel/20240404011009.32945-1-skhan@linuxfoundation.org Cc: stable@vger.kernel.org Fixes: e23db805da2df ("tracing/tools: Add the latency-collector to tools directory") Signed-off-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- tools/tracing/latency/latency-collector.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index 0fd9c747d396..cf263fe9deaf 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -935,12 +935,12 @@ static void show_available(void) } if (!tracers) { - warnx(no_tracer_msg); + warnx("%s", no_tracer_msg); return; } if (!found) { - warnx(no_latency_tr_msg); + warnx("%s", no_latency_tr_msg); tracefs_list_free(tracers); return; } @@ -983,7 +983,7 @@ static const char *find_default_tracer(void) for (i = 0; relevant_tracers[i]; i++) { valid = tracer_valid(relevant_tracers[i], ¬racer); if (notracer) - errx(EXIT_FAILURE, no_tracer_msg); + errx(EXIT_FAILURE, "%s", no_tracer_msg); if (valid) return relevant_tracers[i]; } @@ -1878,7 +1878,7 @@ static void scan_arguments(int argc, char *argv[]) } valid = tracer_valid(current_tracer, ¬racer); if (notracer) - errx(EXIT_FAILURE, no_tracer_msg); + errx(EXIT_FAILURE, "%s", no_tracer_msg); if (!valid) errx(EXIT_FAILURE, "The tracer %s is not supported by your kernel!\n", current_tracer); -- cgit v1.2.3 From 4926c7a52de75c7219a04de7fa857ab30653704d Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Mon, 15 Apr 2024 16:35:22 +0000 Subject: selftest mm/mseal memory sealing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit selftest for memory sealing change in mmap() and mseal(). Link: https://lkml.kernel.org/r/20240415163527.626541-4-jeffxu@chromium.org Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Reviewed-by: Liam R. Howlett Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Jann Horn Cc: Jeff Xu Cc: Jonathan Corbet Cc: Jorge Lucangeli Obes Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Cc: Muhammad Usama Anjum Cc: Pedro Falcato Cc: Stephen Röttger Cc: Suren Baghdasaryan Cc: Amer Al Shanawany Cc: Javier Carrasco Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/mseal_test.c | 1836 +++++++++++++++++++++++++++++++ 3 files changed, 1838 insertions(+) create mode 100644 tools/testing/selftests/mm/mseal_test.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index d26e962f2ac4..98eaa4590f11 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -47,3 +47,4 @@ mkdirty va_high_addr_switch hugetlb_fault_after_madv hugetlb_madv_vs_map +mseal_test diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 60b8feb6a5ec..3482b5aef984 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -59,6 +59,7 @@ TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += thuge-gen diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c new file mode 100644 index 000000000000..06c780d1d8e5 --- /dev/null +++ b/tools/testing/selftests/mm/mseal_test.c @@ -0,0 +1,1836 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * need those definition for manually build using gcc. + * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 mseal_test.c -o mseal_test + */ +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#ifndef PKEY_BITS_PER_KEY +#define PKEY_BITS_PER_PKEY 2 +#endif + +#ifndef PKEY_MASK +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif + +#define FAIL_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + +#define SKIP_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + + +#define TEST_END_CHECK() {\ + ksft_test_result_pass("%s\n", __func__);\ + return;\ +test_end:\ + return;\ +} + +#ifndef u64 +#define u64 unsigned long long +#endif + +static unsigned long get_vma_size(void *addr, int *prot) +{ + FILE *maps; + char line[256]; + int size = 0; + uintptr_t addr_start, addr_end; + char protstr[5]; + *prot = 0; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) + return 0; + + while (fgets(line, sizeof(line), maps)) { + if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, &protstr) == 3) { + if (addr_start == (uintptr_t) addr) { + size = addr_end - addr_start; + if (protstr[0] == 'r') + *prot |= 0x4; + if (protstr[1] == 'w') + *prot |= 0x2; + if (protstr[2] == 'x') + *prot |= 0x1; + break; + } + } + } + fclose(maps); + return size; +} + +/* + * define sys_xyx to call syscall directly. + */ +static int sys_mseal(void *start, size_t len) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mseal, start, len, 0); + return sret; +} + +static int sys_mprotect(void *ptr, size_t size, unsigned long prot) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mprotect, ptr, size, prot); + return sret; +} + +static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + errno = 0; + sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); + return sret; +} + +static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mmap, addr, len, prot, + flags, fd, offset); + return sret; +} + +static int sys_munmap(void *ptr, size_t size) +{ + int sret; + + errno = 0; + sret = syscall(__NR_munmap, ptr, size); + return sret; +} + +static int sys_madvise(void *start, size_t len, int types) +{ + int sret; + + errno = 0; + sret = syscall(__NR_madvise, start, len, types); + return sret; +} + +static int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(__NR_pkey_alloc, flags, init_val); + + return ret; +} + +static unsigned int __read_pkey_reg(void) +{ + unsigned int pkey_reg = 0; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax, edx; + unsigned int ecx = 0; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; +#endif + return pkey_reg; +} + +static void __write_pkey_reg(u64 pkey_reg) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +#endif +} + +static unsigned long pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + unsigned long shift = pkey_bit_position(pkey); + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static void set_pkey(int pkey, unsigned long pkey_value) +{ + unsigned long mask = (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE); + u64 new_pkey_reg; + + assert(!(pkey_value & ~mask)); + new_pkey_reg = set_pkey_bits(__read_pkey_reg(), pkey, pkey_value); + __write_pkey_reg(new_pkey_reg); +} + +static void setup_single_address(int size, void **ptrOut) +{ + void *ptr; + + ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + assert(ptr != (void *)-1); + *ptrOut = ptr; +} + +static void setup_single_address_rw(int size, void **ptrOut) +{ + void *ptr; + unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; + + ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + assert(ptr != (void *)-1); + *ptrOut = ptr; +} + +static void clean_single_address(void *ptr, int size) +{ + int ret; + + ret = munmap(ptr, size); + assert(!ret); +} + +static void seal_single_address(void *ptr, int size) +{ + int ret; + + ret = sys_mseal(ptr, size); + assert(!ret); +} + +bool seal_support(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + + ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == (void *) -1) + return false; + + ret = sys_mseal(ptr, page_size); + if (ret < 0) + return false; + + return true; +} + +bool pkey_supported(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + int pkey = sys_pkey_alloc(0, 0); + + if (pkey > 0) + return true; +#endif + return false; +} + +static void test_seal_addseal(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_start(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + /* munmap 2 pages from ptr. */ + ret = sys_munmap(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail because 2 pages from ptr are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail because 2 pages from ptr are unmapped. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_middle(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + /* munmap 2 pages from ptr + page. */ + ret = sys_munmap(ptr + page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail, since middle 2 pages are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* we still can add seal to the first page and last page*/ + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mseal(ptr + 3 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_end(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + /* unmap last 2 pages. */ + ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail since last 2 pages are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* The first 2 pages is not sealed, and can add seals */ + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_multiple_vmas(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + /* use mprotect to split the vma into 3. */ + ret = sys_mprotect(ptr + page_size, 2 * page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will get applied to all 4 pages - 3 VMAs. */ + ret = sys_mprotect(ptr, size, PROT_READ); + FAIL_TEST_IF_FALSE(!ret); + + /* use mprotect to split the vma into 3. */ + ret = sys_mprotect(ptr + page_size, 2 * page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mseal get applied to all 4 pages - 3 VMAs. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_split_start(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + /* use mprotect to split at middle */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first page, this will split the VMA */ + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* add seal to the remain 3 pages */ + ret = sys_mseal(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_split_end(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + /* use mprotect to split at middle */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the last page */ + ret = sys_mseal(ptr + 3 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* Adding seals to the first 3 pages */ + ret = sys_mseal(ptr, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_invalid_input(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(8 * page_size, &ptr); + clean_single_address(ptr + 4 * page_size, 4 * page_size); + + /* invalid flag */ + ret = syscall(__NR_mseal, ptr, size, 0x20); + FAIL_TEST_IF_FALSE(ret < 0); + + /* unaligned address */ + ret = sys_mseal(ptr + 1, 2 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* length too big */ + ret = sys_mseal(ptr, 5 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* length overflow */ + ret = sys_mseal(ptr, UINT64_MAX/page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* start is not in a valid VMA */ + ret = sys_mseal(ptr - page_size, 5 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + TEST_END_CHECK(); +} + +static void test_seal_zero_length(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + ret = sys_mprotect(ptr, 0, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal 0 length will be OK, same as mprotect */ + ret = sys_mseal(ptr, 0); + FAIL_TEST_IF_FALSE(!ret); + + /* verify the 4 pages are not sealed by previous call. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_zero_address(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + /* use mmap to change protection. */ + ptr = sys_mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + FAIL_TEST_IF_FALSE(ptr == 0); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == 4 * page_size); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + /* verify the 4 pages are sealed by previous call. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret); + + TEST_END_CHECK(); +} + +static void test_seal_twice(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + /* apply the same seal will be OK. idempotent. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + if (seal) + seal_single_address(ptr, size); + + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_start_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + if (seal) + seal_single_address(ptr, page_size); + + /* the first page is sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* pages after the first page is not sealed. */ + ret = sys_mprotect(ptr + page_size, page_size * 3, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_end_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + if (seal) + seal_single_address(ptr + page_size, 3 * page_size); + + /* first page is not sealed */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* last 3 page are sealed */ + ret = sys_mprotect(ptr + page_size, page_size * 3, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_unalign_len(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + if (seal) + seal_single_address(ptr, page_size * 2 - 1); + + /* 2 pages are sealed. */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 2, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_unalign_len_variant_2(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + if (seal) + seal_single_address(ptr, page_size * 2 + 1); + + /* 3 pages are sealed. */ + ret = sys_mprotect(ptr, page_size * 3, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 3, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* use mprotect to split */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) + seal_single_address(ptr, page_size * 4); + + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 2, page_size * 2, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma_with_split(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* use mprotect to split as two vma. */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mseal can apply across 2 vma, also split them. */ + if (seal) + seal_single_address(ptr + page_size, page_size * 2); + + /* the first page is not sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* the second page is sealed. */ + ret = sys_mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* the third page is sealed. */ + ret = sys_mprotect(ptr + 2 * page_size, page_size, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* the fouth page is not sealed. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_partial_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* seal one page. */ + if (seal) + seal_single_address(ptr, page_size); + + /* mprotect first 2 page will fail, since the first page are sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma_with_gap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* use munmap to free two pages in the middle */ + ret = sys_munmap(ptr + page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail, because there is a gap in the address. */ + /* notes, internally mprotect still updated the first page. */ + ret = sys_mprotect(ptr, 4 * page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, 4 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* the first page is not sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + /* the last page is not sealed. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_split(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal all 4 pages. */ + if (seal) { + ret = sys_mseal(ptr, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mprotect is sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + + ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_merge(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* use mprotect to split one page. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal first two pages. */ + if (seal) { + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 2 pages are sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* last 2 pages are not sealed. */ + ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + TEST_END_CHECK(); +} + +static void test_seal_munmap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 4 pages are sealed. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +/* + * allocate 4 pages, + * use mprotect to split it as two VMAs + * seal the whole range + * munmap will fail on both + */ +static void test_seal_munmap_two_vma(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + /* use mprotect to split */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_munmap(ptr, page_size * 2); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr + page_size, page_size * 2); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +/* + * allocate a VMA with 4 pages. + * munmap the middle 2 pages. + * seal the whole 4 pages, will fail. + * munmap the first page will be OK. + * munmap the last page will be OK. + */ +static void test_seal_munmap_vma_with_gap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + ret = sys_munmap(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + /* can't have gap in the middle. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + } + + ret = sys_munmap(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr + page_size * 2, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_munmap_start_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + + /* unmap the first page. */ + ret = sys_munmap(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the last 3 pages. */ + if (seal) { + ret = sys_mseal(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* unmap from the first page. */ + ret = sys_munmap(ptr, size); + if (seal) { + FAIL_TEST_IF_FALSE(ret < 0); + + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size * 3); + } else { + /* note: this will be OK, even the first page is */ + /* already unmapped. */ + FAIL_TEST_IF_FALSE(!ret); + + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 0); + } + + TEST_END_CHECK(); +} + +static void test_munmap_end_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + /* unmap last page. */ + ret = sys_munmap(ptr + page_size * 3, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first 3 pages. */ + if (seal) { + ret = sys_mseal(ptr, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* unmap all pages. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_munmap_middle_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + /* unmap 2 pages in the middle. */ + ret = sys_munmap(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first page. */ + if (seal) { + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* munmap all 4 pages. */ + ret = sys_munmap(ptr, size); + if (seal) { + FAIL_TEST_IF_FALSE(ret < 0); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + + size = get_vma_size(ptr + page_size * 3, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + } else { + FAIL_TEST_IF_FALSE(!ret); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == 0); + + size = get_vma_size(ptr + page_size * 3, &prot); + FAIL_TEST_IF_FALSE(size == 0); + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_shrink(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* shrink from 4 pages to 2 pages. */ + ret2 = mremap(ptr, size, 2 * page_size, 0, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_expand(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + /* ummap last 2 pages. */ + ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* expand from 2 page to 4 pages. */ + ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 == ptr); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move(bool seal) +{ + void *ptr, *newPtr; + unsigned long page_size = getpagesize(); + unsigned long size = page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + setup_single_address(size, &newPtr); + clean_single_address(newPtr, size); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* move from ptr to fixed address. */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mmap_overwrite_prot(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to change protection. */ + ret2 = sys_mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mmap_expand(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 12 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + /* ummap last 4 pages. */ + ret = sys_munmap(ptr + 8 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, 8 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to expand. */ + ret2 = sys_mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mmap_shrink(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 12 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to shrink. */ + ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_shrink_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + setup_single_address(size, &newAddr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move and shrink to fixed address */ + ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_expand_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(page_size, &ptr); + setup_single_address(size, &newAddr); + + if (seal) { + ret = sys_mseal(newAddr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move and expand to fixed address */ + ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, + newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + setup_single_address(size, &newAddr); + + if (seal) { + ret = sys_mseal(newAddr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move to fixed address */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_fixed_zero(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * MREMAP_FIXED can move the mapping to zero address + */ + ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 == 0); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_dontunmap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move, and don't unmap src addr. */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * The 0xdeaddead should not have effect on dest addr + * when MREMAP_DONTUNMAP is set. + */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + 0xdeaddead); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead); + + } + + TEST_END_CHECK(); +} + + +static void test_seal_merge_and_split(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size; + int ret; + int prot; + + /* (24 RO) */ + setup_single_address(24 * page_size, &ptr); + + /* use mprotect(NONE) to set out boundary */ + /* (1 NONE) (22 RO) (1 NONE) */ + ret = sys_mprotect(ptr, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(!ret); + ret = sys_mprotect(ptr + 23 * page_size, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 22 * page_size); + FAIL_TEST_IF_FALSE(prot == 4); + + /* use mseal to split from beginning */ + /* (1 NONE) (1 RO_SEAL) (21 RO) (1 NONE) */ + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 2 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 21 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* use mseal to split from the end. */ + /* (1 NONE) (1 RO_SEAL) (20 RO) (1 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 22 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 22 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 2 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 20 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge with prev. */ + /* (1 NONE) (2 RO_SEAL) (19 RO) (1 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 2 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge with after. */ + /* (1 NONE) (2 RO_SEAL) (18 RO) (2 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 21 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 21 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 2 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* split and merge from prev */ + /* (1 NONE) (3 RO_SEAL) (17 RO) (2 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 1 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 3 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + ret = sys_munmap(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(ret < 0); + ret = sys_mprotect(ptr + 2 * page_size, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* split and merge from next */ + /* (1 NONE) (3 RO_SEAL) (16 RO) (3 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 20 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 20 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 3 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge from middle of prev and middle of next. */ + /* (1 NONE) (22 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, 20 * page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 22 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_rw(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address_rw(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't take effect on RW memory. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + /* base seal still apply. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_pkey(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int pkey; + + SKIP_TEST_IF_FALSE(pkey_supported()); + + setup_single_address_rw(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + pkey = sys_pkey_alloc(0, 0); + FAIL_TEST_IF_FALSE(pkey > 0); + + ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't take effect if PKRU allow write. */ + set_pkey(pkey, 0); + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + /* sealing will take effect if PKRU deny write. */ + set_pkey(pkey, PKEY_DISABLE_WRITE); + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* base seal still apply. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_filebacked(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int fd; + unsigned long mapflags = MAP_PRIVATE; + + fd = memfd_create("test", 0); + FAIL_TEST_IF_FALSE(fd > 0); + + ret = fallocate(fd, 0, 0, size); + FAIL_TEST_IF_FALSE(!ret); + + ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't apply for file backed mapping. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + close(fd); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_shared(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; + + ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't apply for shared mapping. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + + if (seal) + seal_single_address(ptr, size); + + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +int main(int argc, char **argv) +{ + bool test_seal = seal_support(); + + ksft_print_header(); + + if (!test_seal) + ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); + + if (!pkey_supported()) + ksft_print_msg("PKEY not supported\n"); + + ksft_set_plan(80); + + test_seal_addseal(); + test_seal_unmapped_start(); + test_seal_unmapped_middle(); + test_seal_unmapped_end(); + test_seal_multiple_vmas(); + test_seal_split_start(); + test_seal_split_end(); + test_seal_invalid_input(); + test_seal_zero_length(); + test_seal_twice(); + + test_seal_mprotect(false); + test_seal_mprotect(true); + + test_seal_start_mprotect(false); + test_seal_start_mprotect(true); + + test_seal_end_mprotect(false); + test_seal_end_mprotect(true); + + test_seal_mprotect_unalign_len(false); + test_seal_mprotect_unalign_len(true); + + test_seal_mprotect_unalign_len_variant_2(false); + test_seal_mprotect_unalign_len_variant_2(true); + + test_seal_mprotect_two_vma(false); + test_seal_mprotect_two_vma(true); + + test_seal_mprotect_two_vma_with_split(false); + test_seal_mprotect_two_vma_with_split(true); + + test_seal_mprotect_partial_mprotect(false); + test_seal_mprotect_partial_mprotect(true); + + test_seal_mprotect_two_vma_with_gap(false); + test_seal_mprotect_two_vma_with_gap(true); + + test_seal_mprotect_merge(false); + test_seal_mprotect_merge(true); + + test_seal_mprotect_split(false); + test_seal_mprotect_split(true); + + test_seal_munmap(false); + test_seal_munmap(true); + test_seal_munmap_two_vma(false); + test_seal_munmap_two_vma(true); + test_seal_munmap_vma_with_gap(false); + test_seal_munmap_vma_with_gap(true); + + test_munmap_start_freed(false); + test_munmap_start_freed(true); + test_munmap_middle_freed(false); + test_munmap_middle_freed(true); + test_munmap_end_freed(false); + test_munmap_end_freed(true); + + test_seal_mremap_shrink(false); + test_seal_mremap_shrink(true); + test_seal_mremap_expand(false); + test_seal_mremap_expand(true); + test_seal_mremap_move(false); + test_seal_mremap_move(true); + + test_seal_mremap_shrink_fixed(false); + test_seal_mremap_shrink_fixed(true); + test_seal_mremap_expand_fixed(false); + test_seal_mremap_expand_fixed(true); + test_seal_mremap_move_fixed(false); + test_seal_mremap_move_fixed(true); + test_seal_mremap_move_dontunmap(false); + test_seal_mremap_move_dontunmap(true); + test_seal_mremap_move_fixed_zero(false); + test_seal_mremap_move_fixed_zero(true); + test_seal_mremap_move_dontunmap_anyaddr(false); + test_seal_mremap_move_dontunmap_anyaddr(true); + test_seal_discard_ro_anon(false); + test_seal_discard_ro_anon(true); + test_seal_discard_ro_anon_on_rw(false); + test_seal_discard_ro_anon_on_rw(true); + test_seal_discard_ro_anon_on_shared(false); + test_seal_discard_ro_anon_on_shared(true); + test_seal_discard_ro_anon_on_filebacked(false); + test_seal_discard_ro_anon_on_filebacked(true); + test_seal_mmap_overwrite_prot(false); + test_seal_mmap_overwrite_prot(true); + test_seal_mmap_expand(false); + test_seal_mmap_expand(true); + test_seal_mmap_shrink(false); + test_seal_mmap_shrink(true); + + test_seal_merge_and_split(); + test_seal_zero_address(); + + test_seal_discard_ro_anon_on_pkey(false); + test_seal_discard_ro_anon_on_pkey(true); + + ksft_finished(); + return 0; +} -- cgit v1.2.3 From a52b4f11a2e17109c4b9f7df4ff19215b1752efc Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Mon, 15 Apr 2024 16:35:24 +0000 Subject: selftest mm/mseal read-only elf memory segment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sealing read-only of elf mapping so it can't be changed by mprotect. [jeffxu@chromium.org: style change] Link: https://lkml.kernel.org/r/20240416220944.2481203-2-jeffxu@chromium.org [amer.shanawany@gmail.com: fix linker error for inline function] Link: https://lkml.kernel.org/r/20240420202346.546444-1-amer.shanawany@gmail.com [jeffxu@chromium.org: fix compile warning] Link: https://lkml.kernel.org/r/20240420003515.345982-2-jeffxu@chromium.org [jeffxu@chromium.org: fix arm build] Link: https://lkml.kernel.org/r/20240502225331.3806279-2-jeffxu@chromium.org Link: https://lkml.kernel.org/r/20240415163527.626541-6-jeffxu@chromium.org Signed-off-by: Jeff Xu Signed-off-by: Amer Al Shanawany Reviewed-by: Kees Cook Reviewed-by: Liam R. Howlett Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Jann Horn Cc: Jeff Xu Cc: Jonathan Corbet Cc: Jorge Lucangeli Obes Cc: Linus Torvalds Cc: Matthew Wilcox (Oracle) Cc: Muhammad Usama Anjum Cc: Pedro Falcato Cc: Stephen Röttger Cc: Suren Baghdasaryan Cc: Amer Al Shanawany Cc: Javier Carrasco Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/mseal_test.c | 130 ++++++++++++++++------- tools/testing/selftests/mm/seal_elf.c | 179 ++++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 36 deletions(-) create mode 100644 tools/testing/selftests/mm/seal_elf.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 98eaa4590f11..0b9ab987601c 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -48,3 +48,4 @@ va_high_addr_switch hugetlb_fault_after_madv hugetlb_madv_vs_map mseal_test +seal_elf diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 3482b5aef984..3b49bc3d0a3b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -60,6 +60,7 @@ TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test TEST_GEN_FILES += mseal_test +TEST_GEN_FILES += seal_elf TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += thuge-gen diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 06c780d1d8e5..41998cf1dcf5 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE +#include #include #include #include @@ -12,9 +13,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -31,7 +30,7 @@ # define PKEY_DISABLE_WRITE 0x2 #endif -#ifndef PKEY_BITS_PER_KEY +#ifndef PKEY_BITS_PER_PKEY #define PKEY_BITS_PER_PKEY 2 #endif @@ -81,7 +80,7 @@ static unsigned long get_vma_size(void *addr, int *prot) return 0; while (fgets(line, sizeof(line), maps)) { - if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, &protstr) == 3) { + if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, protstr) == 3) { if (addr_start == (uintptr_t) addr) { size = addr_end - addr_start; if (protstr[0] == 'r') @@ -189,7 +188,6 @@ static void __write_pkey_reg(u64 pkey_reg) asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkey_reg == __read_pkey_reg()); #endif } @@ -211,10 +209,8 @@ static u64 set_pkey_bits(u64 reg, int pkey, u64 flags) static void set_pkey(int pkey, unsigned long pkey_value) { - unsigned long mask = (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE); u64 new_pkey_reg; - assert(!(pkey_value & ~mask)); new_pkey_reg = set_pkey_bits(__read_pkey_reg(), pkey, pkey_value); __write_pkey_reg(new_pkey_reg); } @@ -224,7 +220,6 @@ static void setup_single_address(int size, void **ptrOut) void *ptr; ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - assert(ptr != (void *)-1); *ptrOut = ptr; } @@ -234,24 +229,21 @@ static void setup_single_address_rw(int size, void **ptrOut) unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); - assert(ptr != (void *)-1); *ptrOut = ptr; } -static void clean_single_address(void *ptr, int size) +static int clean_single_address(void *ptr, int size) { int ret; - ret = munmap(ptr, size); - assert(!ret); + return ret; } -static void seal_single_address(void *ptr, int size) +static int seal_single_address(void *ptr, int size) { int ret; - ret = sys_mseal(ptr, size); - assert(!ret); + return ret; } bool seal_support(void) @@ -290,6 +282,7 @@ static void test_seal_addseal(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); ret = sys_mseal(ptr, size); FAIL_TEST_IF_FALSE(!ret); @@ -305,6 +298,7 @@ static void test_seal_unmapped_start(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* munmap 2 pages from ptr. */ ret = sys_munmap(ptr, 2 * page_size); @@ -332,6 +326,7 @@ static void test_seal_unmapped_middle(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* munmap 2 pages from ptr + page. */ ret = sys_munmap(ptr + page_size, 2 * page_size); @@ -363,6 +358,7 @@ static void test_seal_unmapped_end(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* unmap last 2 pages. */ ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); @@ -391,6 +387,7 @@ static void test_seal_multiple_vmas(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split the vma into 3. */ ret = sys_mprotect(ptr + page_size, 2 * page_size, @@ -421,6 +418,7 @@ static void test_seal_split_start(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split at middle */ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); @@ -445,6 +443,7 @@ static void test_seal_split_end(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split at middle */ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); @@ -469,7 +468,9 @@ static void test_seal_invalid_input(void) int ret; setup_single_address(8 * page_size, &ptr); - clean_single_address(ptr + 4 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + ret = clean_single_address(ptr + 4 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); /* invalid flag */ ret = syscall(__NR_mseal, ptr, size, 0x20); @@ -502,6 +503,7 @@ static void test_seal_zero_length(void) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); ret = sys_mprotect(ptr, 0, PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); @@ -551,6 +553,7 @@ static void test_seal_twice(void) unsigned long size = 4 * page_size; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); ret = sys_mseal(ptr, size); FAIL_TEST_IF_FALSE(!ret); @@ -570,9 +573,12 @@ static void test_seal_mprotect(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); - if (seal) - seal_single_address(ptr, size); + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); if (seal) @@ -591,9 +597,12 @@ static void test_seal_start_mprotect(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); - if (seal) - seal_single_address(ptr, page_size); + if (seal) { + ret = seal_single_address(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } /* the first page is sealed. */ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); @@ -618,9 +627,12 @@ static void test_seal_end_mprotect(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); - if (seal) - seal_single_address(ptr + page_size, 3 * page_size); + if (seal) { + ret = seal_single_address(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } /* first page is not sealed */ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); @@ -645,9 +657,12 @@ static void test_seal_mprotect_unalign_len(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); - if (seal) - seal_single_address(ptr, page_size * 2 - 1); + if (seal) { + ret = seal_single_address(ptr, page_size * 2 - 1); + FAIL_TEST_IF_FALSE(!ret); + } /* 2 pages are sealed. */ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); @@ -671,8 +686,11 @@ static void test_seal_mprotect_unalign_len_variant_2(bool seal) int ret; setup_single_address(size, &ptr); - if (seal) - seal_single_address(ptr, page_size * 2 + 1); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + if (seal) { + ret = seal_single_address(ptr, page_size * 2 + 1); + FAIL_TEST_IF_FALSE(!ret); + } /* 3 pages are sealed. */ ret = sys_mprotect(ptr, page_size * 3, PROT_READ | PROT_WRITE); @@ -696,13 +714,16 @@ static void test_seal_mprotect_two_vma(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split */ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); - if (seal) - seal_single_address(ptr, page_size * 4); + if (seal) { + ret = seal_single_address(ptr, page_size * 4); + FAIL_TEST_IF_FALSE(!ret); + } ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); if (seal) @@ -728,14 +749,17 @@ static void test_seal_mprotect_two_vma_with_split(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split as two vma. */ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); FAIL_TEST_IF_FALSE(!ret); /* mseal can apply across 2 vma, also split them. */ - if (seal) - seal_single_address(ptr + page_size, page_size * 2); + if (seal) { + ret = seal_single_address(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + } /* the first page is not sealed. */ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); @@ -772,10 +796,13 @@ static void test_seal_mprotect_partial_mprotect(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* seal one page. */ - if (seal) - seal_single_address(ptr, page_size); + if (seal) { + ret = seal_single_address(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } /* mprotect first 2 page will fail, since the first page are sealed. */ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); @@ -795,6 +822,7 @@ static void test_seal_mprotect_two_vma_with_gap(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split. */ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); @@ -837,6 +865,7 @@ static void test_seal_mprotect_split(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split. */ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); @@ -873,6 +902,7 @@ static void test_seal_mprotect_merge(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split one page. */ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); @@ -906,6 +936,7 @@ static void test_seal_munmap(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -936,6 +967,7 @@ static void test_seal_munmap_two_vma(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect to split */ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); @@ -976,6 +1008,7 @@ static void test_seal_munmap_vma_with_gap(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); ret = sys_munmap(ptr + page_size, page_size * 2); FAIL_TEST_IF_FALSE(!ret); @@ -1007,6 +1040,7 @@ static void test_munmap_start_freed(bool seal) int prot; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* unmap the first page. */ ret = sys_munmap(ptr, page_size); @@ -1045,6 +1079,8 @@ static void test_munmap_end_freed(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + /* unmap last page. */ ret = sys_munmap(ptr + page_size * 3, page_size); FAIL_TEST_IF_FALSE(!ret); @@ -1074,6 +1110,8 @@ static void test_munmap_middle_freed(bool seal) int prot; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + /* unmap 2 pages in the middle. */ ret = sys_munmap(ptr + page_size, page_size * 2); FAIL_TEST_IF_FALSE(!ret); @@ -1116,6 +1154,7 @@ static void test_seal_mremap_shrink(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1144,6 +1183,7 @@ static void test_seal_mremap_expand(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* ummap last 2 pages. */ ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); FAIL_TEST_IF_FALSE(!ret); @@ -1175,8 +1215,11 @@ static void test_seal_mremap_move(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); setup_single_address(size, &newPtr); - clean_single_address(newPtr, size); + FAIL_TEST_IF_FALSE(newPtr != (void *)-1); + ret = clean_single_address(newPtr, size); + FAIL_TEST_IF_FALSE(!ret); if (seal) { ret = sys_mseal(ptr, size); @@ -1205,6 +1248,7 @@ static void test_seal_mmap_overwrite_prot(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1232,6 +1276,7 @@ static void test_seal_mmap_expand(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* ummap last 4 pages. */ ret = sys_munmap(ptr + 8 * page_size, 4 * page_size); FAIL_TEST_IF_FALSE(!ret); @@ -1262,6 +1307,7 @@ static void test_seal_mmap_shrink(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1290,7 +1336,9 @@ static void test_seal_mremap_shrink_fixed(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1319,7 +1367,9 @@ static void test_seal_mremap_expand_fixed(bool seal) void *ret2; setup_single_address(page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); if (seal) { ret = sys_mseal(newAddr, size); @@ -1348,7 +1398,9 @@ static void test_seal_mremap_move_fixed(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); if (seal) { ret = sys_mseal(newAddr, size); @@ -1375,6 +1427,7 @@ static void test_seal_mremap_move_fixed_zero(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1406,6 +1459,7 @@ static void test_seal_mremap_move_dontunmap(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1434,6 +1488,7 @@ static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) void *ret2; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { ret = sys_mseal(ptr, size); @@ -1469,6 +1524,7 @@ static void test_seal_merge_and_split(void) /* (24 RO) */ setup_single_address(24 * page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); /* use mprotect(NONE) to set out boundary */ /* (1 NONE) (22 RO) (1 NONE) */ @@ -1700,9 +1756,12 @@ static void test_seal_discard_ro_anon(bool seal) int ret; setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); - if (seal) - seal_single_address(ptr, size); + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } ret = sys_madvise(ptr, size, MADV_DONTNEED); if (seal) @@ -1832,5 +1891,4 @@ int main(int argc, char **argv) test_seal_discard_ro_anon_on_pkey(true); ksft_finished(); - return 0; } diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c new file mode 100644 index 000000000000..f2babec79bb6 --- /dev/null +++ b/tools/testing/selftests/mm/seal_elf.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * need those definition for manually build using gcc. + * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 seal_elf.c -o seal_elf + */ +#define FAIL_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + +#define SKIP_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + + +#define TEST_END_CHECK() {\ + ksft_test_result_pass("%s\n", __func__);\ + return;\ +test_end:\ + return;\ +} + +#ifndef u64 +#define u64 unsigned long long +#endif + +/* + * define sys_xyx to call syscall directly. + */ +static int sys_mseal(void *start, size_t len) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mseal, start, len, 0); + return sret; +} + +static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mmap, addr, len, prot, + flags, fd, offset); + return sret; +} + +static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mprotect, ptr, size, prot); + return sret; +} + +static bool seal_support(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + + ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == (void *) -1) + return false; + + ret = sys_mseal(ptr, page_size); + if (ret < 0) + return false; + + return true; +} + +const char somestr[4096] = {"READONLY"}; + +static void test_seal_elf(void) +{ + int ret; + FILE *maps; + char line[512]; + uintptr_t addr_start, addr_end; + char prot[5]; + char filename[256]; + unsigned long page_size = getpagesize(); + unsigned long long ptr = (unsigned long long) somestr; + char *somestr2 = (char *)somestr; + + /* + * Modify the protection of readonly somestr + */ + if (((unsigned long long)ptr % page_size) != 0) + ptr = (unsigned long long)ptr & ~(page_size - 1); + + ksft_print_msg("somestr = %s\n", somestr); + ksft_print_msg("change protection to rw\n"); + ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + *somestr2 = 'A'; + ksft_print_msg("somestr is modified to: %s\n", somestr); + ret = sys_mprotect((void *)ptr, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(!ret); + + maps = fopen("/proc/self/maps", "r"); + FAIL_TEST_IF_FALSE(maps); + + /* + * apply sealing to elf binary + */ + while (fgets(line, sizeof(line), maps)) { + if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]", + &addr_start, &addr_end, prot, filename) == 4) { + if (strlen(filename)) { + /* + * seal the mapping if read only. + */ + if (strstr(prot, "r-")) { + ret = sys_mseal((void *)addr_start, addr_end - addr_start); + FAIL_TEST_IF_FALSE(!ret); + ksft_print_msg("sealed: %lx-%lx %s %s\n", + addr_start, addr_end, prot, filename); + if ((uintptr_t) somestr >= addr_start && + (uintptr_t) somestr <= addr_end) + ksft_print_msg("mapping for somestr found\n"); + } + } + } + } + fclose(maps); + + ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + ksft_print_msg("somestr is sealed, mprotect is rejected\n"); + + TEST_END_CHECK(); +} + +int main(int argc, char **argv) +{ + bool test_seal = seal_support(); + + ksft_print_header(); + ksft_print_msg("pid=%d\n", getpid()); + + if (!test_seal) + ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); + + ksft_set_plan(1); + + test_seal_elf(); + + ksft_finished(); +} -- cgit v1.2.3 From d4202e66a4b1fe6968f17f9f09bbc30d08f028a1 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 21 May 2024 13:13:56 +0530 Subject: selftests/mm: compaction_test: fix bogus test success on Aarch64 Patch series "Fixes for compaction_test", v2. The compaction_test memory selftest introduces fragmentation in memory and then tries to allocate as many hugepages as possible. This series addresses some problems. On Aarch64, if nr_hugepages == 0, then the test trivially succeeds since compaction_index becomes 0, which is less than 3, due to no division by zero exception being raised. We fix that by checking for division by zero. Secondly, correctly set the number of hugepages to zero before trying to set a large number of them. Now, consider a situation in which, at the start of the test, a non-zero number of hugepages have been already set (while running the entire selftests/mm suite, or manually by the admin). The test operates on 80% of memory to avoid OOM-killer invocation, and because some memory is already blocked by hugepages, it would increase the chance of OOM-killing. Also, since mem_free used in check_compaction() is the value before we set nr_hugepages to zero, the chance that the compaction_index will be small is very high if the preset nr_hugepages was high, leading to a bogus test success. This patch (of 3): Currently, if at runtime we are not able to allocate a huge page, the test will trivially pass on Aarch64 due to no exception being raised on division by zero while computing compaction_index. Fix that by checking for nr_hugepages == 0. Anyways, in general, avoid a division by zero by exiting the program beforehand. While at it, fix a typo, and handle the case where the number of hugepages may overflow an integer. Link: https://lkml.kernel.org/r/20240521074358.675031-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20240521074358.675031-2-dev.jain@arm.com Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory") Signed-off-by: Dev Jain Cc: Anshuman Khandual Cc: Shuah Khan Cc: Sri Jayaramappa Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 4f42eb7d7636..0b249a06a60b 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -82,12 +82,13 @@ int prereq(void) return -1; } -int check_compaction(unsigned long mem_free, unsigned int hugepage_size) +int check_compaction(unsigned long mem_free, unsigned long hugepage_size) { + unsigned long nr_hugepages_ul; int fd, ret = -1; int compaction_index = 0; - char initial_nr_hugepages[10] = {0}; - char nr_hugepages[10] = {0}; + char initial_nr_hugepages[20] = {0}; + char nr_hugepages[20] = {0}; /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ @@ -134,7 +135,12 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size) /* We should have been able to request at least 1/3 rd of the memory in huge pages */ - compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); + nr_hugepages_ul = strtoul(nr_hugepages, NULL, 10); + if (!nr_hugepages_ul) { + ksft_print_msg("ERROR: No memory is available as huge pages\n"); + goto close_fd; + } + compaction_index = mem_free/(nr_hugepages_ul * hugepage_size); lseek(fd, 0, SEEK_SET); @@ -145,11 +151,11 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size) goto close_fd; } - ksft_print_msg("Number of huge pages allocated = %d\n", - atoi(nr_hugepages)); + ksft_print_msg("Number of huge pages allocated = %lu\n", + nr_hugepages_ul); if (compaction_index > 3) { - ksft_print_msg("ERROR: Less that 1/%d of memory is available\n" + ksft_print_msg("ERROR: Less than 1/%d of memory is available\n" "as huge pages\n", compaction_index); goto close_fd; } -- cgit v1.2.3 From 9ad665ef55eaad1ead1406a58a34f615a7c18b5e Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 21 May 2024 13:13:57 +0530 Subject: selftests/mm: compaction_test: fix incorrect write of zero to nr_hugepages Currently, the test tries to set nr_hugepages to zero, but that is not actually done because the file offset is not reset after read(). Fix that using lseek(). Link: https://lkml.kernel.org/r/20240521074358.675031-3-dev.jain@arm.com Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory") Signed-off-by: Dev Jain Cc: Cc: Anshuman Khandual Cc: Shuah Khan Cc: Sri Jayaramappa Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 0b249a06a60b..5e9bd1da9370 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -108,6 +108,8 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size) goto close_fd; } + lseek(fd, 0, SEEK_SET); + /* Start with the initial condition of 0 huge pages*/ if (write(fd, "0", sizeof(char)) != sizeof(char)) { ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", -- cgit v1.2.3 From fb9293b6b0156fbf6ab97a1625d99a29c36d9f0c Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 21 May 2024 13:13:58 +0530 Subject: selftests/mm: compaction_test: fix bogus test success and reduce probability of OOM-killer invocation Reset nr_hugepages to zero before the start of the test. If a non-zero number of hugepages is already set before the start of the test, the following problems arise: - The probability of the test getting OOM-killed increases. Proof: The test wants to run on 80% of available memory to prevent OOM-killing (see original code comments). Let the value of mem_free at the start of the test, when nr_hugepages = 0, be x. In the other case, when nr_hugepages > 0, let the memory consumed by hugepages be y. In the former case, the test operates on 0.8 * x of memory. In the latter, the test operates on 0.8 * (x - y) of memory, with y already filled, hence, memory consumed is y + 0.8 * (x - y) = 0.8 * x + 0.2 * y > 0.8 * x. Q.E.D - The probability of a bogus test success increases. Proof: Let the memory consumed by hugepages be greater than 25% of x, with x and y defined as above. The definition of compaction_index is c_index = (x - y)/z where z is the memory consumed by hugepages after trying to increase them again. In check_compaction(), we set the number of hugepages to zero, and then increase them back; the probability that they will be set back to consume at least y amount of memory again is very high (since there is not much delay between the two attempts of changing nr_hugepages). Hence, z >= y > (x/4) (by the 25% assumption). Therefore, c_index = (x - y)/z <= (x - y)/y = x/y - 1 < 4 - 1 = 3 hence, c_index can always be forced to be less than 3, thereby the test succeeding always. Q.E.D Link: https://lkml.kernel.org/r/20240521074358.675031-4-dev.jain@arm.com Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory") Signed-off-by: Dev Jain Cc: Cc: Anshuman Khandual Cc: Shuah Khan Cc: Sri Jayaramappa Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 71 +++++++++++++++++++--------- 1 file changed, 49 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 5e9bd1da9370..e140558e6f53 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -82,13 +82,16 @@ int prereq(void) return -1; } -int check_compaction(unsigned long mem_free, unsigned long hugepage_size) +int check_compaction(unsigned long mem_free, unsigned long hugepage_size, + unsigned long initial_nr_hugepages) { unsigned long nr_hugepages_ul; int fd, ret = -1; int compaction_index = 0; - char initial_nr_hugepages[20] = {0}; char nr_hugepages[20] = {0}; + char init_nr_hugepages[20] = {0}; + + sprintf(init_nr_hugepages, "%lu", initial_nr_hugepages); /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ @@ -102,23 +105,6 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size) goto out; } - if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { - ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - /* Start with the initial condition of 0 huge pages*/ - if (write(fd, "0", sizeof(char)) != sizeof(char)) { - ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - /* Request a large number of huge pages. The Kernel will allocate as much as it can */ if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { @@ -146,8 +132,8 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size) lseek(fd, 0, SEEK_SET); - if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) - != strlen(initial_nr_hugepages)) { + if (write(fd, init_nr_hugepages, strlen(init_nr_hugepages)) + != strlen(init_nr_hugepages)) { ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", strerror(errno)); goto close_fd; @@ -171,6 +157,41 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size) return ret; } +int set_zero_hugepages(unsigned long *initial_nr_hugepages) +{ + int fd, ret = -1; + char nr_hugepages[20] = {0}; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto out; + } + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + /* Start with the initial condition of 0 huge pages */ + if (write(fd, "0", sizeof(char)) != sizeof(char)) { + ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + *initial_nr_hugepages = strtoul(nr_hugepages, NULL, 10); + ret = 0; + + close_fd: + close(fd); + + out: + return ret; +} int main(int argc, char **argv) { @@ -181,6 +202,7 @@ int main(int argc, char **argv) unsigned long mem_free = 0; unsigned long hugepage_size = 0; long mem_fragmentable_MB = 0; + unsigned long initial_nr_hugepages; ksft_print_header(); @@ -189,6 +211,10 @@ int main(int argc, char **argv) ksft_set_plan(1); + /* Start the test without hugepages reducing mem_free */ + if (set_zero_hugepages(&initial_nr_hugepages)) + ksft_exit_fail(); + lim.rlim_cur = RLIM_INFINITY; lim.rlim_max = RLIM_INFINITY; if (setrlimit(RLIMIT_MEMLOCK, &lim)) @@ -232,7 +258,8 @@ int main(int argc, char **argv) entry = entry->next; } - if (check_compaction(mem_free, hugepage_size) == 0) + if (check_compaction(mem_free, hugepage_size, + initial_nr_hugepages) == 0) ksft_exit_pass(); ksft_exit_fail(); -- cgit v1.2.3 From 1901472fa880e5706f90926cd85a268d2d16bf84 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 May 2024 13:02:19 +1000 Subject: selftests/mm: fix build warnings on ppc64 Fix warnings like: In file included from uffd-unit-tests.c:8: uffd-unit-tests.c: In function `uffd_poison_handle_fault': uffd-common.h:45:33: warning: format `%llu' expects argument of type `long long unsigned int', but argument 3 has type `__u64' {aka `long unsigned int'} [-Wformat=] By switching to unsigned long long for u64 for ppc64 builds. Link: https://lkml.kernel.org/r/20240521030219.57439-1-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 1 + tools/testing/selftests/mm/uffd-common.h | 1 + 2 files changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index bd335cf9bc0e..bdeaac67ff9a 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -1,3 +1,4 @@ +#define __SANE_USERSPACE_TYPES__ // Use ll64 #include #include #include diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index cc5629c3d2aa..a70ae10b5f62 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -8,6 +8,7 @@ #define __UFFD_COMMON_H__ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ // Use ll64 #include #include #include -- cgit v1.2.3 From 4f1b067359ac8364cdb7f9fda41085fa85789d0f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 26 May 2024 08:13:21 -0300 Subject: Revert "perf parse-events: Prefer sysfs/JSON hardware events over legacy" This reverts commit 617824a7f0f73e4de325cf8add58e55b28c12493. This made a simple 'perf record -e cycles:pp make -j199' stop working on the Ampere ARM64 system Linus uses to test ARM64 kernels, as discussed at length in the threads in the Link tags below. The fix provided by Ian wasn't acceptable and work to fix this will take time we don't have at this point, so lets revert this and work on it on the next devel cycle. Reported-by: Linus Torvalds Cc: Adrian Hunter Cc: Bhaskar Chowdhury Cc: Ethan Adams Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tycho Andersen Cc: Yang Jihong Link: https://lore.kernel.org/lkml/CAHk-=wi5Ri=yR2jBVk-4HzTzpoAWOgstr1LEvg_-OXtJvXXJOA@mail.gmail.com Link: https://lore.kernel.org/lkml/CAHk-=wiWvtFyedDNpoV7a8Fq_FpbB+F5KmWK2xPY3QoYseOf_A@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 31 +++++------------ tools/perf/util/parse-events.h | 2 +- tools/perf/util/parse-events.l | 76 +++++++++++++++++++++--------------------- tools/perf/util/parse-events.y | 62 ++++++++++++---------------------- 4 files changed, 68 insertions(+), 103 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 30f958069076..6ed0f9c5581d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1549,7 +1549,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, } int parse_events_multi_pmu_add(struct parse_events_state *parse_state, - const char *event_name, u64 hw_config, + const char *event_name, const struct parse_events_terms *const_parsed_terms, struct list_head **listp, void *loc_) { @@ -1557,8 +1557,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, struct list_head *list = NULL; struct perf_pmu *pmu = NULL; YYLTYPE *loc = loc_; - int ok = 0, core_ok = 0; - const char *tmp; + int ok = 0; + const char *config; struct parse_events_terms parsed_terms; *listp = NULL; @@ -1571,15 +1571,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, return ret; } - tmp = strdup(event_name); - if (!tmp) + config = strdup(event_name); + if (!config) goto out_err; if (parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - tmp, /*num=*/1, /*novalue=*/true, + config, /*num=*/1, /*novalue=*/true, loc, /*loc_val=*/NULL) < 0) { - zfree(&tmp); + zfree(&config); goto out_err; } list_add_tail(&term->list, &parsed_terms.terms); @@ -1610,8 +1610,6 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, pr_debug("%s -> %s/%s/\n", event_name, pmu->name, sb.buf); strbuf_release(&sb); ok++; - if (pmu->is_core) - core_ok++; } } @@ -1628,18 +1626,6 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } } - if (hw_config != PERF_COUNT_HW_MAX && !core_ok) { - /* - * The event wasn't found on core PMUs but it has a hardware - * config version to try. - */ - if (!parse_events_add_numeric(parse_state, list, - PERF_TYPE_HARDWARE, hw_config, - const_parsed_terms, - /*wildcard=*/true)) - ok++; - } - out_err: parse_events_terms__exit(&parsed_terms); if (ok) @@ -1693,8 +1679,7 @@ int parse_events_multi_pmu_add_or_add_pmu(struct parse_events_state *parse_state /* Failure to add, assume event_or_pmu is an event name. */ zfree(listp); - if (!parse_events_multi_pmu_add(parse_state, event_or_pmu, PERF_COUNT_HW_MAX, - const_parsed_terms, listp, loc)) + if (!parse_events_multi_pmu_add(parse_state, event_or_pmu, const_parsed_terms, listp, loc)) return 0; if (asprintf(&help, "Unable to find PMU or event on a PMU of '%s'", event_or_pmu) < 0) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index f2baa69fff98..e13de2c8b706 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -237,7 +237,7 @@ struct evsel *parse_events__add_event(int idx, struct perf_event_attr *attr, struct perf_pmu *pmu); int parse_events_multi_pmu_add(struct parse_events_state *parse_state, - const char *event_name, u64 hw_config, + const char *event_name, const struct parse_events_terms *const_parsed_terms, struct list_head **listp, void *loc); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 99d585d272e0..16045c383ada 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -113,12 +113,12 @@ do { \ yyless(0); \ } while (0) -static int sym(yyscan_t scanner, int config) +static int sym(yyscan_t scanner, int type, int config) { YYSTYPE *yylval = parse_events_get_lval(scanner); - yylval->num = config; - return PE_VALUE_SYM_SW; + yylval->num = (type << 16) + config; + return type == PERF_TYPE_HARDWARE ? PE_VALUE_SYM_HW : PE_VALUE_SYM_SW; } static int tool(yyscan_t scanner, enum perf_tool_event event) @@ -137,13 +137,13 @@ static int term(yyscan_t scanner, enum parse_events__term_type type) return PE_TERM; } -static int hw(yyscan_t scanner, int config) +static int hw_term(yyscan_t scanner, int config) { YYSTYPE *yylval = parse_events_get_lval(scanner); char *text = parse_events_get_text(scanner); - yylval->hardware_event.str = strdup(text); - yylval->hardware_event.num = config; + yylval->hardware_term.str = strdup(text); + yylval->hardware_term.num = PERF_TYPE_HARDWARE + config; return PE_TERM_HW; } @@ -330,16 +330,16 @@ percore { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); } aux-output { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } metric-id { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } -cpu-cycles|cycles { return hw(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } -stalled-cycles-frontend|idle-cycles-frontend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } -stalled-cycles-backend|idle-cycles-backend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } -instructions { return hw(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } -cache-references { return hw(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } -cache-misses { return hw(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } -branch-instructions|branches { return hw(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } -branch-misses { return hw(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } -bus-cycles { return hw(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } -ref-cycles { return hw(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } +cpu-cycles|cycles { return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } +stalled-cycles-frontend|idle-cycles-frontend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } +stalled-cycles-backend|idle-cycles-backend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } +instructions { return hw_term(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } +cache-references { return hw_term(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } +cache-misses { return hw_term(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } +branch-instructions|branches { return hw_term(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +branch-misses { return hw_term(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } +bus-cycles { return hw_term(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } +ref-cycles { return hw_term(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } r{num_raw_hex} { return str(yyscanner, PE_RAW); } r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } , { return ','; } @@ -383,31 +383,31 @@ r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } <> { BEGIN(INITIAL); } } -cpu-cycles|cycles { return hw(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } -stalled-cycles-frontend|idle-cycles-frontend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } -stalled-cycles-backend|idle-cycles-backend { return hw(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } -instructions { return hw(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } -cache-references { return hw(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } -cache-misses { return hw(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } -branch-instructions|branches { return hw(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } -branch-misses { return hw(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } -bus-cycles { return hw(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } -ref-cycles { return hw(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } -cpu-clock { return sym(yyscanner, PERF_COUNT_SW_CPU_CLOCK); } -task-clock { return sym(yyscanner, PERF_COUNT_SW_TASK_CLOCK); } -page-faults|faults { return sym(yyscanner, PERF_COUNT_SW_PAGE_FAULTS); } -minor-faults { return sym(yyscanner, PERF_COUNT_SW_PAGE_FAULTS_MIN); } -major-faults { return sym(yyscanner, PERF_COUNT_SW_PAGE_FAULTS_MAJ); } -context-switches|cs { return sym(yyscanner, PERF_COUNT_SW_CONTEXT_SWITCHES); } -cpu-migrations|migrations { return sym(yyscanner, PERF_COUNT_SW_CPU_MIGRATIONS); } -alignment-faults { return sym(yyscanner, PERF_COUNT_SW_ALIGNMENT_FAULTS); } -emulation-faults { return sym(yyscanner, PERF_COUNT_SW_EMULATION_FAULTS); } -dummy { return sym(yyscanner, PERF_COUNT_SW_DUMMY); } +cpu-cycles|cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); } +stalled-cycles-frontend|idle-cycles-frontend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } +stalled-cycles-backend|idle-cycles-backend { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } +instructions { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); } +cache-references { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES); } +cache-misses { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); } +branch-instructions|branches { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +branch-misses { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); } +bus-cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES); } +ref-cycles { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES); } +cpu-clock { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK); } +task-clock { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK); } +page-faults|faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS); } +minor-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN); } +major-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ); } +context-switches|cs { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES); } +cpu-migrations|migrations { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS); } +alignment-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); } +emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); } +dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } duration_time { return tool(yyscanner, PERF_TOOL_DURATION_TIME); } user_time { return tool(yyscanner, PERF_TOOL_USER_TIME); } system_time { return tool(yyscanner, PERF_TOOL_SYSTEM_TIME); } -bpf-output { return sym(yyscanner, PERF_COUNT_SW_BPF_OUTPUT); } -cgroup-switches { return sym(yyscanner, PERF_COUNT_SW_CGROUP_SWITCHES); } +bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } +cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CGROUP_SWITCHES); } {lc_type} { return str(yyscanner, PE_LEGACY_CACHE); } {lc_type}-{lc_op_result} { return str(yyscanner, PE_LEGACY_CACHE); } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index c94a3994177e..b3c51f06cbdc 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -55,7 +55,7 @@ static void free_list_evsel(struct list_head* list_evsel) %} %token PE_START_EVENTS PE_START_TERMS -%token PE_VALUE PE_VALUE_SYM_SW PE_TERM +%token PE_VALUE PE_VALUE_SYM_HW PE_VALUE_SYM_SW PE_TERM %token PE_VALUE_SYM_TOOL %token PE_EVENT_NAME %token PE_RAW PE_NAME @@ -66,10 +66,12 @@ static void free_list_evsel(struct list_head* list_evsel) %token PE_DRV_CFG_TERM %token PE_TERM_HW %type PE_VALUE +%type PE_VALUE_SYM_HW %type PE_VALUE_SYM_SW %type PE_VALUE_SYM_TOOL %type PE_MODIFIER_EVENT %type PE_TERM +%type value_sym %type PE_RAW %type PE_NAME %type PE_LEGACY_CACHE @@ -85,7 +87,6 @@ static void free_list_evsel(struct list_head* list_evsel) %type opt_pmu_config %destructor { parse_events_terms__delete ($$); } %type event_pmu -%type event_legacy_hardware %type event_legacy_symbol %type event_legacy_cache %type event_legacy_mem @@ -103,8 +104,8 @@ static void free_list_evsel(struct list_head* list_evsel) %destructor { free_list_evsel ($$); } %type tracepoint_name %destructor { free ($$.sys); free ($$.event); } -%type PE_TERM_HW -%destructor { free ($$.str); } +%type PE_TERM_HW +%destructor { free ($$.str); } %union { @@ -119,10 +120,10 @@ static void free_list_evsel(struct list_head* list_evsel) char *sys; char *event; } tracepoint_name; - struct hardware_event { + struct hardware_term { char *str; u64 num; - } hardware_event; + } hardware_term; } %% @@ -265,7 +266,6 @@ PE_EVENT_NAME event_def event_def event_def: event_pmu | - event_legacy_hardware | event_legacy_symbol | event_legacy_cache sep_dc | event_legacy_mem sep_dc | @@ -292,7 +292,7 @@ PE_NAME sep_dc struct list_head *list; int err; - err = parse_events_multi_pmu_add(_parse_state, $1, PERF_COUNT_HW_MAX, NULL, &list, &@1); + err = parse_events_multi_pmu_add(_parse_state, $1, NULL, &list, &@1); if (err < 0) { struct parse_events_state *parse_state = _parse_state; struct parse_events_error *error = parse_state->error; @@ -308,45 +308,24 @@ PE_NAME sep_dc $$ = list; } -event_legacy_hardware: -PE_TERM_HW opt_pmu_config -{ - /* List of created evsels. */ - struct list_head *list = NULL; - int err = parse_events_multi_pmu_add(_parse_state, $1.str, $1.num, $2, &list, &@1); - - free($1.str); - parse_events_terms__delete($2); - if (err) - PE_ABORT(err); - - $$ = list; -} +value_sym: +PE_VALUE_SYM_HW | -PE_TERM_HW sep_dc -{ - struct list_head *list; - int err; - - err = parse_events_multi_pmu_add(_parse_state, $1.str, $1.num, NULL, &list, &@1); - free($1.str); - if (err) - PE_ABORT(err); - $$ = list; -} +PE_VALUE_SYM_SW event_legacy_symbol: -PE_VALUE_SYM_SW '/' event_config '/' +value_sym '/' event_config '/' { struct list_head *list; + int type = $1 >> 16; + int config = $1 & 255; int err; + bool wildcard = (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE); list = alloc_list(); if (!list) YYNOMEM; - err = parse_events_add_numeric(_parse_state, list, - /*type=*/PERF_TYPE_SOFTWARE, /*config=*/$1, - $3, /*wildcard=*/false); + err = parse_events_add_numeric(_parse_state, list, type, config, $3, wildcard); parse_events_terms__delete($3); if (err) { free_list_evsel(list); @@ -355,17 +334,18 @@ PE_VALUE_SYM_SW '/' event_config '/' $$ = list; } | -PE_VALUE_SYM_SW sep_slash_slash_dc +value_sym sep_slash_slash_dc { struct list_head *list; + int type = $1 >> 16; + int config = $1 & 255; + bool wildcard = (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE); int err; list = alloc_list(); if (!list) YYNOMEM; - err = parse_events_add_numeric(_parse_state, list, - /*type=*/PERF_TYPE_SOFTWARE, /*config=*/$1, - /*head_config=*/NULL, /*wildcard=*/false); + err = parse_events_add_numeric(_parse_state, list, type, config, /*head_config=*/NULL, wildcard); if (err) PE_ABORT(err); $$ = list; -- cgit v1.2.3