From 69053101e096c01a4c8e3d2497e3cd5716e43cec Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Tue, 12 Jul 2022 10:23:44 -0500 Subject: ndtest: Cleanup all of blk namespace specific code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the nd_namespace_blk and nd_blk_region infrastructures being removed, the ndtest still has some references to the old code. So the compilation fails as below, ../tools/testing/nvdimm/test/ndtest.c:204:25: error: ‘ND_DEVICE_NAMESPACE_BLK’ undeclared here (not in a function); did you mean ‘ND_DEVICE_NAMESPACE_IO’? 204 | .type = ND_DEVICE_NAMESPACE_BLK, | ^~~~~~~~~~~~~~~~~~~~~~~ | ND_DEVICE_NAMESPACE_IO ../tools/testing/nvdimm/test/ndtest.c: In function ‘ndtest_create_region’: ../tools/testing/nvdimm/test/ndtest.c:630:17: error: ‘ndbr_desc’ undeclared (first use in this function); did you mean ‘ndr_desc’? 630 | ndbr_desc.enable = ndtest_blk_region_enable; | ^~~~~~~~~ | ndr_desc ../tools/testing/nvdimm/test/ndtest.c:630:17: note: each undeclared identifier is reported only once for each function it appears in ../tools/testing/nvdimm/test/ndtest.c:630:36: error: ‘ndtest_blk_region_enable’ undeclared (first use in this function) 630 | ndbr_desc.enable = ndtest_blk_region_enable; | ^~~~~~~~~~~~~~~~~~~~~~~~ ../tools/testing/nvdimm/test/ndtest.c:631:35: error: ‘ndtest_blk_do_io’ undeclared (first use in this function); did you mean ‘ndtest_blk_mmio’? 631 | ndbr_desc.do_io = ndtest_blk_do_io; | ^~~~~~~~~~~~~~~~ | ndtest_blk_mmio The current patch removes the specific code to cleanup all obsolete references. Signed-off-by: Shivaprasad G Bhat Link: https://lore.kernel.org/r/165763940218.3501174.7103619358744815702.stgit@ltc-boston123.aus.stglabs.ibm.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/ndtest.c | 77 -------------------------------------- 1 file changed, 77 deletions(-) (limited to 'tools') diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c index 4d1a947367f9..01ceb98c15a0 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -134,39 +134,6 @@ static struct ndtest_mapping region1_mapping[] = { }, }; -static struct ndtest_mapping region2_mapping[] = { - { - .dimm = 0, - .position = 0, - .start = 0, - .size = DIMM_SIZE, - }, -}; - -static struct ndtest_mapping region3_mapping[] = { - { - .dimm = 1, - .start = 0, - .size = DIMM_SIZE, - } -}; - -static struct ndtest_mapping region4_mapping[] = { - { - .dimm = 2, - .start = 0, - .size = DIMM_SIZE, - } -}; - -static struct ndtest_mapping region5_mapping[] = { - { - .dimm = 3, - .start = 0, - .size = DIMM_SIZE, - } -}; - static struct ndtest_region bus0_regions[] = { { .type = ND_DEVICE_NAMESPACE_PMEM, @@ -182,34 +149,6 @@ static struct ndtest_region bus0_regions[] = { .size = DIMM_SIZE * 2, .range_index = 2, }, - { - .type = ND_DEVICE_NAMESPACE_BLK, - .num_mappings = ARRAY_SIZE(region2_mapping), - .mapping = region2_mapping, - .size = DIMM_SIZE, - .range_index = 3, - }, - { - .type = ND_DEVICE_NAMESPACE_BLK, - .num_mappings = ARRAY_SIZE(region3_mapping), - .mapping = region3_mapping, - .size = DIMM_SIZE, - .range_index = 4, - }, - { - .type = ND_DEVICE_NAMESPACE_BLK, - .num_mappings = ARRAY_SIZE(region4_mapping), - .mapping = region4_mapping, - .size = DIMM_SIZE, - .range_index = 5, - }, - { - .type = ND_DEVICE_NAMESPACE_BLK, - .num_mappings = ARRAY_SIZE(region5_mapping), - .mapping = region5_mapping, - .size = DIMM_SIZE, - .range_index = 6, - }, }; static struct ndtest_mapping region6_mapping[] = { @@ -501,21 +440,6 @@ static int ndtest_create_region(struct ndtest_priv *p, nd_set->altcookie = nd_set->cookie1; ndr_desc->nd_set = nd_set; - if (region->type == ND_DEVICE_NAMESPACE_BLK) { - mappings[0].start = 0; - mappings[0].size = DIMM_SIZE; - mappings[0].nvdimm = p->config->dimms[ndimm].nvdimm; - - ndr_desc->mapping = &mappings[0]; - ndr_desc->num_mappings = 1; - ndr_desc->num_lanes = 1; - ndbr_desc.enable = ndtest_blk_region_enable; - ndbr_desc.do_io = ndtest_blk_do_io; - region->region = nvdimm_blk_region_create(p->bus, ndr_desc); - - goto done; - } - for (i = 0; i < region->num_mappings; i++) { ndimm = region->mapping[i].dimm; mappings[i].start = region->mapping[i].start; @@ -527,7 +451,6 @@ static int ndtest_create_region(struct ndtest_priv *p, ndr_desc->num_mappings = region->num_mappings; region->region = nvdimm_pmem_region_create(p->bus, ndr_desc); -done: if (!region->region) { dev_err(&p->pdev.dev, "Error registering region %pR\n", ndr_desc->res); -- cgit v1.2.3 From d171011e6adad135eaced630dce26cac9a174037 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Mon, 25 Jul 2022 09:59:03 +0800 Subject: selftests: futex: Fix 'the the' typo in comment Replace 'the the' with 'the' in the comment. Signed-off-by: Slark Xiao Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220725015903.5449-1-slark_xiao@163.com --- .../selftests/futex/functional/futex_requeue_pi_signal_restart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index f8c43ce8fe66..c6b8f32990c8 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) /* * If res is non-zero, we either requeued the waiter or hit an * error, break out and handle it. If it is zero, then the - * signal may have hit before the the waiter was blocked on f1. + * signal may have hit before the waiter was blocked on f1. * Try again. */ if (res > 0) { -- cgit v1.2.3 From 47ea7417b0744324424405fc1207e266053237a9 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Sun, 31 Jul 2022 17:26:49 -0600 Subject: libbpf: Skip empty sections in bpf_object__init_global_data_maps The GNU assembler generates an empty .bss section. This is a well established behavior in GAS that happens in all supported targets. The LLVM assembler doesn't generate an empty .bss section. bpftool chokes on the empty .bss section. Additionally in bpf_object__elf_collect the sec_desc->data is not initialized when a section is not recognized. In this case, this happens with .comment. So we must check that sec_desc->data is initialized before checking if the size is 0. Signed-off-by: James Hilliard Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220731232649.4668-1-james.hilliard1@gmail.com --- tools/lib/bpf/libbpf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 50d41815f431..77e3797cf75a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1642,6 +1642,10 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) for (sec_idx = 1; sec_idx < obj->efile.sec_cnt; sec_idx++) { sec_desc = &obj->efile.secs[sec_idx]; + /* Skip recognized sections with size 0. */ + if (sec_desc->data && sec_desc->data->d_size == 0) + continue; + switch (sec_desc->sec_type) { case SEC_DATA: sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); -- cgit v1.2.3 From 3045f42a64324d339125a8a1a1763bb9e1e08300 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sun, 31 Jul 2022 19:51:09 -0700 Subject: libbpf: Initialize err in probe_map_create GCC-11 warns about the possibly unitialized err variable in probe_map_create: libbpf_probes.c: In function 'probe_map_create': libbpf_probes.c:361:38: error: 'err' may be used uninitialized in this function [-Werror=maybe-uninitialized] 361 | return fd < 0 && err == exp_err ? 1 : 0; | ~~~~^~~~~~~~~~ Fixes: 878d8def0603 ("libbpf: Rework feature-probing APIs") Signed-off-by: Florian Fainelli Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220801025109.1206633-1-f.fainelli@gmail.com --- tools/lib/bpf/libbpf_probes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 0b5398786bf3..6d495656f554 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -193,7 +193,7 @@ static int probe_map_create(enum bpf_map_type map_type) LIBBPF_OPTS(bpf_map_create_opts, opts); int key_size, value_size, max_entries; __u32 btf_key_type_id = 0, btf_value_type_id = 0; - int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err; + int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err = 0; key_size = sizeof(__u32); value_size = sizeof(__u32); -- cgit v1.2.3 From d55dfe587bc0670f90564a962615723fe7749ab1 Mon Sep 17 00:00:00 2001 From: Manu Bretelle Date: Mon, 1 Aug 2022 06:24:09 -0700 Subject: bpftool: Remove BPF_OBJ_NAME_LEN restriction when looking up bpf program by name bpftool was limiting the length of names to BPF_OBJ_NAME_LEN in prog_parse fds. Since commit b662000aff84 ("bpftool: Adding support for BTF program names") we can get the full program name from BTF. This patch removes the restriction of name length when running `bpftool prog show name ${name}`. Test: Tested against some internal program names that were longer than `BPF_OBJ_NAME_LEN`, here a redacted example of what was ran to test. # previous behaviour $ sudo bpftool prog show name some_long_program_name Error: can't parse name # with the patch $ sudo ./bpftool prog show name some_long_program_name 123456789: tracing name some_long_program_name tag taghexa gpl .... ... ... ... # too long sudo ./bpftool prog show name $(python3 -c 'print("A"*128)') Error: can't parse name # not too long but no match $ sudo ./bpftool prog show name $(python3 -c 'print("A"*127)') Signed-off-by: Manu Bretelle Signed-off-by: Andrii Nakryiko Tested-by: Jiri Olsa Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220801132409.4147849-1-chantr4@gmail.com --- tools/bpf/bpftool/common.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 067e9ea59e3b..8727765add88 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -722,6 +722,7 @@ print_all_levels(__maybe_unused enum libbpf_print_level level, static int prog_fd_by_nametag(void *nametag, int **fds, bool tag) { + char prog_name[MAX_PROG_FULL_NAME]; unsigned int id = 0; int fd, nb_fds = 0; void *tmp; @@ -754,12 +755,20 @@ static int prog_fd_by_nametag(void *nametag, int **fds, bool tag) goto err_close_fd; } - if ((tag && memcmp(nametag, info.tag, BPF_TAG_SIZE)) || - (!tag && strncmp(nametag, info.name, BPF_OBJ_NAME_LEN))) { + if (tag && memcmp(nametag, info.tag, BPF_TAG_SIZE)) { close(fd); continue; } + if (!tag) { + get_prog_full_name(&info, fd, prog_name, + sizeof(prog_name)); + if (strncmp(nametag, prog_name, sizeof(prog_name))) { + close(fd); + continue; + } + } + if (nb_fds > 0) { tmp = realloc(*fds, (nb_fds + 1) * sizeof(int)); if (!tmp) { @@ -820,7 +829,7 @@ int prog_parse_fds(int *argc, char ***argv, int **fds) NEXT_ARGP(); name = **argv; - if (strlen(name) > BPF_OBJ_NAME_LEN - 1) { + if (strlen(name) > MAX_PROG_FULL_NAME - 1) { p_err("can't parse name"); return -1; } -- cgit v1.2.3 From d25f40ff68aa61c838947bb9adee6c6b36e77453 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Wed, 3 Aug 2022 09:14:03 -0600 Subject: libbpf: Ensure functions with always_inline attribute are inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC expects the always_inline attribute to only be set on inline functions, as such we should make all functions with this attribute use the __always_inline macro which makes the function inline and sets the attribute. Fixes errors like: /home/buildroot/bpf-next/tools/testing/selftests/bpf/tools/include/bpf/bpf_tracing.h:439:1: error: ‘always_inline’ function might not be inlinable [-Werror=attributes] 439 | ____##name(unsigned long long *ctx, ##args) | ^~~~ Signed-off-by: James Hilliard Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220803151403.793024-1-james.hilliard1@gmail.com --- tools/lib/bpf/bpf_tracing.h | 14 +++++++------- tools/lib/bpf/usdt.bpf.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 43ca3aff2292..5fdb93da423b 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -426,7 +426,7 @@ struct pt_regs; */ #define BPF_PROG(name, args...) \ name(unsigned long long *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args); \ typeof(name(0)) name(unsigned long long *ctx) \ { \ @@ -435,7 +435,7 @@ typeof(name(0)) name(unsigned long long *ctx) \ return ____##name(___bpf_ctx_cast(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args) struct pt_regs; @@ -460,7 +460,7 @@ struct pt_regs; */ #define BPF_KPROBE(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -469,7 +469,7 @@ typeof(name(0)) name(struct pt_regs *ctx) \ return ____##name(___bpf_kprobe_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx @@ -484,7 +484,7 @@ ____##name(struct pt_regs *ctx, ##args) */ #define BPF_KRETPROBE(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -540,7 +540,7 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) #define BPF_KSYSCALL(name, args...) \ name(struct pt_regs *ctx); \ extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -555,7 +555,7 @@ typeof(name(0)) name(struct pt_regs *ctx) \ return ____##name(___bpf_syscall_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define BPF_KPROBE_SYSCALL BPF_KSYSCALL diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 4f2adc0bd6ca..fdfd235e52c4 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -232,7 +232,7 @@ long bpf_usdt_cookie(struct pt_regs *ctx) */ #define BPF_USDT(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -241,7 +241,7 @@ typeof(name(0)) name(struct pt_regs *ctx) \ return ____##name(___bpf_usdt_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #endif /* __USDT_BPF_H__ */ -- cgit v1.2.3 From 5653f55ebd767b4ef47414ee7f852517993eda6f Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 5 Aug 2022 10:14:05 -0700 Subject: selftests/bpf: Clean up sys_nanosleep uses This patch cleans up a few things: * dynptr_fail.c: There is no sys_nanosleep tracepoint. dynptr_fail only tests that the prog load fails, so just SEC("?raw_tp") suffices here. * test_bpf_cookie: There is no sys_nanosleep kprobe. The prog is loaded in userspace through bpf_program__attach_kprobe_opts passing in SYS_NANOSLEEP_KPROBE_NAME, so just SEC("k{ret}probe") suffices here. * test_helper_restricted: There is no sys_nanosleep kprobe. test_helper_restricted only tests that the prog load fails, so just SEC("?kprobe")( suffices here. There are no functional changes. Suggested-by: Andrii Nakryiko Signed-off-by: Joanne Koong Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220805171405.2272103-1-joannelkoong@gmail.com --- tools/testing/selftests/bpf/progs/dynptr_fail.c | 56 +++++++++++----------- .../testing/selftests/bpf/progs/test_bpf_cookie.c | 4 +- .../selftests/bpf/progs/test_helper_restricted.c | 4 +- 3 files changed, 32 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index 0a26c243e6e9..b5e0a87f0a36 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -65,7 +65,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr) /* Every bpf_ringbuf_reserve_dynptr call must have a corresponding * bpf_ringbuf_submit/discard_dynptr call */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int ringbuf_missing_release1(void *ctx) { struct bpf_dynptr ptr; @@ -77,7 +77,7 @@ int ringbuf_missing_release1(void *ctx) return 0; } -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int ringbuf_missing_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -112,7 +112,7 @@ static int missing_release_callback_fn(__u32 index, void *data) } /* Any dynptr initialized within a callback must have bpf_dynptr_put called */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int ringbuf_missing_release_callback(void *ctx) { bpf_loop(10, missing_release_callback_fn, NULL, 0); @@ -120,7 +120,7 @@ int ringbuf_missing_release_callback(void *ctx) } /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int ringbuf_release_uninit_dynptr(void *ctx) { struct bpf_dynptr ptr; @@ -132,7 +132,7 @@ int ringbuf_release_uninit_dynptr(void *ctx) } /* A dynptr can't be used after it has been invalidated */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int use_after_invalid(void *ctx) { struct bpf_dynptr ptr; @@ -151,7 +151,7 @@ int use_after_invalid(void *ctx) } /* Can't call non-dynptr ringbuf APIs on a dynptr ringbuf sample */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int ringbuf_invalid_api(void *ctx) { struct bpf_dynptr ptr; @@ -173,7 +173,7 @@ done: } /* Can't add a dynptr to a map */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int add_dynptr_to_map1(void *ctx) { struct bpf_dynptr ptr; @@ -190,7 +190,7 @@ int add_dynptr_to_map1(void *ctx) } /* Can't add a struct with an embedded dynptr to a map */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int add_dynptr_to_map2(void *ctx) { struct test_info x; @@ -207,7 +207,7 @@ int add_dynptr_to_map2(void *ctx) } /* A data slice can't be accessed out of bounds */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int data_slice_out_of_bounds_ringbuf(void *ctx) { struct bpf_dynptr ptr; @@ -227,7 +227,7 @@ done: return 0; } -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int data_slice_out_of_bounds_map_value(void *ctx) { __u32 key = 0, map_val; @@ -247,7 +247,7 @@ int data_slice_out_of_bounds_map_value(void *ctx) } /* A data slice can't be used after it has been released */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int data_slice_use_after_release(void *ctx) { struct bpf_dynptr ptr; @@ -273,7 +273,7 @@ done: } /* A data slice must be first checked for NULL */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int data_slice_missing_null_check1(void *ctx) { struct bpf_dynptr ptr; @@ -293,7 +293,7 @@ int data_slice_missing_null_check1(void *ctx) } /* A data slice can't be dereferenced if it wasn't checked for null */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int data_slice_missing_null_check2(void *ctx) { struct bpf_dynptr ptr; @@ -315,7 +315,7 @@ done: /* Can't pass in a dynptr as an arg to a helper function that doesn't take in a * dynptr argument */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_helper1(void *ctx) { struct bpf_dynptr ptr; @@ -329,7 +329,7 @@ int invalid_helper1(void *ctx) } /* A dynptr can't be passed into a helper function at a non-zero offset */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_helper2(void *ctx) { struct bpf_dynptr ptr; @@ -344,7 +344,7 @@ int invalid_helper2(void *ctx) } /* A bpf_dynptr is invalidated if it's been written into */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_write1(void *ctx) { struct bpf_dynptr ptr; @@ -365,7 +365,7 @@ int invalid_write1(void *ctx) * A bpf_dynptr can't be used as a dynptr if it has been written into at a fixed * offset */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_write2(void *ctx) { struct bpf_dynptr ptr; @@ -388,7 +388,7 @@ int invalid_write2(void *ctx) * A bpf_dynptr can't be used as a dynptr if it has been written into at a * non-const offset */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_write3(void *ctx) { struct bpf_dynptr ptr; @@ -419,7 +419,7 @@ static int invalid_write4_callback(__u32 index, void *data) /* If the dynptr is written into in a callback function, it should * be invalidated as a dynptr */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_write4(void *ctx) { struct bpf_dynptr ptr; @@ -436,7 +436,7 @@ int invalid_write4(void *ctx) /* A globally-defined bpf_dynptr can't be used (it must reside as a stack frame) */ struct bpf_dynptr global_dynptr; -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int global(void *ctx) { /* this should fail */ @@ -448,7 +448,7 @@ int global(void *ctx) } /* A direct read should fail */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_read1(void *ctx) { struct bpf_dynptr ptr; @@ -464,7 +464,7 @@ int invalid_read1(void *ctx) } /* A direct read at an offset should fail */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_read2(void *ctx) { struct bpf_dynptr ptr; @@ -479,7 +479,7 @@ int invalid_read2(void *ctx) } /* A direct read at an offset into the lower stack slot should fail */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_read3(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -505,7 +505,7 @@ static int invalid_read4_callback(__u32 index, void *data) } /* A direct read within a callback function should fail */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_read4(void *ctx) { struct bpf_dynptr ptr; @@ -520,7 +520,7 @@ int invalid_read4(void *ctx) } /* Initializing a dynptr on an offset should fail */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int invalid_offset(void *ctx) { struct bpf_dynptr ptr; @@ -534,7 +534,7 @@ int invalid_offset(void *ctx) } /* Can't release a dynptr twice */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int release_twice(void *ctx) { struct bpf_dynptr ptr; @@ -560,7 +560,7 @@ static int release_twice_callback_fn(__u32 index, void *data) /* Test that releasing a dynptr twice, where one of the releases happens * within a calback function, fails */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int release_twice_callback(void *ctx) { struct bpf_dynptr ptr; @@ -575,7 +575,7 @@ int release_twice_callback(void *ctx) } /* Reject unsupported local mem types for dynptr_from_mem API */ -SEC("?raw_tp/sys_nanosleep") +SEC("?raw_tp") int dynptr_from_mem_invalid_api(void *ctx) { struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c index 22d0ac8709b4..5a3a80f751c4 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -28,14 +28,14 @@ static void update(void *ctx, __u64 *res) *res |= bpf_get_attach_cookie(ctx); } -SEC("kprobe/sys_nanosleep") +SEC("kprobe") int handle_kprobe(struct pt_regs *ctx) { update(ctx, &kprobe_res); return 0; } -SEC("kretprobe/sys_nanosleep") +SEC("kretprobe") int handle_kretprobe(struct pt_regs *ctx) { update(ctx, &kretprobe_res); diff --git a/tools/testing/selftests/bpf/progs/test_helper_restricted.c b/tools/testing/selftests/bpf/progs/test_helper_restricted.c index 20ef9d433b97..5715c569ec03 100644 --- a/tools/testing/selftests/bpf/progs/test_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/test_helper_restricted.c @@ -72,7 +72,7 @@ int tp_timer(void *ctx) return 0; } -SEC("?kprobe/sys_nanosleep") +SEC("?kprobe") int kprobe_timer(void *ctx) { timer_work(); @@ -104,7 +104,7 @@ int tp_spin_lock(void *ctx) return 0; } -SEC("?kprobe/sys_nanosleep") +SEC("?kprobe") int kprobe_spin_lock(void *ctx) { spin_lock_work(); -- cgit v1.2.3 From e19db6762c18ab1ddf7a3ef4d0023780c24dc1e8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Aug 2022 14:42:02 -0700 Subject: libbpf: Reject legacy 'maps' ELF section Add explicit error message if BPF object file is still using legacy BPF map definitions in SEC("maps"). Before this change, if BPF object file is still using legacy map definition user will see a bit confusing: libbpf: elf: skipping unrecognized data section(4) maps libbpf: prog 'handler': bad map relo against 'server_map' in section 'maps' Now libbpf will be explicit about rejecting "maps" ELF section: libbpf: elf: legacy map definitions in 'maps' section are not supported by libbpf v1.0+ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220803214202.23750-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 77e3797cf75a..d3d94704583f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -591,7 +591,6 @@ struct elf_state { size_t strtabidx; struct elf_sec_desc *secs; int sec_cnt; - int maps_shndx; int btf_maps_shndx; __u32 btf_maps_sec_btf_id; int text_shndx; @@ -1272,7 +1271,6 @@ static struct bpf_object *bpf_object__new(const char *path, */ obj->efile.obj_buf = obj_buf; obj->efile.obj_buf_sz = obj_buf_sz; - obj->efile.maps_shndx = -1; obj->efile.btf_maps_shndx = -1; obj->efile.st_ops_shndx = -1; obj->kconfig_map_idx = -1; @@ -3363,7 +3361,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) if (err) return err; } else if (strcmp(name, "maps") == 0) { - obj->efile.maps_shndx = idx; + pr_warn("elf: legacy map definitions in 'maps' section are not supported by libbpf v1.0+\n"); + return -ENOTSUP; } else if (strcmp(name, MAPS_ELF_SEC) == 0) { obj->efile.btf_maps_shndx = idx; } else if (strcmp(name, BTF_ELF_SEC) == 0) { @@ -3895,8 +3894,7 @@ static bool bpf_object__shndx_is_data(const struct bpf_object *obj, static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, int shndx) { - return shndx == obj->efile.maps_shndx || - shndx == obj->efile.btf_maps_shndx; + return shndx == obj->efile.btf_maps_shndx; } static enum libbpf_map_type -- cgit v1.2.3 From 9e32084ef1c33a87a736d6ce3fcb95b60dac9aa1 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sat, 6 Aug 2022 18:20:21 +0800 Subject: libbpf: Do not require executable permission for shared libraries Currently, resolve_full_path() requires executable permission for both programs and shared libraries. This causes failures on distos like Debian since the shared libraries are not installed executable and Linux is not requiring shared libraries to have executable permissions. Let's remove executable permission check for shared libraries. Reported-by: Goro Fuji Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220806102021.3867130-1-hengqi.chen@gmail.com --- tools/lib/bpf/libbpf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d3d94704583f..f7364ea82ac1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10664,15 +10664,17 @@ static const char *arch_specific_lib_paths(void) static int resolve_full_path(const char *file, char *result, size_t result_sz) { const char *search_paths[3] = {}; - int i; + int i, perm; if (str_has_sfx(file, ".so") || strstr(file, ".so.")) { search_paths[0] = getenv("LD_LIBRARY_PATH"); search_paths[1] = "/usr/lib64:/usr/lib"; search_paths[2] = arch_specific_lib_paths(); + perm = R_OK; } else { search_paths[0] = getenv("PATH"); search_paths[1] = "/usr/bin:/usr/sbin"; + perm = R_OK | X_OK; } for (i = 0; i < ARRAY_SIZE(search_paths); i++) { @@ -10691,8 +10693,8 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) if (!seg_len) continue; snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); - /* ensure it is an executable file/link */ - if (access(result, R_OK | X_OK) < 0) + /* ensure it has required permissions */ + if (access(result, perm) < 0) continue; pr_debug("resolved '%s' to '%s'\n", file, result); return 0; -- cgit v1.2.3 From ca34ce29fc4b0e929cc6aada40829d17ab50fee4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 8 Aug 2022 09:47:23 -0700 Subject: bpf: Improve docstring for BPF_F_USER_BUILD_ID flag Most tools which use bpf_get_stack or bpf_get_stackid symbolicate the stack - meaning the stack of addresses in the target process' address space is transformed into meaningful symbol names. The BPF_F_USER_BUILD_ID flag eases this process by finding the build_id of the file-backed vma which the address falls in and translating the address to an offset within the backing file. To be more specific, the offset is a "file offset" from the beginning of the backing file. The symbols in ET_DYN ELF objects have a st_value which is also described as an "offset" - but an offset in the process address space, relative to the base address of the object. It's necessary to translate between the "file offset" and "virtual address offset" during symbolication before they can be directly compared. Failure to do so can lead to confusing bugs, so this patch clarifies language in the documentation in an attempt to keep this from happening. Signed-off-by: Dave Marchevsky Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220808164723.3107500-1-davemarchevsky@fb.com --- include/uapi/linux/bpf.h | 14 ++++++++++++-- tools/include/uapi/linux/bpf.h | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bf9ba1329be..534e33fb1029 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3008,8 +3008,18 @@ union bpf_attr { * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 59a217ca2dfd..f58d58e1d547 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3008,8 +3008,18 @@ union bpf_attr { * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject -- cgit v1.2.3 From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Aug 2022 08:08:02 +0200 Subject: bpf: Add BPF-helper for accessing CLOCK_TAI Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai") introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time sensitive networks (TSN), where all nodes are synchronized by Precision Time Protocol (PTP), it's helpful to have the possibility to generate timestamps based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in place, it becomes very convenient to correlate activity across different machines in the network. Use cases for such a BPF helper include functionalities such as Tx launch time (e.g. ETF and TAPRIO Qdiscs) and timestamping. Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is. Signed-off-by: Jesper Dangaard Brouer [Kurt: Wrote changelog and renamed helper] Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ tools/include/uapi/linux/bpf.h | 13 +++++++++++++ 5 files changed, 42 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20c26aed7896..a627a02cf8ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 534e33fb1029..7d1e2794d83e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5341,6 +5341,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5551,6 +5563,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..639437f36928 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2623,6 +2623,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1f961f9982d2..a95eb9fb01ff 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -198,6 +198,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_tai_ns) +{ + /* NMI safe access to clock tai */ + return ktime_get_tai_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { + .func = bpf_ktime_get_tai_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -1617,6 +1629,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_tai_ns: + return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f58d58e1d547..e174ad28aeb7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5341,6 +5341,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5551,6 +5563,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 64e15820b987cc8e5864a8b907dfc17861e6ab5a Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 9 Aug 2022 08:08:03 +0200 Subject: selftests/bpf: Add BPF-helper test for CLOCK_TAI access Add BPF-helper test case for CLOCK_TAI access. The added test verifies that: * Timestamps are generated * Timestamps are moving forward * Timestamps are reasonable Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-3-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/time_tai.c | 74 +++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_time_tai.c | 24 ++++++++ 2 files changed, 98 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/time_tai.c create mode 100644 tools/testing/selftests/bpf/progs/test_time_tai.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/time_tai.c b/tools/testing/selftests/bpf/prog_tests/time_tai.c new file mode 100644 index 000000000000..a31119823666 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/time_tai.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022 Linutronix GmbH */ + +#include +#include + +#include "test_time_tai.skel.h" + +#include +#include + +#define TAI_THRESHOLD 1000000000ULL /* 1s */ +#define NSEC_PER_SEC 1000000000ULL + +static __u64 ts_to_ns(const struct timespec *ts) +{ + return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec; +} + +void test_time_tai(void) +{ + struct __sk_buff skb = { + .cb[0] = 0, + .cb[1] = 0, + .tstamp = 0, + }; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + .ctx_out = &skb, + .ctx_size_out = sizeof(skb), + ); + struct test_time_tai *skel; + struct timespec now_tai; + __u64 ts1, ts2, now; + int ret, prog_fd; + + /* Open and load */ + skel = test_time_tai__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tai_open")) + return; + + /* Run test program */ + prog_fd = bpf_program__fd(skel->progs.time_tai); + ret = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(ret, "test_run"); + + /* Retrieve generated TAI timestamps */ + ts1 = skb.tstamp; + ts2 = skb.cb[0] | ((__u64)skb.cb[1] << 32); + + /* TAI != 0 */ + ASSERT_NEQ(ts1, 0, "tai_ts1"); + ASSERT_NEQ(ts2, 0, "tai_ts2"); + + /* TAI is moving forward only */ + ASSERT_GT(ts2, ts1, "tai_forward"); + + /* Check for future */ + ret = clock_gettime(CLOCK_TAI, &now_tai); + ASSERT_EQ(ret, 0, "tai_gettime"); + now = ts_to_ns(&now_tai); + + ASSERT_TRUE(now > ts1, "tai_future_ts1"); + ASSERT_TRUE(now > ts2, "tai_future_ts2"); + + /* Check for reasonable range */ + ASSERT_TRUE(now - ts1 < TAI_THRESHOLD, "tai_range_ts1"); + ASSERT_TRUE(now - ts2 < TAI_THRESHOLD, "tai_range_ts2"); + + test_time_tai__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_time_tai.c b/tools/testing/selftests/bpf/progs/test_time_tai.c new file mode 100644 index 000000000000..7ea0863f3ddb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_time_tai.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022 Linutronix GmbH */ + +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("tc") +int time_tai(struct __sk_buff *skb) +{ + __u64 ts1, ts2; + + /* Get TAI timestamps */ + ts1 = bpf_ktime_get_tai_ns(); + ts2 = bpf_ktime_get_tai_ns(); + + /* Save TAI timestamps (Note: skb->hwtstamp is read-only) */ + skb->tstamp = ts1; + skb->cb[0] = ts2 & 0xffffffff; + skb->cb[1] = ts2 >> 32; + + return 0; +} -- cgit v1.2.3 From d020b2360b350b9f91b1769f9c84fe2d22f643db Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 9 Aug 2022 11:11:09 -0600 Subject: selftests/bpf: Fix vmtest.sh -h to not require root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set the exit trap only after argument parsing is done. This way argument parse failure or `-h` will not require sudo. Reasoning is that it's confusing that a help message would require root access. Signed-off-by: Daniel Xu Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/6a802aa37758e5a7e6aa5de294634f5518005e2b.1660064925.git.dxu@dxuuu.xyz --- tools/testing/selftests/bpf/vmtest.sh | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index b86ae4a2e5c5..976ef7585b33 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -307,6 +307,20 @@ update_kconfig() fi } +catch() +{ + local exit_code=$1 + local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}" + # This is just a cleanup and the directory may + # have already been unmounted. So, don't let this + # clobber the error code we intend to return. + unmount_image || true + if [[ -f "${exit_status_file}" ]]; then + exit_code="$(cat ${exit_status_file})" + fi + exit ${exit_code} +} + main() { local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" @@ -353,6 +367,8 @@ main() done shift $((OPTIND -1)) + trap 'catch "$?"' EXIT + if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" else @@ -409,20 +425,4 @@ main() fi } -catch() -{ - local exit_code=$1 - local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}" - # This is just a cleanup and the directory may - # have already been unmounted. So, don't let this - # clobber the error code we intend to return. - unmount_image || true - if [[ -f "${exit_status_file}" ]]; then - exit_code="$(cat ${exit_status_file})" - fi - exit ${exit_code} -} - -trap 'catch "$?"' EXIT - main "$@" -- cgit v1.2.3 From a7be0ab1eb1949f3564739784b4360e1233305f6 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 9 Aug 2022 11:11:10 -0600 Subject: selftests/bpf: Fix vmtest.sh getopts optstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before, you could see the following errors: $ ./vmtest.sh -j ./vmtest.sh: option requires an argument -- j ./vmtest.sh: line 357: OPTARG: unbound variable $ ./vmtest.sh -z ./vmtest.sh: illegal option -- z ./vmtest.sh: line 357: OPTARG: unbound variable Fix by adding ':' as first character of optstring. Reason is that getopts requires ':' as the first character for OPTARG to be set in the `?` and `:` error cases. Note that the ':' as the first character of the optstring switches getopts to silent mode. The desire to run in this mode seems to have been there all along, as the script takes care of reporting errors. Signed-off-by: Daniel Xu Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/0f93b56198328b6b4da7b4cf4662d05c3edb5fd2.1660064925.git.dxu@dxuuu.xyz --- tools/testing/selftests/bpf/vmtest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 976ef7585b33..a29aa05ebb3e 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -333,7 +333,7 @@ main() local exit_command="poweroff -f" local debug_shell="no" - while getopts 'hskid:j:' opt; do + while getopts ':hskid:j:' opt; do case ${opt} in i) update_image="yes" -- cgit v1.2.3 From dc444be8bae45019396aedd53c745e685a4eb235 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 9 Aug 2022 14:40:55 -0700 Subject: selftests/bpf: add extra test for using dynptr data slice after release Add an additional test, "data_slice_use_after_release2", for ensuring that data slices are correctly invalidated by the verifier after the dynptr whose ref obj id they track is released. In particular, this tests data slice invalidation for dynptrs located at a non-zero offset from the frame pointer. Signed-off-by: Joanne Koong Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220809214055.4050604-2-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dynptr.c | 3 +- tools/testing/selftests/bpf/progs/dynptr_fail.c | 38 ++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 3c7aa82b98e2..bcf80b9f7c27 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -22,7 +22,8 @@ static struct { {"add_dynptr_to_map2", "invalid indirect read from stack"}, {"data_slice_out_of_bounds_ringbuf", "value is outside of the allowed memory range"}, {"data_slice_out_of_bounds_map_value", "value is outside of the allowed memory range"}, - {"data_slice_use_after_release", "invalid mem access 'scalar'"}, + {"data_slice_use_after_release1", "invalid mem access 'scalar'"}, + {"data_slice_use_after_release2", "invalid mem access 'scalar'"}, {"data_slice_missing_null_check1", "invalid mem access 'mem_or_null'"}, {"data_slice_missing_null_check2", "invalid mem access 'mem_or_null'"}, {"invalid_helper1", "invalid indirect read from stack"}, diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index b5e0a87f0a36..b0f08ff024fb 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -248,7 +248,7 @@ int data_slice_out_of_bounds_map_value(void *ctx) /* A data slice can't be used after it has been released */ SEC("?raw_tp") -int data_slice_use_after_release(void *ctx) +int data_slice_use_after_release1(void *ctx) { struct bpf_dynptr ptr; struct sample *sample; @@ -272,6 +272,42 @@ done: return 0; } +/* A data slice can't be used after it has been released. + * + * This tests the case where the data slice tracks a dynptr (ptr2) + * that is at a non-zero offset from the frame pointer (ptr1 is at fp, + * ptr2 is at fp - 16). + */ +SEC("?raw_tp") +int data_slice_use_after_release2(void *ctx) +{ + struct bpf_dynptr ptr1, ptr2; + struct sample *sample; + + bpf_ringbuf_reserve_dynptr(&ringbuf, 64, 0, &ptr1); + bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(*sample), 0, &ptr2); + + sample = bpf_dynptr_data(&ptr2, 0, sizeof(*sample)); + if (!sample) + goto done; + + sample->pid = 23; + + bpf_ringbuf_submit_dynptr(&ptr2, 0); + + /* this should fail */ + sample->pid = 23; + + bpf_ringbuf_submit_dynptr(&ptr1, 0); + + return 0; + +done: + bpf_ringbuf_discard_dynptr(&ptr2, 0); + bpf_ringbuf_discard_dynptr(&ptr1, 0); + return 0; +} + /* A data slice must be first checked for NULL */ SEC("?raw_tp") int data_slice_missing_null_check1(void *ctx) -- cgit v1.2.3 From 3143d10b094596f3e5d5964b2660375e586652a3 Mon Sep 17 00:00:00 2001 From: Shibin Koikkara Reeny Date: Wed, 3 Aug 2022 14:43:54 +0000 Subject: selftests/xsk: Update poll test cases Poll test case was not testing all the functionality of the poll feature in the test suite. This patch updates the poll test case which contains 2 test cases to test the RX and the TX poll functionality and additional 2 more test cases to check the timeout feature of the poll event. Poll test suite has 4 test cases: 1. TEST_TYPE_RX_POLL: Check if RX path POLLIN function works as expect. TX path can use any method to send the traffic. 2. TEST_TYPE_TX_POLL: Check if TX path POLLOUT function works as expect. RX path can use any method to receive the traffic. 3. TEST_TYPE_POLL_RXQ_EMPTY: Call poll function with parameter POLLIN on empty RX queue will cause timeout. If timeout then test case passes. 4. TEST_TYPE_POLL_TXQ_FULL: When TX queue is filled and packets are not cleaned by the kernel then if we invoke the poll function with POLLOUT it should trigger timeout. Signed-off-by: Shibin Koikkara Reeny Signed-off-by: Daniel Borkmann Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20220803144354.98122-1-shibin.koikkara.reeny@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 166 ++++++++++++++++++++++++------- tools/testing/selftests/bpf/xskxceiver.h | 8 +- 2 files changed, 134 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 74d56d971baf..20b44ab32a06 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -244,6 +244,11 @@ static void gen_udp_hdr(u32 payload, void *pkt, struct ifobject *ifobject, memset32_htonl(pkt + PKT_HDR_SIZE, payload, UDP_PKT_DATA_SIZE); } +static bool is_umem_valid(struct ifobject *ifobj) +{ + return !!ifobj->umem->umem; +} + static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) { udp_hdr->check = 0; @@ -817,12 +822,13 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) return TEST_PASS; } -static int receive_pkts(struct ifobject *ifobj, struct pollfd *fds) +static int receive_pkts(struct test_spec *test, struct pollfd *fds) { - struct timeval tv_end, tv_now, tv_timeout = {RECV_TMOUT, 0}; + struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; + struct pkt_stream *pkt_stream = test->ifobj_rx->pkt_stream; u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkts_sent = 0; - struct pkt_stream *pkt_stream = ifobj->pkt_stream; - struct xsk_socket_info *xsk = ifobj->xsk; + struct xsk_socket_info *xsk = test->ifobj_rx->xsk; + struct ifobject *ifobj = test->ifobj_rx; struct xsk_umem_info *umem = xsk->umem; struct pkt *pkt; int ret; @@ -843,17 +849,28 @@ static int receive_pkts(struct ifobject *ifobj, struct pollfd *fds) } kick_rx(xsk); + if (ifobj->use_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + + if (!ret) { + if (!is_umem_valid(test->ifobj_tx)) + return TEST_PASS; + + ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); + return TEST_FAILURE; - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); - if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret < 0) - exit_with_error(-ret); } - continue; + + if (!(fds->revents & POLLIN)) + continue; } + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) + continue; + if (ifobj->use_fill_ring) { ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); while (ret != rcvd) { @@ -900,13 +917,35 @@ static int receive_pkts(struct ifobject *ifobj, struct pollfd *fds) return TEST_PASS; } -static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb) +static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb, struct pollfd *fds, + bool timeout) { struct xsk_socket_info *xsk = ifobject->xsk; - u32 i, idx, valid_pkts = 0; + bool use_poll = ifobject->use_poll; + u32 i, idx, ret, valid_pkts = 0; + + while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { + if (use_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (timeout) { + if (ret < 0) { + ksft_print_msg("ERROR: [%s] Poll error %d\n", + __func__, ret); + return TEST_FAILURE; + } + if (ret == 0) + return TEST_PASS; + break; + } + if (ret <= 0) { + ksft_print_msg("ERROR: [%s] Poll error %d\n", + __func__, ret); + return TEST_FAILURE; + } + } - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) complete_pkts(xsk, BATCH_SIZE); + } for (i = 0; i < BATCH_SIZE; i++) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); @@ -933,11 +972,27 @@ static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb) xsk_ring_prod__submit(&xsk->tx, i); xsk->outstanding_tx += valid_pkts; - if (complete_pkts(xsk, i)) - return TEST_FAILURE; - usleep(10); - return TEST_PASS; + if (use_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret <= 0) { + if (ret == 0 && timeout) + return TEST_PASS; + + ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret); + return TEST_FAILURE; + } + } + + if (!timeout) { + if (complete_pkts(xsk, i)) + return TEST_FAILURE; + + usleep(10); + return TEST_PASS; + } + + return TEST_CONTINUE; } static void wait_for_tx_completion(struct xsk_socket_info *xsk) @@ -948,29 +1003,19 @@ static void wait_for_tx_completion(struct xsk_socket_info *xsk) static int send_pkts(struct test_spec *test, struct ifobject *ifobject) { + bool timeout = !is_umem_valid(test->ifobj_rx); struct pollfd fds = { }; - u32 pkt_cnt = 0; + u32 pkt_cnt = 0, ret; fds.fd = xsk_socket__fd(ifobject->xsk->xsk); fds.events = POLLOUT; while (pkt_cnt < ifobject->pkt_stream->nb_pkts) { - int err; - - if (ifobject->use_poll) { - int ret; - - ret = poll(&fds, 1, POLL_TMOUT); - if (ret <= 0) - continue; - - if (!(fds.revents & POLLOUT)) - continue; - } - - err = __send_pkts(ifobject, &pkt_cnt); - if (err || test->fail) + ret = __send_pkts(ifobject, &pkt_cnt, &fds, timeout); + if ((ret || test->fail) && !timeout) return TEST_FAILURE; + else if (ret == TEST_PASS && timeout) + return ret; } wait_for_tx_completion(ifobject->xsk); @@ -1235,7 +1280,7 @@ static void *worker_testapp_validate_rx(void *arg) pthread_barrier_wait(&barr); - err = receive_pkts(ifobject, &fds); + err = receive_pkts(test, &fds); if (!err && ifobject->validation_func) err = ifobject->validation_func(ifobject); @@ -1251,6 +1296,33 @@ static void *worker_testapp_validate_rx(void *arg) pthread_exit(NULL); } +static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj, + enum test_type type) +{ + pthread_t t0; + + if (pthread_barrier_init(&barr, NULL, 2)) + exit_with_error(errno); + + test->current_step++; + if (type == TEST_TYPE_POLL_RXQ_TMOUT) + pkt_stream_reset(ifobj->pkt_stream); + pkts_in_flight = 0; + + /*Spawn thread */ + pthread_create(&t0, NULL, ifobj->func_ptr, test); + + if (type != TEST_TYPE_POLL_TXQ_TMOUT) + pthread_barrier_wait(&barr); + + if (pthread_barrier_destroy(&barr)) + exit_with_error(errno); + + pthread_join(t0, NULL); + + return !!test->fail; +} + static int testapp_validate_traffic(struct test_spec *test) { struct ifobject *ifobj_tx = test->ifobj_tx; @@ -1548,12 +1620,30 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_ pkt_stream_restore_default(test); break; - case TEST_TYPE_POLL: - test->ifobj_tx->use_poll = true; + case TEST_TYPE_RX_POLL: test->ifobj_rx->use_poll = true; - test_spec_set_name(test, "POLL"); + test_spec_set_name(test, "POLL_RX"); testapp_validate_traffic(test); break; + case TEST_TYPE_TX_POLL: + test->ifobj_tx->use_poll = true; + test_spec_set_name(test, "POLL_TX"); + testapp_validate_traffic(test); + break; + case TEST_TYPE_POLL_TXQ_TMOUT: + test_spec_set_name(test, "POLL_TXQ_FULL"); + test->ifobj_tx->use_poll = true; + /* create invalid frame by set umem frame_size and pkt length equal to 2048 */ + test->ifobj_tx->umem->frame_size = 2048; + pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048); + testapp_validate_traffic_single_thread(test, test->ifobj_tx, type); + pkt_stream_restore_default(test); + break; + case TEST_TYPE_POLL_RXQ_TMOUT: + test_spec_set_name(test, "POLL_RXQ_EMPTY"); + test->ifobj_rx->use_poll = true; + testapp_validate_traffic_single_thread(test, test->ifobj_rx, type); + break; case TEST_TYPE_ALIGNED_INV_DESC: test_spec_set_name(test, "ALIGNED_INV_DESC"); testapp_invalid_desc(test); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 3d17053f98e5..ee97576757a9 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -27,6 +27,7 @@ #define TEST_PASS 0 #define TEST_FAILURE -1 +#define TEST_CONTINUE 1 #define MAX_INTERFACES 2 #define MAX_INTERFACE_NAME_CHARS 7 #define MAX_INTERFACES_NAMESPACE_CHARS 10 @@ -48,7 +49,7 @@ #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 -#define RECV_TMOUT 3 +#define THREAD_TMOUT 3 #define DEFAULT_PKT_CNT (4 * 1024) #define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) #define UMEM_SIZE (DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE) @@ -68,7 +69,10 @@ enum test_type { TEST_TYPE_RUN_TO_COMPLETION, TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME, TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT, - TEST_TYPE_POLL, + TEST_TYPE_RX_POLL, + TEST_TYPE_TX_POLL, + TEST_TYPE_POLL_RXQ_TMOUT, + TEST_TYPE_POLL_TXQ_TMOUT, TEST_TYPE_UNALIGNED, TEST_TYPE_ALIGNED_INV_DESC, TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME, -- cgit v1.2.3 From f1432cd24c240cedf78c0d026631e3b10052c8e1 Mon Sep 17 00:00:00 2001 From: Alexandre Vicenzi Date: Mon, 8 Aug 2022 20:03:43 +0200 Subject: rtla: Fix tracer name The correct tracer name is timerlat and not timelat. Link: https://lore.kernel.org/linux-trace-devel/20220808180343.22262-1-alexandre.vicenzi@suse.com Signed-off-by: Alexandre Vicenzi Signed-off-by: Steven Rostedt (Google) --- Documentation/tools/rtla/rtla-timerlat-hist.rst | 2 +- tools/tracing/rtla/src/timerlat_hist.c | 2 +- tools/tracing/rtla/src/timerlat_top.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/Documentation/tools/rtla/rtla-timerlat-hist.rst b/Documentation/tools/rtla/rtla-timerlat-hist.rst index e12eae1f3301..6bf7f0ca4556 100644 --- a/Documentation/tools/rtla/rtla-timerlat-hist.rst +++ b/Documentation/tools/rtla/rtla-timerlat-hist.rst @@ -33,7 +33,7 @@ EXAMPLE ======= In the example below, **rtla timerlat hist** is set to run for *10* minutes, in the cpus *0-4*, *skipping zero* only lines. Moreover, **rtla timerlat -hist** will change the priority of the *timelat* threads to run under +hist** will change the priority of the *timerlat* threads to run under *SCHED_DEADLINE* priority, with a *10us* runtime every *1ms* period. The *1ms* period is also passed to the *timerlat* tracer:: diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index f3ec628f5e51..4b48af8a8309 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -892,7 +892,7 @@ int timerlat_hist_main(int argc, char *argv[]) return_value = 0; if (trace_is_off(&tool->trace, &record->trace)) { - printf("rtla timelat hit stop tracing\n"); + printf("rtla timerlat hit stop tracing\n"); if (params->trace_output) { printf(" Saving trace to %s\n", params->trace_output); save_trace_to_file(record->trace.inst, params->trace_output); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 35452a1d45e9..334271935222 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -687,7 +687,7 @@ int timerlat_top_main(int argc, char *argv[]) return_value = 0; if (trace_is_off(&top->trace, &record->trace)) { - printf("rtla timelat hit stop tracing\n"); + printf("rtla timerlat hit stop tracing\n"); if (params->trace_output) { printf(" Saving trace to %s\n", params->trace_output); save_trace_to_file(record->trace.inst, params->trace_output); -- cgit v1.2.3 From ff5a55dcdb343e3db9b9fb08795b78544b032773 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 16 Jul 2022 15:47:08 +0200 Subject: tools/rtla: Fix command symlinks "ln -s" stores the next argument directly as the symlink target, so it needs to be a relative path. In this case, just "rtla". Link: https://lore.kernel.org/linux-trace-devel/YtLBXMI6Ui4HLIF1@decadent.org.uk Fixes: 0605bf009f18 ("rtla: Add osnoise tool") Fixes: a828cd18bc4a ("rtla: Add timerlat tool and timelart top mode") Signed-off-by: Ben Hutchings Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 1bea2d16d4c1..b8fe10d941ce 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -108,9 +108,9 @@ install: doc_install $(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR) $(STRIP) $(DESTDIR)$(BINDIR)/rtla @test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise - ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/osnoise + ln -s rtla $(DESTDIR)$(BINDIR)/osnoise @test ! -f $(DESTDIR)$(BINDIR)/timerlat || rm $(DESTDIR)$(BINDIR)/timerlat - ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/timerlat + ln -s rtla $(DESTDIR)$(BINDIR)/timerlat .PHONY: clean tarball clean: doc_clean -- cgit v1.2.3 From 1a7b22ab15ebf643e10e54ae5387afee06e39ad0 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 16 Jul 2022 15:48:34 +0200 Subject: tools/rtla: Build with EXTRA_{C,LD}FLAGS To allow for distributions and other builders to apply hardening policy and other customisation, append EXTRA_CFLAGS and EXTRA_LDFLAGS to the corresponding variables. Link: https://lore.kernel.org/linux-trace-devel/YtLBshz0nMQ7530H@decadent.org.uk Signed-off-by: Ben Hutchings Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index b8fe10d941ce..f392708c7a1e 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -30,8 +30,8 @@ WOPTS := -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_A TRACEFS_HEADERS := $$($(PKG_CONFIG) --cflags libtracefs) -CFLAGS := -O -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(MOPTS) $(WOPTS) $(TRACEFS_HEADERS) -LDFLAGS := -ggdb +CFLAGS := -O -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(MOPTS) $(WOPTS) $(TRACEFS_HEADERS) $(EXTRA_CFLAGS) +LDFLAGS := -ggdb $(EXTRA_LDFLAGS) LIBS := $$($(PKG_CONFIG) --libs libtracefs) SRC := $(wildcard src/*.c) -- cgit v1.2.3 From 20aec89aac7761e3c096004f5c819aacc86fc542 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 10 Aug 2022 11:39:18 -0400 Subject: rtla: Consolidate and show all necessary libraries that failed for building When building rtla tools, if the necessary libraries are not installed (libtraceevent and libtracefs), show the ones that are missing in one consolidated output, and also show how to install them (at least for Fedora). Link: https://lore.kernel.org/all/CAHk-=wh+e1qcCnEYJ3JRDVLNCYbJ=0u+Ts5bOYZnY3mX_k-hFA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220810113918.5d19ce59@gandalf.local.home Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/Makefile | 62 ++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index f392708c7a1e..22e28b76f800 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -61,40 +61,50 @@ endif LIBTRACEEVENT_MIN_VERSION = 1.5 LIBTRACEFS_MIN_VERSION = 1.3 +.PHONY: all warnings show_warnings +all: warnings rtla + TEST_LIBTRACEEVENT = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEEVENT_MIN_VERSION) libtraceevent > /dev/null 2>&1 || echo n") ifeq ("$(TEST_LIBTRACEEVENT)", "n") -.PHONY: warning_traceevent -warning_traceevent: - @echo "********************************************" - @echo "** NOTICE: libtraceevent version $(LIBTRACEEVENT_MIN_VERSION) or higher not found" - @echo "**" - @echo "** Consider installing the latest libtraceevent from your" - @echo "** distribution, e.g., 'dnf install libtraceevent' on Fedora," - @echo "** or from source:" - @echo "**" - @echo "** https://git.kernel.org/pub/scm/libs/libtrace/libtraceevent.git/ " - @echo "**" - @echo "********************************************" +WARNINGS = show_warnings +MISSING_LIBS += echo "** libtraceevent version $(LIBTRACEEVENT_MIN_VERSION) or higher"; +MISSING_PACKAGES += "libtraceevent-devel" +MISSING_SOURCE += echo "** https://git.kernel.org/pub/scm/libs/libtrace/libtraceevent.git/ "; endif TEST_LIBTRACEFS = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEFS_MIN_VERSION) libtracefs > /dev/null 2>&1 || echo n") ifeq ("$(TEST_LIBTRACEFS)", "n") -.PHONY: warning_tracefs -warning_tracefs: - @echo "********************************************" - @echo "** NOTICE: libtracefs version $(LIBTRACEFS_MIN_VERSION) or higher not found" - @echo "**" - @echo "** Consider installing the latest libtracefs from your" - @echo "** distribution, e.g., 'dnf install libtracefs' on Fedora," - @echo "** or from source:" - @echo "**" - @echo "** https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/ " - @echo "**" - @echo "********************************************" +WARNINGS = show_warnings +MISSING_LIBS += echo "** libtracefs version $(LIBTRACEFS_MIN_VERSION) or higher"; +MISSING_PACKAGES += "libtracefs-devel" +MISSING_SOURCE += echo "** https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/ "; endif -.PHONY: all -all: rtla +define show_dependencies + @echo "********************************************"; \ + echo "** NOTICE: Failed build dependencies"; \ + echo "**"; \ + echo "** Required Libraries:"; \ + $(MISSING_LIBS) \ + echo "**"; \ + echo "** Consider installing the latest libtracefs from your"; \ + echo "** distribution, e.g., 'dnf install $(MISSING_PACKAGES)' on Fedora,"; \ + echo "** or from source:"; \ + echo "**"; \ + $(MISSING_SOURCE) \ + echo "**"; \ + echo "********************************************" +endef + +show_warnings: + $(call show_dependencies); + +ifneq ("$(WARNINGS)", "") +ERROR_OUT = $(error Please add the necessary dependencies) + +warnings: $(WARNINGS) + $(ERROR_OUT) +endif rtla: $(OBJ) $(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS) -- cgit v1.2.3 From e338945816754a1c362f606b8e2029f2c023e51c Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 10 Aug 2022 08:59:05 +0200 Subject: selftests/bpf: add destructive kfunc test Add a test checking that programs calling destructive kfuncs can only do so if they have CAP_SYS_BOOT capabilities. Signed-off-by: Artem Savkov Link: https://lore.kernel.org/r/20220810065905.475418-4-asavkov@redhat.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 5 +++ .../testing/selftests/bpf/prog_tests/kfunc_call.c | 36 ++++++++++++++++++++++ .../selftests/bpf/progs/kfunc_call_destructive.c | 14 +++++++++ 3 files changed, 55 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_destructive.c (limited to 'tools') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index cbc9cd5058cb..afa7125252f6 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -695,6 +695,10 @@ noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) { } +noinline void bpf_kfunc_call_test_destructive(void) +{ +} + __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); @@ -719,6 +723,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_SET8_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index c00eb974eb85..351fafa006fb 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -5,6 +5,9 @@ #include "kfunc_call_test.lskel.h" #include "kfunc_call_test_subprog.skel.h" #include "kfunc_call_test_subprog.lskel.h" +#include "kfunc_call_destructive.skel.h" + +#include "cap_helpers.h" static void test_main(void) { @@ -86,6 +89,36 @@ static void test_subprog_lskel(void) kfunc_call_test_subprog_lskel__destroy(skel); } +static int test_destructive_open_and_load(void) +{ + struct kfunc_call_destructive *skel; + int err; + + skel = kfunc_call_destructive__open(); + if (!ASSERT_OK_PTR(skel, "prog_open")) + return -1; + + err = kfunc_call_destructive__load(skel); + + kfunc_call_destructive__destroy(skel); + + return err; +} + +static void test_destructive(void) +{ + __u64 save_caps = 0; + + ASSERT_OK(test_destructive_open_and_load(), "succesful_load"); + + if (!ASSERT_OK(cap_disable_effective(1ULL << CAP_SYS_BOOT, &save_caps), "drop_caps")) + return; + + ASSERT_EQ(test_destructive_open_and_load(), -13, "no_caps_failure"); + + cap_enable_effective(save_caps, NULL); +} + void test_kfunc_call(void) { if (test__start_subtest("main")) @@ -96,4 +129,7 @@ void test_kfunc_call(void) if (test__start_subtest("subprog_lskel")) test_subprog_lskel(); + + if (test__start_subtest("destructive")) + test_destructive(); } diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c b/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c new file mode 100644 index 000000000000..767472bc5a97 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_destructive.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +extern void bpf_kfunc_call_test_destructive(void) __ksym; + +SEC("tc") +int kfunc_destructive_test(void) +{ + bpf_kfunc_call_test_destructive(); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d7c5802faff6e7f50d18db40fdcb7e50590177f5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 10 Aug 2022 11:34:25 -0700 Subject: libbpf: preserve errno across pr_warn/pr_info/pr_debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As suggested in [0], make sure that libbpf_print saves and restored errno and as such guaranteed that no matter what actual print callback user installs, macros like pr_warn/pr_info/pr_debug are completely transparent as far as errno goes. While libbpf code is pretty careful about not clobbering important errno values accidentally with pr_warn(), it's a trivial change to make sure that pr_warn can be used anywhere without a risk of clobbering errno. No functional changes, just future proofing. [0] https://github.com/libbpf/libbpf/pull/536 Signed-off-by: Andrii Nakryiko Acked-by: Daniel Müller Link: https://lore.kernel.org/r/20220810183425.1998735-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f7364ea82ac1..917d975bd4c6 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -223,13 +223,18 @@ __printf(2, 3) void libbpf_print(enum libbpf_print_level level, const char *format, ...) { va_list args; + int old_errno; if (!__libbpf_pr) return; + old_errno = errno; + va_start(args, format); __libbpf_pr(level, format, args); va_end(args); + + errno = old_errno; } static void pr_perm_msg(int err) -- cgit v1.2.3 From 10b62d6a38f7c92e9f41983bb7d7669c9fa6e287 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 11 Aug 2022 11:40:20 +0800 Subject: libbpf: Add names for auxiliary maps The bpftool self-created maps can appear in final map show output due to deferred removal in kernel. These maps don't have a name, which would make users confused about where it comes from. With a libbpf_ prefix name, users could know who created these maps. It also could make some tests (like test_offload.py, which skip base maps without names as a workaround) filter them out. Kernel adds bpf prog/map name support in the same merge commit fadad670a8ab ("Merge branch 'bpf-extend-info'"). So we can also use kernel_supports(NULL, FEAT_PROG_NAME) to check if kernel supports map name. As discussed [1], Let's make bpf_map_create accept non-null name string, and silently ignore the name if kernel doesn't support. [1] https://lore.kernel.org/bpf/CAEf4BzYL1TQwo1231s83pjTdFPk9XWWhfZC5=KzkU-VO0k=0Ug@mail.gmail.com/ Signed-off-by: Hangbin Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220811034020.529685-1-liuhangbin@gmail.com --- tools/lib/bpf/bpf.c | 2 +- tools/lib/bpf/libbpf.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index efcc06dafbd9..6a96e665dc5d 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -183,7 +183,7 @@ int bpf_map_create(enum bpf_map_type map_type, return libbpf_err(-EINVAL); attr.map_type = map_type; - if (map_name) + if (map_name && kernel_supports(NULL, FEAT_PROG_NAME)) libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); attr.key_size = key_size; attr.value_size = value_size; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 917d975bd4c6..3f01f5cd8a4c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4437,7 +4437,7 @@ static int probe_kern_global_data(void) }; int ret, map, insn_cnt = ARRAY_SIZE(insns); - map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL); + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL); if (map < 0) { ret = -errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); @@ -4570,7 +4570,7 @@ static int probe_kern_array_mmap(void) LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); int fd; - fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(int), 1, &opts); + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); return probe_fd(fd); } @@ -4617,7 +4617,7 @@ static int probe_prog_bind_map(void) }; int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); - map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL); + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL); if (map < 0) { ret = -errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); -- cgit v1.2.3 From 27e23836ce22a3e5d89712ef832ab72e47ce9f43 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Wed, 10 Aug 2022 20:07:10 +0000 Subject: selftests/bpf: Add lru_bug to s390x deny list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lru_bug BPF selftest is failing execution on s390x machines. The failure is due to program attachment failing in turn, similar to a bunch of other tests. Those other tests have already been deny-listed and with this change we do the same for the lru_bug test, adding it to the corresponding file. Fixes: de7b9927105b ("selftests/bpf: Add test for prealloc_lru_pop bug") Signed-off-by: Daniel Müller Acked-by: Mykola Lysenko Link: https://lore.kernel.org/r/20220810200710.1300299-1-deso@posteo.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index e33cab34d22f..db9810611788 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -65,3 +65,4 @@ send_signal # intermittently fails to receive signa select_reuseport # intermittently fails on new s390x setup xdp_synproxy # JIT does not support calling kernel function (kfunc) unpriv_bpf_disabled # fentry +lru_bug # prog 'printk': failed to auto-attach: -524 -- cgit v1.2.3 From f889a2e89ea5b4db5cf09765ee5e310be43c7b6f Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Fri, 12 Aug 2022 17:16:32 +1000 Subject: selftests/powerpc: Add missing PMU selftests to .gitignores Some recently added selftests don't have their binaries in .gitignores, so add them. I also alphabetically sorted sampling_tests/.gitignore while I was in there. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220812071632.56095-1-ruscur@russell.cc --- .../powerpc/pmu/event_code_tests/.gitignore | 20 ++++++++++++++++++++ .../selftests/powerpc/pmu/sampling_tests/.gitignore | 18 ++++++++++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/.gitignore (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/.gitignore b/tools/testing/selftests/powerpc/pmu/event_code_tests/.gitignore new file mode 100644 index 000000000000..5710683da525 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/.gitignore @@ -0,0 +1,20 @@ +blacklisted_events_test +event_alternatives_tests_p10 +event_alternatives_tests_p9 +generic_events_valid_test +group_constraint_cache_test +group_constraint_l2l3_sel_test +group_constraint_mmcra_sample_test +group_constraint_pmc56_test +group_constraint_pmc_count_test +group_constraint_radix_scope_qual_test +group_constraint_repeat_test +group_constraint_thresh_cmp_test +group_constraint_thresh_ctl_test +group_constraint_thresh_sel_test +group_constraint_unit_test +group_pmc56_exclude_constraints_test +hw_cache_event_type_test +invalid_event_code_test +reserved_bits_mmcra_sample_elig_mode_test +reserved_bits_mmcra_thresh_ctl_test diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/.gitignore b/tools/testing/selftests/powerpc/pmu/sampling_tests/.gitignore index 0fce5a694684..f93b4c7c3a8a 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/.gitignore +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/.gitignore @@ -1,11 +1,21 @@ -mmcr0_exceptionbits_test +bhrb_filter_map_test +bhrb_no_crash_wo_pmu_test +intr_regs_no_crash_wo_pmu_test mmcr0_cc56run_test -mmcr0_pmccext_test -mmcr0_pmcjce_test +mmcr0_exceptionbits_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test +mmcr0_pmccext_test +mmcr0_pmcjce_test mmcr1_comb_test -mmcr2_l2l3_test +mmcr1_sel_unit_cache_test mmcr2_fcs_fch_test +mmcr2_l2l3_test mmcr3_src_test +mmcra_bhrb_any_test +mmcra_bhrb_cond_test +mmcra_bhrb_disable_no_branch_test +mmcra_bhrb_disable_test +mmcra_bhrb_ind_call_test +mmcra_thresh_cmp_test mmcra_thresh_marked_sample_test -- cgit v1.2.3 From 54c939773b2d2c2e6676743c180cb2049bb3a40a Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 12 Aug 2022 16:37:25 +0100 Subject: bpftool: Fix a typo in a comment This is the wrong library name: libcap, not libpcap. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220812153727.224500-1-quentin@isovalent.com --- tools/bpf/bpftool/feature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 7ecabf7947fb..36cf0f1517c9 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1147,7 +1147,7 @@ exit_free: return res; #else /* Detection assumes user has specific privileges. - * We do not use libpcap so let's approximate, and restrict usage to + * We do not use libcap so let's approximate, and restrict usage to * root user only. */ if (geteuid()) { -- cgit v1.2.3 From 4961d0772578e8737afe61370743f3bc22867111 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 12 Aug 2022 16:37:27 +0100 Subject: bpf: Clear up confusion in bpf_skb_adjust_room()'s documentation Adding or removing room space _below_ layers 2 or 3, as the description mentions, is ambiguous. This was written with a mental image of the packet with layer 2 at the top, layer 3 under it, and so on. But it has led users to believe that it was on lower layers (before the beginning of the L2 and L3 headers respectively). Let's make it more explicit, and specify between which layers the room space is adjusted. Reported-by: Rumen Telbizov Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220812153727.224500-3-quentin@isovalent.com --- include/uapi/linux/bpf.h | 6 ++++-- tools/include/uapi/linux/bpf.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7d1e2794d83e..934a2a8beb87 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2573,10 +2573,12 @@ union bpf_attr { * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer - * (room space is added or removed below the layer 2 header). + * (room space is added or removed between the layer 2 and + * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). + * (room space is added or removed between the layer 3 and + * layer 4 headers). * * The following flags are supported at this time: * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e174ad28aeb7..1d6085e15fc8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2573,10 +2573,12 @@ union bpf_attr { * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer - * (room space is added or removed below the layer 2 header). + * (room space is added or removed between the layer 2 and + * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). + * (room space is added or removed between the layer 3 and + * layer 4 headers). * * The following flags are supported at this time: * -- cgit v1.2.3 From f15f39fabed2248311607445ddfa6dba63abebb9 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 11 Aug 2022 21:34:33 +0800 Subject: tools: hv: Remove an extraneous "the" There are two "the" in the text. Remove one. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20220811133433.10175-1-wangborong@cdjrlc.com Signed-off-by: Wei Liu --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 1e6fd6ca513b..c97c12e95ecb 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -44,7 +44,7 @@ /* * KVP protocol: The user mode component first registers with the - * the kernel component. Subsequently, the kernel component requests, data + * kernel component. Subsequently, the kernel component requests, data * for the specified keys. In response to this message the user mode component * fills in the value corresponding to the specified key. We overload the * sequence field in the cn_msg header to define our KVP message types. -- cgit v1.2.3 From f1227dc7d0411ee9a9faaa1e80cfd9d6e5d6d63e Mon Sep 17 00:00:00 2001 From: Guillaume Tucker Date: Wed, 3 Aug 2022 22:13:54 +0200 Subject: selftests/landlock: fix broken include of linux/landlock.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert part of the earlier changes to fix the kselftest build when using a sub-directory from the top of the tree as this broke the landlock test build as a side-effect when building with "make -C tools/testing/selftests/landlock". Reported-by: Mickaël Salaün Fixes: a917dd94b832 ("selftests/landlock: drop deprecated headers dependency") Fixes: f2745dc0ba3d ("selftests: stop using KSFT_KHDR_INSTALL") Signed-off-by: Guillaume Tucker Signed-off-by: Shuah Khan --- tools/testing/selftests/landlock/Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index a6959df28eb0..02868ac3bc71 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -9,10 +9,13 @@ TEST_GEN_PROGS := $(src_test:.c=) TEST_GEN_PROGS_EXTENDED := true OVERRIDE_TARGETS := 1 +top_srcdir := ../../../.. include ../lib.mk +khdr_dir = $(top_srcdir)/usr/include + $(OUTPUT)/true: true.c $(LINK.c) $< $(LDLIBS) -o $@ -static -$(OUTPUT)/%_test: %_test.c ../kselftest_harness.h common.h - $(LINK.c) $< $(LDLIBS) -o $@ -lcap +$(OUTPUT)/%_test: %_test.c $(khdr_dir)/linux/landlock.h ../kselftest_harness.h common.h + $(LINK.c) $< $(LDLIBS) -o $@ -lcap -I$(khdr_dir) -- cgit v1.2.3 From cea558855c39b7f1f02ff50dcf701ca6596bc964 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 15 Aug 2022 17:22:05 +0100 Subject: bpftool: Clear errno after libcap's checks When bpftool is linked against libcap, the library runs a "constructor" function to compute the number of capabilities of the running kernel [0], at the beginning of the execution of the program. As part of this, it performs multiple calls to prctl(). Some of these may fail, and set errno to a non-zero value: # strace -e prctl ./bpftool version prctl(PR_CAPBSET_READ, CAP_MAC_OVERRIDE) = 1 prctl(PR_CAPBSET_READ, 0x30 /* CAP_??? */) = -1 EINVAL (Invalid argument) prctl(PR_CAPBSET_READ, CAP_CHECKPOINT_RESTORE) = 1 prctl(PR_CAPBSET_READ, 0x2c /* CAP_??? */) = -1 EINVAL (Invalid argument) prctl(PR_CAPBSET_READ, 0x2a /* CAP_??? */) = -1 EINVAL (Invalid argument) prctl(PR_CAPBSET_READ, 0x29 /* CAP_??? */) = -1 EINVAL (Invalid argument) ** fprintf added at the top of main(): we have errno == 1 ./bpftool v7.0.0 using libbpf v1.0 features: libbfd, libbpf_strict, skeletons +++ exited with 0 +++ This has been addressed in libcap 2.63 [1], but until this version is available everywhere, we can fix it on bpftool side. Let's clean errno at the beginning of the main() function, to make sure that these checks do not interfere with the batch mode, where we error out if errno is set after a bpftool command. [0] https://git.kernel.org/pub/scm/libs/libcap/libcap.git/tree/libcap/cap_alloc.c?h=libcap-2.65#n20 [1] https://git.kernel.org/pub/scm/libs/libcap/libcap.git/commit/?id=f25a1b7e69f7b33e6afb58b3e38f3450b7d2d9a0 Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220815162205.45043-1-quentin@isovalent.com --- tools/bpf/bpftool/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 451cefc2d0da..ccd7457f92bf 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -435,6 +435,16 @@ int main(int argc, char **argv) setlinebuf(stdout); +#ifdef USE_LIBCAP + /* Libcap < 2.63 hooks before main() to compute the number of + * capabilities of the running kernel, and doing so it calls prctl() + * which may fail and set errno to non-zero. + * Let's reset errno to make sure this does not interfere with the + * batch mode. + */ + errno = 0; +#endif + last_do_help = do_help; pretty_output = false; json_output = false; -- cgit v1.2.3 From e81fbd4c1ba7b128a198c2843665e1186db449b6 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 11 Aug 2022 15:55:25 -0600 Subject: selftests/bpf: Add existing connection bpf_*_ct_lookup() test Add a test where we do a conntrack lookup on an existing connection. This is nice because it's a more realistic test than artifically creating a ct entry and looking it up afterwards. Signed-off-by: Daniel Xu Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/de5a617832f38f8b5631cc87e2a836da7c94d497.1660254747.git.dxu@dxuuu.xyz --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 59 +++++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_bpf_nf.c | 18 ++++++++ 2 files changed, 77 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 7a74a1579076..88a2c0bdefec 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -24,10 +24,34 @@ enum { TEST_TC_BPF, }; +#define TIMEOUT_MS 3000 + +static int connect_to_server(int srv_fd) +{ + int fd = -1; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket")) + goto out; + + if (!ASSERT_EQ(connect_fd_to_fd(fd, srv_fd, TIMEOUT_MS), 0, "connect_fd_to_fd")) { + close(fd); + fd = -1; + } +out: + return fd; +} + static void test_bpf_nf_ct(int mode) { + const char *iptables = "iptables -t raw %s PREROUTING -j CT"; + int srv_fd = -1, client_fd = -1, srv_client_fd = -1; + struct sockaddr_in peer_addr = {}; struct test_bpf_nf *skel; int prog_fd, err; + socklen_t len; + u16 srv_port; + char cmd[64]; LIBBPF_OPTS(bpf_test_run_opts, topts, .data_in = &pkt_v4, .data_size_in = sizeof(pkt_v4), @@ -38,6 +62,32 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK_PTR(skel, "test_bpf_nf__open_and_load")) return; + /* Enable connection tracking */ + snprintf(cmd, sizeof(cmd), iptables, "-A"); + if (!ASSERT_OK(system(cmd), "iptables")) + goto end; + + srv_port = (mode == TEST_XDP) ? 5005 : 5006; + srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", srv_port, TIMEOUT_MS); + if (!ASSERT_GE(srv_fd, 0, "start_server")) + goto end; + + client_fd = connect_to_server(srv_fd); + if (!ASSERT_GE(client_fd, 0, "connect_to_server")) + goto end; + + len = sizeof(peer_addr); + srv_client_fd = accept(srv_fd, (struct sockaddr *)&peer_addr, &len); + if (!ASSERT_GE(srv_client_fd, 0, "accept")) + goto end; + if (!ASSERT_EQ(len, sizeof(struct sockaddr_in), "sockaddr len")) + goto end; + + skel->bss->saddr = peer_addr.sin_addr.s_addr; + skel->bss->sport = peer_addr.sin_port; + skel->bss->daddr = peer_addr.sin_addr.s_addr; + skel->bss->dport = htons(srv_port); + if (mode == TEST_XDP) prog_fd = bpf_program__fd(skel->progs.nf_xdp_ct_test); else @@ -63,7 +113,16 @@ static void test_bpf_nf_ct(int mode) ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update"); /* expected status is IPS_SEEN_REPLY */ ASSERT_EQ(skel->bss->test_status, 2, "Test for ct status update "); + ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup"); end: + if (srv_client_fd != -1) + close(srv_client_fd); + if (client_fd != -1) + close(client_fd); + if (srv_fd != -1) + close(srv_fd); + snprintf(cmd, sizeof(cmd), iptables, "-D"); + system(cmd); test_bpf_nf__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index 196cd8dfe42a..84e0fd479794 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -23,6 +23,11 @@ int test_insert_entry = -EAFNOSUPPORT; int test_succ_lookup = -ENOENT; u32 test_delta_timeout = 0; u32 test_status = 0; +__be32 saddr = 0; +__be16 sport = 0; +__be32 daddr = 0; +__be16 dport = 0; +int test_exist_lookup = -ENOENT; struct nf_conn; @@ -160,6 +165,19 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, } test_alloc_entry = 0; } + + bpf_tuple.ipv4.saddr = saddr; + bpf_tuple.ipv4.daddr = daddr; + bpf_tuple.ipv4.sport = sport; + bpf_tuple.ipv4.dport = dport; + ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, + sizeof(opts_def)); + if (ct) { + test_exist_lookup = 0; + bpf_ct_release(ct); + } else { + test_exist_lookup = opts_def.error; + } } SEC("xdp") -- cgit v1.2.3 From 99799de2cba2d399acf65f49a986b3d5cf0732ab Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 11 Aug 2022 15:55:26 -0600 Subject: selftests/bpf: Add connmark read test Test that the prog can read from the connection mark. This test is nice because it ensures progs can interact with netfilter subsystem correctly. Signed-off-by: Daniel Xu Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/d3bc620a491e4c626c20d80631063922cbe13e2b.1660254747.git.dxu@dxuuu.xyz --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 3 ++- tools/testing/selftests/bpf/progs/test_bpf_nf.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 88a2c0bdefec..544bf90ac2a7 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -44,7 +44,7 @@ out: static void test_bpf_nf_ct(int mode) { - const char *iptables = "iptables -t raw %s PREROUTING -j CT"; + const char *iptables = "iptables -t raw %s PREROUTING -j CONNMARK --set-mark 42/0"; int srv_fd = -1, client_fd = -1, srv_client_fd = -1; struct sockaddr_in peer_addr = {}; struct test_bpf_nf *skel; @@ -114,6 +114,7 @@ static void test_bpf_nf_ct(int mode) /* expected status is IPS_SEEN_REPLY */ ASSERT_EQ(skel->bss->test_status, 2, "Test for ct status update "); ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup"); + ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark"); end: if (srv_client_fd != -1) close(srv_client_fd); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index 84e0fd479794..2722441850cc 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -28,6 +28,7 @@ __be16 sport = 0; __be32 daddr = 0; __be16 dport = 0; int test_exist_lookup = -ENOENT; +u32 test_exist_lookup_mark = 0; struct nf_conn; @@ -174,6 +175,8 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, sizeof(opts_def)); if (ct) { test_exist_lookup = 0; + if (ct->mark == 42) + test_exist_lookup_mark = 43; bpf_ct_release(ct); } else { test_exist_lookup = opts_def.error; -- cgit v1.2.3 From 8308bf207ce6963adb42791cfb260dc6552b6665 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 11 Aug 2022 15:55:27 -0600 Subject: selftests/bpf: Update CI kconfig The previous selftest changes require two kconfig changes in bpf-ci. Signed-off-by: Daniel Xu Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/2c27c6ebf7a03954915f83560653752450389564.1660254747.git.dxu@dxuuu.xyz --- tools/testing/selftests/bpf/config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index fabf0c014349..3fc46f9cfb22 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -50,9 +50,11 @@ CONFIG_NET_SCHED=y CONFIG_NETDEVSIM=m CONFIG_NETFILTER=y CONFIG_NETFILTER_SYNPROXY=y +CONFIG_NETFILTER_XT_CONNMARK=y CONFIG_NETFILTER_XT_MATCH_STATE=y CONFIG_NETFILTER_XT_TARGET_CT=y CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_MARK=y CONFIG_NF_DEFRAG_IPV4=y CONFIG_NF_DEFRAG_IPV6=y CONFIG_RC_CORE=y -- cgit v1.2.3 From 7f203bc89eb66d6afde7eae91347fc0352090cc3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Jul 2022 13:10:16 -1000 Subject: cgroup: Replace cgroup->ancestor_ids[] with ->ancestors[] Every cgroup knows all its ancestors through its ->ancestor_ids[]. There's no advantage to remembering the IDs instead of the pointers directly and this makes the array useless for finding an actual ancestor cgroup forcing cgroup_ancestor() to iteratively walk up the hierarchy instead. Let's replace cgroup->ancestor_ids[] with ->ancestors[] and remove the walking-up from cgroup_ancestor(). While at it, improve comments around cgroup_root->cgrp_ancestor_storage. This patch shouldn't cause user-visible behavior differences. v2: Update cgroup_ancestor() to use ->ancestors[]. v3: cgroup_root->cgrp_ancestor_storage's type is updated to match cgroup->ancestors[]. Better comments. Signed-off-by: Tejun Heo Acked-by: Namhyung Kim --- include/linux/cgroup-defs.h | 16 ++++++++++------ include/linux/cgroup.h | 8 +++----- kernel/cgroup/cgroup.c | 7 +++---- net/netfilter/nft_socket.c | 9 +++++---- tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +- 5 files changed, 22 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4bcf56b3491c..1283993d7ea8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -384,7 +384,7 @@ struct cgroup { /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with - * ancestor_ids[] can determine whether a given cgroup is a + * ancestors[] can determine whether a given cgroup is a * descendant of another without traversing the hierarchy. */ int level; @@ -504,8 +504,8 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; - /* ids of the ancestors at each level including self */ - u64 ancestor_ids[]; + /* All ancestors including self */ + struct cgroup *ancestors[]; }; /* @@ -522,11 +522,15 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; - /* The root cgroup. Root is destroyed on its release. */ + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. cgrp->ancestors[0] will be used overflowing into the + * following field. cgrp_ancestor_storage must immediately follow. + */ struct cgroup cgrp; - /* for cgrp->ancestor_ids[0] */ - u64 cgrp_ancestor_id_storage; + /* must follow cgrp for cgrp->ancestors[0], see above */ + struct cgroup *cgrp_ancestor_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed53bfe7c46c..4d143729b246 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -574,7 +574,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); + return cgrp->ancestors[ancestor->level] == ancestor; } /** @@ -591,11 +591,9 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { - if (cgrp->level < ancestor_level) + if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; - while (cgrp && cgrp->level > ancestor_level) - cgrp = cgroup_parent(cgrp); - return cgrp; + return cgrp->ancestors[ancestor_level]; } /** diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ffaccd6373f1..627ff0f07da7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2049,7 +2049,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); - root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); + root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -5400,8 +5400,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), - GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -5453,7 +5452,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); + cgrp->ancestors[tcgrp->level] = tcgrp; if (tcgrp != cgrp) { tcgrp->nr_descendants++; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index a7de29137618..49a5348a6a14 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -40,16 +40,17 @@ static noinline bool nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level) { struct cgroup *cgrp; + u64 cgid; if (!sk_fullsock(sk)) return false; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - if (level > cgrp->level) + cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level); + if (!cgrp) return false; - memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64)); - + cgid = cgroup_id(cgrp); + memcpy(dest, &cgid, sizeof(u64)); return true; } #endif diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c index 292c430768b5..bd6a420acc8f 100644 --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -68,7 +68,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) break; // convert cgroup-id to a map index - cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]); + cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id); elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); if (!elem) continue; -- cgit v1.2.3 From 1f235777c3a4ab115162fe7d45b82be534b9ae2e Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sat, 13 Aug 2022 08:09:36 +0800 Subject: libbpf: Making bpf_prog_load() ignore name if kernel doesn't support Similar with commit 10b62d6a38f7 ("libbpf: Add names for auxiliary maps"), let's make bpf_prog_load() also ignore name if kernel doesn't support program name. To achieve this, we need to call sys_bpf_prog_load() directly in probe_kern_prog_name() to avoid circular dependency. sys_bpf_prog_load() also need to be exported in the libbpf_internal.h file. Signed-off-by: Hangbin Liu Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220813000936.6464-1-liuhangbin@gmail.com --- tools/lib/bpf/bpf.c | 6 ++---- tools/lib/bpf/libbpf.c | 13 +++++++++++-- tools/lib/bpf/libbpf_internal.h | 3 +++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 6a96e665dc5d..575867d69496 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -84,9 +84,7 @@ static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr, return ensure_good_fd(fd); } -#define PROG_LOAD_ATTEMPTS 5 - -static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) { int fd; @@ -263,7 +261,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0); attr.kern_version = OPTS_GET(opts, kern_version, 0); - if (prog_name) + if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME)) libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); attr.license = ptr_to_u64(license); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3f01f5cd8a4c..aa05a99b913d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4415,14 +4415,23 @@ static int probe_fd(int fd) static int probe_kern_prog_name(void) { + const size_t attr_sz = offsetofend(union bpf_attr, prog_name); struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }; - int ret, insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.license = ptr_to_u64("GPL"); + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)ARRAY_SIZE(insns); + libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); /* make sure loading with name works */ - ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "test", "GPL", insns, insn_cnt, NULL); + ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); return probe_fd(ret); } diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 4135ae0a2bc3..377642ff51fc 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -573,4 +573,7 @@ static inline bool is_pow_of_2(size_t x) return x && (x & (x - 1)) == 0; } +#define PROG_LOAD_ATTEMPTS 5 +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ -- cgit v1.2.3 From 5f4d1fd5b5d3506759b5d9cf20bb5fb5b8bdcab1 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 12 Aug 2022 11:07:13 -0700 Subject: selftests/sgx: Ignore OpenSSL 3.0 deprecated functions warning OpenSSL 3.0 deprecates some of the functions used in the SGX selftests, causing build errors on new distros. For now ignore the warnings until support for the functions is no longer available and mark FIXME so that it can be clear this should be removed at some point. Signed-off-by: Kristen Carlson Accardi Reviewed-by: Jarkko Sakkinen Signed-off-by: Shuah Khan --- tools/testing/selftests/sgx/sigstruct.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index 50c5ab1aa6fa..a07896a46364 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -17,6 +17,12 @@ #include "defines.h" #include "main.h" +/* + * FIXME: OpenSSL 3.0 has deprecated some functions. For now just ignore + * the warnings. + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + struct q1q2_ctx { BN_CTX *bn_ctx; BIGNUM *m; -- cgit v1.2.3 From 93d7c52a6eb93e58e4569bd4de95ba3b19e3cf20 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Sat, 6 Aug 2022 22:05:30 +0530 Subject: selftests/net: Refactor xfrm_fill_key() to use array of structs A TODO in net/ipsec.c asks to refactor the code in xfrm_fill_key() to use set/map to avoid manually comparing each algorithm with the "name" parameter passed to the function as an argument. This patch refactors the code to create an array of structs where each struct contains the algorithm name and its corresponding key length. Signed-off-by: Gautam Menghani Signed-off-by: Steffen Klassert --- tools/testing/selftests/net/ipsec.c | 104 ++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index cc10c10c5ed9..9a8229abfa02 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -58,6 +58,8 @@ #define VETH_FMT "ktst-%d" #define VETH_LEN 12 +#define XFRM_ALGO_NR_KEYS 29 + static int nsfd_parent = -1; static int nsfd_childa = -1; static int nsfd_childb = -1; @@ -75,6 +77,43 @@ const unsigned int ping_timeout = 300; const unsigned int ping_count = 100; const unsigned int ping_success = 80; +struct xfrm_key_entry { + char algo_name[35]; + int key_len; +}; + +struct xfrm_key_entry xfrm_key_entries[] = { + {"digest_null", 0}, + {"ecb(cipher_null)", 0}, + {"cbc(des)", 64}, + {"hmac(md5)", 128}, + {"cmac(aes)", 128}, + {"xcbc(aes)", 128}, + {"cbc(cast5)", 128}, + {"cbc(serpent)", 128}, + {"hmac(sha1)", 160}, + {"hmac(rmd160)", 160}, + {"cbc(des3_ede)", 192}, + {"hmac(sha256)", 256}, + {"cbc(aes)", 256}, + {"cbc(camellia)", 256}, + {"cbc(twofish)", 256}, + {"rfc3686(ctr(aes))", 288}, + {"hmac(sha384)", 384}, + {"cbc(blowfish)", 448}, + {"hmac(sha512)", 512}, + {"rfc4106(gcm(aes))-128", 160}, + {"rfc4543(gcm(aes))-128", 160}, + {"rfc4309(ccm(aes))-128", 152}, + {"rfc4106(gcm(aes))-192", 224}, + {"rfc4543(gcm(aes))-192", 224}, + {"rfc4309(ccm(aes))-192", 216}, + {"rfc4106(gcm(aes))-256", 288}, + {"rfc4543(gcm(aes))-256", 288}, + {"rfc4309(ccm(aes))-256", 280}, + {"rfc7539(chacha20,poly1305)-128", 0} +}; + static void randomize_buffer(void *buf, size_t buflen) { int *p = (int *)buf; @@ -767,65 +806,12 @@ static int do_ping(int cmd_fd, char *buf, size_t buf_len, struct in_addr from, static int xfrm_fill_key(char *name, char *buf, size_t buf_len, unsigned int *key_len) { - /* TODO: use set/map instead */ - if (strncmp(name, "digest_null", ALGO_LEN) == 0) - *key_len = 0; - else if (strncmp(name, "ecb(cipher_null)", ALGO_LEN) == 0) - *key_len = 0; - else if (strncmp(name, "cbc(des)", ALGO_LEN) == 0) - *key_len = 64; - else if (strncmp(name, "hmac(md5)", ALGO_LEN) == 0) - *key_len = 128; - else if (strncmp(name, "cmac(aes)", ALGO_LEN) == 0) - *key_len = 128; - else if (strncmp(name, "xcbc(aes)", ALGO_LEN) == 0) - *key_len = 128; - else if (strncmp(name, "cbc(cast5)", ALGO_LEN) == 0) - *key_len = 128; - else if (strncmp(name, "cbc(serpent)", ALGO_LEN) == 0) - *key_len = 128; - else if (strncmp(name, "hmac(sha1)", ALGO_LEN) == 0) - *key_len = 160; - else if (strncmp(name, "hmac(rmd160)", ALGO_LEN) == 0) - *key_len = 160; - else if (strncmp(name, "cbc(des3_ede)", ALGO_LEN) == 0) - *key_len = 192; - else if (strncmp(name, "hmac(sha256)", ALGO_LEN) == 0) - *key_len = 256; - else if (strncmp(name, "cbc(aes)", ALGO_LEN) == 0) - *key_len = 256; - else if (strncmp(name, "cbc(camellia)", ALGO_LEN) == 0) - *key_len = 256; - else if (strncmp(name, "cbc(twofish)", ALGO_LEN) == 0) - *key_len = 256; - else if (strncmp(name, "rfc3686(ctr(aes))", ALGO_LEN) == 0) - *key_len = 288; - else if (strncmp(name, "hmac(sha384)", ALGO_LEN) == 0) - *key_len = 384; - else if (strncmp(name, "cbc(blowfish)", ALGO_LEN) == 0) - *key_len = 448; - else if (strncmp(name, "hmac(sha512)", ALGO_LEN) == 0) - *key_len = 512; - else if (strncmp(name, "rfc4106(gcm(aes))-128", ALGO_LEN) == 0) - *key_len = 160; - else if (strncmp(name, "rfc4543(gcm(aes))-128", ALGO_LEN) == 0) - *key_len = 160; - else if (strncmp(name, "rfc4309(ccm(aes))-128", ALGO_LEN) == 0) - *key_len = 152; - else if (strncmp(name, "rfc4106(gcm(aes))-192", ALGO_LEN) == 0) - *key_len = 224; - else if (strncmp(name, "rfc4543(gcm(aes))-192", ALGO_LEN) == 0) - *key_len = 224; - else if (strncmp(name, "rfc4309(ccm(aes))-192", ALGO_LEN) == 0) - *key_len = 216; - else if (strncmp(name, "rfc4106(gcm(aes))-256", ALGO_LEN) == 0) - *key_len = 288; - else if (strncmp(name, "rfc4543(gcm(aes))-256", ALGO_LEN) == 0) - *key_len = 288; - else if (strncmp(name, "rfc4309(ccm(aes))-256", ALGO_LEN) == 0) - *key_len = 280; - else if (strncmp(name, "rfc7539(chacha20,poly1305)-128", ALGO_LEN) == 0) - *key_len = 0; + int i; + + for (i = 0; i < XFRM_ALGO_NR_KEYS; i++) { + if (strncmp(name, xfrm_key_entries[i].algo_name, ALGO_LEN) == 0) + *key_len = xfrm_key_entries[i].key_len; + } if (*key_len > buf_len) { printk("Can't pack a key - too big for buffer"); -- cgit v1.2.3 From 807662cac66af0dfca60ce1cf784063da6ec2f65 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Tue, 16 Aug 2022 07:52:31 +0200 Subject: selftests/bpf: Fix attach point for non-x86 arches in test_progs/lsm Use SYS_PREFIX macro from bpf_misc.h instead of hard-coded '__x64_' prefix for sys_setdomainname attach point in lsm test. Signed-off-by: Artem Savkov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220816055231.717006-1-asavkov@redhat.com --- tools/testing/selftests/bpf/DENYLIST.s390x | 2 +- tools/testing/selftests/bpf/progs/lsm.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index e33cab34d22f..9d8de15e725e 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -43,7 +43,7 @@ test_bpffs # bpffs test failed 255 test_bprm_opts # failed to auto-attach program 'secure_exec': -524 (trampoline) test_ima # failed to auto-attach program 'ima': -524 (trampoline) test_local_storage # failed to auto-attach program 'unlink_hook': -524 (trampoline) -test_lsm # failed to find kernel BTF type ID of '__x64_sys_setdomainname': -3 (?) +test_lsm # attach unexpected error: -524 (trampoline) test_overhead # attach_fentry unexpected error: -524 (trampoline) test_profiler # unknown func bpf_probe_read_str#45 (overlapping) timer # failed to auto-attach program 'test1': -524 (trampoline) diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index 33694ef8acfa..d8d8af623bc2 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -4,6 +4,7 @@ * Copyright 2020 Google LLC. */ +#include "bpf_misc.h" #include "vmlinux.h" #include #include @@ -160,7 +161,7 @@ int BPF_PROG(test_task_free, struct task_struct *task) int copy_test = 0; -SEC("fentry.s/__x64_sys_setdomainname") +SEC("fentry.s/" SYS_PREFIX "sys_setdomainname") int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs) { void *ptr = (void *)PT_REGS_PARM1(regs); -- cgit v1.2.3 From d5810139cca39cf2854728b465f8bada4a445302 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 15 Aug 2022 11:20:27 -0500 Subject: selftests/bpf: Add tests verifying bpf lsm userns_create hook The LSM hook userns_create was introduced to provide LSM's an opportunity to block or allow unprivileged user namespace creation. This test serves two purposes: it provides a test eBPF implementation, and tests the hook successfully blocks or allows user namespace creation. This tests 3 cases: 1. Unattached bpf program does not block unpriv user namespace creation. 2. Attached bpf program allows user namespace creation given CAP_SYS_ADMIN privileges. 3. Attached bpf program denies user namespace creation for a user without CAP_SYS_ADMIN. Acked-by: KP Singh Signed-off-by: Frederick Lawler Signed-off-by: Paul Moore --- .../selftests/bpf/prog_tests/deny_namespace.c | 102 +++++++++++++++++++++ .../selftests/bpf/progs/test_deny_namespace.c | 33 +++++++ 2 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/deny_namespace.c create mode 100644 tools/testing/selftests/bpf/progs/test_deny_namespace.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/deny_namespace.c b/tools/testing/selftests/bpf/prog_tests/deny_namespace.c new file mode 100644 index 000000000000..1bc6241b755b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/deny_namespace.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include "test_deny_namespace.skel.h" +#include +#include "cap_helpers.h" +#include + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +/* negative return value -> some internal error + * positive return value -> userns creation failed + * 0 -> userns creation succeeded + */ +static int create_user_ns(void) +{ + pid_t pid; + + pid = fork(); + if (pid < 0) + return -1; + + if (pid == 0) { + if (unshare(CLONE_NEWUSER)) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + return wait_for_pid(pid); +} + +static void test_userns_create_bpf(void) +{ + __u32 cap_mask = 1ULL << CAP_SYS_ADMIN; + __u64 old_caps = 0; + + cap_enable_effective(cap_mask, &old_caps); + + ASSERT_OK(create_user_ns(), "priv new user ns"); + + cap_disable_effective(cap_mask, &old_caps); + + ASSERT_EQ(create_user_ns(), EPERM, "unpriv new user ns"); + + if (cap_mask & old_caps) + cap_enable_effective(cap_mask, NULL); +} + +static void test_unpriv_userns_create_no_bpf(void) +{ + __u32 cap_mask = 1ULL << CAP_SYS_ADMIN; + __u64 old_caps = 0; + + cap_disable_effective(cap_mask, &old_caps); + + ASSERT_OK(create_user_ns(), "no-bpf unpriv new user ns"); + + if (cap_mask & old_caps) + cap_enable_effective(cap_mask, NULL); +} + +void test_deny_namespace(void) +{ + struct test_deny_namespace *skel = NULL; + int err; + + if (test__start_subtest("unpriv_userns_create_no_bpf")) + test_unpriv_userns_create_no_bpf(); + + skel = test_deny_namespace__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel load")) + goto close_prog; + + err = test_deny_namespace__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto close_prog; + + if (test__start_subtest("userns_create_bpf")) + test_userns_create_bpf(); + + test_deny_namespace__detach(skel); + +close_prog: + test_deny_namespace__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_deny_namespace.c b/tools/testing/selftests/bpf/progs/test_deny_namespace.c new file mode 100644 index 000000000000..09ad5a4ebd1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_deny_namespace.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +struct kernel_cap_struct { + __u32 cap[_LINUX_CAPABILITY_U32S_3]; +} __attribute__((preserve_access_index)); + +struct cred { + struct kernel_cap_struct cap_effective; +} __attribute__((preserve_access_index)); + +char _license[] SEC("license") = "GPL"; + +SEC("lsm.s/userns_create") +int BPF_PROG(test_userns_create, const struct cred *cred, int ret) +{ + struct kernel_cap_struct caps = cred->cap_effective; + int cap_index = CAP_TO_INDEX(CAP_SYS_ADMIN); + __u32 cap_mask = CAP_TO_MASK(CAP_SYS_ADMIN); + + if (ret) + return 0; + + ret = -EPERM; + if (caps.cap[cap_index] & cap_mask) + return 0; + + return -EPERM; +} -- cgit v1.2.3 From b71b7bfeac38c7a21c423ddafb29aa6258949df8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Aug 2022 14:15:21 +0200 Subject: testing: selftests: nft_flowtable.sh: use random netns names "ns1" is a too generic name, use a random suffix to avoid errors when such a netns exists. Also allows to run multiple instances of the script in parallel. Signed-off-by: Florian Westphal --- tools/testing/selftests/netfilter/nft_flowtable.sh | 246 +++++++++++---------- 1 file changed, 128 insertions(+), 118 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index d4ffebb989f8..c336e6c148d1 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -14,6 +14,11 @@ # nft_flowtable.sh -o8000 -l1500 -r2000 # +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsr1="nsr1-$sfx" +nsr2="nsr2-$sfx" # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -36,18 +41,17 @@ checktool (){ checktool "nft --version" "run test without nft tool" checktool "ip -Version" "run test without ip tool" checktool "which nc" "run test without nc (netcat)" -checktool "ip netns add nsr1" "create net namespace" +checktool "ip netns add $nsr1" "create net namespace $nsr1" -ip netns add ns1 -ip netns add ns2 - -ip netns add nsr2 +ip netns add $ns1 +ip netns add $ns2 +ip netns add $nsr2 cleanup() { - for i in 1 2; do - ip netns del ns$i - ip netns del nsr$i - done + ip netns del $ns1 + ip netns del $ns2 + ip netns del $nsr1 + ip netns del $nsr2 rm -f "$ns1in" "$ns1out" rm -f "$ns2in" "$ns2out" @@ -59,22 +63,21 @@ trap cleanup EXIT sysctl -q net.netfilter.nf_log_all_netns=1 -ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1 -ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2 +ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 +ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 -ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2 +ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 for dev in lo veth0 veth1; do - for i in 1 2; do - ip -net nsr$i link set $dev up - done + ip -net $nsr1 link set $dev up + ip -net $nsr2 link set $dev up done -ip -net nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net nsr1 addr add dead:1::1/64 dev veth0 +ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net $nsr1 addr add dead:1::1/64 dev veth0 -ip -net nsr2 addr add 10.0.2.1/24 dev veth1 -ip -net nsr2 addr add dead:2::1/64 dev veth1 +ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 +ip -net $nsr2 addr add dead:2::1/64 dev veth1 # set different MTUs so we need to push packets coming from ns1 (large MTU) # to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), @@ -106,49 +109,56 @@ do esac done -if ! ip -net nsr1 link set veth0 mtu $omtu; then +if ! ip -net $nsr1 link set veth0 mtu $omtu; then exit 1 fi -ip -net ns1 link set eth0 mtu $omtu +ip -net $ns1 link set eth0 mtu $omtu -if ! ip -net nsr2 link set veth1 mtu $rmtu; then +if ! ip -net $nsr2 link set veth1 mtu $rmtu; then exit 1 fi -ip -net ns2 link set eth0 mtu $rmtu +ip -net $ns2 link set eth0 mtu $rmtu # transfer-net between nsr1 and nsr2. # these addresses are not used for connections. -ip -net nsr1 addr add 192.168.10.1/24 dev veth1 -ip -net nsr1 addr add fee1:2::1/64 dev veth1 - -ip -net nsr2 addr add 192.168.10.2/24 dev veth0 -ip -net nsr2 addr add fee1:2::2/64 dev veth0 - -for i in 1 2; do - ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null - ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null - - ip -net ns$i link set lo up - ip -net ns$i link set eth0 up - ip -net ns$i addr add 10.0.$i.99/24 dev eth0 - ip -net ns$i route add default via 10.0.$i.1 - ip -net ns$i addr add dead:$i::99/64 dev eth0 - ip -net ns$i route add default via dead:$i::1 - if ! ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then +ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 +ip -net $nsr1 addr add fee1:2::1/64 dev veth1 + +ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 +ip -net $nsr2 addr add fee1:2::2/64 dev veth0 + +for i in 0 1; do + ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null + ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null +done + +for ns in $ns1 $ns2;do + ip -net $ns link set lo up + ip -net $ns link set eth0 up + + if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then echo "ERROR: Check Originator/Responder values (problem during address addition)" exit 1 fi - # don't set ip DF bit for first two tests - ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null + ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null done -ip -net nsr1 route add default via 192.168.10.2 -ip -net nsr2 route add default via 192.168.10.1 +ip -net $ns1 addr add 10.0.1.99/24 dev eth0 +ip -net $ns2 addr add 10.0.2.99/24 dev eth0 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns2 route add default via 10.0.2.1 +ip -net $ns1 addr add dead:1::99/64 dev eth0 +ip -net $ns2 addr add dead:2::99/64 dev eth0 +ip -net $ns1 route add default via dead:1::1 +ip -net $ns2 route add default via dead:2::1 + +ip -net $nsr1 route add default via 192.168.10.2 +ip -net $nsr2 route add default via 192.168.10.1 -ip netns exec nsr1 nft -f - < /dev/null; then - echo "ERROR: ns1 cannot reach ns2" 1>&2 +if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then + echo "ERROR: $ns1 cannot reach ns2" 1>&2 exit 1 fi -if ! ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then - echo "ERROR: ns2 cannot reach ns1" 1>&2 +if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then + echo "ERROR: $ns2 cannot reach $ns1" 1>&2 exit 1 fi if [ $ret -eq 0 ];then - echo "PASS: netns routing/connectivity: ns1 can reach ns2" + echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" fi ns1in=$(mktemp) @@ -312,24 +322,24 @@ make_file "$ns2in" # First test: # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. -if test_tcp_forwarding ns1 ns2; then +if test_tcp_forwarding $ns1 $ns2; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 - ip netns exec nsr1 nft list ruleset + ip netns exec $nsr1 nft list ruleset ret=1 fi # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. -ip -net ns2 route del default via 10.0.2.1 -ip -net ns2 route del default via dead:2::1 -ip -net ns2 route add 192.168.10.1 via 10.0.2.1 +ip -net $ns2 route del default via 10.0.2.1 +ip -net $ns2 route del default via dead:2::1 +ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 # Second test: # Same, but with NAT enabled. -ip netns exec nsr1 nft -f - <&2 - ip netns exec nsr1 nft list ruleset + ip netns exec $nsr1 nft list ruleset ret=1 fi # Third test: # Same as second test, but with PMTU discovery enabled. -handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) +handle=$(ip netns exec $nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) -if ! ip netns exec nsr1 nft delete rule inet filter forward $handle; then +if ! ip netns exec $nsr1 nft delete rule inet filter forward $handle; then echo "FAIL: Could not delete large-packet accept rule" exit 1 fi -ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null +ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -if test_tcp_forwarding_nat ns1 ns2; then +if test_tcp_forwarding_nat $ns1 $ns2; then echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" else echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 - ip netns exec nsr1 nft list ruleset + ip netns exec $nsr1 nft list ruleset fi # Another test: # Add bridge interface br0 to Router1, with NAT enabled. -ip -net nsr1 link add name br0 type bridge -ip -net nsr1 addr flush dev veth0 -ip -net nsr1 link set up dev veth0 -ip -net nsr1 link set veth0 master br0 -ip -net nsr1 addr add 10.0.1.1/24 dev br0 -ip -net nsr1 addr add dead:1::1/64 dev br0 -ip -net nsr1 link set up dev br0 +ip -net $nsr1 link add name br0 type bridge +ip -net $nsr1 addr flush dev veth0 +ip -net $nsr1 link set up dev veth0 +ip -net $nsr1 link set veth0 master br0 +ip -net $nsr1 addr add 10.0.1.1/24 dev br0 +ip -net $nsr1 addr add dead:1::1/64 dev br0 +ip -net $nsr1 link set up dev br0 -ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null +ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null # br0 with NAT enabled. -ip netns exec nsr1 nft -f - <&2 - ip netns exec nsr1 nft list ruleset + ip netns exec $nsr1 nft list ruleset ret=1 fi # Another test: # Add bridge interface br0 to Router1, with NAT and VLAN. -ip -net nsr1 link set veth0 nomaster -ip -net nsr1 link set down dev veth0 -ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10 -ip -net nsr1 link set up dev veth0 -ip -net nsr1 link set up dev veth0.10 -ip -net nsr1 link set veth0.10 master br0 - -ip -net ns1 addr flush dev eth0 -ip -net ns1 link add link eth0 name eth0.10 type vlan id 10 -ip -net ns1 link set eth0 up -ip -net ns1 link set eth0.10 up -ip -net ns1 addr add 10.0.1.99/24 dev eth0.10 -ip -net ns1 route add default via 10.0.1.1 -ip -net ns1 addr add dead:1::99/64 dev eth0.10 - -if test_tcp_forwarding_nat ns1 ns2; then +ip -net $nsr1 link set veth0 nomaster +ip -net $nsr1 link set down dev veth0 +ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 +ip -net $nsr1 link set up dev veth0 +ip -net $nsr1 link set up dev veth0.10 +ip -net $nsr1 link set veth0.10 master br0 + +ip -net $ns1 addr flush dev eth0 +ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 +ip -net $ns1 link set eth0 up +ip -net $ns1 link set eth0.10 up +ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns1 addr add dead:1::99/64 dev eth0.10 + +if test_tcp_forwarding_nat $ns1 $ns2; then echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN" else echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 - ip netns exec nsr1 nft list ruleset + ip netns exec $nsr1 nft list ruleset ret=1 fi # restore test topology (remove bridge and VLAN) -ip -net nsr1 link set veth0 nomaster -ip -net nsr1 link set veth0 down -ip -net nsr1 link set veth0.10 down -ip -net nsr1 link delete veth0.10 type vlan -ip -net nsr1 link delete br0 type bridge -ip -net ns1 addr flush dev eth0.10 -ip -net ns1 link set eth0.10 down -ip -net ns1 link set eth0 down -ip -net ns1 link delete eth0.10 type vlan +ip -net $nsr1 link set veth0 nomaster +ip -net $nsr1 link set veth0 down +ip -net $nsr1 link set veth0.10 down +ip -net $nsr1 link delete veth0.10 type vlan +ip -net $nsr1 link delete br0 type bridge +ip -net $ns1 addr flush dev eth0.10 +ip -net $ns1 link set eth0.10 down +ip -net $ns1 link set eth0 down +ip -net $ns1 link delete eth0.10 type vlan # restore address in ns1 and nsr1 -ip -net ns1 link set eth0 up -ip -net ns1 addr add 10.0.1.99/24 dev eth0 -ip -net ns1 route add default via 10.0.1.1 -ip -net ns1 addr add dead:1::99/64 dev eth0 -ip -net ns1 route add default via dead:1::1 -ip -net nsr1 addr add 10.0.1.1/24 dev veth0 -ip -net nsr1 addr add dead:1::1/64 dev veth0 -ip -net nsr1 link set up dev veth0 +ip -net $ns1 link set eth0 up +ip -net $ns1 addr add 10.0.1.99/24 dev eth0 +ip -net $ns1 route add default via 10.0.1.1 +ip -net $ns1 addr add dead:1::99/64 dev eth0 +ip -net $ns1 route add default via dead:1::1 +ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net $nsr1 addr add dead:1::1/64 dev veth0 +ip -net $nsr1 link set up dev veth0 KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) @@ -480,23 +490,23 @@ do_esp() { } -do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 +do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 -do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 +do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 -ip netns exec nsr1 nft delete table ip nat +ip netns exec $nsr1 nft delete table ip nat # restore default routes -ip -net ns2 route del 192.168.10.1 via 10.0.2.1 -ip -net ns2 route add default via 10.0.2.1 -ip -net ns2 route add default via dead:2::1 +ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 +ip -net $ns2 route add default via 10.0.2.1 +ip -net $ns2 route add default via dead:2::1 -if test_tcp_forwarding ns1 ns2; then +if test_tcp_forwarding $ns1 $ns2; then echo "PASS: ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" - ip netns exec nsr1 nft list ruleset 1>&2 - ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2 + ip netns exec $nsr1 nft list ruleset 1>&2 + ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 fi exit $ret -- cgit v1.2.3 From c8550b9077d271b9b4fbe5a9a260eb021f371c4f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Aug 2022 14:15:22 +0200 Subject: testing: selftests: nft_flowtable.sh: rework test to detect offload failure This test fails on current kernel releases because the flotwable path now calls dst_check from packet path and will then remove the offload. Test script has two purposes: 1. check that file (random content) can be sent to other netns (and vv) 2. check that the flow is offloaded (rather than handled by classic forwarding path). Since dst_check is in place, 2) fails because the nftables ruleset in router namespace 1 intentionally blocks traffic under the assumption that packets are not passed via classic path at all. Rework this: Instead of blocking traffic, create two named counters, one for original and one for reverse direction. The first three test cases are handled by classic forwarding path (path mtu discovery is disabled and packets exceed MTU). But all other tests enable PMTUD, so the originator and responder are expected to lower packet size and flowtable is expected to do the packet forwarding. For those tests, check that the packet counters (which are only incremented for packets that are passed up to classic forward path) are significantly lower than the file size transferred. I've tested that the counter-checks fail as expected when the 'flow add' statement is removed from the ruleset. Signed-off-by: Florian Westphal --- tools/testing/selftests/netfilter/nft_flowtable.sh | 141 ++++++++++++--------- 1 file changed, 84 insertions(+), 57 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index c336e6c148d1..7060bae04ec8 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -24,8 +24,7 @@ nsr2="nsr2-$sfx" ksft_skip=4 ret=0 -ns1in="" -ns2in="" +nsin="" ns1out="" ns2out="" @@ -53,8 +52,7 @@ cleanup() { ip netns del $nsr1 ip netns del $nsr2 - rm -f "$ns1in" "$ns1out" - rm -f "$ns2in" "$ns2out" + rm -f "$nsin" "$ns1out" "$ns2out" [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns } @@ -165,36 +163,20 @@ table inet filter { devices = { veth0, veth1 } } + counter routed_orig { } + counter routed_repl { } + chain forward { type filter hook forward priority 0; policy drop; # flow offloaded? Tag ct with mark 1, so we can detect when it fails. - meta oif "veth1" tcp dport 12345 flow offload @f1 counter - - # use packet size to trigger 'should be offloaded by now'. - # otherwise, if 'flow offload' expression never offloads, the - # test will pass. - tcp dport 12345 meta length gt 200 ct mark set 1 counter - - # this turns off flow offloading internally, so expect packets again - tcp flags fin,rst ct mark set 0 accept - - # this allows large packets from responder, we need this as long - # as PMTUd is off. - # This rule is deleted for the last test, when we expect PMTUd - # to kick in and ensure all packets meet mtu requirements. - meta length gt $lmtu accept comment something-to-grep-for + meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept - # next line blocks connection w.o. working offload. - # we only do this for reverse dir, because we expect packets to - # enter slow path due to MTU mismatch of veth0 and veth1. - tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop + # count packets supposedly offloaded as per direction. + ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept ct state established,related accept - # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed) - meta length lt 200 oif "veth1" tcp dport 12345 counter accept - meta nfproto ipv4 meta l4proto icmp accept meta nfproto ipv6 meta l4proto icmpv6 accept } @@ -221,16 +203,16 @@ if [ $ret -eq 0 ];then echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" fi -ns1in=$(mktemp) +nsin=$(mktemp) ns1out=$(mktemp) -ns2in=$(mktemp) ns2out=$(mktemp) make_file() { name=$1 - SIZE=$((RANDOM % (1024 * 8))) + SIZE=$((RANDOM % (1024 * 128))) + SIZE=$((SIZE + (1024 * 8))) TSIZE=$((SIZE * 1024)) dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null @@ -241,6 +223,38 @@ make_file() dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null } +check_counters() +{ + local what=$1 + local ok=1 + + local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) + local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) + + local orig_cnt=${orig#*bytes} + local repl_cnt=${repl#*bytes} + + local fs=$(du -sb $nsin) + local max_orig=${fs%%/*} + local max_repl=$((max_orig/4)) + + if [ $orig_cnt -gt $max_orig ];then + echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 + ret=1 + ok=0 + fi + + if [ $repl_cnt -gt $max_repl ];then + echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 + ret=1 + ok=0 + fi + + if [ $ok -eq 1 ]; then + echo "PASS: $what" + fi +} + check_transfer() { in=$1 @@ -265,11 +279,11 @@ test_tcp_forwarding_ip() local dstport=$4 local lret=0 - ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" & + ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & lpid=$! sleep 1 - ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" & + ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & cpid=$! sleep 3 @@ -284,11 +298,11 @@ test_tcp_forwarding_ip() wait - if ! check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"; then + if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then lret=1 fi - if ! check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"; then + if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then lret=1 fi @@ -305,23 +319,40 @@ test_tcp_forwarding() test_tcp_forwarding_nat() { local lret + local pmtu test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 lret=$? + pmtu=$3 + what=$4 + if [ $lret -eq 0 ] ; then + if [ $pmtu -eq 1 ] ;then + check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what" + else + echo "PASS: flow offload for ns1/ns2 with masquerade $what" + fi + test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 lret=$? + if [ $pmtu -eq 1 ] ;then + check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what" + elif [ $lret -eq 0 ] ; then + echo "PASS: flow offload for ns1/ns2 with dnat $what" + fi fi return $lret } -make_file "$ns1in" -make_file "$ns2in" +make_file "$nsin" # First test: # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. +# Due to MTU mismatch in both directions, all packets (except small packets like pure +# acks) have to be handled by normal forwarding path. Therefore, packet counters +# are not checked. if test_tcp_forwarding $ns1 $ns2; then echo "PASS: flow offloaded for ns1/ns2" else @@ -338,7 +369,8 @@ ip -net $ns2 route del default via dead:2::1 ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 # Second test: -# Same, but with NAT enabled. +# Same, but with NAT enabled. Same as in first test: we expect normal forward path +# to handle most packets. ip netns exec $nsr1 nft -f - <&2 ip netns exec $nsr1 nft list ruleset ret=1 fi # Third test: -# Same as second test, but with PMTU discovery enabled. -handle=$(ip netns exec $nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2) - -if ! ip netns exec $nsr1 nft delete rule inet filter forward $handle; then - echo "FAIL: Could not delete large-packet accept rule" - exit 1 -fi - +# Same as second test, but with PMTU discovery enabled. This +# means that we expect the fastpath to handle packets as soon +# as the endpoints adjust the packet size. ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null -if test_tcp_forwarding_nat $ns1 $ns2; then - echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery" -else +# reset counters. +# With pmtu in-place we'll also check that nft counters +# are lower than file size and packets were forwarded via flowtable layer. +# For earlier tests (large mtus), packets cannot be handled via flowtable +# (except pure acks and other small packets). +ip netns exec $nsr1 nft reset counters table inet filter >/dev/null + +if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 ip netns exec $nsr1 nft list ruleset fi @@ -408,14 +438,13 @@ table ip nat { } EOF -if test_tcp_forwarding_nat $ns1 $ns2; then - echo "PASS: flow offloaded for ns1/ns2 with bridge NAT" -else +if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 ip netns exec $nsr1 nft list ruleset ret=1 fi + # Another test: # Add bridge interface br0 to Router1, with NAT and VLAN. ip -net $nsr1 link set veth0 nomaster @@ -433,9 +462,7 @@ ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 ip -net $ns1 route add default via 10.0.1.1 ip -net $ns1 addr add dead:1::99/64 dev eth0.10 -if test_tcp_forwarding_nat $ns1 $ns2; then - echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN" -else +if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 ip netns exec $nsr1 nft list ruleset ret=1 @@ -502,7 +529,7 @@ ip -net $ns2 route add default via 10.0.2.1 ip -net $ns2 route add default via dead:2::1 if test_tcp_forwarding $ns1 $ns2; then - echo "PASS: ipsec tunnel mode for ns1/ns2" + check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" ip netns exec $nsr1 nft list ruleset 1>&2 -- cgit v1.2.3 From 43cb8cbadffa21e88a65dd1129c86f5552d6c42e Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 16 Aug 2022 16:40:11 -0700 Subject: libbpf: Allows disabling auto attach Adds libbpf APIs for disabling auto-attach for individual functions. This is motivated by the use case of cgroup iter [1]. Some iter types require their parameters to be non-zero, therefore applying auto-attach on them will fail. With these two new APIs, users who want to use auto-attach and these types of iters can disable auto-attach on the program and perform manual attach. [1] https://lore.kernel.org/bpf/CAEf4BzZ+a2uDo_t6kGBziqdz--m2gh2_EUwkGLDtMd65uwxUjA@mail.gmail.com/ Signed-off-by: Hao Luo Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220816234012.910255-1-haoluo@google.com --- tools/lib/bpf/libbpf.c | 15 ++++++++++++++- tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 2 ++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index aa05a99b913d..0159a43c7efd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -417,6 +417,7 @@ struct bpf_program { int fd; bool autoload; + bool autoattach; bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; @@ -755,6 +756,8 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->autoload = true; } + prog->autoattach = true; + /* inherit object's log_level */ prog->log_level = obj->log_level; @@ -8314,6 +8317,16 @@ int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) return 0; } +bool bpf_program__autoattach(const struct bpf_program *prog) +{ + return prog->autoattach; +} + +void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach) +{ + prog->autoattach = autoattach; +} + const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) { return prog->insns; @@ -12346,7 +12359,7 @@ int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) struct bpf_program *prog = *s->progs[i].prog; struct bpf_link **link = s->progs[i].link; - if (!prog->autoload) + if (!prog->autoload || !prog->autoattach) continue; /* auto-attaching not supported for this program */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 61493c4cddac..88a1ac34b12a 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -260,6 +260,8 @@ LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); +LIBBPF_API bool bpf_program__autoattach(const struct bpf_program *prog); +LIBBPF_API void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach); struct bpf_insn; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 119e6e1ea7f1..2b928dc21af0 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -358,6 +358,8 @@ LIBBPF_1.0.0 { bpf_obj_get_opts; bpf_prog_query_opts; bpf_program__attach_ksyscall; + bpf_program__autoattach; + bpf_program__set_autoattach; btf__add_enum64; btf__add_enum64_value; libbpf_bpf_attach_type_str; -- cgit v1.2.3 From 738a2f2f9130f98f92ccb3efd94d4879c0a0990c Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 16 Aug 2022 16:40:12 -0700 Subject: selftests/bpf: Tests libbpf autoattach APIs Adds test for libbpf APIs that toggle bpf program auto-attaching. Signed-off-by: Hao Luo Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220816234012.910255-2-haoluo@google.com --- .../testing/selftests/bpf/prog_tests/autoattach.c | 30 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/test_autoattach.c | 23 +++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/autoattach.c create mode 100644 tools/testing/selftests/bpf/progs/test_autoattach.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/autoattach.c b/tools/testing/selftests/bpf/prog_tests/autoattach.c new file mode 100644 index 000000000000..dc5e01d279bd --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/autoattach.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include +#include "test_autoattach.skel.h" + +void test_autoattach(void) +{ + struct test_autoattach *skel; + + skel = test_autoattach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + goto cleanup; + + /* disable auto-attach for prog2 */ + bpf_program__set_autoattach(skel->progs.prog2, false); + ASSERT_TRUE(bpf_program__autoattach(skel->progs.prog1), "autoattach_prog1"); + ASSERT_FALSE(bpf_program__autoattach(skel->progs.prog2), "autoattach_prog2"); + if (!ASSERT_OK(test_autoattach__attach(skel), "skel_attach")) + goto cleanup; + + usleep(1); + + ASSERT_TRUE(skel->bss->prog1_called, "attached_prog1"); + ASSERT_FALSE(skel->bss->prog2_called, "attached_prog2"); + +cleanup: + test_autoattach__destroy(skel); +} + diff --git a/tools/testing/selftests/bpf/progs/test_autoattach.c b/tools/testing/selftests/bpf/progs/test_autoattach.c new file mode 100644 index 000000000000..11a44493ebce --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_autoattach.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include "vmlinux.h" +#include + +bool prog1_called = false; +bool prog2_called = false; + +SEC("raw_tp/sys_enter") +int prog1(const void *ctx) +{ + prog1_called = true; + return 0; +} + +SEC("raw_tp/sys_exit") +int prog2(const void *ctx) +{ + prog2_called = true; + return 0; +} + -- cgit v1.2.3 From d4e6d684f3bea46a2fc195765c77a3b26bcb080e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Aug 2022 17:19:26 -0700 Subject: libbpf: Fix potential NULL dereference when parsing ELF Fix if condition filtering empty ELF sections to prevent NULL dereference. Fixes: 47ea7417b074 ("libbpf: Skip empty sections in bpf_object__init_global_data_maps") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20220816001929.369487-2-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0159a43c7efd..146d35526b87 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1649,7 +1649,7 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) sec_desc = &obj->efile.secs[sec_idx]; /* Skip recognized sections with size 0. */ - if (sec_desc->data && sec_desc->data->d_size == 0) + if (!sec_desc->data || sec_desc->data->d_size == 0) continue; switch (sec_desc->sec_type) { -- cgit v1.2.3 From 813847a31447feba6119df4ee77a7c0c7a77fc72 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Aug 2022 17:19:27 -0700 Subject: libbpf: Streamline bpf_attr and perf_event_attr initialization Make sure that entire libbpf code base is initializing bpf_attr and perf_event_attr with memset(0). Also for bpf_attr make sure we clear and pass to kernel only relevant parts of bpf_attr. bpf_attr is a huge union of independent sub-command attributes, so there is no need to clear and pass entire union bpf_attr, which over time grows quite a lot and for most commands this growth is completely irrelevant. Few cases where we were relying on compiler initialization of BPF UAPI structs (like bpf_prog_info, bpf_map_info, etc) with `= {};` were switched to memset(0) pattern for future-proofing. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20220816001929.369487-3-andrii@kernel.org --- tools/lib/bpf/bpf.c | 173 +++++++++++++++++++++++++----------------- tools/lib/bpf/libbpf.c | 43 +++++++---- tools/lib/bpf/netlink.c | 3 +- tools/lib/bpf/skel_internal.h | 10 ++- 4 files changed, 138 insertions(+), 91 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 575867d69496..e3a0bd7efa2f 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -105,7 +105,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) */ int probe_memcg_account(void) { - const size_t prog_load_attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); struct bpf_insn insns[] = { BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), BPF_EXIT_INSN(), @@ -115,13 +115,13 @@ int probe_memcg_account(void) int prog_fd; /* attempt loading freplace trying to use custom BTF */ - memset(&attr, 0, prog_load_attr_sz); + memset(&attr, 0, attr_sz); attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; attr.insns = ptr_to_u64(insns); attr.insn_cnt = insn_cnt; attr.license = ptr_to_u64("GPL"); - prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, prog_load_attr_sz); + prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz); if (prog_fd >= 0) { close(prog_fd); return 1; @@ -232,6 +232,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, const struct bpf_prog_load_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, fd_array); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; @@ -251,7 +252,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, if (attempts == 0) attempts = PROG_LOAD_ATTEMPTS; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.prog_type = prog_type; attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0); @@ -314,7 +315,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, attr.log_level = log_level; } - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); if (fd >= 0) return fd; @@ -354,7 +355,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, break; } - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); if (fd >= 0) goto done; } @@ -368,7 +369,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, attr.log_size = log_size; attr.log_level = 1; - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); } done: /* free() doesn't affect errno, so we don't need to restore it */ @@ -380,127 +381,136 @@ done: int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); attr.flags = flags; - ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_elem(int fd, const void *key, void *value) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); - ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); attr.flags = flags; - ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); - ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); attr.flags = flags; - ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_delete_elem(int fd, const void *key) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); - ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.flags = flags; - ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_get_next_key(int fd, const void *key, void *next_key) { + const size_t attr_sz = offsetofend(union bpf_attr, next_key); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.next_key = ptr_to_u64(next_key); - ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_freeze(int fd) { + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; - ret = sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); return libbpf_err_errno(ret); } @@ -509,13 +519,14 @@ static int bpf_map_batch_common(int cmd, int fd, void *in_batch, __u32 *count, const struct bpf_map_batch_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, batch); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_map_batch_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.batch.map_fd = fd; attr.batch.in_batch = ptr_to_u64(in_batch); attr.batch.out_batch = ptr_to_u64(out_batch); @@ -525,7 +536,7 @@ static int bpf_map_batch_common(int cmd, int fd, void *in_batch, attr.batch.elem_flags = OPTS_GET(opts, elem_flags, 0); attr.batch.flags = OPTS_GET(opts, flags, 0); - ret = sys_bpf(cmd, &attr, sizeof(attr)); + ret = sys_bpf(cmd, &attr, attr_sz); *count = attr.batch.count; return libbpf_err_errno(ret); @@ -564,14 +575,15 @@ int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *co int bpf_obj_pin(int fd, const char *pathname) { + const size_t attr_sz = offsetofend(union bpf_attr, file_flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.pathname = ptr_to_u64((void *)pathname); attr.bpf_fd = fd; - ret = sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); + ret = sys_bpf(BPF_OBJ_PIN, &attr, attr_sz); return libbpf_err_errno(ret); } @@ -582,17 +594,18 @@ int bpf_obj_get(const char *pathname) int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, file_flags); union bpf_attr attr; int fd; if (!OPTS_VALID(opts, bpf_obj_get_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.pathname = ptr_to_u64((void *)pathname); attr.file_flags = OPTS_GET(opts, file_flags, 0); - fd = sys_bpf_fd(BPF_OBJ_GET, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_OBJ_GET, &attr, attr_sz); return libbpf_err_errno(fd); } @@ -610,20 +623,21 @@ int bpf_prog_attach_opts(int prog_fd, int target_fd, enum bpf_attach_type type, const struct bpf_prog_attach_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_prog_attach_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; attr.attach_flags = OPTS_GET(opts, flags, 0); attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); - ret = sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_ATTACH, &attr, attr_sz); return libbpf_err_errno(ret); } @@ -634,28 +648,30 @@ int bpf_prog_attach_xattr(int prog_fd, int target_fd, int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.target_fd = target_fd; attr.attach_type = type; - ret = sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) { + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; - ret = sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); return libbpf_err_errno(ret); } @@ -663,6 +679,7 @@ int bpf_link_create(int prog_fd, int target_fd, enum bpf_attach_type attach_type, const struct bpf_link_create_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, link_create); __u32 target_btf_id, iter_info_len; union bpf_attr attr; int fd, err; @@ -681,7 +698,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); } - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.link_create.prog_fd = prog_fd; attr.link_create.target_fd = target_fd; attr.link_create.attach_type = attach_type; @@ -725,7 +742,7 @@ int bpf_link_create(int prog_fd, int target_fd, break; } proceed: - fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz); if (fd >= 0) return fd; /* we'll get EINVAL if LINK_CREATE doesn't support attaching fentry @@ -761,44 +778,47 @@ proceed: int bpf_link_detach(int link_fd) { + const size_t attr_sz = offsetofend(union bpf_attr, link_detach); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.link_detach.link_fd = link_fd; - ret = sys_bpf(BPF_LINK_DETACH, &attr, sizeof(attr)); + ret = sys_bpf(BPF_LINK_DETACH, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_link_update(int link_fd, int new_prog_fd, const struct bpf_link_update_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, link_update); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_link_update_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.link_update.link_fd = link_fd; attr.link_update.new_prog_fd = new_prog_fd; attr.link_update.flags = OPTS_GET(opts, flags, 0); attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); - ret = sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); + ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_iter_create(int link_fd) { + const size_t attr_sz = offsetofend(union bpf_attr, iter_create); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.iter_create.link_fd = link_fd; - fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); } @@ -806,13 +826,14 @@ int bpf_prog_query_opts(int target_fd, enum bpf_attach_type type, struct bpf_prog_query_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, query); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_prog_query_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.query.target_fd = target_fd; attr.query.attach_type = type; @@ -821,7 +842,7 @@ int bpf_prog_query_opts(int target_fd, attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); - ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_QUERY, &attr, attr_sz); OPTS_SET(opts, attach_flags, attr.query.attach_flags); OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); @@ -850,13 +871,14 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, test); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_test_run_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.test.prog_fd = prog_fd; attr.test.batch_size = OPTS_GET(opts, batch_size, 0); attr.test.cpu = OPTS_GET(opts, cpu, 0); @@ -872,7 +894,7 @@ int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) attr.test.data_in = ptr_to_u64(OPTS_GET(opts, data_in, NULL)); attr.test.data_out = ptr_to_u64(OPTS_GET(opts, data_out, NULL)); - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, attr_sz); OPTS_SET(opts, data_size_out, attr.test.data_size_out); OPTS_SET(opts, ctx_size_out, attr.test.ctx_size_out); @@ -884,13 +906,14 @@ int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int err; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.start_id = start_id; - err = sys_bpf(cmd, &attr, sizeof(attr)); + err = sys_bpf(cmd, &attr, attr_sz); if (!err) *next_id = attr.next_id; @@ -919,80 +942,84 @@ int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) int bpf_prog_get_fd_by_id(__u32 id) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.prog_id = id; - fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_map_get_fd_by_id(__u32 id) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_id = id; - fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_btf_get_fd_by_id(__u32 id) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.btf_id = id; - fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_link_get_fd_by_id(__u32 id) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.link_id = id; - fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) { + const size_t attr_sz = offsetofend(union bpf_attr, info); union bpf_attr attr; int err; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.info.bpf_fd = bpf_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); - err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); - + err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); if (!err) *info_len = attr.info.info_len; - return libbpf_err_errno(err); } int bpf_raw_tracepoint_open(const char *name, int prog_fd) { + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; - fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); return libbpf_err_errno(fd); } @@ -1048,16 +1075,18 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr) { - union bpf_attr attr = {}; + const size_t attr_sz = offsetofend(union bpf_attr, task_fd_query); + union bpf_attr attr; int err; + memset(&attr, 0, attr_sz); attr.task_fd_query.pid = pid; attr.task_fd_query.fd = fd; attr.task_fd_query.flags = flags; attr.task_fd_query.buf = ptr_to_u64(buf); attr.task_fd_query.buf_len = *buf_len; - err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr)); + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, attr_sz); *buf_len = attr.task_fd_query.buf_len; *prog_id = attr.task_fd_query.prog_id; @@ -1070,30 +1099,32 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, int bpf_enable_stats(enum bpf_stats_type type) { + const size_t attr_sz = offsetofend(union bpf_attr, enable_stats); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.enable_stats.type = type; - fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_prog_bind_map(int prog_fd, int map_fd, const struct bpf_prog_bind_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, prog_bind_map); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_prog_bind_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.prog_bind_map.prog_fd = prog_fd; attr.prog_bind_map.map_fd = map_fd; attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); - ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz); return libbpf_err_errno(ret); } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 146d35526b87..21fc3fc7f44c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4287,11 +4287,12 @@ int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) int bpf_map__reuse_fd(struct bpf_map *map, int fd) { - struct bpf_map_info info = {}; + struct bpf_map_info info; __u32 len = sizeof(info), name_len; int new_fd, err; char *new_name; + memset(&info, 0, len); err = bpf_obj_get_info_by_fd(fd, &info, &len); if (err && errno == EINVAL) err = bpf_get_map_info_from_fdinfo(fd, &info); @@ -4833,13 +4834,12 @@ bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) { - struct bpf_map_info map_info = {}; + struct bpf_map_info map_info; char msg[STRERR_BUFSIZE]; - __u32 map_info_len; + __u32 map_info_len = sizeof(map_info); int err; - map_info_len = sizeof(map_info); - + memset(&map_info, 0, map_info_len); err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); if (err && errno == EINVAL) err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); @@ -9007,11 +9007,12 @@ int libbpf_find_vmlinux_btf_id(const char *name, static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) { - struct bpf_prog_info info = {}; + struct bpf_prog_info info; __u32 info_len = sizeof(info); struct btf *btf; int err; + memset(&info, 0, info_len); err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_len); if (err) { pr_warn("failed bpf_obj_get_info_by_fd for FD %d: %d\n", @@ -9839,13 +9840,16 @@ static int determine_uprobe_retprobe_bit(void) static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, uint64_t offset, int pid, size_t ref_ctr_off) { - struct perf_event_attr attr = {}; + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; char errmsg[STRERR_BUFSIZE]; int type, pfd; if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) return -EINVAL; + memset(&attr, 0, attr_sz); + type = uprobe ? determine_uprobe_perf_type() : determine_kprobe_perf_type(); if (type < 0) { @@ -9866,7 +9870,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, } attr.config |= 1 << bit; } - attr.size = sizeof(attr); + attr.size = attr_sz; attr.type = type; attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ @@ -9965,7 +9969,8 @@ static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retpro static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, const char *kfunc_name, size_t offset, int pid) { - struct perf_event_attr attr = {}; + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; char errmsg[STRERR_BUFSIZE]; int type, pfd, err; @@ -9984,7 +9989,9 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); goto err_clean_legacy; } - attr.size = sizeof(attr); + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; attr.config = type; attr.type = PERF_TYPE_TRACEPOINT; @@ -10441,6 +10448,7 @@ static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retpro static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, const char *binary_path, size_t offset, int pid) { + const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; int type, pfd, err; @@ -10458,8 +10466,8 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, goto err_clean_legacy; } - memset(&attr, 0, sizeof(attr)); - attr.size = sizeof(attr); + memset(&attr, 0, attr_sz); + attr.size = attr_sz; attr.config = type; attr.type = PERF_TYPE_TRACEPOINT; @@ -10998,7 +11006,8 @@ static int determine_tracepoint_id(const char *tp_category, static int perf_event_open_tracepoint(const char *tp_category, const char *tp_name) { - struct perf_event_attr attr = {}; + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; char errmsg[STRERR_BUFSIZE]; int tp_id, pfd, err; @@ -11010,8 +11019,9 @@ static int perf_event_open_tracepoint(const char *tp_category, return tp_id; } + memset(&attr, 0, attr_sz); attr.type = PERF_TYPE_TRACEPOINT; - attr.size = sizeof(attr); + attr.size = attr_sz; attr.config = tp_id; pfd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu */, @@ -11631,12 +11641,15 @@ struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, void *ctx, const struct perf_buffer_opts *opts) { + const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_buffer_params p = {}; - struct perf_event_attr attr = {}; + struct perf_event_attr attr; if (!OPTS_VALID(opts, perf_buffer_opts)) return libbpf_err_ptr(-EINVAL); + memset(&attr, 0, attr_sz); + attr.size = attr_sz; attr.config = PERF_COUNT_SW_BPF_OUTPUT; attr.type = PERF_TYPE_SOFTWARE; attr.sample_type = PERF_SAMPLE_RAW; diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 6c013168032d..35104580870c 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -587,11 +587,12 @@ static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) { - struct bpf_prog_info info = {}; + struct bpf_prog_info info; __u32 info_len = sizeof(info); char name[256]; int len, ret; + memset(&info, 0, info_len); ret = bpf_obj_get_info_by_fd(fd, &info, &info_len); if (ret < 0) return ret; diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index bd6f4505e7b1..365d769e0357 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -285,6 +285,8 @@ static inline int skel_link_create(int prog_fd, int target_fd, static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) { + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); int map_fd = -1, prog_fd = -1, key = 0, err; union bpf_attr attr; @@ -302,7 +304,7 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) goto out; } - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, prog_load_attr_sz); attr.prog_type = BPF_PROG_TYPE_SYSCALL; attr.insns = (long) opts->insns; attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); @@ -313,18 +315,18 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) attr.log_size = opts->ctx->log_size; attr.log_buf = opts->ctx->log_buf; attr.prog_flags = BPF_F_SLEEPABLE; - err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, prog_load_attr_sz); if (prog_fd < 0) { opts->errstr = "failed to load loader prog"; set_err; goto out; } - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, test_run_attr_sz); attr.test.prog_fd = prog_fd; attr.test.ctx_in = (long) opts->ctx; attr.test.ctx_size_in = opts->ctx->sz; - err = skel_sys_bpf(BPF_PROG_RUN, &attr, sizeof(attr)); + err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz); if (err < 0 || (int)attr.test.retval < 0) { opts->errstr = "failed to execute loader prog"; if (err < 0) { -- cgit v1.2.3 From abf84b64e36b175c9c4dd4ecbad2af4329c00041 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Aug 2022 17:19:28 -0700 Subject: libbpf: Clean up deprecated and legacy aliases Remove three missed deprecated APIs that were aliased to new APIs: bpf_object__unload, bpf_prog_attach_xattr and btf__load. Also move legacy API libbpf_find_kernel_btf (aliased to btf__load_vmlinux_btf) into libbpf_legacy.h. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20220816001929.369487-4-andrii@kernel.org --- tools/lib/bpf/bpf.c | 5 ----- tools/lib/bpf/btf.c | 2 -- tools/lib/bpf/btf.h | 1 - tools/lib/bpf/libbpf.c | 2 -- tools/lib/bpf/libbpf_legacy.h | 2 ++ 5 files changed, 2 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index e3a0bd7efa2f..1d49a0352836 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -641,11 +641,6 @@ int bpf_prog_attach_opts(int prog_fd, int target_fd, return libbpf_err_errno(ret); } -__attribute__((alias("bpf_prog_attach_opts"))) -int bpf_prog_attach_xattr(int prog_fd, int target_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); - int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2d14f1a52d7a..361131518d63 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1225,8 +1225,6 @@ int btf__load_into_kernel(struct btf *btf) return btf_load_into_kernel(btf, NULL, 0, 0); } -int btf__load(struct btf *) __attribute__((alias("btf__load_into_kernel"))); - int btf__fd(const struct btf *btf) { return btf->fd; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 583760df83b4..ae543144ee30 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -116,7 +116,6 @@ LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_b LIBBPF_API struct btf *btf__load_vmlinux_btf(void); LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf); -LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 21fc3fc7f44c..3ad139285fad 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7263,8 +7263,6 @@ static int bpf_object_unload(struct bpf_object *obj) return 0; } -int bpf_object__unload(struct bpf_object *obj) __attribute__((alias("bpf_object_unload"))); - static int bpf_object__sanitize_maps(struct bpf_object *obj) { struct bpf_map *m; diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h index 5b7e0155db6a..1e1be467bede 100644 --- a/tools/lib/bpf/libbpf_legacy.h +++ b/tools/lib/bpf/libbpf_legacy.h @@ -125,6 +125,8 @@ struct bpf_map; struct btf; struct btf_ext; +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); LIBBPF_API enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map); -- cgit v1.2.3 From df78da27260c915039b348b164bbc53fa372ba70 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Aug 2022 17:19:29 -0700 Subject: selftests/bpf: Few fixes for selftests/bpf built in release mode Fix few issues found when building and running test_progs in release mode. First, potentially uninitialized idx variable in xskxceiver, force-initialize to zero to satisfy compiler. Few instances of defining uprobe trigger functions break in release mode unless marked as noinline, due to being static. Add noinline to make sure everything works. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20220816001929.369487-5-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/attach_probe.c | 6 +++--- tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 2 +- tools/testing/selftests/bpf/prog_tests/task_pt_regs.c | 2 +- tools/testing/selftests/bpf/xskxceiver.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 0b899d2d8ea7..9566d9d2f6ee 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -6,19 +6,19 @@ volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes"))); /* uprobe attach point */ -static void trigger_func(void) +static noinline void trigger_func(void) { asm volatile (""); } /* attach point for byname uprobe */ -static void trigger_func2(void) +static noinline void trigger_func2(void) { asm volatile (""); } /* attach point for byname sleepable uprobe */ -static void trigger_func3(void) +static noinline void trigger_func3(void) { asm volatile (""); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 2974b44f80fa..2be2d61954bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -13,7 +13,7 @@ #include "kprobe_multi.skel.h" /* uprobe attach point */ -static void trigger_func(void) +static noinline void trigger_func(void) { asm volatile (""); } diff --git a/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c index 61935e7e056a..f000734a3d1f 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c +++ b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c @@ -4,7 +4,7 @@ #include "test_task_pt_regs.skel.h" /* uprobe attach point */ -static void trigger_func(void) +static noinline void trigger_func(void) { asm volatile (""); } diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 20b44ab32a06..14b4737b223c 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -922,7 +922,7 @@ static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb, struct pollfd *fd { struct xsk_socket_info *xsk = ifobject->xsk; bool use_poll = ifobject->use_poll; - u32 i, idx, ret, valid_pkts = 0; + u32 i, idx = 0, ret, valid_pkts = 0; while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { if (use_poll) { -- cgit v1.2.3 From 0db7058e8e23e6bbab1b4747ecabd1784c34f50b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 24 May 2022 11:01:18 +0200 Subject: x86/clear_user: Make it faster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on a patch by Mark Hemment and incorporating very sane suggestions from Linus. The point here is to have the default case with FSRM - which is supposed to be the majority of x86 hw out there - if not now then soon - be directly inlined into the instruction stream so that no function call overhead is taking place. Drop the early clobbers from the @size and @addr operands as those are not needed anymore since we have single instruction alternatives. The benchmarks I ran would show very small improvements and a PF benchmark would even show weird things like slowdowns with higher core counts. So for a ~6m running the git test suite, the function gets called under 700K times, all from padzero(): <...>-2536 [006] ..... 261.208801: padzero: to: 0x55b0663ed214, size: 3564, cycles: 21900 <...>-2536 [006] ..... 261.208819: padzero: to: 0x7f061adca078, size: 3976, cycles: 17160 <...>-2537 [008] ..... 261.211027: padzero: to: 0x5572d019e240, size: 3520, cycles: 23850 <...>-2537 [008] ..... 261.211049: padzero: to: 0x7f1288dc9078, size: 3976, cycles: 15900 ... which is around 1%-ish of the total time and which is consistent with the benchmark numbers. So Mel gave me the idea to simply measure how fast the function becomes. I.e.: start = rdtsc_ordered(); ret = __clear_user(to, n); end = rdtsc_ordered(); Computing the mean average of all the samples collected during the test suite run then shows some improvement: clear_user_original: Amean: 9219.71 (Sum: 6340154910, samples: 687674) fsrm: Amean: 8030.63 (Sum: 5522277720, samples: 687652) That's on Zen3. The situation looks a lot more confusing on Intel: Icelake: clear_user_original: Amean: 19679.4 (Sum: 13652560764, samples: 693750) Amean: 19743.7 (Sum: 13693470604, samples: 693562) (I ran it twice just to be sure.) ERMS: Amean: 20374.3 (Sum: 13910601024, samples: 682752) Amean: 20453.7 (Sum: 14186223606, samples: 693576) FSRM: Amean: 20458.2 (Sum: 13918381386, sample s: 680331) The original microbenchmark which people were complaining about: for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=65536; done 2>&1 | grep copied 32207011840 bytes (32 GB, 30 GiB) copied, 1 s, 32.2 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.93069 s, 35.6 GB/s 37597741056 bytes (38 GB, 35 GiB) copied, 1 s, 37.6 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.78017 s, 38.6 GB/s 62020124672 bytes (62 GB, 58 GiB) copied, 2 s, 31.0 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 2.13716 s, 32.2 GB/s 60010004480 bytes (60 GB, 56 GiB) copied, 1 s, 60.0 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.14129 s, 60.2 GB/s 53212086272 bytes (53 GB, 50 GiB) copied, 1 s, 53.2 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.28398 s, 53.5 GB/s 55698259968 bytes (56 GB, 52 GiB) copied, 1 s, 55.7 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.22507 s, 56.1 GB/s 55306092544 bytes (55 GB, 52 GiB) copied, 1 s, 55.3 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.23647 s, 55.6 GB/s 54387539968 bytes (54 GB, 51 GiB) copied, 1 s, 54.4 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.25693 s, 54.7 GB/s 50566529024 bytes (51 GB, 47 GiB) copied, 1 s, 50.6 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.35096 s, 50.9 GB/s 58308165632 bytes (58 GB, 54 GiB) copied, 1 s, 58.3 GB/s 68719476736 bytes (69 GB, 64 GiB) copied, 1.17394 s, 58.5 GB/s Now the same thing with smaller buffers: for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=8192; done 2>&1 | grep copied 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28485 s, 30.2 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276112 s, 31.1 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.29136 s, 29.5 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.283803 s, 30.3 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.306503 s, 28.0 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.349169 s, 24.6 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276912 s, 31.0 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.265356 s, 32.4 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28464 s, 30.2 GB/s 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.242998 s, 35.3 GB/s is also not conclusive because it all depends on the buffer sizes, their alignments and when the microcode detects that cachelines can be aggregated properly and copied in bigger sizes. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/CAHk-=wh=Mu_EYhtOmPn6AxoQZyEh-4fo2Zx3G7rBv1g7vwoKiw@mail.gmail.com --- arch/x86/include/asm/uaccess.h | 5 +- arch/x86/include/asm/uaccess_64.h | 45 +++++++++++++ arch/x86/lib/clear_page_64.S | 138 ++++++++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy_64.c | 40 ----------- tools/objtool/check.c | 3 + 5 files changed, 188 insertions(+), 43 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 913e593a3b45..c46207946e05 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -502,9 +502,6 @@ strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); - #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); @@ -526,6 +523,8 @@ extern struct movsl_mask { #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); # include #else # include diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 45697e04d771..d13d71af5cf6 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -79,4 +79,49 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } + +/* + * Zero Userspace. + */ + +__must_check unsigned long +clear_user_original(void __user *addr, unsigned long len); +__must_check unsigned long +clear_user_rep_good(void __user *addr, unsigned long len); +__must_check unsigned long +clear_user_erms(void __user *addr, unsigned long len); + +static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) +{ + might_fault(); + stac(); + + /* + * No memory constraint because it doesn't change any memory gcc + * knows about. + */ + asm volatile( + "1:\n\t" + ALTERNATIVE_3("rep stosb", + "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM), + "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS), + "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT + : "a" (0) + /* rep_good clobbers %rdx */ + : "rdx"); + + clac(); + + return size; +} + +static __always_inline unsigned long clear_user(void __user *to, unsigned long n) +{ + if (access_ok(to, n)) + return __clear_user(to, n); + return n; +} #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index fe59b8ac4fcc..ecbfb4dd3b01 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include +#include #include /* @@ -50,3 +51,140 @@ SYM_FUNC_START(clear_page_erms) RET SYM_FUNC_END(clear_page_erms) EXPORT_SYMBOL_GPL(clear_page_erms) + +/* + * Default clear user-space. + * Input: + * rdi destination + * rcx count + * + * Output: + * rcx: uncleared bytes or 0 if successful. + */ +SYM_FUNC_START(clear_user_original) + /* + * Copy only the lower 32 bits of size as that is enough to handle the rest bytes, + * i.e., no need for a 'q' suffix and thus a REX prefix. + */ + mov %ecx,%eax + shr $3,%rcx + jz .Lrest_bytes + + # do the qwords first + .p2align 4 +.Lqwords: + movq $0,(%rdi) + lea 8(%rdi),%rdi + dec %rcx + jnz .Lqwords + +.Lrest_bytes: + and $7, %eax + jz .Lexit + + # now do the rest bytes +.Lbytes: + movb $0,(%rdi) + inc %rdi + dec %eax + jnz .Lbytes + +.Lexit: + /* + * %rax still needs to be cleared in the exception case because this function is called + * from inline asm and the compiler expects %rax to be zero when exiting the inline asm, + * in case it might reuse it somewhere. + */ + xor %eax,%eax + RET + +.Lqwords_exception: + # convert remaining qwords back into bytes to return to caller + shl $3, %rcx + and $7, %eax + add %rax,%rcx + jmp .Lexit + +.Lbytes_exception: + mov %eax,%ecx + jmp .Lexit + + _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception) + _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception) +SYM_FUNC_END(clear_user_original) +EXPORT_SYMBOL(clear_user_original) + +/* + * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is + * present. + * Input: + * rdi destination + * rcx count + * + * Output: + * rcx: uncleared bytes or 0 if successful. + */ +SYM_FUNC_START(clear_user_rep_good) + # call the original thing for less than a cacheline + cmp $64, %rcx + jb clear_user_original + +.Lprep: + # copy lower 32-bits for rest bytes + mov %ecx, %edx + shr $3, %rcx + jz .Lrep_good_rest_bytes + +.Lrep_good_qwords: + rep stosq + +.Lrep_good_rest_bytes: + and $7, %edx + jz .Lrep_good_exit + +.Lrep_good_bytes: + mov %edx, %ecx + rep stosb + +.Lrep_good_exit: + # see .Lexit comment above + xor %eax, %eax + RET + +.Lrep_good_qwords_exception: + # convert remaining qwords back into bytes to return to caller + shl $3, %rcx + and $7, %edx + add %rdx, %rcx + jmp .Lrep_good_exit + + _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception) + _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit) +SYM_FUNC_END(clear_user_rep_good) +EXPORT_SYMBOL(clear_user_rep_good) + +/* + * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. + * Input: + * rdi destination + * rcx count + * + * Output: + * rcx: uncleared bytes or 0 if successful. + * + */ +SYM_FUNC_START(clear_user_erms) + # call the original thing for less than a cacheline + cmp $64, %rcx + jb clear_user_original + +.Lerms_bytes: + rep stosb + +.Lerms_exit: + xorl %eax,%eax + RET + + _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit) +SYM_FUNC_END(clear_user_erms) +EXPORT_SYMBOL(clear_user_erms) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0ae6cf804197..6c1f8ac5e721 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -14,46 +14,6 @@ * Zero Userspace */ -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_fault(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - stac(); - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - " .align 16\n" - "0: movq $0,(%[dst])\n" - " addq $8,%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb $0,(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - - _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1]) - _ASM_EXTABLE_UA(1b, 2b) - - : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); - clac(); - return size; -} -EXPORT_SYMBOL(__clear_user); - -unsigned long clear_user(void __user *to, unsigned long n) -{ - if (access_ok(to, n)) - return __clear_user(to, n); - return n; -} -EXPORT_SYMBOL(clear_user); - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0cec74da7ffe..4b2e11726f4e 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1071,6 +1071,9 @@ static const char *uaccess_safe_builtin[] = { "copy_mc_fragile_handle_tail", "copy_mc_enhanced_fast_string", "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ + "clear_user_erms", + "clear_user_rep_good", + "clear_user_original", NULL }; -- cgit v1.2.3 From 31123c0360e01ee0389aee3a7b2ad32f13136662 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:47 -0700 Subject: selftests/bpf: bpf_setsockopt tests This patch adds tests to exercise optnames that are allowed in bpf_setsockopt(). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061847.4182339-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/setget_sockopt.c | 125 ++++++ .../testing/selftests/bpf/progs/bpf_tracing_net.h | 31 +- tools/testing/selftests/bpf/progs/setget_sockopt.c | 451 +++++++++++++++++++++ 3 files changed, 606 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/setget_sockopt.c create mode 100644 tools/testing/selftests/bpf/progs/setget_sockopt.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c new file mode 100644 index 000000000000..018611e6b248 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#define _GNU_SOURCE +#include +#include +#include + +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "network_helpers.h" + +#include "setget_sockopt.skel.h" + +#define CG_NAME "/setget-sockopt-test" + +static const char addr4_str[] = "127.0.0.1"; +static const char addr6_str[] = "::1"; +static struct setget_sockopt *skel; +static int cg_fd; + +static int create_netns(void) +{ + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns")) + return -1; + + if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up")) + return -1; + + if (!ASSERT_OK(system("ip link add dev binddevtest1 type veth peer name binddevtest2"), + "add veth")) + return -1; + + if (!ASSERT_OK(system("ip link set dev binddevtest1 up"), + "bring veth up")) + return -1; + + return 0; +} + +static void test_tcp(int family) +{ + struct setget_sockopt__bss *bss = skel->bss; + int sfd, cfd; + + memset(bss, 0, sizeof(*bss)); + + sfd = start_server(family, SOCK_STREAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + return; + + cfd = connect_to_fd(sfd, 0); + if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) { + close(sfd); + return; + } + close(sfd); + close(cfd); + + ASSERT_EQ(bss->nr_listen, 1, "nr_listen"); + ASSERT_EQ(bss->nr_connect, 1, "nr_connect"); + ASSERT_EQ(bss->nr_active, 1, "nr_active"); + ASSERT_EQ(bss->nr_passive, 1, "nr_passive"); + ASSERT_EQ(bss->nr_socket_post_create, 2, "nr_socket_post_create"); + ASSERT_EQ(bss->nr_binddev, 2, "nr_bind"); +} + +static void test_udp(int family) +{ + struct setget_sockopt__bss *bss = skel->bss; + int sfd; + + memset(bss, 0, sizeof(*bss)); + + sfd = start_server(family, SOCK_DGRAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + return; + close(sfd); + + ASSERT_GE(bss->nr_socket_post_create, 1, "nr_socket_post_create"); + ASSERT_EQ(bss->nr_binddev, 1, "nr_bind"); +} + +void test_setget_sockopt(void) +{ + cg_fd = test__join_cgroup(CG_NAME); + if (cg_fd < 0) + return; + + if (create_netns()) + goto done; + + skel = setget_sockopt__open(); + if (!ASSERT_OK_PTR(skel, "open skel")) + goto done; + + strcpy(skel->rodata->veth, "binddevtest1"); + skel->rodata->veth_ifindex = if_nametoindex("binddevtest1"); + if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex")) + goto done; + + if (!ASSERT_OK(setget_sockopt__load(skel), "load skel")) + goto done; + + skel->links.skops_sockopt = + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd); + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup")) + goto done; + + skel->links.socket_post_create = + bpf_program__attach_cgroup(skel->progs.socket_post_create, cg_fd); + if (!ASSERT_OK_PTR(skel->links.socket_post_create, "attach_cgroup")) + goto done; + + test_tcp(AF_INET6); + test_tcp(AF_INET); + test_udp(AF_INET6); + test_udp(AF_INET); + +done: + setget_sockopt__destroy(skel); + close(cg_fd); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 98dd2c4815f0..5ebc6dabef84 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -6,13 +6,40 @@ #define AF_INET6 10 #define SOL_SOCKET 1 +#define SO_REUSEADDR 2 #define SO_SNDBUF 7 -#define __SO_ACCEPTCON (1 << 16) +#define SO_RCVBUF 8 +#define SO_KEEPALIVE 9 #define SO_PRIORITY 12 +#define SO_REUSEPORT 15 +#define SO_RCVLOWAT 18 +#define SO_BINDTODEVICE 25 +#define SO_MARK 36 +#define SO_MAX_PACING_RATE 47 +#define SO_BINDTOIFINDEX 62 +#define SO_TXREHASH 74 +#define __SO_ACCEPTCON (1 << 16) + +#define IP_TOS 1 + +#define IPV6_TCLASS 67 +#define IPV6_AUTOFLOWLABEL 70 #define SOL_TCP 6 +#define TCP_NODELAY 1 +#define TCP_MAXSEG 2 +#define TCP_KEEPIDLE 4 +#define TCP_KEEPINTVL 5 +#define TCP_KEEPCNT 6 +#define TCP_SYNCNT 7 +#define TCP_WINDOW_CLAMP 10 #define TCP_CONGESTION 13 +#define TCP_THIN_LINEAR_TIMEOUTS 16 +#define TCP_USER_TIMEOUT 18 +#define TCP_NOTSENT_LOWAT 25 +#define TCP_SAVE_SYN 27 #define TCP_CA_NAME_MAX 16 +#define TCP_NAGLE_OFF 1 #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 @@ -49,6 +76,8 @@ #define sk_state __sk_common.skc_state #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr +#define sk_flags __sk_common.skc_flags +#define sk_reuse __sk_common.skc_reuse #define s6_addr32 in6_u.u6_addr32 diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c new file mode 100644 index 000000000000..4a4cb44a4a15 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -0,0 +1,451 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +extern unsigned long CONFIG_HZ __kconfig; + +const volatile char veth[IFNAMSIZ]; +const volatile int veth_ifindex; + +int nr_listen; +int nr_passive; +int nr_active; +int nr_connect; +int nr_binddev; +int nr_socket_post_create; + +struct sockopt_test { + int opt; + int new; + int restore; + int expected; + int tcp_expected; + unsigned int flip:1; +}; + +static const char cubic_cc[] = "cubic"; +static const char reno_cc[] = "reno"; + +static const struct sockopt_test sol_socket_tests[] = { + { .opt = SO_REUSEADDR, .flip = 1, }, + { .opt = SO_SNDBUF, .new = 8123, .expected = 8123 * 2, }, + { .opt = SO_RCVBUF, .new = 8123, .expected = 8123 * 2, }, + { .opt = SO_KEEPALIVE, .flip = 1, }, + { .opt = SO_PRIORITY, .new = 0xeb9f, .expected = 0xeb9f, }, + { .opt = SO_REUSEPORT, .flip = 1, }, + { .opt = SO_RCVLOWAT, .new = 8123, .expected = 8123, }, + { .opt = SO_MARK, .new = 0xeb9f, .expected = 0xeb9f, }, + { .opt = SO_MAX_PACING_RATE, .new = 0xeb9f, .expected = 0xeb9f, }, + { .opt = SO_TXREHASH, .flip = 1, }, + { .opt = 0, }, +}; + +static const struct sockopt_test sol_tcp_tests[] = { + { .opt = TCP_NODELAY, .flip = 1, }, + { .opt = TCP_MAXSEG, .new = 1314, .expected = 1314, }, + { .opt = TCP_KEEPIDLE, .new = 123, .expected = 123, .restore = 321, }, + { .opt = TCP_KEEPINTVL, .new = 123, .expected = 123, .restore = 321, }, + { .opt = TCP_KEEPCNT, .new = 123, .expected = 123, .restore = 124, }, + { .opt = TCP_SYNCNT, .new = 123, .expected = 123, .restore = 124, }, + { .opt = TCP_WINDOW_CLAMP, .new = 8123, .expected = 8123, .restore = 8124, }, + { .opt = TCP_CONGESTION, }, + { .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, }, + { .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, }, + { .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, }, + { .opt = TCP_SAVE_SYN, .new = 1, .expected = 1, }, + { .opt = 0, }, +}; + +static const struct sockopt_test sol_ip_tests[] = { + { .opt = IP_TOS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, }, + { .opt = 0, }, +}; + +static const struct sockopt_test sol_ipv6_tests[] = { + { .opt = IPV6_TCLASS, .new = 0xe1, .expected = 0xe1, .tcp_expected = 0xe0, }, + { .opt = IPV6_AUTOFLOWLABEL, .flip = 1, }, + { .opt = 0, }, +}; + +struct loop_ctx { + void *ctx; + struct sock *sk; +}; + +static int __bpf_getsockopt(void *ctx, struct sock *sk, + int level, int opt, int *optval, + int optlen) +{ + if (level == SOL_SOCKET) { + switch (opt) { + case SO_REUSEADDR: + *optval = !!BPF_CORE_READ_BITFIELD(sk, sk_reuse); + break; + case SO_KEEPALIVE: + *optval = !!(sk->sk_flags & (1UL << 3)); + break; + case SO_RCVLOWAT: + *optval = sk->sk_rcvlowat; + break; + case SO_MAX_PACING_RATE: + *optval = sk->sk_max_pacing_rate; + break; + default: + return bpf_getsockopt(ctx, level, opt, optval, optlen); + } + return 0; + } + + if (level == IPPROTO_TCP) { + struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk); + + if (!tp) + return -1; + + switch (opt) { + case TCP_NODELAY: + *optval = !!(BPF_CORE_READ_BITFIELD(tp, nonagle) & TCP_NAGLE_OFF); + break; + case TCP_MAXSEG: + *optval = tp->rx_opt.user_mss; + break; + case TCP_KEEPIDLE: + *optval = tp->keepalive_time / CONFIG_HZ; + break; + case TCP_SYNCNT: + *optval = tp->inet_conn.icsk_syn_retries; + break; + case TCP_KEEPINTVL: + *optval = tp->keepalive_intvl / CONFIG_HZ; + break; + case TCP_KEEPCNT: + *optval = tp->keepalive_probes; + break; + case TCP_WINDOW_CLAMP: + *optval = tp->window_clamp; + break; + case TCP_THIN_LINEAR_TIMEOUTS: + *optval = !!BPF_CORE_READ_BITFIELD(tp, thin_lto); + break; + case TCP_USER_TIMEOUT: + *optval = tp->inet_conn.icsk_user_timeout; + break; + case TCP_NOTSENT_LOWAT: + *optval = tp->notsent_lowat; + break; + case TCP_SAVE_SYN: + *optval = BPF_CORE_READ_BITFIELD(tp, save_syn); + break; + default: + return bpf_getsockopt(ctx, level, opt, optval, optlen); + } + return 0; + } + + if (level == IPPROTO_IPV6) { + switch (opt) { + case IPV6_AUTOFLOWLABEL: { + __u16 proto = sk->sk_protocol; + struct inet_sock *inet_sk; + + if (proto == IPPROTO_TCP) + inet_sk = (struct inet_sock *)bpf_skc_to_tcp_sock(sk); + else + inet_sk = (struct inet_sock *)bpf_skc_to_udp6_sock(sk); + + if (!inet_sk) + return -1; + + *optval = !!inet_sk->pinet6->autoflowlabel; + break; + } + default: + return bpf_getsockopt(ctx, level, opt, optval, optlen); + } + return 0; + } + + return bpf_getsockopt(ctx, level, opt, optval, optlen); +} + +static int bpf_test_sockopt_flip(void *ctx, struct sock *sk, + const struct sockopt_test *t, + int level) +{ + int old, tmp, new, opt = t->opt; + + opt = t->opt; + + if (__bpf_getsockopt(ctx, sk, level, opt, &old, sizeof(old))) + return 1; + /* kernel initialized txrehash to 255 */ + if (level == SOL_SOCKET && opt == SO_TXREHASH && old != 0 && old != 1) + old = 1; + + new = !old; + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new))) + return 1; + if (__bpf_getsockopt(ctx, sk, level, opt, &tmp, sizeof(tmp)) || + tmp != new) + return 1; + + if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old))) + return 1; + + return 0; +} + +static int bpf_test_sockopt_int(void *ctx, struct sock *sk, + const struct sockopt_test *t, + int level) +{ + int old, tmp, new, expected, opt; + + opt = t->opt; + new = t->new; + if (sk->sk_type == SOCK_STREAM && t->tcp_expected) + expected = t->tcp_expected; + else + expected = t->expected; + + if (__bpf_getsockopt(ctx, sk, level, opt, &old, sizeof(old)) || + old == new) + return 1; + + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new))) + return 1; + if (__bpf_getsockopt(ctx, sk, level, opt, &tmp, sizeof(tmp)) || + tmp != expected) + return 1; + + if (t->restore) + old = t->restore; + if (bpf_setsockopt(ctx, level, opt, &old, sizeof(old))) + return 1; + + return 0; +} + +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + + if (i >= ARRAY_SIZE(sol_socket_tests)) + return 1; + + t = &sol_socket_tests[i]; + if (!t->opt) + return 1; + + if (t->flip) + return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, SOL_SOCKET); + + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET); +} + +static int bpf_test_ip_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + + if (i >= ARRAY_SIZE(sol_ip_tests)) + return 1; + + t = &sol_ip_tests[i]; + if (!t->opt) + return 1; + + if (t->flip) + return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IP); + + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IP); +} + +static int bpf_test_ipv6_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + + if (i >= ARRAY_SIZE(sol_ipv6_tests)) + return 1; + + t = &sol_ipv6_tests[i]; + if (!t->opt) + return 1; + + if (t->flip) + return bpf_test_sockopt_flip(lc->ctx, lc->sk, t, IPPROTO_IPV6); + + return bpf_test_sockopt_int(lc->ctx, lc->sk, t, IPPROTO_IPV6); +} + +static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc) +{ + const struct sockopt_test *t; + struct sock *sk; + void *ctx; + + if (i >= ARRAY_SIZE(sol_tcp_tests)) + return 1; + + t = &sol_tcp_tests[i]; + if (!t->opt) + return 1; + + ctx = lc->ctx; + sk = lc->sk; + + if (t->opt == TCP_CONGESTION) { + char old_cc[16], tmp_cc[16]; + const char *new_cc; + + if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc))) + return 1; + if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc)) + new_cc = reno_cc; + else + new_cc = cubic_cc; + if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, (void *)new_cc, + sizeof(new_cc))) + return 1; + if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, tmp_cc, sizeof(tmp_cc))) + return 1; + if (bpf_strncmp(tmp_cc, sizeof(tmp_cc), new_cc)) + return 1; + if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc))) + return 1; + return 0; + } + + if (t->flip) + return bpf_test_sockopt_flip(ctx, sk, t, IPPROTO_TCP); + + return bpf_test_sockopt_int(ctx, sk, t, IPPROTO_TCP); +} + +static int bpf_test_sockopt(void *ctx, struct sock *sk) +{ + struct loop_ctx lc = { .ctx = ctx, .sk = sk, }; + __u16 family, proto; + int n; + + family = sk->sk_family; + proto = sk->sk_protocol; + + n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_socket_tests)) + return -1; + + if (proto == IPPROTO_TCP) { + n = bpf_loop(ARRAY_SIZE(sol_tcp_tests), bpf_test_tcp_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_tcp_tests)) + return -1; + } + + if (family == AF_INET) { + n = bpf_loop(ARRAY_SIZE(sol_ip_tests), bpf_test_ip_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_ip_tests)) + return -1; + } else { + n = bpf_loop(ARRAY_SIZE(sol_ipv6_tests), bpf_test_ipv6_sockopt, &lc, 0); + if (n != ARRAY_SIZE(sol_ipv6_tests)) + return -1; + } + + return 0; +} + +static int binddev_test(void *ctx) +{ + const char empty_ifname[] = ""; + int ifindex, zero = 0; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + (void *)veth, sizeof(veth))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != veth_ifindex) + return -1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + (void *)empty_ifname, sizeof(empty_ifname))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != 0) + return -1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + (void *)&veth_ifindex, sizeof(int))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != veth_ifindex) + return -1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &zero, sizeof(int))) + return -1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_BINDTOIFINDEX, + &ifindex, sizeof(int)) || + ifindex != 0) + return -1; + + return 0; +} + +SEC("lsm_cgroup/socket_post_create") +int BPF_PROG(socket_post_create, struct socket *sock, int family, + int type, int protocol, int kern) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 1; + + nr_socket_post_create += !bpf_test_sockopt(sk, sk); + nr_binddev += !binddev_test(sk); + + return 1; +} + +SEC("sockops") +int skops_sockopt(struct bpf_sock_ops *skops) +{ + struct bpf_sock *bpf_sk = skops->sk; + struct sock *sk; + + if (!bpf_sk) + return 1; + + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk); + if (!sk) + return 1; + + switch (skops->op) { + case BPF_SOCK_OPS_TCP_LISTEN_CB: + nr_listen += !bpf_test_sockopt(skops, sk); + break; + case BPF_SOCK_OPS_TCP_CONNECT_CB: + nr_connect += !bpf_test_sockopt(skops, sk); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + nr_active += !bpf_test_sockopt(skops, sk); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + nr_passive += !bpf_test_sockopt(skops, sk); + break; + } + + return 1; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 2cd87cea78425e6b019e34e969dc008ce560acbf Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Aug 2022 17:28:25 +0200 Subject: selftests: mlxsw: Add ingress RIF configuration test for 802.1D bridge Before layer 2 forwarding, the device classifies an incoming packet to a FID. After classification, the FID is known, but also all the attributes of the FID, such as the router interface (RIF) via which a packet that needs to be routed will ingress the router block. For VLAN-unaware bridges (802.1D), the FID classification is done according to {Port, VID}. When a RIF is added on top of a FID, all the existing {Port, VID}->FID mappings should be updated by the software with the new RIF. In addition, when a new mapping is added for FID which already has a RIF, the correct RIF should be used for it. Add a test to verify that packets can be routed after {Port, VID}->FID classification, regardless of the order of the configuration. # ./ingress_rif_conf_1d.sh TEST: Add RIF for existing {port, VID}->FID mapping [ OK ] TEST: Add {port, VID}->FID mapping for FID with a RIF [ OK ] Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: Jakub Kicinski --- .../drivers/net/mlxsw/ingress_rif_conf_1d.sh | 264 +++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1d.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1d.sh b/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1d.sh new file mode 100755 index 000000000000..df2b09966886 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1d.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test routing over bridge and verify that the order of configuration does not +# impact switch behavior. Verify that RIF is added correctly for existing +# mappings and that new mappings use the correct RIF. + +# +-------------------+ +--------------------+ +# | H1 | | H2 | +# | | | | +# | $h1.10 + | | + $h2.10 | +# | 192.0.2.1/28 | | | | 192.0.2.3/28 | +# | | | | | | +# | $h1 + | | + $h2 | +# +----------------|--+ +--|-----------------+ +# | | +# +----------------|-------------------------|-----------------+ +# | SW | | | +# | +--------------|-------------------------|---------------+ | +# | | $swp1 + + $swp2 | | +# | | | | | | +# | | $swp1.10 + + $swp2.10 | | +# | | | | +# | | br0 | | +# | | 192.0.2.2/28 | | +# | +--------------------------------------------------------+ | +# | | +# | $swp3.10 + | +# | 192.0.2.17/28 | | +# | | | +# | $swp3 + | +# +---------------|--------------------------------------------+ +# | +# +---------------|--+ +# | $h3 + | +# | | | +# | $h3.10 + | +# | 192.0.2.18/28 | +# | | +# | H3 | +# +------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + port_vid_map_rif + rif_port_vid_map +" + +NUM_NETIFS=6 +source $lib_dir/lib.sh +source $lib_dir/tc_common.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 + vlan_create $h1 10 v$h1 192.0.2.1/28 + + ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + + vlan_destroy $h1 10 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + vlan_create $h2 10 v$h2 192.0.2.3/28 +} + +h2_destroy() +{ + vlan_destroy $h2 10 + simple_if_fini $h2 +} + +h3_create() +{ + simple_if_init $h3 + vlan_create $h3 10 v$h3 192.0.2.18/28 + + ip route add 192.0.2.0/28 vrf v$h3 nexthop via 192.0.2.17 +} + +h3_destroy() +{ + ip route del 192.0.2.0/28 vrf v$h3 nexthop via 192.0.2.17 + + vlan_destroy $h3 10 + simple_if_fini $h3 +} + +switch_create() +{ + ip link set dev $swp1 up + + ip link add dev br0 type bridge mcast_snooping 0 + + # By default, a link-local address is generated when netdevice becomes + # up. Adding an address to the bridge will cause creating a RIF for it. + # Prevent generating link-local address to be able to control when the + # RIF is added. + sysctl_set net.ipv6.conf.br0.addr_gen_mode 1 + ip link set dev br0 up + + ip link set dev $swp2 up + vlan_create $swp2 10 + ip link set dev $swp2.10 master br0 + + ip link set dev $swp3 up + vlan_create $swp3 10 "" 192.0.2.17/28 + tc qdisc add dev $swp3 clsact + + # Replace neighbor to avoid 1 packet which is forwarded in software due + # to "unresolved neigh". + ip neigh replace dev $swp3.10 192.0.2.18 lladdr $(mac_get $h3.10) +} + +switch_destroy() +{ + tc qdisc del dev $swp3 clsact + vlan_destroy $swp3 10 + ip link set dev $swp3 down + + ip link set dev $swp2.10 nomaster + vlan_destroy $swp2 10 + ip link set dev $swp2 down + + ip link set dev br0 down + sysctl_restore net.ipv6.conf.br0.addr_gen_mode + ip link del dev br0 + + ip link set dev $swp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + h3_create + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + h3_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +bridge_rif_add() +{ + rifs_occ_t0=$(devlink_resource_occ_get rifs) + __addr_add_del br0 add 192.0.2.2/28 + rifs_occ_t1=$(devlink_resource_occ_get rifs) + + expected_rifs=$((rifs_occ_t0 + 1)) + + [[ $expected_rifs -eq $rifs_occ_t1 ]] + check_err $? "Expected $expected_rifs RIFs, $rifs_occ_t1 are used" + + sleep 1 +} + +bridge_rif_del() +{ + __addr_add_del br0 del 192.0.2.2/28 +} + +port_vid_map_rif() +{ + RET=0 + + # First add {port, VID}->FID for $swp1.10, then add a RIF and verify + # that packets can be routed via the existing mapping. + vlan_create $swp1 10 + ip link set dev $swp1.10 master br0 + bridge_rif_add + + # The hardware matches on the first ethertype which is not VLAN, + # so the protocol should be IP. + tc filter add dev $swp3 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.18 action pass + + ping_do $h1.10 192.0.2.18 + check_err $? "Ping failed" + + tc_check_at_least_x_packets "dev $swp3 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add RIF for existing {port, VID}->FID mapping" + + tc filter del dev $swp3 egress + + bridge_rif_del + ip link set dev $swp1.10 nomaster + vlan_destroy $swp1 10 +} + +rif_port_vid_map() +{ + RET=0 + + # First add an address to the bridge, which will create a RIF on top of + # it, then add a new {port, VID}->FID mapping and verify that packets + # can be routed via the new mapping. + bridge_rif_add + vlan_create $swp1 10 + ip link set dev $swp1.10 master br0 + + # The hardware matches on the first ethertype which is not VLAN, + # so the protocol should be IP. + tc filter add dev $swp3 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.18 action pass + + ping_do $h1.10 192.0.2.18 + check_err $? "Ping failed" + + tc_check_at_least_x_packets "dev $swp3 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add {port, VID}->FID mapping for FID with a RIF" + + tc filter del dev $swp3 egress + + ip link set dev $swp1.10 nomaster + vlan_destroy $swp1 10 + bridge_rif_del +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From 3a5ddc886847d4cd84bd2ce7bbbfdb3fd5845678 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Aug 2022 17:28:26 +0200 Subject: selftests: mlxsw: Add ingress RIF configuration test for 802.1Q bridge Before layer 2 forwarding, the device classifies an incoming packet to a FID. After classification, the FID is known, but also all the attributes of the FID, such as the router interface (RIF) via which a packet that needs to be routed will ingress the router block. For VLAN-aware bridges (802.1Q), the FID classification is done according to VID. When a RIF is added on top of a FID, the existing VID->FID mapping should be updated by the software with the new RIF. We never map multiple VLANs to the same FID using VID->FID, so we cannot create VID->FID for FID which already has a RIF using 802.1Q. Anyway, verify that packets can be routed via port which is added after the FID already has a RIF. Add a test to verify that packets can be routed after VID->FID classification, regardless of the order of the configuration. # ./ingress_rif_conf_1q.sh TEST: Add RIF for existing VID->FID mapping [ OK ] TEST: Add port to VID->FID mapping for FID with a RIF [ OK ] Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: Jakub Kicinski --- .../drivers/net/mlxsw/ingress_rif_conf_1q.sh | 264 +++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1q.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1q.sh b/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1q.sh new file mode 100755 index 000000000000..577293bab88b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_1q.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test routing over bridge and verify that the order of configuration does not +# impact switch behavior. Verify that RIF is added correctly for existing +# mapping and that packets can be routed via port which is added after the FID +# already has a RIF. + +# +-------------------+ +--------------------+ +# | H1 | | H2 | +# | | | | +# | $h1.10 + | | + $h2.10 | +# | 192.0.2.1/28 | | | | 192.0.2.3/28 | +# | | | | | | +# | $h1 + | | + $h2 | +# +----------------|--+ +--|-----------------+ +# | | +# +----------------|-------------------------|-----------------+ +# | SW | | | +# | +--------------|-------------------------|---------------+ | +# | | $swp1 + + $swp2 | | +# | | | | +# | | br0 | | +# | +--------------------------------------------------------+ | +# | | | +# | br0.10 | +# | 192.0.2.2/28 | +# | | +# | | +# | $swp3 + | +# | 192.0.2.17/28 | | +# +----------------|-------------------------------------------+ +# | +# +----------------|--+ +# | $h3 + | +# | 192.0.2.18/28 | +# | | +# | H3 | +# +-------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + vid_map_rif + rif_vid_map +" + +NUM_NETIFS=6 +source $lib_dir/lib.sh +source $lib_dir/tc_common.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 + vlan_create $h1 10 v$h1 192.0.2.1/28 + + ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + + vlan_destroy $h1 10 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + vlan_create $h2 10 v$h2 192.0.2.3/28 +} + +h2_destroy() +{ + vlan_destroy $h2 10 + simple_if_fini $h2 +} + +h3_create() +{ + simple_if_init $h3 192.0.2.18/28 + ip route add 192.0.2.0/28 vrf v$h3 nexthop via 192.0.2.17 +} + +h3_destroy() +{ + ip route del 192.0.2.0/28 vrf v$h3 nexthop via 192.0.2.17 + simple_if_fini $h3 192.0.2.18/28 +} + +switch_create() +{ + ip link set dev $swp1 up + + ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0 + + # By default, a link-local address is generated when netdevice becomes + # up. Adding an address to the bridge will cause creating a RIF for it. + # Prevent generating link-local address to be able to control when the + # RIF is added. + sysctl_set net.ipv6.conf.br0.addr_gen_mode 1 + ip link set dev br0 up + + ip link set dev $swp2 up + ip link set dev $swp2 master br0 + bridge vlan add vid 10 dev $swp2 + + ip link set dev $swp3 up + __addr_add_del $swp3 add 192.0.2.17/28 + tc qdisc add dev $swp3 clsact + + # Replace neighbor to avoid 1 packet which is forwarded in software due + # to "unresolved neigh". + ip neigh replace dev $swp3 192.0.2.18 lladdr $(mac_get $h3) +} + +switch_destroy() +{ + tc qdisc del dev $swp3 clsact + __addr_add_del $swp3 del 192.0.2.17/28 + ip link set dev $swp3 down + + bridge vlan del vid 10 dev $swp2 + ip link set dev $swp2 nomaster + ip link set dev $swp2 down + + ip link set dev br0 down + sysctl_restore net.ipv6.conf.br0.addr_gen_mode + ip link del dev br0 + + ip link set dev $swp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + h3_create + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + h3_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +bridge_rif_add() +{ + rifs_occ_t0=$(devlink_resource_occ_get rifs) + vlan_create br0 10 "" 192.0.2.2/28 + rifs_occ_t1=$(devlink_resource_occ_get rifs) + + expected_rifs=$((rifs_occ_t0 + 1)) + + [[ $expected_rifs -eq $rifs_occ_t1 ]] + check_err $? "Expected $expected_rifs RIFs, $rifs_occ_t1 are used" + + sleep 1 +} + +bridge_rif_del() +{ + vlan_destroy br0 10 +} + +vid_map_rif() +{ + RET=0 + + # First add VID->FID for vlan 10, then add a RIF and verify that + # packets can be routed via the existing mapping. + bridge vlan add vid 10 dev br0 self + ip link set dev $swp1 master br0 + bridge vlan add vid 10 dev $swp1 + + bridge_rif_add + + tc filter add dev $swp3 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.18 action pass + + ping_do $h1.10 192.0.2.18 + check_err $? "Ping failed" + + tc_check_at_least_x_packets "dev $swp3 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add RIF for existing VID->FID mapping" + + tc filter del dev $swp3 egress + + bridge_rif_del + + bridge vlan del vid 10 dev $swp1 + ip link set dev $swp1 nomaster + bridge vlan del vid 10 dev br0 self +} + +rif_vid_map() +{ + RET=0 + + # Using 802.1Q, there is only one VID->FID map for each VID. That means + # that we cannot really check adding a new map for existing FID with a + # RIF. Verify that packets can be routed via port which is added after + # the FID already has a RIF, although in practice there is no new + # mapping in the hardware. + bridge vlan add vid 10 dev br0 self + bridge_rif_add + + ip link set dev $swp1 master br0 + bridge vlan add vid 10 dev $swp1 + + tc filter add dev $swp3 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.18 action pass + + ping_do $h1.10 192.0.2.18 + check_err $? "Ping failed" + + tc_check_at_least_x_packets "dev $swp3 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add port to VID->FID mapping for FID with a RIF" + + tc filter del dev $swp3 egress + + bridge vlan del vid 10 dev $swp1 + ip link set dev $swp1 nomaster + + bridge_rif_del + bridge vlan del vid 10 dev br0 self +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From cbeb6e1195d1f293a11534f5eeb7455af3c8f9d5 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Aug 2022 17:28:27 +0200 Subject: selftests: mlxsw: Add ingress RIF configuration test for VXLAN Before layer 2 forwarding, the device classifies an incoming packet to a FID. After classification, the FID is known, but also all the attributes of the FID, such as the router interface (RIF) via which a packet that needs to be routed will ingress the router block. For VXLAN decapsulation, the FID classification is done according to the VNI. When a RIF is added on top of a FID, the existing VNI->FID mapping should be updated by the software with the new RIF. In addition, when a new mapping is added for FID which already has a RIF, the correct RIF should be used for it. Add a test to verify that packets can be routed after decapsulation which is done after VNI->FID classification, regardless of the order of the configuration. # ./ingress_rif_conf_vxlan.sh TEST: Add RIF for existing VNI->FID mapping [ OK ] TEST: Add VNI->FID mapping for FID with a RIF [ OK ] Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: Jakub Kicinski --- .../drivers/net/mlxsw/ingress_rif_conf_vxlan.sh | 311 +++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_vxlan.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_vxlan.sh new file mode 100755 index 000000000000..90450216a10d --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/ingress_rif_conf_vxlan.sh @@ -0,0 +1,311 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test routing after VXLAN decapsulation and verify that the order of +# configuration does not impact switch behavior. Verify that RIF is added +# correctly for existing mapping and that new mapping uses the correct RIF. + +# +---------------------------+ +# | H1 | +# | + $h1 | +# | | 192.0.2.1/28 | +# +----|----------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | +--|--------------------------------------------------------------------+ | +# | | + $swp1 br1 | | +# | | vid 10 pvid untagged | | +# | | | | +# | | | | +# | | + vx4001 | | +# | | local 192.0.2.17 | | +# | | remote 192.0.2.18 | | +# | | id 104001 | | +# | | dstport $VXPORT | | +# | | vid 4001 pvid untagged | | +# | | | | +# | +----------------------------------+------------------------------------+ | +# | | | +# | +----------------------------------|------------------------------------+ | +# | | | | | +# | | +-------------------------------+---------------------------------+ | | +# | | | | | | +# | | + vlan10 vlan4001 + | | +# | | 192.0.2.2/28 | | +# | | | | +# | | vrf-green | | +# | +-----------------------------------------------------------------------+ | +# | | +# | + $rp1 +lo | +# | | 198.51.100.1/24 192.0.2.17/32 | +# +----|----------------------------------------------------------------------+ +# | +# +----|--------------------------------------------------------+ +# | | v$rp2 | +# | + $rp2 | +# | 198.51.100.2/24 | +# | | +# +-------------------------------------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + vni_fid_map_rif + rif_vni_fid_map +" + +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/tc_common.sh +source $lib_dir/devlink_lib.sh + +: ${VXPORT:=4789} +export VXPORT + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 +} + +switch_create() +{ + ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \ + mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br1 address $(mac_get $swp1) + ip link set dev br1 up + + ip link set dev $rp1 up + ip address add dev $rp1 198.51.100.1/24 + + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + bridge vlan add vid 10 dev $swp1 pvid untagged + + tc qdisc add dev $swp1 clsact + + ip link add name vx4001 type vxlan id 104001 \ + local 192.0.2.17 dstport $VXPORT \ + nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx4001 up + + ip link set dev vx4001 master br1 + + ip address add 192.0.2.17/32 dev lo + + # Create SVIs. + vrf_create "vrf-green" + ip link set dev vrf-green up + + ip link add link br1 name vlan10 up master vrf-green type vlan id 10 + + # Replace neighbor to avoid 1 packet which is forwarded in software due + # to "unresolved neigh". + ip neigh replace dev vlan10 192.0.2.1 lladdr $(mac_get $h1) + + ip address add 192.0.2.2/28 dev vlan10 + + bridge vlan add vid 10 dev br1 self + bridge vlan add vid 4001 dev br1 self + + sysctl_set net.ipv4.conf.all.rp_filter 0 +} + +switch_destroy() +{ + sysctl_restore net.ipv4.conf.all.rp_filter + + bridge vlan del vid 4001 dev br1 self + bridge vlan del vid 10 dev br1 self + + ip link del dev vlan10 + + vrf_destroy "vrf-green" + + ip address del 192.0.2.17/32 dev lo + + tc qdisc del dev $swp1 clsact + + bridge vlan del vid 10 dev $swp1 + ip link set dev $swp1 down + ip link set dev $swp1 nomaster + + ip link set dev vx4001 nomaster + + ip link set dev vx4001 down + ip link del dev vx4001 + + ip address del dev $rp1 198.51.100.1/24 + ip link set dev $rp1 down + + ip link set dev br1 down + ip link del dev br1 +} + +vrp2_create() +{ + simple_if_init $rp2 198.51.100.2/24 + + ip route add 192.0.2.17/32 vrf v$rp2 nexthop via 198.51.100.1 +} + +vrp2_destroy() +{ + ip route del 192.0.2.17/32 vrf v$rp2 nexthop via 198.51.100.1 + + simple_if_fini $rp2 198.51.100.2/24 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + rp1=${NETIFS[p3]} + rp2=${NETIFS[p4]} + + vrf_prepare + forwarding_enable + + h1_create + switch_create + + vrp2_create +} + +cleanup() +{ + pre_cleanup + + vrp2_destroy + + switch_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +payload_get() +{ + local dest_mac=$(mac_get vlan4001) + local src_mac=$(mac_get $rp1) + + p=$(: + )"08:"$( : VXLAN flags + )"00:00:00:"$( : VXLAN reserved + )"01:96:41:"$( : VXLAN VNI : 104001 + )"00:"$( : VXLAN reserved + )"$dest_mac:"$( : ETH daddr + )"$src_mac:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:54:"$( : IP total length + )"3f:49:"$( : IP identification + )"00:00:"$( : IP flags + frag off + )"3f:"$( : IP TTL + )"01:"$( : IP proto + )"50:21:"$( : IP header csum + )"c6:33:64:0a:"$( : IP saddr: 198.51.100.10 + )"c0:00:02:01:"$( : IP daddr: 192.0.2.1 + ) + echo $p +} + +vlan_rif_add() +{ + rifs_occ_t0=$(devlink_resource_occ_get rifs) + + ip link add link br1 name vlan4001 up master vrf-green \ + type vlan id 4001 + + rifs_occ_t1=$(devlink_resource_occ_get rifs) + expected_rifs=$((rifs_occ_t0 + 1)) + + [[ $expected_rifs -eq $rifs_occ_t1 ]] + check_err $? "Expected $expected_rifs RIFs, $rifs_occ_t1 are used" +} + +vlan_rif_del() +{ + ip link del dev vlan4001 +} + +vni_fid_map_rif() +{ + local rp1_mac=$(mac_get $rp1) + + RET=0 + + # First add VNI->FID mapping to the FID of VLAN 4001 + bridge vlan add vid 4001 dev vx4001 pvid untagged + + # Add a RIF to the FID with VNI->FID mapping + vlan_rif_add + + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.1 action pass + + payload=$(payload_get) + ip vrf exec v$rp2 $MZ $rp2 -c 10 -d 1msec -b $rp1_mac \ + -B 192.0.2.17 -A 192.0.2.18 \ + -t udp sp=12345,dp=$VXPORT,p=$payload -q + + tc_check_at_least_x_packets "dev $swp1 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add RIF for existing VNI->FID mapping" + + tc filter del dev $swp1 egress + + bridge vlan del vid 4001 dev vx4001 pvid untagged + vlan_rif_del +} + +rif_vni_fid_map() +{ + local rp1_mac=$(mac_get $rp1) + + RET=0 + + # First add a RIF to the FID of VLAN 4001 + vlan_rif_add + + # Add VNI->FID mapping to FID with a RIF + bridge vlan add vid 4001 dev vx4001 pvid untagged + + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.1 action pass + + payload=$(payload_get) + ip vrf exec v$rp2 $MZ $rp2 -c 10 -d 1msec -b $rp1_mac \ + -B 192.0.2.17 -A 192.0.2.18 \ + -t udp sp=12345,dp=$VXPORT,p=$payload -q + + tc_check_at_least_x_packets "dev $swp1 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add VNI->FID mapping for FID with a RIF" + + tc filter del dev $swp1 egress + + bridge vlan del vid 4001 dev vx4001 pvid untagged + vlan_rif_del +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From 1623d5719fdff46620eb55c5d4f0cf8af1afcdb4 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Aug 2022 17:28:28 +0200 Subject: selftests: mlxsw: Add egress VID classification test After routing, the device always consults a table that determines the packet's egress VID based on {egress RIF, egress local port}. In the unified bridge model, it is up to software to maintain this table via REIV register. The table needs to be updated in the following flows: 1. When a RIF is set on a FID, for each FID's {Port, VID} mapping, a new {RIF, Port}->VID mapping should be created. 2. When a {Port, VID} is mapped to a FID and the FID already has a RIF, a new {RIF, Port}->VID mapping should be created. Add a test to verify that packets get the correct VID after routing, regardless of the order of the configuration. # ./egress_vid_classification.sh TEST: Add RIF for existing {port, VID}->FID mapping [ OK ] TEST: Add {port, VID}->FID mapping for FID with a RIF [ OK ] Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: Jakub Kicinski --- .../drivers/net/mlxsw/egress_vid_classification.sh | 273 +++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/egress_vid_classification.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/egress_vid_classification.sh b/tools/testing/selftests/drivers/net/mlxsw/egress_vid_classification.sh new file mode 100755 index 000000000000..0cf9e47e3209 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/egress_vid_classification.sh @@ -0,0 +1,273 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test VLAN classification after routing and verify that the order of +# configuration does not impact switch behavior. Verify that {RIF, Port}->VID +# mapping is added correctly for existing {Port, VID}->FID mapping and that +# {RIF, Port}->VID mapping is added correctly for new {Port, VID}->FID mapping. + +# +-------------------+ +--------------------+ +# | H1 | | H2 | +# | | | | +# | $h1.10 + | | + $h2.10 | +# | 192.0.2.1/28 | | | | 192.0.2.3/28 | +# | | | | | | +# | $h1 + | | + $h2 | +# +----------------|--+ +--|-----------------+ +# | | +# +----------------|-------------------------|-----------------+ +# | SW | | | +# | +--------------|-------------------------|---------------+ | +# | | $swp1 + + $swp2 | | +# | | | | | | +# | | $swp1.10 + + $swp2.10 | | +# | | | | +# | | br0 | | +# | | 192.0.2.2/28 | | +# | +--------------------------------------------------------+ | +# | | +# | $swp3.20 + | +# | 192.0.2.17/28 | | +# | | | +# | $swp3 + | +# +---------------|--------------------------------------------+ +# | +# +---------------|--+ +# | $h3 + | +# | | | +# | $h3.20 + | +# | 192.0.2.18/28 | +# | | +# | H3 | +# +------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + port_vid_map_rif + rif_port_vid_map +" + +NUM_NETIFS=6 +source $lib_dir/lib.sh +source $lib_dir/tc_common.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 + vlan_create $h1 10 v$h1 192.0.2.1/28 + + ip route add 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip route del 192.0.2.16/28 vrf v$h1 nexthop via 192.0.2.2 + + vlan_destroy $h1 10 + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + vlan_create $h2 10 v$h2 192.0.2.3/28 +} + +h2_destroy() +{ + vlan_destroy $h2 10 + simple_if_fini $h2 +} + +h3_create() +{ + simple_if_init $h3 + vlan_create $h3 20 v$h3 192.0.2.18/28 + + ip route add 192.0.2.0/28 vrf v$h3 nexthop via 192.0.2.17 +} + +h3_destroy() +{ + ip route del 192.0.2.0/28 vrf v$h3 nexthop via 192.0.2.17 + + vlan_destroy $h3 20 + simple_if_fini $h3 +} + +switch_create() +{ + ip link set dev $swp1 up + tc qdisc add dev $swp1 clsact + + ip link add dev br0 type bridge mcast_snooping 0 + + # By default, a link-local address is generated when netdevice becomes + # up. Adding an address to the bridge will cause creating a RIF for it. + # Prevent generating link-local address to be able to control when the + # RIF is added. + sysctl_set net.ipv6.conf.br0.addr_gen_mode 1 + ip link set dev br0 up + + ip link set dev $swp2 up + vlan_create $swp2 10 + ip link set dev $swp2.10 master br0 + + ip link set dev $swp3 up + vlan_create $swp3 20 "" 192.0.2.17/28 + + # Replace neighbor to avoid 1 packet which is forwarded in software due + # to "unresolved neigh". + ip neigh replace dev $swp3.20 192.0.2.18 lladdr $(mac_get $h3.20) +} + +switch_destroy() +{ + vlan_destroy $swp3 20 + ip link set dev $swp3 down + + ip link set dev $swp2.10 nomaster + vlan_destroy $swp2 10 + ip link set dev $swp2 down + + ip link set dev br0 down + sysctl_restore net.ipv6.conf.br0.addr_gen_mode + ip link del dev br0 + + tc qdisc del dev $swp1 clsact + ip link set dev $swp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + h3_create + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + h3_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +bridge_rif_add() +{ + rifs_occ_t0=$(devlink_resource_occ_get rifs) + __addr_add_del br0 add 192.0.2.2/28 + rifs_occ_t1=$(devlink_resource_occ_get rifs) + + expected_rifs=$((rifs_occ_t0 + 1)) + + [[ $expected_rifs -eq $rifs_occ_t1 ]] + check_err $? "Expected $expected_rifs RIFs, $rifs_occ_t1 are used" + + sleep 1 +} + +bridge_rif_del() +{ + __addr_add_del br0 del 192.0.2.2/28 +} + +port_vid_map_rif() +{ + RET=0 + + # First add {port, VID}->FID for swp1.10, then add a RIF and verify that + # packets get the correct VID after routing. + vlan_create $swp1 10 + ip link set dev $swp1.10 master br0 + bridge_rif_add + + # Replace neighbor to avoid 1 packet which is forwarded in software due + # to "unresolved neigh". + ip neigh replace dev br0 192.0.2.1 lladdr $(mac_get $h1.10) + + # The hardware matches on the first ethertype which is not VLAN, + # so the protocol should be IP. + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.1 action pass + + ping_do $h1.10 192.0.2.18 + check_err $? "Ping failed" + + tc_check_at_least_x_packets "dev $swp1 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add RIF for existing {port, VID}->FID mapping" + + tc filter del dev $swp1 egress + + bridge_rif_del + ip link set dev $swp1.10 nomaster + vlan_destroy $swp1 10 +} + +rif_port_vid_map() +{ + RET=0 + + # First add an address to the bridge, which will create a RIF on top of + # it, then add a new {port, VID}->FID mapping and verify that packets + # get the correct VID after routing. + bridge_rif_add + vlan_create $swp1 10 + ip link set dev $swp1.10 master br0 + + # Replace neighbor to avoid 1 packet which is forwarded in software due + # to "unresolved neigh". + ip neigh replace dev br0 192.0.2.1 lladdr $(mac_get $h1.10) + + # The hardware matches on the first ethertype which is not VLAN, + # so the protocol should be IP. + tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \ + flower skip_sw dst_ip 192.0.2.1 action pass + + ping_do $h1.10 192.0.2.18 + check_err $? "Ping failed" + + tc_check_at_least_x_packets "dev $swp1 egress" 101 10 + check_err $? "Packets were not routed in hardware" + + log_test "Add {port, VID}->FID mapping for FID with a RIF" + + tc filter del dev $swp1 egress + + ip link set dev $swp1.10 nomaster + vlan_destroy $swp1 10 + bridge_rif_del +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From e27e5bea956ce4d3eb15112de5fa5a3b77c2f488 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2022 14:39:27 -0700 Subject: x86/ibt, objtool: Add IBT_NOSEAL() Add a macro which prevents a function from getting sealed if there are no compile-time references to it. Signed-off-by: Josh Poimboeuf Message-Id: <20220818213927.e44fmxkoq4yj6ybn@treble> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/ibt.h | 11 +++++++++++ tools/objtool/check.c | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 689880eca9ba..9b08082a5d9f 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -31,6 +31,16 @@ #define __noendbr __attribute__((nocf_check)) +/* + * Create a dummy function pointer reference to prevent objtool from marking + * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by + * apply_ibt_endbr()). + */ +#define IBT_NOSEAL(fname) \ + ".pushsection .discard.ibt_endbr_noseal\n\t" \ + _ASM_PTR fname "\n\t" \ + ".popsection\n\t" + static inline __attribute_const__ u32 gen_endbr(void) { u32 endbr; @@ -84,6 +94,7 @@ extern __noendbr void ibt_restore(u64 save); #ifndef __ASSEMBLY__ #define ASM_ENDBR +#define IBT_NOSEAL(name) #define __noendbr diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0cec74da7ffe..91678252a9b6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -4096,7 +4096,8 @@ static int validate_ibt(struct objtool_file *file) * These sections can reference text addresses, but not with * the intent to indirect branch to them. */ - if (!strncmp(sec->name, ".discard", 8) || + if ((!strncmp(sec->name, ".discard", 8) && + strcmp(sec->name, ".discard.ibt_endbr_noseal")) || !strncmp(sec->name, ".debug", 6) || !strcmp(sec->name, ".altinstructions") || !strcmp(sec->name, ".ibt_endbr_seal") || -- cgit v1.2.3 From 67ef8664cc5b113f6c49b01d2a0e4cbc589623dd Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Jul 2022 23:48:37 +0000 Subject: KVM: selftests: Fix KVM_EXCEPTION_MAGIC build with Clang Change KVM_EXCEPTION_MAGIC to use the all-caps "ULL", rather than lower case. This fixes a build failure with Clang: In file included from x86_64/hyperv_features.c:13: include/x86_64/processor.h:825:9: error: unexpected token in argument list return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr)); ^ include/x86_64/processor.h:802:15: note: expanded from macro 'kvm_asm_safe' asm volatile(KVM_ASM_SAFE(insn) \ ^ include/x86_64/processor.h:785:2: note: expanded from macro 'KVM_ASM_SAFE' "mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t" \ ^ :1:18: note: instantiated into assembly here mov $0xabacadabaull, %r9 ^ Fixes: 3b23054cd3f5 ("KVM: selftests: Add x86-64 support for exception fixup") Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Message-Id: <20220722234838.2160385-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 45edf45821d0..51c6661aca77 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -754,7 +754,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); /* If a toddler were to say "abracadabra". */ -#define KVM_EXCEPTION_MAGIC 0xabacadabaull +#define KVM_EXCEPTION_MAGIC 0xabacadabaULL /* * KVM selftest exception fixup uses registers to coordinate with the exception -- cgit v1.2.3 From 372d07084593dc7a399bf9bee815711b1fb1bcf2 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Jul 2022 23:48:38 +0000 Subject: KVM: selftests: Fix ambiguous mov in KVM_ASM_SAFE() Change the mov in KVM_ASM_SAFE() that zeroes @vector to a movb to make it unambiguous. This fixes a build failure with Clang since, unlike the GNU assembler, the LLVM integrated assembler rejects ambiguous X86 instructions that don't have suffixes: In file included from x86_64/hyperv_features.c:13: include/x86_64/processor.h:825:9: error: ambiguous instructions require an explicit suffix (could be 'movb', 'movw', 'movl', or 'movq') return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr)); ^ include/x86_64/processor.h:802:15: note: expanded from macro 'kvm_asm_safe' asm volatile(KVM_ASM_SAFE(insn) \ ^ include/x86_64/processor.h:788:16: note: expanded from macro 'KVM_ASM_SAFE' "1: " insn "\n\t" \ ^ :5:2: note: instantiated into assembly here mov $0, 15(%rsp) ^ It seems like this change could introduce undesirable behavior in the future, e.g. if someone used a type larger than a u8 for @vector, since KVM_ASM_SAFE() will only zero the bottom byte. I tried changing the type of @vector to an int to see what would happen. GCC failed to compile due to a size mismatch between `movb` and `%eax`. Clang succeeded in compiling, but the generated code looked correct, so perhaps it will not be an issue. That being said it seems like there could be a better solution to this issue that does not assume @vector is a u8. Fixes: 3b23054cd3f5 ("KVM: selftests: Add x86-64 support for exception fixup") Signed-off-by: David Matlack Reviewed-by: Sean Christopherson Message-Id: <20220722234838.2160385-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 51c6661aca77..0cbc71b7af50 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -786,7 +786,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, "lea 1f(%%rip), %%r10\n\t" \ "lea 2f(%%rip), %%r11\n\t" \ "1: " insn "\n\t" \ - "mov $0, %[vector]\n\t" \ + "movb $0, %[vector]\n\t" \ "jmp 3f\n\t" \ "2:\n\t" \ "mov %%r9b, %[vector]\n\t" \ -- cgit v1.2.3 From e989bc3d0f3f93aab7c5018affc3f87b74716b37 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 14 Jun 2022 07:33:48 -0700 Subject: perf cpumap: Const map for max() Allows max() to be used with 'const struct perf_cpu_maps *'. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Athira Jajeev Cc: Colin Ian King Cc: Dave Marchevsky Cc: German Gomez Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: James Clark Cc: Kees Kook Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Song Liu Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220614143353.1559597-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 2 +- tools/lib/perf/include/perf/cpumap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 384d5e076ee4..6cd0be7c1bb4 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -309,7 +309,7 @@ bool perf_cpu_map__has(const struct perf_cpu_map *cpus, struct perf_cpu cpu) return perf_cpu_map__idx(cpus, cpu) != -1; } -struct perf_cpu perf_cpu_map__max(struct perf_cpu_map *map) +struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map) { struct perf_cpu result = { .cpu = -1 diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index 24de795b09bb..03aceb72a783 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -23,7 +23,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map); -LIBPERF_API struct perf_cpu perf_cpu_map__max(struct perf_cpu_map *map); +LIBPERF_API struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map); LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu); #define perf_cpu_map__for_each_cpu(cpu, idx, cpus) \ -- cgit v1.2.3 From 35ae6f09d8fd02add781e452a6d2ba6ea3a5482e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 14 Jun 2022 07:33:49 -0700 Subject: perf cpumap: Synthetic events and const/static Make the cpumap arguments const to make it clearer they are in rather than out arguments. Make two functions static and remove external declarations. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Athira Jajeev Cc: Colin Ian King Cc: Dave Marchevsky Cc: German Gomez Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: James Clark Cc: Kees Kook Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Song Liu Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220614143353.1559597-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.h | 4 ---- tools/perf/util/synthetic-events.c | 20 +++++++++++--------- tools/perf/util/synthetic-events.h | 2 +- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index a7b0931d5137..4e0d8dd3b7a0 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -463,10 +463,6 @@ size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FIL int kallsyms__get_function_start(const char *kallsyms_filename, const char *symbol_name, u64 *addr); -void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int *max); -void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf_cpu_map *map, - u16 type, int max); - void event_attr_init(struct perf_event_attr *attr); int perf_event_paranoid(void); diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 2ae59c03ae77..b3e03a4c6652 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1185,7 +1185,7 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool, } static void synthesize_cpus(struct cpu_map_entries *cpus, - struct perf_cpu_map *map) + const struct perf_cpu_map *map) { int i, map_nr = perf_cpu_map__nr(map); @@ -1196,7 +1196,7 @@ static void synthesize_cpus(struct cpu_map_entries *cpus, } static void synthesize_mask(struct perf_record_record_cpu_map *mask, - struct perf_cpu_map *map, int max) + const struct perf_cpu_map *map, int max) { int i; @@ -1207,12 +1207,12 @@ static void synthesize_mask(struct perf_record_record_cpu_map *mask, set_bit(perf_cpu_map__cpu(map, i).cpu, mask->mask); } -static size_t cpus_size(struct perf_cpu_map *map) +static size_t cpus_size(const struct perf_cpu_map *map) { return sizeof(struct cpu_map_entries) + perf_cpu_map__nr(map) * sizeof(u16); } -static size_t mask_size(struct perf_cpu_map *map, int *max) +static size_t mask_size(const struct perf_cpu_map *map, int *max) { int i; @@ -1229,7 +1229,8 @@ static size_t mask_size(struct perf_cpu_map *map, int *max) return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long); } -void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int *max) +static void *cpu_map_data__alloc(const struct perf_cpu_map *map, size_t *size, + u16 *type, int *max) { size_t size_cpus, size_mask; bool is_dummy = perf_cpu_map__empty(map); @@ -1263,8 +1264,9 @@ void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int return zalloc(*size); } -void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf_cpu_map *map, - u16 type, int max) +static void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, + const struct perf_cpu_map *map, + u16 type, int max) { data->type = type; @@ -1279,7 +1281,7 @@ void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf } } -static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map) +static struct perf_record_cpu_map *cpu_map_event__new(const struct perf_cpu_map *map) { size_t size = sizeof(struct perf_record_cpu_map); struct perf_record_cpu_map *event; @@ -1299,7 +1301,7 @@ static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map) } int perf_event__synthesize_cpu_map(struct perf_tool *tool, - struct perf_cpu_map *map, + const struct perf_cpu_map *map, perf_event__handler_t process, struct machine *machine) { diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h index 81cb3d6af0b9..53737d1619a4 100644 --- a/tools/perf/util/synthetic-events.h +++ b/tools/perf/util/synthetic-events.h @@ -46,7 +46,7 @@ typedef int (*perf_event__handler_t)(struct perf_tool *tool, union perf_event *e int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process); int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process); int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_cpu_map(struct perf_tool *tool, struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_cpu_map(struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); -- cgit v1.2.3 From 28526478ccae88680645405f4e849d9ed4fbce7f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 14 Jun 2022 07:33:50 -0700 Subject: perf cpumap: Compute mask size in constant time perf_cpu_map__max() computes the cpumap's maximum value, no need to iterate over all values. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Athira Jajeev Cc: Colin Ian King Cc: Dave Marchevsky Cc: German Gomez Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: James Clark Cc: Kees Kook Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Song Liu Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220614143353.1559597-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b3e03a4c6652..3ae7c0f54157 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1214,18 +1214,7 @@ static size_t cpus_size(const struct perf_cpu_map *map) static size_t mask_size(const struct perf_cpu_map *map, int *max) { - int i; - - *max = 0; - - for (i = 0; i < perf_cpu_map__nr(map); i++) { - /* bit position of the cpu is + 1 */ - int bit = perf_cpu_map__cpu(map, i).cpu + 1; - - if (bit > *max) - *max = bit; - } - + *max = perf_cpu_map__max(map).cpu; return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long); } -- cgit v1.2.3 From 7b3e31869081771c63c3d006347ad06738f843b5 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Thu, 18 Aug 2022 09:45:53 +0800 Subject: objtool: Use arch_jump_destination() in read_intra_function_calls() Use arch_jump_destiation() instead of the open-coded 'offset + len + immediate' that is x86 specific. Avoids future trouble with other architectures. Signed-off-by: Chen Zhongjin Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220818014553.220261-1-chenzhongjin@huawei.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0cec74da7ffe..b012d987a658 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2233,7 +2233,7 @@ static int read_intra_function_calls(struct objtool_file *file) */ insn->type = INSN_JUMP_UNCONDITIONAL; - dest_off = insn->offset + insn->len + insn->immediate; + dest_off = arch_jump_destination(insn); insn->jump_dest = find_insn(file, insn->sec, dest_off); if (!insn->jump_dest) { WARN_FUNC("can't find call dest at %s+0x%lx", -- cgit v1.2.3 From 3c6f3900808c483b0bbb2c351f995c7b880dae14 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2022 09:26:57 -0700 Subject: objtool: Remove "ANNOTATE_NOENDBR on ENDBR" warning This warning isn't very useful: why would you put ANNOTATE_NOENDBR on ENDBR, and if you did, what's the harm? And thus far it's only found one non-bug, where the '__end_entry_SYSENTER_compat' label happens to land on the ENDBR from entry_SYSCALL_compat: vmlinux.o: warning: objtool: entry_SYSCALL_compat+0x0: ANNOTATE_NOENDBR on ENDBR .. which is fine. Just remove the warning. Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/142341a5dafdfc788e4c95b9e226a6eefc9b626e.1660839773.git.jpoimboe@kernel.org --- tools/objtool/check.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b012d987a658..8b8c8f74a775 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2102,9 +2102,6 @@ static int read_noendbr_hints(struct objtool_file *file) return -1; } - if (insn->type == INSN_ENDBR) - WARN_FUNC("ANNOTATE_NOENDBR on ENDBR", insn->sec, insn->offset); - insn->noendbr = 1; } -- cgit v1.2.3 From b2f10cd4e805eb647773df273eb1a6ff9e6ea45d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 14 Jun 2022 07:33:51 -0700 Subject: perf cpumap: Fix alignment for masks in event encoding A mask encoding of a cpu map is laid out as: u16 nr u16 long_size unsigned long mask[]; However, the mask may be 8-byte aligned meaning there is a 4-byte pad after long_size. This means 32-bit and 64-bit builds see the mask as being at different offsets. On top of this the structure is in the byte data[] encoded as: u16 type char data[] This means the mask's struct isn't the required 4 or 8 byte aligned, but is offset by 2. Consequently the long reads and writes are causing undefined behavior as the alignment is broken. Fix the mask struct by creating explicit 32 and 64-bit variants, use a union to avoid data[] and casts; the struct must be packed so the layout matches the existing perf.data layout. Taking an address of a member of a packed struct breaks alignment so pass the packed perf_record_cpu_map_data to functions, so they can access variables with the right alignment. As the 64-bit version has 4 bytes of padding, optimizing writing to only write the 32-bit version. Committer notes: Disable warnings about 'packed' that break the build in some arches like riscv64, but just around that specific struct. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Athira Jajeev Cc: Colin Ian King Cc: Dave Marchevsky Cc: German Gomez Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: James Clark Cc: Kees Kook Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Song Liu Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220614143353.1559597-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/event.h | 47 ++++++++++++++++++++-- tools/perf/tests/cpumap.c | 19 +++++---- tools/perf/util/cpumap.c | 80 +++++++++++++++++++++++++++++-------- tools/perf/util/cpumap.h | 4 +- tools/perf/util/session.c | 30 +++++++------- tools/perf/util/synthetic-events.c | 34 +++++++++------- 6 files changed, 154 insertions(+), 60 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 556bb06798f2..57f54781f5ed 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -6,6 +6,7 @@ #include #include #include +#include #include /* pid_t */ #define event_contains(obj, mem) ((obj).header.size > offsetof(typeof(obj), mem)) @@ -153,22 +154,60 @@ enum { PERF_CPU_MAP__MASK = 1, }; +/* + * Array encoding of a perf_cpu_map where nr is the number of entries in cpu[] + * and each entry is a value for a CPU in the map. + */ struct cpu_map_entries { __u16 nr; __u16 cpu[]; }; -struct perf_record_record_cpu_map { +/* Bitmap encoding of a perf_cpu_map where bitmap entries are 32-bit. */ +struct perf_record_mask_cpu_map32 { + /* Number of mask values. */ __u16 nr; + /* Constant 4. */ __u16 long_size; - unsigned long mask[]; + /* Bitmap data. */ + __u32 mask[]; }; -struct perf_record_cpu_map_data { +/* Bitmap encoding of a perf_cpu_map where bitmap entries are 64-bit. */ +struct perf_record_mask_cpu_map64 { + /* Number of mask values. */ + __u16 nr; + /* Constant 8. */ + __u16 long_size; + /* Legacy padding. */ + char __pad[4]; + /* Bitmap data. */ + __u64 mask[]; +}; + +/* + * 'struct perf_record_cpu_map_data' is packed as unfortunately an earlier + * version had unaligned data and we wish to retain file format compatibility. + * -irogers + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" + +struct __packed perf_record_cpu_map_data { __u16 type; - char data[]; + union { + /* Used when type == PERF_CPU_MAP__CPUS. */ + struct cpu_map_entries cpus_data; + /* Used when type == PERF_CPU_MAP__MASK and long_size == 4. */ + struct perf_record_mask_cpu_map32 mask32_data; + /* Used when type == PERF_CPU_MAP__MASK and long_size == 8. */ + struct perf_record_mask_cpu_map64 mask64_data; + }; }; +#pragma GCC diagnostic pop + struct perf_record_cpu_map { struct perf_event_header header; struct perf_record_cpu_map_data data; diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index f94929ebb54b..7ea150cdc137 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -17,21 +17,23 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused, struct machine *machine __maybe_unused) { struct perf_record_cpu_map *map_event = &event->cpu_map; - struct perf_record_record_cpu_map *mask; struct perf_record_cpu_map_data *data; struct perf_cpu_map *map; int i; + unsigned int long_size; data = &map_event->data; TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__MASK); - mask = (struct perf_record_record_cpu_map *)data->data; + long_size = data->mask32_data.long_size; - TEST_ASSERT_VAL("wrong nr", mask->nr == 1); + TEST_ASSERT_VAL("wrong long_size", long_size == 4 || long_size == 8); + + TEST_ASSERT_VAL("wrong nr", data->mask32_data.nr == 1); for (i = 0; i < 20; i++) { - TEST_ASSERT_VAL("wrong cpu", test_bit(i, mask->mask)); + TEST_ASSERT_VAL("wrong cpu", perf_record_cpu_map_data__test_bit(i, data)); } map = cpu_map__new_data(data); @@ -51,7 +53,6 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, struct machine *machine __maybe_unused) { struct perf_record_cpu_map *map_event = &event->cpu_map; - struct cpu_map_entries *cpus; struct perf_record_cpu_map_data *data; struct perf_cpu_map *map; @@ -59,11 +60,9 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__CPUS); - cpus = (struct cpu_map_entries *)data->data; - - TEST_ASSERT_VAL("wrong nr", cpus->nr == 2); - TEST_ASSERT_VAL("wrong cpu", cpus->cpu[0] == 1); - TEST_ASSERT_VAL("wrong cpu", cpus->cpu[1] == 256); + TEST_ASSERT_VAL("wrong nr", data->cpus_data.nr == 2); + TEST_ASSERT_VAL("wrong cpu", data->cpus_data.cpu[0] == 1); + TEST_ASSERT_VAL("wrong cpu", data->cpus_data.cpu[1] == 256); map = cpu_map__new_data(data); TEST_ASSERT_VAL("wrong nr", perf_cpu_map__nr(map) == 2); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 12b2243222b0..ae43fb88f444 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -22,54 +22,102 @@ static int max_node_num; */ static int *cpunode_map; -static struct perf_cpu_map *cpu_map__from_entries(struct cpu_map_entries *cpus) +bool perf_record_cpu_map_data__test_bit(int i, + const struct perf_record_cpu_map_data *data) +{ + int bit_word32 = i / 32; + __u32 bit_mask32 = 1U << (i & 31); + int bit_word64 = i / 64; + __u64 bit_mask64 = ((__u64)1) << (i & 63); + + return (data->mask32_data.long_size == 4) + ? (bit_word32 < data->mask32_data.nr) && + (data->mask32_data.mask[bit_word32] & bit_mask32) != 0 + : (bit_word64 < data->mask64_data.nr) && + (data->mask64_data.mask[bit_word64] & bit_mask64) != 0; +} + +/* Read ith mask value from data into the given 64-bit sized bitmap */ +static void perf_record_cpu_map_data__read_one_mask(const struct perf_record_cpu_map_data *data, + int i, unsigned long *bitmap) +{ +#if __SIZEOF_LONG__ == 8 + if (data->mask32_data.long_size == 4) + bitmap[0] = data->mask32_data.mask[i]; + else + bitmap[0] = data->mask64_data.mask[i]; +#else + if (data->mask32_data.long_size == 4) { + bitmap[0] = data->mask32_data.mask[i]; + bitmap[1] = 0; + } else { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + bitmap[0] = (unsigned long)(data->mask64_data.mask[i] >> 32); + bitmap[1] = (unsigned long)data->mask64_data.mask[i]; +#else + bitmap[0] = (unsigned long)data->mask64_data.mask[i]; + bitmap[1] = (unsigned long)(data->mask64_data.mask[i] >> 32); +#endif + } +#endif +} +static struct perf_cpu_map *cpu_map__from_entries(const struct perf_record_cpu_map_data *data) { struct perf_cpu_map *map; - map = perf_cpu_map__empty_new(cpus->nr); + map = perf_cpu_map__empty_new(data->cpus_data.nr); if (map) { unsigned i; - for (i = 0; i < cpus->nr; i++) { + for (i = 0; i < data->cpus_data.nr; i++) { /* * Special treatment for -1, which is not real cpu number, * and we need to use (int) -1 to initialize map[i], * otherwise it would become 65535. */ - if (cpus->cpu[i] == (u16) -1) + if (data->cpus_data.cpu[i] == (u16) -1) map->map[i].cpu = -1; else - map->map[i].cpu = (int) cpus->cpu[i]; + map->map[i].cpu = (int) data->cpus_data.cpu[i]; } } return map; } -static struct perf_cpu_map *cpu_map__from_mask(struct perf_record_record_cpu_map *mask) +static struct perf_cpu_map *cpu_map__from_mask(const struct perf_record_cpu_map_data *data) { + DECLARE_BITMAP(local_copy, 64); + int weight = 0, mask_nr = data->mask32_data.nr; struct perf_cpu_map *map; - int nr, nbits = mask->nr * mask->long_size * BITS_PER_BYTE; - nr = bitmap_weight(mask->mask, nbits); + for (int i = 0; i < mask_nr; i++) { + perf_record_cpu_map_data__read_one_mask(data, i, local_copy); + weight += bitmap_weight(local_copy, 64); + } + + map = perf_cpu_map__empty_new(weight); + if (!map) + return NULL; - map = perf_cpu_map__empty_new(nr); - if (map) { - int cpu, i = 0; + for (int i = 0, j = 0; i < mask_nr; i++) { + int cpus_per_i = (i * data->mask32_data.long_size * BITS_PER_BYTE); + int cpu; - for_each_set_bit(cpu, mask->mask, nbits) - map->map[i++].cpu = cpu; + perf_record_cpu_map_data__read_one_mask(data, i, local_copy); + for_each_set_bit(cpu, local_copy, 64) + map->map[j++].cpu = cpu + cpus_per_i; } return map; } -struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data) +struct perf_cpu_map *cpu_map__new_data(const struct perf_record_cpu_map_data *data) { if (data->type == PERF_CPU_MAP__CPUS) - return cpu_map__from_entries((struct cpu_map_entries *)data->data); + return cpu_map__from_entries(data); else - return cpu_map__from_mask((struct perf_record_record_cpu_map *)data->data); + return cpu_map__from_mask(data); } size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp) diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 703ae6d3386e..fa8a5acdcae1 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -37,9 +37,11 @@ struct cpu_aggr_map { struct perf_record_cpu_map_data; +bool perf_record_cpu_map_data__test_bit(int i, const struct perf_record_cpu_map_data *data); + struct perf_cpu_map *perf_cpu_map__empty_new(int nr); -struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data); +struct perf_cpu_map *cpu_map__new_data(const struct perf_record_cpu_map_data *data); size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 98e16659a149..61bb9675e044 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -916,30 +916,30 @@ static void perf_event__cpu_map_swap(union perf_event *event, bool sample_id_all __maybe_unused) { struct perf_record_cpu_map_data *data = &event->cpu_map.data; - struct cpu_map_entries *cpus; - struct perf_record_record_cpu_map *mask; - unsigned i; data->type = bswap_16(data->type); switch (data->type) { case PERF_CPU_MAP__CPUS: - cpus = (struct cpu_map_entries *)data->data; - - cpus->nr = bswap_16(cpus->nr); + data->cpus_data.nr = bswap_16(data->cpus_data.nr); - for (i = 0; i < cpus->nr; i++) - cpus->cpu[i] = bswap_16(cpus->cpu[i]); + for (unsigned i = 0; i < data->cpus_data.nr; i++) + data->cpus_data.cpu[i] = bswap_16(data->cpus_data.cpu[i]); break; case PERF_CPU_MAP__MASK: - mask = (struct perf_record_record_cpu_map *)data->data; - - mask->nr = bswap_16(mask->nr); - mask->long_size = bswap_16(mask->long_size); + data->mask32_data.long_size = bswap_16(data->mask32_data.long_size); - switch (mask->long_size) { - case 4: mem_bswap_32(&mask->mask, mask->nr); break; - case 8: mem_bswap_64(&mask->mask, mask->nr); break; + switch (data->mask32_data.long_size) { + case 4: + data->mask32_data.nr = bswap_16(data->mask32_data.nr); + for (unsigned i = 0; i < data->mask32_data.nr; i++) + data->mask32_data.mask[i] = bswap_32(data->mask32_data.mask[i]); + break; + case 8: + data->mask64_data.nr = bswap_16(data->mask64_data.nr); + for (unsigned i = 0; i < data->mask64_data.nr; i++) + data->mask64_data.mask[i] = bswap_64(data->mask64_data.mask[i]); + break; default: pr_err("cpu_map swap: unsupported long size\n"); } diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 3ae7c0f54157..59747c440bd5 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1184,27 +1184,33 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool, return err; } -static void synthesize_cpus(struct cpu_map_entries *cpus, +static void synthesize_cpus(struct perf_record_cpu_map_data *data, const struct perf_cpu_map *map) { int i, map_nr = perf_cpu_map__nr(map); - cpus->nr = map_nr; + data->cpus_data.nr = map_nr; for (i = 0; i < map_nr; i++) - cpus->cpu[i] = perf_cpu_map__cpu(map, i).cpu; + data->cpus_data.cpu[i] = perf_cpu_map__cpu(map, i).cpu; } -static void synthesize_mask(struct perf_record_record_cpu_map *mask, +static void synthesize_mask(struct perf_record_cpu_map_data *data, const struct perf_cpu_map *map, int max) { - int i; + int idx; + struct perf_cpu cpu; + + /* Due to padding, the 4bytes per entry mask variant is always smaller. */ + data->mask32_data.nr = BITS_TO_U32(max); + data->mask32_data.long_size = 4; - mask->nr = BITS_TO_LONGS(max); - mask->long_size = sizeof(long); + perf_cpu_map__for_each_cpu(cpu, idx, map) { + int bit_word = cpu.cpu / 32; + __u32 bit_mask = 1U << (cpu.cpu & 31); - for (i = 0; i < perf_cpu_map__nr(map); i++) - set_bit(perf_cpu_map__cpu(map, i).cpu, mask->mask); + data->mask32_data.mask[bit_word] |= bit_mask; + } } static size_t cpus_size(const struct perf_cpu_map *map) @@ -1215,7 +1221,7 @@ static size_t cpus_size(const struct perf_cpu_map *map) static size_t mask_size(const struct perf_cpu_map *map, int *max) { *max = perf_cpu_map__max(map).cpu; - return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long); + return sizeof(struct perf_record_mask_cpu_map32) + BITS_TO_U32(*max) * sizeof(__u32); } static void *cpu_map_data__alloc(const struct perf_cpu_map *map, size_t *size, @@ -1248,7 +1254,7 @@ static void *cpu_map_data__alloc(const struct perf_cpu_map *map, size_t *size, *type = PERF_CPU_MAP__MASK; } - *size += sizeof(struct perf_record_cpu_map_data); + *size += sizeof(__u16); /* For perf_record_cpu_map_data.type. */ *size = PERF_ALIGN(*size, sizeof(u64)); return zalloc(*size); } @@ -1261,10 +1267,10 @@ static void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, switch (type) { case PERF_CPU_MAP__CPUS: - synthesize_cpus((struct cpu_map_entries *) data->data, map); + synthesize_cpus(data, map); break; case PERF_CPU_MAP__MASK: - synthesize_mask((struct perf_record_record_cpu_map *)data->data, map, max); + synthesize_mask(data, map, max); default: break; } @@ -1272,7 +1278,7 @@ static void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, static struct perf_record_cpu_map *cpu_map_event__new(const struct perf_cpu_map *map) { - size_t size = sizeof(struct perf_record_cpu_map); + size_t size = sizeof(struct perf_event_header); struct perf_record_cpu_map *event; int max; u16 type; -- cgit v1.2.3 From cf1258ac37afe80dbf277add5f1464477b46c9f0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 19 Jun 2021 10:09:08 -0300 Subject: perf beauty: Update copy of linux/socket.h with the kernel sources To pick the changes in: 7fa875b8e53c288d ("net: copy from user before calling __copy_msghdr") ebe73a284f4de8c5 ("net: Allow custom iter handler in msghdr") 7c701d92b2b5e517 ("skbuff: carry external ubuf_info in msghdr") c04245328dd7e915 ("net: make __sys_accept4_file() static") That don't result in any changes in the tables generated from that header. This silences this perf build warning: Warning: Kernel ABI header at 'tools/perf/trace/beauty/include/linux/socket.h' differs from latest version at 'include/linux/socket.h' diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Cc: David Ahern Cc: David S. Miller Cc: Dylan Yudaken Cc: Jakub Kicinski Cc: Jens Axboe Cc: Pavel Begunkov Cc: Yajun Deng Link: https://lore.kernel.org/lkml/YvzYs+F+Xzq8Hvvp@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/linux/socket.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 17311ad9f9af..de3701a2a212 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -14,6 +14,8 @@ struct file; struct pid; struct cred; struct socket; +struct sock; +struct sk_buff; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) @@ -69,6 +71,9 @@ struct msghdr { unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ + struct ubuf_info *msg_ubuf; + int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); }; struct user_msghdr { @@ -416,10 +421,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); -extern int __copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec __user **uiov, size_t *nsegs); +extern int __copy_msghdr(struct msghdr *kmsg, + struct user_msghdr *umsg, + struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, @@ -428,10 +432,6 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); -extern int __sys_accept4_file(struct file *file, unsigned file_flags, - struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags, - unsigned long nofile); extern struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); -- cgit v1.2.3 From 7f7f86a7bdd694bfb214479afb6a1f7266bb4d22 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 7 Aug 2020 08:45:47 -0300 Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes in: 2b1299322016731d ("x86/speculation: Add RSB VM Exit protections") 4af184ee8b2c0a69 ("tools/power turbostat: dump secondary Turbo-Ratio-Limit") 4ad3278df6fe2b08 ("x86/speculation: Disable RRSBA behavior") d7caac991feeef1b ("x86/cpu/amd: Add Spectral Chicken") 6ad0ad2bf8a67e27 ("x86/bugs: Report Intel retbleed vulnerability") c59a1f106f5cd484 ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") 465932db25f36648 ("x86/cpu: Add new VMX feature, Tertiary VM-Execution control") 027bbb884be006b0 ("KVM: x86/speculation: Disable Fill buffer clear within guests") 51802186158c74a0 ("x86/speculation/mmio: Enumerate Processor MMIO Stale Data bug") Addressing these tools/perf build warnings: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' That makes the beautification scripts to pick some new entries: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after --- before 2022-08-17 09:05:13.938246475 -0300 +++ after 2022-08-17 09:05:22.221455851 -0300 @@ -161,6 +161,7 @@ [0x0000048f] = "IA32_VMX_TRUE_EXIT_CTLS", [0x00000490] = "IA32_VMX_TRUE_ENTRY_CTLS", [0x00000491] = "IA32_VMX_VMFUNC", + [0x00000492] = "IA32_VMX_PROCBASED_CTLS3", [0x000004c1] = "IA32_PMC0", [0x000004d0] = "IA32_MCG_EXT_CTL", [0x00000560] = "IA32_RTIT_OUTPUT_BASE", @@ -212,6 +213,7 @@ [0x0000064D] = "PLATFORM_ENERGY_STATUS", [0x0000064e] = "PPERF", [0x0000064f] = "PERF_LIMIT_REASONS", + [0x00000650] = "SECONDARY_TURBO_RATIO_LIMIT", [0x00000658] = "PKG_WEIGHTED_CORE_C0_RES", [0x00000659] = "PKG_ANY_CORE_C0_RES", [0x0000065A] = "PKG_ANY_GFXE_C0_RES", $ Now one can trace systemwide asking to see backtraces to where those MSRs are being read/written, see this example with a previous update: # perf trace -e msr:*_msr/max-stack=32/ --filter="msr>=IA32_U_CET && msr<=IA32_INT_SSP_TAB" ^C# If we use -v (verbose mode) we can see what it does behind the scenes: # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr>=IA32_U_CET && msr<=IA32_INT_SSP_TAB" Using CPUID AuthenticAMD-25-21-0 0x6a0 0x6a8 New filter for msr:read_msr: (msr>=0x6a0 && msr<=0x6a8) && (common_pid != 597499 && common_pid != 3313) 0x6a0 0x6a8 New filter for msr:write_msr: (msr>=0x6a0 && msr<=0x6a8) && (common_pid != 597499 && common_pid != 3313) mmap size 528384B ^C# Example with a frequent msr: # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr==IA32_SPEC_CTRL" --max-events 2 Using CPUID AuthenticAMD-25-21-0 0x48 New filter for msr:read_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841) 0x48 New filter for msr:write_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841) mmap size 528384B Looking at the vmlinux_path (8 entries long) symsrc__init: build id mismatch for vmlinux. Using /proc/kcore for kernel data Using /proc/kallsyms for symbols 0.000 Timer/2525383 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) __switch_to_xtra ([kernel.kallsyms]) __switch_to ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule ([kernel.kallsyms]) futex_wait_queue_me ([kernel.kallsyms]) futex_wait ([kernel.kallsyms]) do_futex ([kernel.kallsyms]) __x64_sys_futex ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) entry_SYSCALL_64_after_hwframe ([kernel.kallsyms]) __futex_abstimed_wait_common64 (/usr/lib64/libpthread-2.33.so) 0.030 :0/0 msr:write_msr(msr: IA32_SPEC_CTRL, val: 2) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) __switch_to_xtra ([kernel.kallsyms]) __switch_to ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule_idle ([kernel.kallsyms]) do_idle ([kernel.kallsyms]) cpu_startup_entry ([kernel.kallsyms]) secondary_startup_64_no_verify ([kernel.kallsyms]) # Cc: Adrian Hunter Cc: Daniel Sneddon Cc: Ian Rogers Cc: Jiri Olsa Cc: Len Brown Cc: Like Xu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Robert Hoo Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/lkml/YvzbT24m2o5U%2F7+q@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index e057e039173c..6674bdb096f3 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -235,6 +235,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -392,6 +398,7 @@ #define MSR_TURBO_ACTIVATION_RATIO 0x0000064C #define MSR_PLATFORM_ENERGY_STATUS 0x0000064D +#define MSR_SECONDARY_TURBO_RATIO_LIMIT 0x00000650 #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 @@ -1022,6 +1029,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 -- cgit v1.2.3 From fabe0c61d842637b722344bcd49bfb1b76e2cc68 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Dec 2020 14:58:51 -0300 Subject: tools headers UAPI: Sync linux/fscrypt.h with the kernel sources To pick the changes from: 6b2a51ff03bf0c54 ("fscrypt: Add HCTR2 support for filename encryption") That don't result in any changes in tooling, just causes this to be rebuilt: CC /tmp/build/perf-urgent/trace/beauty/sync_file_range.o LD /tmp/build/perf-urgent/trace/beauty/perf-in.o addressing this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/fscrypt.h' differs from latest version at 'include/uapi/linux/fscrypt.h' diff -u tools/include/uapi/linux/fscrypt.h include/uapi/linux/fscrypt.h Cc: Adrian Hunter Cc: Herbert Xu Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Nathan Huckleberry Link: https://lore.kernel.org/lkml/Yvzl8C7O1b+hf9GS@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fscrypt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/fscrypt.h b/tools/include/uapi/linux/fscrypt.h index 9f4428be3e36..a756b29afcc2 100644 --- a/tools/include/uapi/linux/fscrypt.h +++ b/tools/include/uapi/linux/fscrypt.h @@ -27,7 +27,8 @@ #define FSCRYPT_MODE_AES_128_CBC 5 #define FSCRYPT_MODE_AES_128_CTS 6 #define FSCRYPT_MODE_ADIANTUM 9 -/* If adding a mode number > 9, update FSCRYPT_MODE_MAX in fscrypt_private.h */ +#define FSCRYPT_MODE_AES_256_HCTR2 10 +/* If adding a mode number > 10, update FSCRYPT_MODE_MAX in fscrypt_private.h */ /* * Legacy policy version; ad-hoc KDF and no key verification. -- cgit v1.2.3 From 62ed93d1996b3aaeadda59b25ac5b70be59b8a61 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: tools headers cpufeatures: Sync with the kernel sources To pick the changes from: 2b1299322016731d ("x86/speculation: Add RSB VM Exit protections") 28a99e95f55c6185 ("x86/amd: Use IBPB for firmware calls") 4ad3278df6fe2b08 ("x86/speculation: Disable RRSBA behavior") 26aae8ccbc197223 ("x86/cpu/amd: Enumerate BTC_NO") 9756bba28470722d ("x86/speculation: Fill RSB on vmexit for IBRS") 3ebc170068885b6f ("x86/bugs: Add retbleed=ibpb") 2dbb887e875b1de3 ("x86/entry: Add kernel IBRS implementation") 6b80b59b35557065 ("x86/bugs: Report AMD retbleed vulnerability") a149180fbcf336e9 ("x86: Add magic AMD return-thunk") 15e67227c49a5783 ("x86: Undo return-thunk damage") a883d624aed463c8 ("x86/cpufeatures: Move RETPOLINE flags to word 11") aae99a7c9ab371b2 ("x86/cpufeatures: Introduce x2AVIC CPUID bit") 6f33a9daff9f0790 ("x86: Fix comment for X86_FEATURE_ZEN") 51802186158c74a0 ("x86/speculation/mmio: Enumerate Processor MMIO Stale Data bug") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Adrian Hunter Cc: Alexandre Chartre Cc: Andrew Cooper Cc: Borislav Petkov Cc: Daniel Sneddon Cc: Dave Hansen Cc: Ian Rogers Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Suravee Suthikulpanit Cc: Wyes Karny Link: https://lore.kernel.org/lkml/Yvznmu5oHv0ZDN2w@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 8323ac5b7eee..235dc85c91c3 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ +#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -303,7 +303,7 @@ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -354,6 +354,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ @@ -457,5 +458,6 @@ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ #define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ -- cgit v1.2.3 From 54cd4cde7c1edb869603073167cabab0b760fff6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Nov 2021 11:08:31 -0300 Subject: tools headers UAPI: Sync drm/i915_drm.h with the kernel sources To pick up the changes in: a913bde810fc464d ("drm/i915: Update i915 uapi documentation") 525e93f6317a08a0 ("drm/i915/uapi: add NEEDS_CPU_ACCESS hint") 141f733bb3abb000 ("drm/i915/uapi: expose the avail tracking") 3f4309cbdc849637 ("drm/i915/uapi: add probed_cpu_visible_size") a50794f26f52c66c ("uapi/drm/i915: Document memory residency and Flat-CCS capability of obj") That don't add any new ioctl, so no changes in tooling. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/drm/i915_drm.h' differs from latest version at 'include/uapi/drm/i915_drm.h' diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Matthew Auld Cc: Matt Roper Cc: Namhyung Kim Cc: Niranjana Vishwanathapura Cc: Ramalingam C Link: http://lore.kernel.org/lkml/Yvzrp9RFIeEkb5fI@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 387 +++++++++++++++++++++++++++++--------- 1 file changed, 300 insertions(+), 87 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index b28ff5d88145..520ad2691a99 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -751,14 +751,27 @@ typedef struct drm_i915_irq_wait { /* Must be kept compact -- no holes and well documented */ -typedef struct drm_i915_getparam { +/** + * struct drm_i915_getparam - Driver parameter query structure. + */ +struct drm_i915_getparam { + /** @param: Driver parameter to query. */ __s32 param; - /* + + /** + * @value: Address of memory where queried value should be put. + * * WARNING: Using pointers instead of fixed-size u64 means we need to write * compat32 code. Don't repeat this mistake. */ int __user *value; -} drm_i915_getparam_t; +}; + +/** + * typedef drm_i915_getparam_t - Driver parameter query structure. + * See struct drm_i915_getparam. + */ +typedef struct drm_i915_getparam drm_i915_getparam_t; /* Ioctl to set kernel params: */ @@ -1239,76 +1252,119 @@ struct drm_i915_gem_exec_object2 { __u64 rsvd2; }; +/** + * struct drm_i915_gem_exec_fence - An input or output fence for the execbuf + * ioctl. + * + * The request will wait for input fence to signal before submission. + * + * The returned output fence will be signaled after the completion of the + * request. + */ struct drm_i915_gem_exec_fence { - /** - * User's handle for a drm_syncobj to wait on or signal. - */ + /** @handle: User's handle for a drm_syncobj to wait on or signal. */ __u32 handle; + /** + * @flags: Supported flags are: + * + * I915_EXEC_FENCE_WAIT: + * Wait for the input fence before request submission. + * + * I915_EXEC_FENCE_SIGNAL: + * Return request completion fence as output + */ + __u32 flags; #define I915_EXEC_FENCE_WAIT (1<<0) #define I915_EXEC_FENCE_SIGNAL (1<<1) #define __I915_EXEC_FENCE_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_SIGNAL << 1)) - __u32 flags; }; -/* - * See drm_i915_gem_execbuffer_ext_timeline_fences. - */ -#define DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES 0 - -/* +/** + * struct drm_i915_gem_execbuffer_ext_timeline_fences - Timeline fences + * for execbuf ioctl. + * * This structure describes an array of drm_syncobj and associated points for * timeline variants of drm_syncobj. It is invalid to append this structure to * the execbuf if I915_EXEC_FENCE_ARRAY is set. */ struct drm_i915_gem_execbuffer_ext_timeline_fences { +#define DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES 0 + /** @base: Extension link. See struct i915_user_extension. */ struct i915_user_extension base; /** - * Number of element in the handles_ptr & value_ptr arrays. + * @fence_count: Number of elements in the @handles_ptr & @value_ptr + * arrays. */ __u64 fence_count; /** - * Pointer to an array of struct drm_i915_gem_exec_fence of length - * fence_count. + * @handles_ptr: Pointer to an array of struct drm_i915_gem_exec_fence + * of length @fence_count. */ __u64 handles_ptr; /** - * Pointer to an array of u64 values of length fence_count. Values - * must be 0 for a binary drm_syncobj. A Value of 0 for a timeline - * drm_syncobj is invalid as it turns a drm_syncobj into a binary one. + * @values_ptr: Pointer to an array of u64 values of length + * @fence_count. + * Values must be 0 for a binary drm_syncobj. A Value of 0 for a + * timeline drm_syncobj is invalid as it turns a drm_syncobj into a + * binary one. */ __u64 values_ptr; }; +/** + * struct drm_i915_gem_execbuffer2 - Structure for DRM_I915_GEM_EXECBUFFER2 + * ioctl. + */ struct drm_i915_gem_execbuffer2 { - /** - * List of gem_exec_object2 structs - */ + /** @buffers_ptr: Pointer to a list of gem_exec_object2 structs */ __u64 buffers_ptr; + + /** @buffer_count: Number of elements in @buffers_ptr array */ __u32 buffer_count; - /** Offset in the batchbuffer to start execution from. */ + /** + * @batch_start_offset: Offset in the batchbuffer to start execution + * from. + */ __u32 batch_start_offset; - /** Bytes used in batchbuffer from batch_start_offset */ + + /** + * @batch_len: Length in bytes of the batch buffer, starting from the + * @batch_start_offset. If 0, length is assumed to be the batch buffer + * object size. + */ __u32 batch_len; + + /** @DR1: deprecated */ __u32 DR1; + + /** @DR4: deprecated */ __u32 DR4; + + /** @num_cliprects: See @cliprects_ptr */ __u32 num_cliprects; + /** - * This is a struct drm_clip_rect *cliprects if I915_EXEC_FENCE_ARRAY - * & I915_EXEC_USE_EXTENSIONS are not set. + * @cliprects_ptr: Kernel clipping was a DRI1 misfeature. + * + * It is invalid to use this field if I915_EXEC_FENCE_ARRAY or + * I915_EXEC_USE_EXTENSIONS flags are not set. * * If I915_EXEC_FENCE_ARRAY is set, then this is a pointer to an array - * of struct drm_i915_gem_exec_fence and num_cliprects is the length - * of the array. + * of &drm_i915_gem_exec_fence and @num_cliprects is the length of the + * array. * * If I915_EXEC_USE_EXTENSIONS is set, then this is a pointer to a - * single struct i915_user_extension and num_cliprects is 0. + * single &i915_user_extension and num_cliprects is 0. */ __u64 cliprects_ptr; + + /** @flags: Execbuf flags */ + __u64 flags; #define I915_EXEC_RING_MASK (0x3f) #define I915_EXEC_DEFAULT (0<<0) #define I915_EXEC_RENDER (1<<0) @@ -1326,10 +1382,6 @@ struct drm_i915_gem_execbuffer2 { #define I915_EXEC_CONSTANTS_REL_GENERAL (0<<6) /* default */ #define I915_EXEC_CONSTANTS_ABSOLUTE (1<<6) #define I915_EXEC_CONSTANTS_REL_SURFACE (2<<6) /* gen4/5 only */ - __u64 flags; - __u64 rsvd1; /* now used for context info */ - __u64 rsvd2; -}; /** Resets the SO write offset registers for transform feedback on gen7. */ #define I915_EXEC_GEN7_SOL_RESET (1<<8) @@ -1432,9 +1484,23 @@ struct drm_i915_gem_execbuffer2 { * drm_i915_gem_execbuffer_ext enum. */ #define I915_EXEC_USE_EXTENSIONS (1 << 21) - #define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_USE_EXTENSIONS << 1)) + /** @rsvd1: Context id */ + __u64 rsvd1; + + /** + * @rsvd2: in and out sync_file file descriptors. + * + * When I915_EXEC_FENCE_IN or I915_EXEC_FENCE_SUBMIT flag is set, the + * lower 32 bits of this field will have the in sync_file fd (input). + * + * When I915_EXEC_FENCE_OUT flag is set, the upper 32 bits of this + * field will have the out sync_file fd (output). + */ + __u64 rsvd2; +}; + #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) #define i915_execbuffer2_set_context_id(eb2, context) \ (eb2).rsvd1 = context & I915_EXEC_CONTEXT_ID_MASK @@ -1814,19 +1880,58 @@ struct drm_i915_gem_context_create { __u32 pad; }; +/** + * struct drm_i915_gem_context_create_ext - Structure for creating contexts. + */ struct drm_i915_gem_context_create_ext { - __u32 ctx_id; /* output: id of new context*/ + /** @ctx_id: Id of the created context (output) */ + __u32 ctx_id; + + /** + * @flags: Supported flags are: + * + * I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS: + * + * Extensions may be appended to this structure and driver must check + * for those. See @extensions. + * + * I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE + * + * Created context will have single timeline. + */ __u32 flags; #define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS (1u << 0) #define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE (1u << 1) #define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \ (-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1)) + + /** + * @extensions: Zero-terminated chain of extensions. + * + * I915_CONTEXT_CREATE_EXT_SETPARAM: + * Context parameter to set or query during context creation. + * See struct drm_i915_gem_context_create_ext_setparam. + * + * I915_CONTEXT_CREATE_EXT_CLONE: + * This extension has been removed. On the off chance someone somewhere + * has attempted to use it, never re-use this extension number. + */ __u64 extensions; +#define I915_CONTEXT_CREATE_EXT_SETPARAM 0 +#define I915_CONTEXT_CREATE_EXT_CLONE 1 }; +/** + * struct drm_i915_gem_context_param - Context parameter to set or query. + */ struct drm_i915_gem_context_param { + /** @ctx_id: Context id */ __u32 ctx_id; + + /** @size: Size of the parameter @value */ __u32 size; + + /** @param: Parameter to set or query */ __u64 param; #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1 /* I915_CONTEXT_PARAM_NO_ZEROMAP has been removed. On the off chance @@ -1973,6 +2078,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_PROTECTED_CONTENT 0xd /* Must be kept compact -- no holes and well documented */ + /** @value: Context parameter value to be set or queried */ __u64 value; }; @@ -2371,23 +2477,29 @@ struct i915_context_param_engines { struct i915_engine_class_instance engines[N__]; \ } __attribute__((packed)) name__ +/** + * struct drm_i915_gem_context_create_ext_setparam - Context parameter + * to set or query during context creation. + */ struct drm_i915_gem_context_create_ext_setparam { -#define I915_CONTEXT_CREATE_EXT_SETPARAM 0 + /** @base: Extension link. See struct i915_user_extension. */ struct i915_user_extension base; + + /** + * @param: Context parameter to set or query. + * See struct drm_i915_gem_context_param. + */ struct drm_i915_gem_context_param param; }; -/* This API has been removed. On the off chance someone somewhere has - * attempted to use it, never re-use this extension number. - */ -#define I915_CONTEXT_CREATE_EXT_CLONE 1 - struct drm_i915_gem_context_destroy { __u32 ctx_id; __u32 pad; }; -/* +/** + * struct drm_i915_gem_vm_control - Structure to create or destroy VM. + * * DRM_I915_GEM_VM_CREATE - * * Create a new virtual memory address space (ppGTT) for use within a context @@ -2397,20 +2509,23 @@ struct drm_i915_gem_context_destroy { * The id of new VM (bound to the fd) for use with I915_CONTEXT_PARAM_VM is * returned in the outparam @id. * - * No flags are defined, with all bits reserved and must be zero. - * * An extension chain maybe provided, starting with @extensions, and terminated * by the @next_extension being 0. Currently, no extensions are defined. * * DRM_I915_GEM_VM_DESTROY - * - * Destroys a previously created VM id, specified in @id. + * Destroys a previously created VM id, specified in @vm_id. * * No extensions or flags are allowed currently, and so must be zero. */ struct drm_i915_gem_vm_control { + /** @extensions: Zero-terminated chain of extensions. */ __u64 extensions; + + /** @flags: reserved for future usage, currently MBZ */ __u32 flags; + + /** @vm_id: Id of the VM created or to be destroyed */ __u32 vm_id; }; @@ -3207,36 +3322,6 @@ struct drm_i915_gem_memory_class_instance { * struct drm_i915_memory_region_info - Describes one region as known to the * driver. * - * Note that we reserve some stuff here for potential future work. As an example - * we might want expose the capabilities for a given region, which could include - * things like if the region is CPU mappable/accessible, what are the supported - * mapping types etc. - * - * Note that to extend struct drm_i915_memory_region_info and struct - * drm_i915_query_memory_regions in the future the plan is to do the following: - * - * .. code-block:: C - * - * struct drm_i915_memory_region_info { - * struct drm_i915_gem_memory_class_instance region; - * union { - * __u32 rsvd0; - * __u32 new_thing1; - * }; - * ... - * union { - * __u64 rsvd1[8]; - * struct { - * __u64 new_thing2; - * __u64 new_thing3; - * ... - * }; - * }; - * }; - * - * With this things should remain source compatible between versions for - * userspace, even as we add new fields. - * * Note this is using both struct drm_i915_query_item and struct drm_i915_query. * For this new query we are adding the new query id DRM_I915_QUERY_MEMORY_REGIONS * at &drm_i915_query_item.query_id. @@ -3248,14 +3333,81 @@ struct drm_i915_memory_region_info { /** @rsvd0: MBZ */ __u32 rsvd0; - /** @probed_size: Memory probed by the driver (-1 = unknown) */ + /** + * @probed_size: Memory probed by the driver + * + * Note that it should not be possible to ever encounter a zero value + * here, also note that no current region type will ever return -1 here. + * Although for future region types, this might be a possibility. The + * same applies to the other size fields. + */ __u64 probed_size; - /** @unallocated_size: Estimate of memory remaining (-1 = unknown) */ + /** + * @unallocated_size: Estimate of memory remaining + * + * Requires CAP_PERFMON or CAP_SYS_ADMIN to get reliable accounting. + * Without this (or if this is an older kernel) the value here will + * always equal the @probed_size. Note this is only currently tracked + * for I915_MEMORY_CLASS_DEVICE regions (for other types the value here + * will always equal the @probed_size). + */ __u64 unallocated_size; - /** @rsvd1: MBZ */ - __u64 rsvd1[8]; + union { + /** @rsvd1: MBZ */ + __u64 rsvd1[8]; + struct { + /** + * @probed_cpu_visible_size: Memory probed by the driver + * that is CPU accessible. + * + * This will be always be <= @probed_size, and the + * remainder (if there is any) will not be CPU + * accessible. + * + * On systems without small BAR, the @probed_size will + * always equal the @probed_cpu_visible_size, since all + * of it will be CPU accessible. + * + * Note this is only tracked for + * I915_MEMORY_CLASS_DEVICE regions (for other types the + * value here will always equal the @probed_size). + * + * Note that if the value returned here is zero, then + * this must be an old kernel which lacks the relevant + * small-bar uAPI support (including + * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS), but on + * such systems we should never actually end up with a + * small BAR configuration, assuming we are able to load + * the kernel module. Hence it should be safe to treat + * this the same as when @probed_cpu_visible_size == + * @probed_size. + */ + __u64 probed_cpu_visible_size; + + /** + * @unallocated_cpu_visible_size: Estimate of CPU + * visible memory remaining. + * + * Note this is only tracked for + * I915_MEMORY_CLASS_DEVICE regions (for other types the + * value here will always equal the + * @probed_cpu_visible_size). + * + * Requires CAP_PERFMON or CAP_SYS_ADMIN to get reliable + * accounting. Without this the value here will always + * equal the @probed_cpu_visible_size. Note this is only + * currently tracked for I915_MEMORY_CLASS_DEVICE + * regions (for other types the value here will also + * always equal the @probed_cpu_visible_size). + * + * If this is an older kernel the value here will be + * zero, see also @probed_cpu_visible_size. + */ + __u64 unallocated_cpu_visible_size; + }; + }; }; /** @@ -3329,11 +3481,11 @@ struct drm_i915_query_memory_regions { * struct drm_i915_gem_create_ext - Existing gem_create behaviour, with added * extension support using struct i915_user_extension. * - * Note that in the future we want to have our buffer flags here, at least for - * the stuff that is immutable. Previously we would have two ioctls, one to - * create the object with gem_create, and another to apply various parameters, - * however this creates some ambiguity for the params which are considered - * immutable. Also in general we're phasing out the various SET/GET ioctls. + * Note that new buffer flags should be added here, at least for the stuff that + * is immutable. Previously we would have two ioctls, one to create the object + * with gem_create, and another to apply various parameters, however this + * creates some ambiguity for the params which are considered immutable. Also in + * general we're phasing out the various SET/GET ioctls. */ struct drm_i915_gem_create_ext { /** @@ -3341,7 +3493,6 @@ struct drm_i915_gem_create_ext { * * The (page-aligned) allocated size for the object will be returned. * - * * DG2 64K min page size implications: * * On discrete platforms, starting from DG2, we have to contend with GTT @@ -3353,7 +3504,9 @@ struct drm_i915_gem_create_ext { * * Note that the returned size here will always reflect any required * rounding up done by the kernel, i.e 4K will now become 64K on devices - * such as DG2. + * such as DG2. The kernel will always select the largest minimum + * page-size for the set of possible placements as the value to use when + * rounding up the @size. * * Special DG2 GTT address alignment requirement: * @@ -3377,14 +3530,58 @@ struct drm_i915_gem_create_ext { * is deemed to be a good compromise. */ __u64 size; + /** * @handle: Returned handle for the object. * * Object handles are nonzero. */ __u32 handle; - /** @flags: MBZ */ + + /** + * @flags: Optional flags. + * + * Supported values: + * + * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS - Signal to the kernel that + * the object will need to be accessed via the CPU. + * + * Only valid when placing objects in I915_MEMORY_CLASS_DEVICE, and only + * strictly required on configurations where some subset of the device + * memory is directly visible/mappable through the CPU (which we also + * call small BAR), like on some DG2+ systems. Note that this is quite + * undesirable, but due to various factors like the client CPU, BIOS etc + * it's something we can expect to see in the wild. See + * &drm_i915_memory_region_info.probed_cpu_visible_size for how to + * determine if this system applies. + * + * Note that one of the placements MUST be I915_MEMORY_CLASS_SYSTEM, to + * ensure the kernel can always spill the allocation to system memory, + * if the object can't be allocated in the mappable part of + * I915_MEMORY_CLASS_DEVICE. + * + * Also note that since the kernel only supports flat-CCS on objects + * that can *only* be placed in I915_MEMORY_CLASS_DEVICE, we therefore + * don't support I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS together with + * flat-CCS. + * + * Without this hint, the kernel will assume that non-mappable + * I915_MEMORY_CLASS_DEVICE is preferred for this object. Note that the + * kernel can still migrate the object to the mappable part, as a last + * resort, if userspace ever CPU faults this object, but this might be + * expensive, and so ideally should be avoided. + * + * On older kernels which lack the relevant small-bar uAPI support (see + * also &drm_i915_memory_region_info.probed_cpu_visible_size), + * usage of the flag will result in an error, but it should NEVER be + * possible to end up with a small BAR configuration, assuming we can + * also successfully load the i915 kernel module. In such cases the + * entire I915_MEMORY_CLASS_DEVICE region will be CPU accessible, and as + * such there are zero restrictions on where the object can be placed. + */ +#define I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS (1 << 0) __u32 flags; + /** * @extensions: The chain of extensions to apply to this object. * @@ -3443,6 +3640,22 @@ struct drm_i915_gem_create_ext { * At which point we get the object handle in &drm_i915_gem_create_ext.handle, * along with the final object size in &drm_i915_gem_create_ext.size, which * should account for any rounding up, if required. + * + * Note that userspace has no means of knowing the current backing region + * for objects where @num_regions is larger than one. The kernel will only + * ensure that the priority order of the @regions array is honoured, either + * when initially placing the object, or when moving memory around due to + * memory pressure + * + * On Flat-CCS capable HW, compression is supported for the objects residing + * in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) have other + * memory class in @regions and migrated (by i915, due to memory + * constraints) to the non I915_MEMORY_CLASS_DEVICE region, then i915 needs to + * decompress the content. But i915 doesn't have the required information to + * decompress the userspace compressed objects. + * + * So i915 supports Flat-CCS, on the objects which can reside only on + * I915_MEMORY_CLASS_DEVICE regions. */ struct drm_i915_gem_create_ext_memory_regions { /** @base: Extension link. See struct i915_user_extension. */ -- cgit v1.2.3 From bf465ca8090a1aa4f88d87e003302158c772c3de Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: tools headers UAPI: Sync linux/kvm.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick the changes in: 8a061562e2f2b32b ("RISC-V: KVM: Add extensible CSR emulation framework") f5ecfee944934757 ("KVM: s390: resetting the Topology-Change-Report") 450a563924ae9437 ("KVM: stats: Fix value for KVM_STATS_UNIT_MAX for boolean stats") 1b870fa5573e260b ("kvm: stats: tell userspace which values are boolean") db1c875e0539518e ("KVM: s390: add KVM_S390_ZPCI_OP to manage guest zPCI devices") 94dfc73e7cf4a31d ("treewide: uapi: Replace zero-length arrays with flexible-array members") 084cc29f8bbb034c ("KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis") 2f4073e08f4cc5a4 ("KVM: VMX: Enable Notify VM exit") ed2351174e38ad4f ("KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault") e9bf3acb23f0a6e1 ("KVM: s390: Add KVM_CAP_S390_PROTECTED_DUMP") 8aba09588d2af37c ("KVM: s390: Add CPU dump functionality") 0460eb35b443f73f ("KVM: s390: Add configuration dump functionality") fe9a93e07ba4f29d ("KVM: s390: pv: Add query dump information") 35d02493dba1ae63 ("KVM: s390: pv: Add query interface") c24a950ec7d60c4d ("KVM, SEV: Add KVM_EXIT_SHUTDOWN metadata for SEV-ES") ffbb61d09fc56c85 ("KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl.") 661a20fab7d156cf ("KVM: x86/xen: Advertise and document KVM_XEN_HVM_CONFIG_EVTCHN_SEND") fde0451be8fb3208 ("KVM: x86/xen: Support per-vCPU event channel upcall via local APIC") 28d1629f751c4a5f ("KVM: x86/xen: Kernel acceleration for XENVER_version") 536395260582be74 ("KVM: x86/xen: handle PV timers oneshot mode") 942c2490c23f2800 ("KVM: x86/xen: Add KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID") 2fd6df2f2b47d430 ("KVM: x86/xen: intercept EVTCHNOP_send from guests") 35025735a79eaa89 ("KVM: x86/xen: Support direct injection of event channel events") That just rebuilds perf, as these patches add just an ioctl that is S390 specific and may clash with other arches, so are so far being excluded in the harvester script: $ tools/perf/trace/beauty/kvm_ioctl.sh > before $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > after $ diff -u before after $ grep 390 tools/perf/trace/beauty/kvm_ioctl.sh egrep -v " ((ARM|PPC|S390)_|[GS]ET_(DEBUGREGS|PIT2|XSAVE|TSC_KHZ)|CREATE_SPAPR_TCE_64)" | \ $ This is also by now used by tools/testing/selftests/kvm/, a simple test build succeeded. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Anup Patel Cc: Ben Gardon Cc: Chenyi Qiang Cc: Christian Borntraeger Cc: David Woodhouse Cc: Gustavo A. R. Silva Cc: Janosch Frank Cc: João Martins Cc: Matthew Rosato Cc: Oliver Upton Cc: Paolo Bonzini Cc: Peter Gonda Cc: Pierre Morel Cc: Tao Xu Link: https://lore.kernel.org/lkml/YvzuryClcn%2FvA0Gn@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 108 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index cb6e3846d27b..eed0315a77a6 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -270,6 +270,8 @@ struct kvm_xen_exit { #define KVM_EXIT_X86_BUS_LOCK 33 #define KVM_EXIT_XEN 34 #define KVM_EXIT_RISCV_SBI 35 +#define KVM_EXIT_RISCV_CSR 36 +#define KVM_EXIT_NOTIFY 37 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -496,6 +498,18 @@ struct kvm_run { unsigned long args[6]; unsigned long ret[2]; } riscv_sbi; + /* KVM_EXIT_RISCV_CSR */ + struct { + unsigned long csr_num; + unsigned long new_value; + unsigned long write_mask; + unsigned long ret_value; + } riscv_csr; + /* KVM_EXIT_NOTIFY */ + struct { +#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 flags; + } notify; /* Fix the size of the union. */ char padding[256]; }; @@ -1157,6 +1171,12 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_TSC_CONTROL 214 #define KVM_CAP_SYSTEM_EVENT_DATA 215 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216 +#define KVM_CAP_S390_PROTECTED_DUMP 217 +#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 +#define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 +#define KVM_CAP_S390_ZPCI_OP 221 +#define KVM_CAP_S390_CPU_TOPOLOGY 222 #ifdef KVM_CAP_IRQ_ROUTING @@ -1660,6 +1680,55 @@ struct kvm_s390_pv_unp { __u64 tweak; }; +enum pv_cmd_dmp_id { + KVM_PV_DUMP_INIT, + KVM_PV_DUMP_CONFIG_STOR_STATE, + KVM_PV_DUMP_COMPLETE, + KVM_PV_DUMP_CPU, +}; + +struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + __u64 reserved[4]; +}; + +enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, +}; + +struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; +}; + +struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; +}; + +struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; +}; + +struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + union { + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; +}; + enum pv_cmd_id { KVM_PV_ENABLE, KVM_PV_DISABLE, @@ -1668,6 +1737,8 @@ enum pv_cmd_id { KVM_PV_VERIFY, KVM_PV_PREP_RESET, KVM_PV_UNSHARE_ALL, + KVM_PV_INFO, + KVM_PV_DUMP, }; struct kvm_pv_cmd { @@ -2119,4 +2190,41 @@ struct kvm_stats_desc { /* Available with KVM_CAP_XSAVE2 */ #define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) +/* Available with KVM_CAP_S390_PROTECTED_DUMP */ +#define KVM_S390_PV_CPU_COMMAND _IOWR(KVMIO, 0xd0, struct kvm_pv_cmd) + +/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */ +#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0) +#define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1) + +/* Available with KVM_CAP_S390_ZPCI_OP */ +#define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From 25f308951703be599f82c44229b6f74c4ad86ed4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2020 08:52:32 -0300 Subject: tools headers kvm s390: Sync headers with the kernel sources To pick the changes in: f5ecfee944934757 ("KVM: s390: resetting the Topology-Change-Report") None of them trigger any changes in tooling, this time this is just to silence these perf build warnings: Warning: Kernel ABI header at 'tools/arch/s390/include/uapi/asm/kvm.h' differs from latest version at 'arch/s390/include/uapi/asm/kvm.h' diff -u tools/arch/s390/include/uapi/asm/kvm.h arch/s390/include/uapi/asm/kvm.h Cc: Janosch Frank Cc: Pierre Morel Link: http://lore.kernel.org/lkml/YvzwMXzaIzOU4WAY@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/s390/include/uapi/asm/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 7a6b14874d65..a73cf01a1606 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 #define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 -- cgit v1.2.3 From 898d24034605d2a0b16f6ca349d2e74124b5e043 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 14 Apr 2020 09:12:55 -0300 Subject: tools include UAPI: Sync linux/vhost.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To get the changes in: f345a0143b4dd1cf ("vhost-vdpa: uAPI to suspend the device") Silencing this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/vhost.h' differs from latest version at 'include/uapi/linux/vhost.h' diff -u tools/include/uapi/linux/vhost.h include/uapi/linux/vhost.h To pick up these changes and support them: $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > before $ cp include/uapi/linux/vhost.h tools/include/uapi/linux/vhost.h $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > after $ diff -u before after --- before 2022-08-18 09:46:12.355958316 -0300 +++ after 2022-08-18 09:46:19.701182822 -0300 @@ -29,6 +29,7 @@ [0x75] = "VDPA_SET_VRING_ENABLE", [0x77] = "VDPA_SET_CONFIG_CALL", [0x7C] = "VDPA_SET_GROUP_ASID", + [0x7D] = "VDPA_SUSPEND", }; = { [0x00] = "GET_FEATURES", $ For instance, see how those 'cmd' ioctl arguments get translated, now VDPA_SUSPEND will be as well: # perf trace -a -e ioctl --max-events=10 0.000 ( 0.011 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1) = 0 21.353 ( 0.014 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1) = 0 25.766 ( 0.014 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740) = 0 25.845 ( 0.034 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70) = 0 25.916 ( 0.011 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0) = 0 25.941 ( 0.025 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ATOMIC, arg: 0x7ffe4a22c840) = 0 32.915 ( 0.009 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_RMFB, arg: 0x7ffe4a22cf9c) = 0 42.522 ( 0.013 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740) = 0 42.579 ( 0.031 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70) = 0 42.644 ( 0.010 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0) = 0 # Cc: Adrian Hunter Cc: Eugenio Pérez Cc: Ian Rogers Cc: Jiri Olsa Cc: Michael S. Tsirkin Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Yv6Kb4OESuNJuH6X@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/vhost.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h index cab645d4a645..f9f115a7c75b 100644 --- a/tools/include/uapi/linux/vhost.h +++ b/tools/include/uapi/linux/vhost.h @@ -171,4 +171,13 @@ #define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ struct vhost_vring_state) +/* Suspend a device so it does not process virtqueue requests anymore + * + * After the return of ioctl the device must preserve all the necessary state + * (the virtqueue vring base plus the possible device specific states) that is + * required for restoring in the future. The device must not change its + * configuration after that point. + */ +#define VHOST_VDPA_SUSPEND _IO(VHOST_VIRTIO, 0x7D) + #endif -- cgit v1.2.3 From eea085d11449bc6514dca9850cdd3a996ec1217e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 2 Mar 2021 17:20:08 -0300 Subject: tools headers UAPI: Sync KVM's vmx.h header with the kernel sources To pick the changes in: 2f4073e08f4cc5a4 ("KVM: VMX: Enable Notify VM exit") That makes 'perf kvm-stat' aware of this new NOTIFY exit reason, thus addressing the following perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/vmx.h' differs from latest version at 'arch/x86/include/uapi/asm/vmx.h' diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Tao Xu Link: http://lore.kernel.org/lkml/Yv6LavXMZ+njijpq@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/vmx.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..a5faf6d88f1b 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -91,6 +91,7 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +154,8 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } -- cgit v1.2.3 From e5bc0deae57615324ca843827873b39a34acc82e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 10 Sep 2021 11:46:54 -0300 Subject: tools headers UAPI: Sync x86's asm/kvm.h with the kernel sources To pick the changes in: 43bb9e000ea4c621 ("KVM: x86: Tweak name of MONITOR/MWAIT #UD quirk to make it #UD specific") 94dfc73e7cf4a31d ("treewide: uapi: Replace zero-length arrays with flexible-array members") bfbcc81bb82cbbad ("KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior") b172862241b48499 ("KVM: x86: PIT: Preserve state of speaker port data bit") ed2351174e38ad4f ("KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault") That just rebuilds kvm-stat.c on x86, no change in functionality. This silences these perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/kvm.h' differs from latest version at 'arch/x86/include/uapi/asm/kvm.h' diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Chenyi Qiang Cc: Sean Christopherson Cc: Gustavo A. R. Silva Cc: Paolo Bonzini Cc: Paul Durrant Link: https://lore.kernel.org/lkml/Yv6OMPKYqYSbUxwZ@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/kvm.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index ec53c9fa1da9..46de10a809ec 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -306,7 +306,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; @@ -325,6 +326,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +361,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; @@ -434,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 -- cgit v1.2.3 From 65ba872a6971c11ceb342c3330f059289c0e6bdb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Aug 2022 17:36:41 -0700 Subject: tools headers UAPI: Sync linux/perf_event.h with the kernel sources To pick the trivial change in: 119a784c81270eb8 ("perf/core: Add a new read format to get a number of lost samples") Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220819003644.508916-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index e2b77fbca91e..581ed4bdc062 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -301,6 +301,7 @@ enum { * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST * } && !PERF_FORMAT_GROUP * * { u64 nr; @@ -308,6 +309,7 @@ enum { * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING * { u64 value; * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST * } cntr[nr]; * } && PERF_FORMAT_GROUP * }; @@ -317,8 +319,9 @@ enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ -- cgit v1.2.3 From 89e3106fa25fb1b626a7123dba870159d453e785 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Aug 2022 17:36:42 -0700 Subject: libperf: Handle read format in perf_evsel__read() The perf_counts_values should be increased to read the new lost data. Also adjust values after read according the read format. This supports PERF_FORMAT_GROUP which has a different data format but it's only available for leader events. Currently it doesn't have an API to read sibling (member) events in the group. But users may read the sibling event directly. Also reading from mmap would be disabled when the read format has ID or LOST bit as it's not exposed via mmap. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220819003644.508916-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/evsel.c | 79 ++++++++++++++++++++++++++++++++++++- tools/lib/perf/include/perf/event.h | 3 +- tools/lib/perf/include/perf/evsel.h | 4 +- 3 files changed, 83 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 952f3520d5c2..8ce5bbd09666 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -305,6 +305,9 @@ int perf_evsel__read_size(struct perf_evsel *evsel) if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); + if (read_format & PERF_FORMAT_LOST) + entry += sizeof(u64); + if (read_format & PERF_FORMAT_GROUP) { nr = evsel->nr_members; size += sizeof(u64); @@ -314,24 +317,98 @@ int perf_evsel__read_size(struct perf_evsel *evsel) return size; } +/* This only reads values for the leader */ +static int perf_evsel__read_group(struct perf_evsel *evsel, int cpu_map_idx, + int thread, struct perf_counts_values *count) +{ + size_t size = perf_evsel__read_size(evsel); + int *fd = FD(evsel, cpu_map_idx, thread); + u64 read_format = evsel->attr.read_format; + u64 *data; + int idx = 1; + + if (fd == NULL || *fd < 0) + return -EINVAL; + + data = calloc(1, size); + if (data == NULL) + return -ENOMEM; + + if (readn(*fd, data, size) <= 0) { + free(data); + return -errno; + } + + /* + * This reads only the leader event intentionally since we don't have + * perf counts values for sibling events. + */ + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + count->ena = data[idx++]; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + count->run = data[idx++]; + + /* value is always available */ + count->val = data[idx++]; + if (read_format & PERF_FORMAT_ID) + count->id = data[idx++]; + if (read_format & PERF_FORMAT_LOST) + count->lost = data[idx++]; + + free(data); + return 0; +} + +/* + * The perf read format is very flexible. It needs to set the proper + * values according to the read format. + */ +static void perf_evsel__adjust_values(struct perf_evsel *evsel, u64 *buf, + struct perf_counts_values *count) +{ + u64 read_format = evsel->attr.read_format; + int n = 0; + + count->val = buf[n++]; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + count->ena = buf[n++]; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + count->run = buf[n++]; + + if (read_format & PERF_FORMAT_ID) + count->id = buf[n++]; + + if (read_format & PERF_FORMAT_LOST) + count->lost = buf[n++]; +} + int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int thread, struct perf_counts_values *count) { size_t size = perf_evsel__read_size(evsel); int *fd = FD(evsel, cpu_map_idx, thread); + u64 read_format = evsel->attr.read_format; + struct perf_counts_values buf; memset(count, 0, sizeof(*count)); if (fd == NULL || *fd < 0) return -EINVAL; + if (read_format & PERF_FORMAT_GROUP) + return perf_evsel__read_group(evsel, cpu_map_idx, thread, count); + if (MMAP(evsel, cpu_map_idx, thread) && + !(read_format & (PERF_FORMAT_ID | PERF_FORMAT_LOST)) && !perf_mmap__read_self(MMAP(evsel, cpu_map_idx, thread), count)) return 0; - if (readn(*fd, count->values, size) <= 0) + if (readn(*fd, buf.values, size) <= 0) return -errno; + perf_evsel__adjust_values(evsel, buf.values, count); return 0; } diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 57f54781f5ed..93bf93a59c99 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -77,7 +77,7 @@ struct perf_record_lost_samples { }; /* - * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID + * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID | PERF_FORMAT_LOST */ struct perf_record_read { struct perf_event_header header; @@ -86,6 +86,7 @@ struct perf_record_read { __u64 time_enabled; __u64 time_running; __u64 id; + __u64 lost; }; struct perf_record_throttle { diff --git a/tools/lib/perf/include/perf/evsel.h b/tools/lib/perf/include/perf/evsel.h index 699c0ed97d34..6f92204075c2 100644 --- a/tools/lib/perf/include/perf/evsel.h +++ b/tools/lib/perf/include/perf/evsel.h @@ -18,8 +18,10 @@ struct perf_counts_values { uint64_t val; uint64_t ena; uint64_t run; + uint64_t id; + uint64_t lost; }; - uint64_t values[3]; + uint64_t values[5]; }; }; -- cgit v1.2.3 From 6d395a513556e61dc22c6abdf9b419deb46f1908 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Aug 2022 17:36:43 -0700 Subject: libperf: Add a test case for read formats It checks a various combination of the read format settings and verify it return the value in a proper position. The test uses task-clock software events to guarantee it's always active and sets enabled/running time. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220819003644.508916-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/tests/test-evsel.c | 161 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) (limited to 'tools') diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index 89be89afb24d..a11fc51bfb68 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include +#include #include #include #include +#include #include #include "tests.h" @@ -189,6 +192,163 @@ static int test_stat_user_read(int event) return 0; } +static int test_stat_read_format_single(struct perf_event_attr *attr, struct perf_thread_map *threads) +{ + struct perf_evsel *evsel; + struct perf_counts_values counts; + volatile int count = 0x100000; + int err; + + evsel = perf_evsel__new(attr); + __T("failed to create evsel", evsel); + + /* skip old kernels that don't support the format */ + err = perf_evsel__open(evsel, NULL, threads); + if (err < 0) + return 0; + + while (count--) ; + + memset(&counts, -1, sizeof(counts)); + perf_evsel__read(evsel, 0, 0, &counts); + + __T("failed to read value", counts.val); + if (attr->read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + __T("failed to read TOTAL_TIME_ENABLED", counts.ena); + if (attr->read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + __T("failed to read TOTAL_TIME_RUNNING", counts.run); + if (attr->read_format & PERF_FORMAT_ID) + __T("failed to read ID", counts.id); + if (attr->read_format & PERF_FORMAT_LOST) + __T("failed to read LOST", counts.lost == 0); + + perf_evsel__close(evsel); + perf_evsel__delete(evsel); + return 0; +} + +static int test_stat_read_format_group(struct perf_event_attr *attr, struct perf_thread_map *threads) +{ + struct perf_evsel *leader, *member; + struct perf_counts_values counts; + volatile int count = 0x100000; + int err; + + attr->read_format |= PERF_FORMAT_GROUP; + leader = perf_evsel__new(attr); + __T("failed to create leader", leader); + + attr->read_format &= ~PERF_FORMAT_GROUP; + member = perf_evsel__new(attr); + __T("failed to create member", member); + + member->leader = leader; + leader->nr_members = 2; + + /* skip old kernels that don't support the format */ + err = perf_evsel__open(leader, NULL, threads); + if (err < 0) + return 0; + err = perf_evsel__open(member, NULL, threads); + if (err < 0) + return 0; + + while (count--) ; + + memset(&counts, -1, sizeof(counts)); + perf_evsel__read(leader, 0, 0, &counts); + + __T("failed to read leader value", counts.val); + if (attr->read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + __T("failed to read leader TOTAL_TIME_ENABLED", counts.ena); + if (attr->read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + __T("failed to read leader TOTAL_TIME_RUNNING", counts.run); + if (attr->read_format & PERF_FORMAT_ID) + __T("failed to read leader ID", counts.id); + if (attr->read_format & PERF_FORMAT_LOST) + __T("failed to read leader LOST", counts.lost == 0); + + memset(&counts, -1, sizeof(counts)); + perf_evsel__read(member, 0, 0, &counts); + + __T("failed to read member value", counts.val); + if (attr->read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + __T("failed to read member TOTAL_TIME_ENABLED", counts.ena); + if (attr->read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + __T("failed to read member TOTAL_TIME_RUNNING", counts.run); + if (attr->read_format & PERF_FORMAT_ID) + __T("failed to read member ID", counts.id); + if (attr->read_format & PERF_FORMAT_LOST) + __T("failed to read member LOST", counts.lost == 0); + + perf_evsel__close(member); + perf_evsel__close(leader); + perf_evsel__delete(member); + perf_evsel__delete(leader); + return 0; +} + +static int test_stat_read_format(void) +{ + struct perf_thread_map *threads; + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_TASK_CLOCK, + }; + int err, i; + +#define FMT(_fmt) PERF_FORMAT_ ## _fmt +#define FMT_TIME (FMT(TOTAL_TIME_ENABLED) | FMT(TOTAL_TIME_RUNNING)) + + uint64_t test_formats [] = { + 0, + FMT_TIME, + FMT(ID), + FMT(LOST), + FMT_TIME | FMT(ID), + FMT_TIME | FMT(LOST), + FMT_TIME | FMT(ID) | FMT(LOST), + FMT(ID) | FMT(LOST), + }; + +#undef FMT +#undef FMT_TIME + + threads = perf_thread_map__new_dummy(); + __T("failed to create threads", threads); + + perf_thread_map__set_pid(threads, 0, 0); + + for (i = 0; i < (int)ARRAY_SIZE(test_formats); i++) { + attr.read_format = test_formats[i]; + __T_VERBOSE("testing single read with read_format: %lx\n", + (unsigned long)test_formats[i]); + + err = test_stat_read_format_single(&attr, threads); + __T("failed to read single format", err == 0); + } + + perf_thread_map__put(threads); + + threads = perf_thread_map__new_array(2, NULL); + __T("failed to create threads", threads); + + perf_thread_map__set_pid(threads, 0, 0); + perf_thread_map__set_pid(threads, 1, 0); + + for (i = 0; i < (int)ARRAY_SIZE(test_formats); i++) { + attr.read_format = test_formats[i]; + __T_VERBOSE("testing group read with read_format: %lx\n", + (unsigned long)test_formats[i]); + + err = test_stat_read_format_group(&attr, threads); + __T("failed to read group format", err == 0); + } + + perf_thread_map__put(threads); + return 0; +} + int test_evsel(int argc, char **argv) { __T_START; @@ -200,6 +360,7 @@ int test_evsel(int argc, char **argv) test_stat_thread_enable(); test_stat_user_read(PERF_COUNT_HW_INSTRUCTIONS); test_stat_user_read(PERF_COUNT_HW_CPU_CYCLES); + test_stat_read_format(); __T_END; return tests_failed == 0 ? 0 : -1; -- cgit v1.2.3 From f52679b78877f17e95a317e18a4c9c46cc3d845a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 18 Aug 2022 17:36:44 -0700 Subject: perf tools: Support reading PERF_FORMAT_LOST The recent kernel added lost count can be read from either read(2) or ring buffer data with PERF_SAMPLE_READ. As it's a variable length data we need to access it according to the format info. But for perf tools use cases, PERF_FORMAT_ID is always set. So we can only check PERF_FORMAT_LOST bit to determine the data format. Add sample_read_value_size() and next_sample_read_value() helpers to make it a bit easier to access. Use them in all places where it reads the struct sample_read_value. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220819003644.508916-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sample-parsing.c | 14 ++++++--- tools/perf/util/event.h | 21 ++++++++++++- tools/perf/util/evsel.c | 29 +++++++++++------- .../util/scripting-engines/trace-event-python.c | 19 ++++++++---- tools/perf/util/session.c | 35 +++++++++++++--------- tools/perf/util/synthetic-events.c | 32 +++++++++++++++----- 6 files changed, 108 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 07f2411b0ad4..20930dd48ee0 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -86,10 +86,15 @@ static bool samples_same(const struct perf_sample *s1, COMP(read.time_running); /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ if (read_format & PERF_FORMAT_GROUP) { - for (i = 0; i < s1->read.group.nr; i++) - MCOMP(read.group.values[i]); + for (i = 0; i < s1->read.group.nr; i++) { + /* FIXME: check values without LOST */ + if (read_format & PERF_FORMAT_LOST) + MCOMP(read.group.values[i]); + } } else { COMP(read.one.id); + if (read_format & PERF_FORMAT_LOST) + COMP(read.one.lost); } } @@ -263,7 +268,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .data = (void *)aux_data, }, }; - struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},}; + struct sample_read_value values[] = {{1, 5, 0}, {9, 3, 0}, {2, 7, 0}, {6, 4, 1},}; struct perf_sample sample_out, sample_out_endian; size_t i, sz, bufsz; int err, ret = -1; @@ -286,6 +291,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) } else { sample.read.one.value = 0x08789faeb786aa87ULL; sample.read.one.id = 99; + sample.read.one.lost = 1; } sz = perf_event__sample_event_size(&sample, sample_type, read_format); @@ -370,7 +376,7 @@ out_free: */ static int test__sample_parsing(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - const u64 rf[] = {4, 5, 6, 7, 12, 13, 14, 15}; + const u64 rf[] = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 28, 29, 30, 31}; u64 sample_type; u64 sample_regs; size_t i; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 4e0d8dd3b7a0..12eae6917022 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -65,7 +65,8 @@ struct stack_dump { struct sample_read_value { u64 value; - u64 id; + u64 id; /* only if PERF_FORMAT_ID */ + u64 lost; /* only if PERF_FORMAT_LOST */ }; struct sample_read { @@ -80,6 +81,24 @@ struct sample_read { }; }; +static inline size_t sample_read_value_size(u64 read_format) +{ + /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ + if (read_format & PERF_FORMAT_LOST) + return sizeof(struct sample_read_value); + else + return offsetof(struct sample_read_value, lost); +} + +static inline struct sample_read_value * +next_sample_read_value(struct sample_read_value *v, u64 read_format) +{ + return (void *)v + sample_read_value_size(read_format); +} + +#define sample_read_group__for_each(v, nr, rf) \ + for (int __i = 0; __i < (int)nr; v = next_sample_read_value(v, rf), __i++) + struct ip_callchain { u64 nr; u64 ips[]; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 4852089e1d79..18c3eb864d55 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1541,7 +1541,7 @@ static int evsel__read_one(struct evsel *evsel, int cpu_map_idx, int thread) } static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread, - u64 val, u64 ena, u64 run) + u64 val, u64 ena, u64 run, u64 lost) { struct perf_counts_values *count; @@ -1550,6 +1550,7 @@ static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread, count->val = val; count->ena = ena; count->run = run; + count->lost = lost; perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, true); } @@ -1558,7 +1559,7 @@ static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int { u64 read_format = leader->core.attr.read_format; struct sample_read_value *v; - u64 nr, ena = 0, run = 0, i; + u64 nr, ena = 0, run = 0, lost = 0; nr = *data++; @@ -1571,18 +1572,18 @@ static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) run = *data++; - v = (struct sample_read_value *) data; - - evsel__set_count(leader, cpu_map_idx, thread, v[0].value, ena, run); - - for (i = 1; i < nr; i++) { + v = (void *)data; + sample_read_group__for_each(v, nr, read_format) { struct evsel *counter; - counter = evlist__id2evsel(leader->evlist, v[i].id); + counter = evlist__id2evsel(leader->evlist, v->id); if (!counter) return -EINVAL; - evsel__set_count(counter, cpu_map_idx, thread, v[i].value, ena, run); + if (read_format & PERF_FORMAT_LOST) + lost = v->lost; + + evsel__set_count(counter, cpu_map_idx, thread, v->value, ena, run, lost); } return 0; @@ -2475,8 +2476,8 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, if (data->read.group.nr > max_group_nr) return -EFAULT; - sz = data->read.group.nr * - sizeof(struct sample_read_value); + + sz = data->read.group.nr * sample_read_value_size(read_format); OVERFLOW_CHECK(array, sz, max_size); data->read.group.values = (struct sample_read_value *)array; @@ -2485,6 +2486,12 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, OVERFLOW_CHECK_u64(array); data->read.one.id = *array; array++; + + if (read_format & PERF_FORMAT_LOST) { + OVERFLOW_CHECK_u64(array); + data->read.one.lost = *array; + array++; + } } } diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 9ef2406e0ede..1f2040f36d4e 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -642,15 +642,19 @@ exit: return pylist; } -static PyObject *get_sample_value_as_tuple(struct sample_read_value *value) +static PyObject *get_sample_value_as_tuple(struct sample_read_value *value, + u64 read_format) { PyObject *t; - t = PyTuple_New(2); + t = PyTuple_New(3); if (!t) Py_FatalError("couldn't create Python tuple"); PyTuple_SetItem(t, 0, PyLong_FromUnsignedLongLong(value->id)); PyTuple_SetItem(t, 1, PyLong_FromUnsignedLongLong(value->value)); + if (read_format & PERF_FORMAT_LOST) + PyTuple_SetItem(t, 2, PyLong_FromUnsignedLongLong(value->lost)); + return t; } @@ -681,12 +685,17 @@ static void set_sample_read_in_dict(PyObject *dict_sample, Py_FatalError("couldn't create Python list"); if (read_format & PERF_FORMAT_GROUP) { - for (i = 0; i < sample->read.group.nr; i++) { - PyObject *t = get_sample_value_as_tuple(&sample->read.group.values[i]); + struct sample_read_value *v = sample->read.group.values; + + i = 0; + sample_read_group__for_each(v, sample->read.group.nr, read_format) { + PyObject *t = get_sample_value_as_tuple(v, read_format); PyList_SET_ITEM(values, i, t); + i++; } } else { - PyObject *t = get_sample_value_as_tuple(&sample->read.one); + PyObject *t = get_sample_value_as_tuple(&sample->read.one, + read_format); PyList_SET_ITEM(values, 0, t); } pydict_set_item_string_decref(dict_sample, "values", values); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 61bb9675e044..192c9274f7ad 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1283,21 +1283,25 @@ static void sample_read__printf(struct perf_sample *sample, u64 read_format) sample->read.time_running); if (read_format & PERF_FORMAT_GROUP) { - u64 i; + struct sample_read_value *value = sample->read.group.values; printf(".... group nr %" PRIu64 "\n", sample->read.group.nr); - for (i = 0; i < sample->read.group.nr; i++) { - struct sample_read_value *value; - - value = &sample->read.group.values[i]; + sample_read_group__for_each(value, sample->read.group.nr, read_format) { printf("..... id %016" PRIx64 - ", value %016" PRIx64 "\n", + ", value %016" PRIx64, value->id, value->value); + if (read_format & PERF_FORMAT_LOST) + printf(", lost %" PRIu64, value->lost); + printf("\n"); } - } else - printf("..... id %016" PRIx64 ", value %016" PRIx64 "\n", + } else { + printf("..... id %016" PRIx64 ", value %016" PRIx64, sample->read.one.id, sample->read.one.value); + if (read_format & PERF_FORMAT_LOST) + printf(", lost %" PRIu64, sample->read.one.lost); + printf("\n"); + } } static void dump_event(struct evlist *evlist, union perf_event *event, @@ -1411,6 +1415,9 @@ static void dump_read(struct evsel *evsel, union perf_event *event) if (read_format & PERF_FORMAT_ID) printf("... id : %" PRI_lu64 "\n", read_event->id); + + if (read_format & PERF_FORMAT_LOST) + printf("... lost : %" PRI_lu64 "\n", read_event->lost); } static struct machine *machines__find_for_cpumode(struct machines *machines, @@ -1479,14 +1486,14 @@ static int deliver_sample_group(struct evlist *evlist, struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, - struct machine *machine) + struct machine *machine, + u64 read_format) { int ret = -EINVAL; - u64 i; + struct sample_read_value *v = sample->read.group.values; - for (i = 0; i < sample->read.group.nr; i++) { - ret = deliver_sample_value(evlist, tool, event, sample, - &sample->read.group.values[i], + sample_read_group__for_each(v, sample->read.group.nr, read_format) { + ret = deliver_sample_value(evlist, tool, event, sample, v, machine); if (ret) break; @@ -1510,7 +1517,7 @@ static int evlist__deliver_sample(struct evlist *evlist, struct perf_tool *tool, /* For PERF_SAMPLE_READ we have either single or group mode. */ if (read_format & PERF_FORMAT_GROUP) return deliver_sample_group(evlist, tool, event, sample, - machine); + machine, read_format); else return deliver_sample_value(evlist, tool, event, sample, &sample->read.one, machine); diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 59747c440bd5..812424dbf2d5 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1429,11 +1429,12 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, result += sizeof(u64); /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ if (read_format & PERF_FORMAT_GROUP) { - sz = sample->read.group.nr * - sizeof(struct sample_read_value); - result += sz; + sz = sample_read_value_size(read_format); + result += sz * sample->read.group.nr; } else { result += sizeof(u64); + if (read_format & PERF_FORMAT_LOST) + result += sizeof(u64); } } @@ -1518,6 +1519,20 @@ void __weak arch_perf_synthesize_sample_weight(const struct perf_sample *data, *array = data->weight; } +static __u64 *copy_read_group_values(__u64 *array, __u64 read_format, + const struct perf_sample *sample) +{ + size_t sz = sample_read_value_size(read_format); + struct sample_read_value *v = sample->read.group.values; + + sample_read_group__for_each(v, sample->read.group.nr, read_format) { + /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ + memcpy(array, v, sz); + array = (void *)array + sz; + } + return array; +} + int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format, const struct perf_sample *sample) { @@ -1599,13 +1614,16 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo /* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */ if (read_format & PERF_FORMAT_GROUP) { - sz = sample->read.group.nr * - sizeof(struct sample_read_value); - memcpy(array, sample->read.group.values, sz); - array = (void *)array + sz; + array = copy_read_group_values(array, read_format, + sample); } else { *array = sample->read.one.id; array++; + + if (read_format & PERF_FORMAT_LOST) { + *array = sample->read.one.lost; + array++; + } } } -- cgit v1.2.3 From e918cd231ee6f1dc969e71718ed11c71e98f5c4c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 17 Aug 2022 22:32:42 +0100 Subject: selftests/bpf: Fix spelling mistake. There is a spelling mistake in an ASSERT_OK literal string. Fix it. Signed-off-by: Colin Ian King Acked-by: Mykola Lysenko Link: https://lore.kernel.org/r/20220817213242.101277-1-colin.i.king@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/kfunc_call.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 351fafa006fb..eede7c304f86 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -109,7 +109,7 @@ static void test_destructive(void) { __u64 save_caps = 0; - ASSERT_OK(test_destructive_open_and_load(), "succesful_load"); + ASSERT_OK(test_destructive_open_and_load(), "successful_load"); if (!ASSERT_OK(cap_disable_effective(1ULL << CAP_SYS_BOOT, &save_caps), "drop_caps")) return; -- cgit v1.2.3 From b979f005d9b1ebdba565e85f5228dda6fe7a30e4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 19 Aug 2022 12:21:55 -0700 Subject: selftest/bpf: Add setget_sockopt to DENYLIST.s390x Trampoline is not supported in s390. Fixes: 31123c0360e0 ("selftests/bpf: bpf_setsockopt tests") Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220819192155.91713-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 9d8de15e725e..a708c3dcc154 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -65,3 +65,4 @@ send_signal # intermittently fails to receive signa select_reuseport # intermittently fails on new s390x setup xdp_synproxy # JIT does not support calling kernel function (kfunc) unpriv_bpf_disabled # fentry +setget_sockopt # attach unexpected error: -524 (trampoline) -- cgit v1.2.3 From b690842d12fd6687c326663d69d5732de00c00f6 Mon Sep 17 00:00:00 2001 From: Matthias May Date: Wed, 17 Aug 2022 09:36:49 +0200 Subject: selftests/net: test l2 tunnel TOS/TTL inheriting There are currently 3 ip tunnels that are capable of carrying L2 traffic: gretap, vxlan and geneve. They all are capable to inherit the TOS/TTL for the outer IP-header from the inner frame. Add a test that verifies that these fields are correctly inherited. These tests failed before the following commits: b09ab9c92e50 ("ip6_tunnel: allow to inherit from VLAN encapsulated IP") 3f8a8447fd0b ("ip6_gre: use actual protocol to select xmit") 41337f52b967 ("ip6_gre: set DSCP for non-IP") 7ae29fd1be43 ("ip_tunnel: allow to inherit from VLAN encapsulated IP") 7074732c8fae ("ip_tunnels: allow VXLAN/GENEVE to inherit TOS/TTL from VLAN") ca2bb69514a8 ("geneve: do not use RT_TOS for IPv6 flowlabel") b4ab94d6adaa ("geneve: fix TOS inheriting for ipv4") Signed-off-by: Matthias May Link: https://lore.kernel.org/r/20220817073649.26117-1-matthias.may@westermo.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/l2_tos_ttl_inherit.sh | 390 ++++++++++++++++++++++ 2 files changed, 391 insertions(+) create mode 100755 tools/testing/selftests/net/l2_tos_ttl_inherit.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c0ee2955fe54..11a288b67e2f 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -42,6 +42,7 @@ TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh TEST_PROGS += arp_ndisc_untracked_subnets.sh TEST_PROGS += stress_reuseport_listen.sh +TEST_PROGS := l2_tos_ttl_inherit.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest diff --git a/tools/testing/selftests/net/l2_tos_ttl_inherit.sh b/tools/testing/selftests/net/l2_tos_ttl_inherit.sh new file mode 100755 index 000000000000..dca1e6f777a8 --- /dev/null +++ b/tools/testing/selftests/net/l2_tos_ttl_inherit.sh @@ -0,0 +1,390 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Author: Matthias May +# +# This script evaluates ip tunnels that are capable of carrying L2 traffic +# if they inherit or set the inheritable fields. +# Namely these tunnels are: 'gretap', 'vxlan' and 'geneve'. +# Checked inheritable fields are: TOS and TTL. +# The outer tunnel protocol of 'IPv4' or 'IPv6' is verified- +# As payload frames of type 'IPv4', 'IPv6' and 'other'(ARP) are verified. +# In addition this script also checks if forcing a specific field in the +# outer header is working. + +if [ "$(id -u)" != "0" ]; then + echo "Please run as root." + exit 0 +fi +if ! which tcpdump > /dev/null 2>&1; then + echo "No tcpdump found. Required for this test." + exit 0 +fi + +expected_tos="0x00" +expected_ttl="0" +failed=false + +get_random_tos() { + # Get a random hex tos value between 0x00 and 0xfc, a multiple of 4 + echo "0x$(tr -dc '0-9a-f' < /dev/urandom | head -c 1)\ +$(tr -dc '048c' < /dev/urandom | head -c 1)" +} +get_random_ttl() { + # Get a random dec value between 0 and 255 + printf "%d" "0x$(tr -dc '0-9a-f' < /dev/urandom | head -c 2)" +} +get_field() { + # Expects to get the 'head -n 1' of a captured frame by tcpdump. + # Parses this first line and returns the specified field. + local field="$1" + local input="$2" + local found=false + input="$(echo "$input" | tr -d '(),')" + for input_field in $input; do + if $found; then + echo "$input_field" + return + fi + # The next field that we iterate over is the looked for value + if [ "$input_field" = "$field" ]; then + found=true + fi + done + echo "0" +} +setup() { + local type="$1" + local outer="$2" + local inner="$3" + local tos_ttl="$4" + local vlan="$5" + local test_tos="0x00" + local test_ttl="0" + local ns="ip netns exec testing" + + # We don't want a test-tos of 0x00, + # because this is the value that we get when no tos is set. + expected_tos="$(get_random_tos)" + while [ "$expected_tos" = "0x00" ]; do + expected_tos="$(get_random_tos)" + done + if [ "$tos_ttl" = "random" ]; then + test_tos="$expected_tos" + tos="fixed $test_tos" + elif [ "$tos_ttl" = "inherit" ]; then + test_tos="$tos_ttl" + tos="inherit $expected_tos" + fi + + # We don't want a test-ttl of 64 or 0, + # because 64 is when no ttl is set and 0 is not a valid ttl. + expected_ttl="$(get_random_ttl)" + while [ "$expected_ttl" = "64" ] || [ "$expected_ttl" = "0" ]; do + expected_ttl="$(get_random_ttl)" + done + + if [ "$tos_ttl" = "random" ]; then + test_ttl="$expected_ttl" + ttl="fixed $test_ttl" + elif [ "$tos_ttl" = "inherit" ]; then + test_ttl="$tos_ttl" + ttl="inherit $expected_ttl" + fi + printf "│%7s │%6s │%6s │%13s │%13s │%6s │" \ + "$type" "$outer" "$inner" "$tos" "$ttl" "$vlan" + + # Create 'testing' netns, veth pair and connect main ns with testing ns + ip netns add testing + ip link add type veth + ip link set veth1 netns testing + ip link set veth0 up + $ns ip link set veth1 up + ip addr flush dev veth0 + $ns ip addr flush dev veth1 + + local local_addr1="" + local local_addr2="" + if [ "$type" = "gre" ] || [ "$type" = "vxlan" ]; then + if [ "$outer" = "4" ]; then + local_addr1="local 198.18.0.1" + local_addr2="local 198.18.0.2" + elif [ "$outer" = "6" ]; then + local_addr1="local fdd1:ced0:5d88:3fce::1" + local_addr2="local fdd1:ced0:5d88:3fce::2" + fi + fi + local vxlan="" + if [ "$type" = "vxlan" ]; then + vxlan="vni 100 dstport 4789" + fi + local geneve="" + if [ "$type" = "geneve" ]; then + geneve="vni 100" + fi + # Create tunnel and assign outer IPv4/IPv6 addresses + if [ "$outer" = "4" ]; then + if [ "$type" = "gre" ]; then + type="gretap" + fi + ip addr add 198.18.0.1/24 dev veth0 + $ns ip addr add 198.18.0.2/24 dev veth1 + ip link add name tep0 type $type $local_addr1 remote \ + 198.18.0.2 tos $test_tos ttl $test_ttl $vxlan $geneve + $ns ip link add name tep1 type $type $local_addr2 remote \ + 198.18.0.1 tos $test_tos ttl $test_ttl $vxlan $geneve + elif [ "$outer" = "6" ]; then + if [ "$type" = "gre" ]; then + type="ip6gretap" + fi + ip addr add fdd1:ced0:5d88:3fce::1/64 dev veth0 + $ns ip addr add fdd1:ced0:5d88:3fce::2/64 dev veth1 + ip link add name tep0 type $type $local_addr1 \ + remote fdd1:ced0:5d88:3fce::2 tos $test_tos ttl $test_ttl \ + $vxlan $geneve + $ns ip link add name tep1 type $type $local_addr2 \ + remote fdd1:ced0:5d88:3fce::1 tos $test_tos ttl $test_ttl \ + $vxlan $geneve + fi + + # Bring L2-tunnel link up and create VLAN on top + ip link set tep0 up + $ns ip link set tep1 up + ip addr flush dev tep0 + $ns ip addr flush dev tep1 + local parent + if $vlan; then + parent="vlan99-" + ip link add link tep0 name ${parent}0 type vlan id 99 + $ns ip link add link tep1 name ${parent}1 type vlan id 99 + ip link set ${parent}0 up + $ns ip link set ${parent}1 up + ip addr flush dev ${parent}0 + $ns ip addr flush dev ${parent}1 + else + parent="tep" + fi + + # Assign inner IPv4/IPv6 addresses + if [ "$inner" = "4" ] || [ "$inner" = "other" ]; then + ip addr add 198.19.0.1/24 brd + dev ${parent}0 + $ns ip addr add 198.19.0.2/24 brd + dev ${parent}1 + elif [ "$inner" = "6" ]; then + ip addr add fdd4:96cf:4eae:443b::1/64 dev ${parent}0 + $ns ip addr add fdd4:96cf:4eae:443b::2/64 dev ${parent}1 + fi +} + +verify() { + local outer="$1" + local inner="$2" + local tos_ttl="$3" + local vlan="$4" + + local ping_pid out captured_tos captured_ttl result + + local ping_dst + if [ "$inner" = "4" ]; then + ping_dst="198.19.0.2" + elif [ "$inner" = "6" ]; then + ping_dst="fdd4:96cf:4eae:443b::2" + elif [ "$inner" = "other" ]; then + ping_dst="198.19.0.3" # Generates ARPs which are not IPv4/IPv6 + fi + if [ "$tos_ttl" = "inherit" ]; then + ping -i 0.1 $ping_dst -Q "$expected_tos" -t "$expected_ttl" \ + 2>/dev/null 1>&2 & ping_pid="$!" + else + ping -i 0.1 $ping_dst 2>/dev/null 1>&2 & ping_pid="$!" + fi + local tunnel_type_offset tunnel_type_proto req_proto_offset req_offset + if [ "$type" = "gre" ]; then + tunnel_type_proto="0x2f" + elif [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + tunnel_type_proto="0x11" + fi + if [ "$outer" = "4" ]; then + tunnel_type_offset="9" + if [ "$inner" = "4" ]; then + req_proto_offset="47" + req_offset="58" + if [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + req_proto_offset="$((req_proto_offset + 12))" + req_offset="$((req_offset + 12))" + fi + if $vlan; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + out="$(tcpdump --immediate-mode -p -c 1 -v -i veth0 -n \ + ip[$tunnel_type_offset] = $tunnel_type_proto and \ + ip[$req_proto_offset] = 0x01 and \ + ip[$req_offset] = 0x08 2>/dev/null | head -n 1)" + elif [ "$inner" = "6" ]; then + req_proto_offset="44" + req_offset="78" + if [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + req_proto_offset="$((req_proto_offset + 12))" + req_offset="$((req_offset + 12))" + fi + if $vlan; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + out="$(tcpdump --immediate-mode -p -c 1 -v -i veth0 -n \ + ip[$tunnel_type_offset] = $tunnel_type_proto and \ + ip[$req_proto_offset] = 0x3a and \ + ip[$req_offset] = 0x80 2>/dev/null | head -n 1)" + elif [ "$inner" = "other" ]; then + req_proto_offset="36" + req_offset="45" + if [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + req_proto_offset="$((req_proto_offset + 12))" + req_offset="$((req_offset + 12))" + fi + if $vlan; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + if [ "$tos_ttl" = "inherit" ]; then + expected_tos="0x00" + expected_ttl="64" + fi + out="$(tcpdump --immediate-mode -p -c 1 -v -i veth0 -n \ + ip[$tunnel_type_offset] = $tunnel_type_proto and \ + ip[$req_proto_offset] = 0x08 and \ + ip[$((req_proto_offset + 1))] = 0x06 and \ + ip[$req_offset] = 0x01 2>/dev/null | head -n 1)" + fi + elif [ "$outer" = "6" ]; then + if [ "$type" = "gre" ]; then + tunnel_type_offset="40" + elif [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + tunnel_type_offset="6" + fi + if [ "$inner" = "4" ]; then + local req_proto_offset="75" + local req_offset="86" + if [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + if $vlan; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + out="$(tcpdump --immediate-mode -p -c 1 -v -i veth0 -n \ + ip6[$tunnel_type_offset] = $tunnel_type_proto and \ + ip6[$req_proto_offset] = 0x01 and \ + ip6[$req_offset] = 0x08 2>/dev/null | head -n 1)" + elif [ "$inner" = "6" ]; then + local req_proto_offset="72" + local req_offset="106" + if [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + if $vlan; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + out="$(tcpdump --immediate-mode -p -c 1 -v -i veth0 -n \ + ip6[$tunnel_type_offset] = $tunnel_type_proto and \ + ip6[$req_proto_offset] = 0x3a and \ + ip6[$req_offset] = 0x80 2>/dev/null | head -n 1)" + elif [ "$inner" = "other" ]; then + local req_proto_offset="64" + local req_offset="73" + if [ "$type" = "vxlan" ] || [ "$type" = "geneve" ]; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + if $vlan; then + req_proto_offset="$((req_proto_offset + 4))" + req_offset="$((req_offset + 4))" + fi + if [ "$tos_ttl" = "inherit" ]; then + expected_tos="0x00" + expected_ttl="64" + fi + out="$(tcpdump --immediate-mode -p -c 1 -v -i veth0 -n \ + ip6[$tunnel_type_offset] = $tunnel_type_proto and \ + ip6[$req_proto_offset] = 0x08 and \ + ip6[$((req_proto_offset + 1))] = 0x06 and \ + ip6[$req_offset] = 0x01 2>/dev/null | head -n 1)" + fi + fi + kill -9 $ping_pid + wait $ping_pid 2>/dev/null + result="FAIL" + if [ "$outer" = "4" ]; then + captured_ttl="$(get_field "ttl" "$out")" + captured_tos="$(printf "0x%02x" "$(get_field "tos" "$out")")" + if [ "$captured_tos" = "$expected_tos" ] && + [ "$captured_ttl" = "$expected_ttl" ]; then + result="OK" + fi + elif [ "$outer" = "6" ]; then + captured_ttl="$(get_field "hlim" "$out")" + captured_tos="$(printf "0x%02x" "$(get_field "class" "$out")")" + if [ "$captured_tos" = "$expected_tos" ] && + [ "$captured_ttl" = "$expected_ttl" ]; then + result="OK" + fi + fi + + printf "%7s │\n" "$result" + if [ "$result" = "FAIL" ]; then + failed=true + if [ "$captured_tos" != "$expected_tos" ]; then + printf "│%43s%27s │\n" \ + "Expected TOS value: $expected_tos" \ + "Captured TOS value: $captured_tos" + fi + if [ "$captured_ttl" != "$expected_ttl" ]; then + printf "│%43s%27s │\n" \ + "Expected TTL value: $expected_ttl" \ + "Captured TTL value: $captured_ttl" + fi + printf "│%71s│\n" " " + fi +} + +cleanup() { + ip link del veth0 2>/dev/null + ip netns del testing 2>/dev/null + ip link del tep0 2>/dev/null +} + +printf "┌────────┬───────┬───────┬──────────────┬" +printf "──────────────┬───────┬────────┐\n" +for type in gre vxlan geneve; do + if ! $(modprobe "$type" 2>/dev/null); then + continue + fi + for outer in 4 6; do + printf "├────────┼───────┼───────┼──────────────┼" + printf "──────────────┼───────┼────────┤\n" + printf "│ Type │ outer | inner │ tos │" + printf " ttl │ vlan │ result │\n" + for inner in 4 6 other; do + printf "├────────┼───────┼───────┼──────────────┼" + printf "──────────────┼───────┼────────┤\n" + for tos_ttl in inherit random; do + for vlan in false true; do + setup "$type" "$outer" "$inner" \ + "$tos_ttl" "$vlan" + verify "$outer" "$inner" "$tos_ttl" \ + "$vlan" + cleanup + done + done + done + done +done +printf "└────────┴───────┴───────┴──────────────┴" +printf "──────────────┴───────┴────────┘\n" + +if $failed; then + exit 1 +fi -- cgit v1.2.3 From bdbf0617bbc3641af158d1aeffeebb1505f76263 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 19 Aug 2022 12:19:28 -0700 Subject: selftests/vm: fix inability to build any vm tests When we stopped using KSFT_KHDR_INSTALL, a side effect is we also changed the value of `top_srcdir`. This can be seen by looking at the code removed by commit 49de12ba06ef ("selftests: drop KSFT_KHDR_INSTALL make target"). (Note though that this commit didn't break this, technically the one before it did since that's the one that stopped KSFT_KHDR_INSTALL from being used, even though the code was still there.) Previously lib.mk reconfigured `top_srcdir` when KSFT_KHDR_INSTALL was being used. Now, that's no longer the case. As a result, the path to gup_test.h in vm/Makefile was wrong, and since it's a dependency of all of the vm binaries none of them could be built. Instead, we'd get an "error" like: make[1]: *** No rule to make target '/[...]/tools/testing/selftests/vm/compaction_test', needed by 'all'. Stop. So, modify lib.mk so it once again sets top_srcdir to the root of the kernel tree. Fixes: f2745dc0ba3d ("selftests: stop using KSFT_KHDR_INSTALL") Signed-off-by: Axel Rasmussen Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 947fc72413e9..d44c72b3abe3 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -40,6 +40,7 @@ ifeq (0,$(MAKELEVEL)) endif endif selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) +top_srcdir = $(selfdir)/../../.. # The following are built by lib.mk common compile rules. # TEST_CUSTOM_PROGS should be used by tests that require -- cgit v1.2.3 From a0a12c3ed057af57552bf6c0aeaca6835693df04 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 19 Aug 2022 12:06:40 -0700 Subject: asm goto: eradicate CC_HAS_ASM_GOTO GCC has supported asm goto since 4.5, and Clang has since version 9.0.0. The minimum supported versions of these tools for the build according to Documentation/process/changes.rst are 5.1 and 11.0.0 respectively. Remove the feature detection script, Kconfig option, and clean up some fallback code that is no longer supported. The removed script was also testing for a GCC specific bug that was fixed in the 4.7 release. Also remove workarounds for bpftrace using clang older than 9.0.0, since other BPF backend fixes are required at this point. Link: https://lore.kernel.org/lkml/CAK7LNATSr=BXKfkdW8f-H5VT_w=xBpT2ZQcZ7rm6JfkdE+QnmA@mail.gmail.com/ Link: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=48637 Acked-by: Borislav Petkov Suggested-by: Masahiro Yamada Suggested-by: Alexei Starovoitov Signed-off-by: Nick Desaulniers Reviewed-by: Ingo Molnar Reviewed-by: Nathan Chancellor Reviewed-by: Alexandre Belloni Signed-off-by: Linus Torvalds --- Documentation/kbuild/kconfig-language.rst | 4 ++-- arch/Kconfig | 3 +-- arch/um/include/asm/cpufeature.h | 15 --------------- arch/x86/Makefile | 4 ---- arch/x86/include/asm/cpufeature.h | 15 --------------- arch/x86/include/asm/rmwcc.h | 6 +++--- arch/x86/kvm/emulate.c | 2 +- init/Kconfig | 4 ---- scripts/gcc-goto.sh | 22 ---------------------- tools/arch/x86/include/asm/rmwcc.h | 21 --------------------- 10 files changed, 7 insertions(+), 89 deletions(-) delete mode 100755 scripts/gcc-goto.sh (limited to 'tools') diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index 7fb398649f51..858ed5d80def 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -525,8 +525,8 @@ followed by a test macro:: If you need to expose a compiler capability to makefiles and/or C source files, `CC_HAS_` is the recommended prefix for the config option:: - config CC_HAS_ASM_GOTO - def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) + config CC_HAS_FOO + def_bool $(success,$(srctree)/scripts/cc-check-foo.sh $(CC)) Build as module only ~~~~~~~~~~~~~~~~~~~~ diff --git a/arch/Kconfig b/arch/Kconfig index f330410da63a..5dbf11a5ba4e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -53,7 +53,6 @@ config KPROBES config JUMP_LABEL bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL - depends on CC_HAS_ASM_GOTO select OBJTOOL if HAVE_JUMP_LABEL_HACK help This option enables a transparent branch optimization that @@ -1361,7 +1360,7 @@ config HAVE_PREEMPT_DYNAMIC_CALL config HAVE_PREEMPT_DYNAMIC_KEY bool - depends on HAVE_ARCH_JUMP_LABEL && CC_HAS_ASM_GOTO + depends on HAVE_ARCH_JUMP_LABEL select HAVE_PREEMPT_DYNAMIC help An architecture should select this if it can handle the preemption diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h index 19cd7ed6ec3c..4b6d1b526bc1 100644 --- a/arch/um/include/asm/cpufeature.h +++ b/arch/um/include/asm/cpufeature.h @@ -65,20 +65,6 @@ extern void setup_clear_cpu_cap(unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) - -/* - * Workaround for the sake of BPF compilation which utilizes kernel - * headers, but clang does not support ASM GOTO and fails the build. - */ -#ifndef __BPF_TRACING__ -#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" -#endif - -#define static_cpu_has(bit) boot_cpu_has(bit) - -#else - /* * Static testing of CPU features. Used the same as boot_cpu_has(). It * statically patches the target code for additional performance. Use @@ -137,7 +123,6 @@ t_no: boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) -#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7854685c5f25..bafbd905e6e7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -286,10 +286,6 @@ vdso_install: archprepare: checkbin checkbin: -ifndef CONFIG_CC_HAS_ASM_GOTO - @echo Compiler lacks asm-goto support. - @exit 1 -endif ifdef CONFIG_RETPOLINE ifeq ($(RETPOLINE_CFLAGS),) @echo "You are building kernel with non-retpoline compiler." >&2 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ea34cc31b047..1a85e1fb0922 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -155,20 +155,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) - -/* - * Workaround for the sake of BPF compilation which utilizes kernel - * headers, but clang does not support ASM GOTO and fails the build. - */ -#ifndef __BPF_TRACING__ -#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" -#endif - -#define static_cpu_has(bit) boot_cpu_has(bit) - -#else - /* * Static testing of CPU features. Used the same as boot_cpu_has(). It * statically patches the target code for additional performance. Use @@ -208,7 +194,6 @@ t_no: boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) -#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8a9eba191516..7fa611216417 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -11,7 +11,7 @@ #define __CLOBBERS_MEM(clb...) "memory", ## clb -#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO) +#ifndef __GCC_ASM_FLAG_OUTPUTS__ /* Use asm goto */ @@ -27,7 +27,7 @@ cc_label: c = true; \ c; \ }) -#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) */ /* Use flags output or a set instruction */ @@ -40,7 +40,7 @@ cc_label: c = true; \ c; \ }) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) */ #define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f092c54d1a2f..d5ec3a2ed5a4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -479,7 +479,7 @@ FOP_END; /* * XXX: inoutclob user must know where the argument is being expanded. - * Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault. + * Using asm goto would allow us to remove _fault. */ #define asm_safe(insn, inoutclob...) \ ({ \ diff --git a/init/Kconfig b/init/Kconfig index 80fe60fa77fb..532362fcfe31 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -70,11 +70,7 @@ config CC_CAN_LINK_STATIC default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) -config CC_HAS_ASM_GOTO - def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) - config CC_HAS_ASM_GOTO_OUTPUT - depends on CC_HAS_ASM_GOTO def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) config CC_HAS_ASM_GOTO_TIED_OUTPUT diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh deleted file mode 100755 index 8b980fb2270a..000000000000 --- a/scripts/gcc-goto.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Test for gcc 'asm goto' support -# Copyright (C) 2010, Jason Baron - -cat << "END" | $@ -x c - -fno-PIE -c -o /dev/null -int main(void) -{ -#if defined(__arm__) || defined(__aarch64__) - /* - * Not related to asm goto, but used by jump label - * and broken on some ARM GCC versions (see GCC Bug 48637). - */ - static struct { int dummy; int state; } tp; - asm (".long %c0" :: "i" (&tp.state)); -#endif - -entry: - asm goto ("" :::: entry); - return 0; -} -END diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h index fee7983a90b4..11ff975242ca 100644 --- a/tools/arch/x86/include/asm/rmwcc.h +++ b/tools/arch/x86/include/asm/rmwcc.h @@ -2,8 +2,6 @@ #ifndef _TOOLS_LINUX_ASM_X86_RMWcc #define _TOOLS_LINUX_ASM_X86_RMWcc -#ifdef CONFIG_CC_HAS_ASM_GOTO - #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ @@ -20,23 +18,4 @@ cc_label: \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) -#else /* !CONFIG_CC_HAS_ASM_GOTO */ - -#define __GEN_RMWcc(fullop, var, cc, ...) \ -do { \ - char c; \ - asm volatile (fullop "; set" cc " %1" \ - : "+m" (var), "=qm" (c) \ - : __VA_ARGS__ : "memory"); \ - return c != 0; \ -} while (0) - -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) - -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) - -#endif /* CONFIG_CC_HAS_ASM_GOTO */ - #endif /* _TOOLS_LINUX_ASM_X86_RMWcc */ -- cgit v1.2.3 From c814bf958926ff45a9c1e899bd001006ab6cfbae Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 16 Aug 2022 10:51:06 +0000 Subject: powerpc/selftests: Use timersub() for gettimeofday() Use timersub() function to simplify the code. Reported-by: Zeal Robot Signed-off-by: ye xingchen Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220816105106.82666-1-ye.xingchen@zte.com.cn --- tools/testing/selftests/powerpc/benchmarks/gettimeofday.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c index 6b415683357b..580fcac0a09f 100644 --- a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c +++ b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c @@ -12,7 +12,7 @@ static int test_gettimeofday(void) { int i; - struct timeval tv_start, tv_end; + struct timeval tv_start, tv_end, tv_diff; gettimeofday(&tv_start, NULL); @@ -20,7 +20,9 @@ static int test_gettimeofday(void) gettimeofday(&tv_end, NULL); } - printf("time = %.6f\n", tv_end.tv_sec - tv_start.tv_sec + (tv_end.tv_usec - tv_start.tv_usec) * 1e-6); + timersub(&tv_start, &tv_end, &tv_diff); + + printf("time = %.6f\n", tv_diff.tv_sec + (tv_diff.tv_usec) * 1e-6); return 0; } -- cgit v1.2.3 From cfd2b5c1106fa20254d9f24970232cdf24860005 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 22 Aug 2022 17:25:57 +0800 Subject: perf tools: Fix compile error for x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a0a12c3ed057 ("asm goto: eradicate CC_HAS_ASM_GOTO") eradicates CC_HAS_ASM_GOTO, and in the process also causes the perf tool on x86 to use asm_volatile_goto when compiling __GEN_RMWcc. However, asm_volatile_goto is not declared in the perf tool headers, which causes a compilation error: In file included from tools/arch/x86/include/asm/atomic.h:7, from tools/include/asm/atomic.h:6, from tools/include/linux/atomic.h:5, from tools/include/linux/refcount.h:41, from tools/lib/perf/include/internal/cpumap.h:5, from tools/perf/util/cpumap.h:7, from tools/perf/util/env.h:7, from tools/perf/util/header.h:12, from pmu-events/pmu-events.c:9: tools/arch/x86/include/asm/atomic.h: In function ‘atomic_dec_and_test’: tools/arch/x86/include/asm/rmwcc.h:7:2: error: implicit declaration of function ‘asm_volatile_goto’ [-Werror=implicit-function-declaration] asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ ^~~~~~~~~~~~~~~~~ Define asm_volatile_goto in compiler_types.h if not declared, like the main kernel header files do. Fixes: a0a12c3ed057 ("asm goto: eradicate CC_HAS_ASM_GOTO") Signed-off-by: Yang Jihong Tested-by: Arnaldo Carvalho de Melo Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- tools/include/linux/compiler_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 24ae3054f304..1bdd834bdd57 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -36,4 +36,8 @@ #include #endif +#ifndef asm_volatile_goto +#define asm_volatile_goto(x...) asm goto(x) +#endif + #endif /* __LINUX_COMPILER_TYPES_H */ -- cgit v1.2.3 From c078290a2b7618473a7d0a05334cc91fe0ac2949 Mon Sep 17 00:00:00 2001 From: Jonathan Toppins Date: Fri, 19 Aug 2022 11:15:12 -0400 Subject: selftests: include bonding tests into the kselftest infra This creates a test collection in drivers/net/bonding for bonding specific kernel selftests. The first test is a reproducer that provisions a bond and given the specific order in how the ip-link(8) commands are issued the bond never transmits an LACPDU frame on any of its slaves. Signed-off-by: Jonathan Toppins Acked-by: Jay Vosburgh Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + .../testing/selftests/drivers/net/bonding/Makefile | 6 ++ .../drivers/net/bonding/bond-break-lacpdu-tx.sh | 81 ++++++++++++++++++++++ tools/testing/selftests/drivers/net/bonding/config | 1 + .../testing/selftests/drivers/net/bonding/settings | 1 + 6 files changed, 91 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/bonding/Makefile create mode 100755 tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh create mode 100644 tools/testing/selftests/drivers/net/bonding/config create mode 100644 tools/testing/selftests/drivers/net/bonding/settings (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index f512b430c7cb..274b2c1e506e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3679,6 +3679,7 @@ F: Documentation/networking/bonding.rst F: drivers/net/bonding/ F: include/net/bond* F: include/uapi/linux/if_bonding.h +F: tools/testing/selftests/net/bonding/ BOSCH SENSORTEC BMA400 ACCELEROMETER IIO DRIVER M: Dan Robertson diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 10b34bb03bc1..c2064a35688b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -12,6 +12,7 @@ TARGETS += cpu-hotplug TARGETS += damon TARGETS += drivers/dma-buf TARGETS += drivers/s390x/uvdevice +TARGETS += drivers/net/bonding TARGETS += efivarfs TARGETS += exec TARGETS += filesystems diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile new file mode 100644 index 000000000000..ab6c54b12098 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for net selftests + +TEST_PROGS := bond-break-lacpdu-tx.sh + +include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh b/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh new file mode 100755 index 000000000000..47ab90596acb --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh @@ -0,0 +1,81 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Regression Test: +# Verify LACPDUs get transmitted after setting the MAC address of +# the bond. +# +# https://bugzilla.redhat.com/show_bug.cgi?id=2020773 +# +# +---------+ +# | fab-br0 | +# +---------+ +# | +# +---------+ +# | fbond | +# +---------+ +# | | +# +------+ +------+ +# |veth1 | |veth2 | +# +------+ +------+ +# +# We use veths instead of physical interfaces + +set -e +tmp=$(mktemp -q dump.XXXXXX) +cleanup() { + ip link del fab-br0 >/dev/null 2>&1 || : + ip link del fbond >/dev/null 2>&1 || : + ip link del veth1-bond >/dev/null 2>&1 || : + ip link del veth2-bond >/dev/null 2>&1 || : + modprobe -r bonding >/dev/null 2>&1 || : + rm -f -- ${tmp} +} + +trap cleanup 0 1 2 +cleanup +sleep 1 + +# create the bridge +ip link add fab-br0 address 52:54:00:3B:7C:A6 mtu 1500 type bridge \ + forward_delay 15 + +# create the bond +ip link add fbond type bond mode 4 miimon 200 xmit_hash_policy 1 \ + ad_actor_sys_prio 65535 lacp_rate fast + +# set bond address +ip link set fbond address 52:54:00:3B:7C:A6 +ip link set fbond up + +# set again bond sysfs parameters +ip link set fbond type bond ad_actor_sys_prio 65535 + +# create veths +ip link add name veth1-bond type veth peer name veth1-end +ip link add name veth2-bond type veth peer name veth2-end + +# add ports +ip link set fbond master fab-br0 +ip link set veth1-bond down master fbond +ip link set veth2-bond down master fbond + +# bring up +ip link set veth1-end up +ip link set veth2-end up +ip link set fab-br0 up +ip link set fbond up +ip addr add dev fab-br0 10.0.0.3 + +tcpdump -n -i veth1-end -e ether proto 0x8809 >${tmp} 2>&1 & +sleep 15 +pkill tcpdump >/dev/null 2>&1 +rc=0 +num=$(grep "packets captured" ${tmp} | awk '{print $1}') +if test "$num" -gt 0; then + echo "PASS, captured ${num}" +else + echo "FAIL" + rc=1 +fi +exit $rc diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config new file mode 100644 index 000000000000..dc1c22de3c92 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -0,0 +1 @@ +CONFIG_BONDING=y diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings new file mode 100644 index 000000000000..867e118223cd --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/settings @@ -0,0 +1 @@ +timeout=60 -- cgit v1.2.3 From b1346338fbaefac1b796a50478f8e8070b54e9e4 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Fri, 19 Aug 2022 05:43:50 +0000 Subject: vsock_test: POLLIN + SO_RCVLOWAT test This adds test to check, that when poll() returns POLLIN, POLLRDNORM bits, next read call won't block. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- tools/testing/vsock/vsock_test.c | 108 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'tools') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index dc577461afc2..bb6d691cb30d 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "timeout.h" #include "control.h" @@ -596,6 +597,108 @@ static void test_seqpacket_invalid_rec_buffer_server(const struct test_opts *opt close(fd); } +#define RCVLOWAT_BUF_SIZE 128 + +static void test_stream_poll_rcvlowat_server(const struct test_opts *opts) +{ + int fd; + int i; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte. */ + send_byte(fd, 1, 0); + + control_writeln("SRVSENT"); + + /* Wait until client is ready to receive rest of data. */ + control_expectln("CLNSENT"); + + for (i = 0; i < RCVLOWAT_BUF_SIZE - 1; i++) + send_byte(fd, 1, 0); + + /* Keep socket in active state. */ + control_expectln("POLLDONE"); + + close(fd); +} + +static void test_stream_poll_rcvlowat_client(const struct test_opts *opts) +{ + unsigned long lowat_val = RCVLOWAT_BUF_SIZE; + char buf[RCVLOWAT_BUF_SIZE]; + struct pollfd fds; + ssize_t read_res; + short poll_flags; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, + &lowat_val, sizeof(lowat_val))) { + perror("setsockopt"); + exit(EXIT_FAILURE); + } + + control_expectln("SRVSENT"); + + /* At this point, server sent 1 byte. */ + fds.fd = fd; + poll_flags = POLLIN | POLLRDNORM; + fds.events = poll_flags; + + /* Try to wait for 1 sec. */ + if (poll(&fds, 1, 1000) < 0) { + perror("poll"); + exit(EXIT_FAILURE); + } + + /* poll() must return nothing. */ + if (fds.revents) { + fprintf(stderr, "Unexpected poll result %hx\n", + fds.revents); + exit(EXIT_FAILURE); + } + + /* Tell server to send rest of data. */ + control_writeln("CLNSENT"); + + /* Poll for data. */ + if (poll(&fds, 1, 10000) < 0) { + perror("poll"); + exit(EXIT_FAILURE); + } + + /* Only these two bits are expected. */ + if (fds.revents != poll_flags) { + fprintf(stderr, "Unexpected poll result %hx\n", + fds.revents); + exit(EXIT_FAILURE); + } + + /* Use MSG_DONTWAIT, if call is going to wait, EAGAIN + * will be returned. + */ + read_res = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); + if (read_res != RCVLOWAT_BUF_SIZE) { + fprintf(stderr, "Unexpected recv result %zi\n", + read_res); + exit(EXIT_FAILURE); + } + + control_writeln("POLLDONE"); + + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -646,6 +749,11 @@ static struct test_case test_cases[] = { .run_client = test_seqpacket_invalid_rec_buffer_client, .run_server = test_seqpacket_invalid_rec_buffer_server, }, + { + .name = "SOCK_STREAM poll() + SO_RCVLOWAT", + .run_client = test_stream_poll_rcvlowat_client, + .run_server = test_stream_poll_rcvlowat_server, + }, {}, }; -- cgit v1.2.3 From 91350fe152930c0d61a362af68272526490efea5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:17 +0300 Subject: bpf, flow_dissector: Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode for bpf progs Currently, attaching BPF_PROG_TYPE_FLOW_DISSECTOR programs completely replaces the flow-dissector logic with custom dissection logic. This forces implementors to write programs that handle dissection for any flows expected in the namespace. It makes sense for flow-dissector BPF programs to just augment the dissector with custom logic (e.g. dissecting certain flows or custom protocols), while enjoying the broad capabilities of the standard dissector for any other traffic. Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode. Flow-dissector BPF programs may return this to indicate no dissection was made, and fallback to the standard dissector is requested. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-3-shmulik.ladkani@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ net/core/flow_dissector.c | 3 +++ tools/include/uapi/linux/bpf.h | 5 +++++ 3 files changed, 13 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 934a2a8beb87..7f87012b012e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5861,6 +5861,11 @@ enum bpf_ret_code { * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a01817fb4ef4..990429c69ccd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1022,11 +1022,14 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } +dissect_continue: rcu_read_unlock(); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1d6085e15fc8..f38814fbb618 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5861,6 +5861,11 @@ enum bpf_ret_code { * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { -- cgit v1.2.3 From 5deedfbee84278da3b76fb7176dc3742f56eb370 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:18 +0300 Subject: bpf, test_run: Propagate bpf_flow_dissect's retval to user's bpf_attr.test.retval Formerly, a boolean denoting whether bpf_flow_dissect returned BPF_OK was set into 'bpf_attr.test.retval'. Augment this, so users can check the actual return code of the dissector program under test. Existing prog_tests/flow_dissector*.c tests were correspondingly changed to check against each test's expected retval. Also, tests' resulting 'flow_keys' are verified only in case the expected retval is BPF_OK. This allows adding new tests that expect non BPF_OK. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-4-shmulik.ladkani@gmail.com --- net/bpf/test_run.c | 2 +- .../selftests/bpf/prog_tests/flow_dissector.c | 23 +++++++++++++++++++++- .../bpf/prog_tests/flow_dissector_load_bytes.c | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 51c479433517..25d8ecf105aa 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1445,7 +1445,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, bpf_test_timer_enter(&t); do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size, flags) == BPF_OK; + size, flags); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 0c1661ea996e..8fa3c454995e 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -100,6 +100,7 @@ struct test { } pkt; struct bpf_flow_keys keys; __u32 flags; + __u32 retval; }; #define VLAN_HLEN 4 @@ -126,6 +127,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "ipv6", @@ -146,6 +148,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "802.1q-ipv4", @@ -168,6 +171,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "802.1ad-ipv6", @@ -191,6 +195,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "ipv4-frag", @@ -217,6 +222,7 @@ struct test tests[] = { .dport = 8080, }, .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + .retval = BPF_OK, }, { .name = "ipv4-no-frag", @@ -239,6 +245,7 @@ struct test tests[] = { .is_frag = true, .is_first_frag = true, }, + .retval = BPF_OK, }, { .name = "ipv6-frag", @@ -265,6 +272,7 @@ struct test tests[] = { .dport = 8080, }, .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG, + .retval = BPF_OK, }, { .name = "ipv6-no-frag", @@ -287,6 +295,7 @@ struct test tests[] = { .is_frag = true, .is_first_frag = true, }, + .retval = BPF_OK, }, { .name = "ipv6-flow-label", @@ -309,6 +318,7 @@ struct test tests[] = { .dport = 8080, .flow_label = __bpf_constant_htonl(0xbeeef), }, + .retval = BPF_OK, }, { .name = "ipv6-no-flow-label", @@ -331,6 +341,7 @@ struct test tests[] = { .flow_label = __bpf_constant_htonl(0xbeeef), }, .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, + .retval = BPF_OK, }, { .name = "ipip-encap", @@ -359,6 +370,7 @@ struct test tests[] = { .sport = 80, .dport = 8080, }, + .retval = BPF_OK, }, { .name = "ipip-no-encap", @@ -386,6 +398,7 @@ struct test tests[] = { .is_encap = true, }, .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP, + .retval = BPF_OK, }, }; @@ -503,6 +516,10 @@ static void run_tests_skb_less(int tap_fd, struct bpf_map *keys) err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); + /* check the stored flow_keys only if BPF_OK expected */ + if (tests[i].retval != BPF_OK) + continue; + err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); ASSERT_OK(err, "bpf_map_lookup_elem"); @@ -588,7 +605,11 @@ void test_flow_dissector(void) err = bpf_prog_test_run_opts(prog_fd, &topts); ASSERT_OK(err, "test_run"); - ASSERT_EQ(topts.retval, 1, "test_run retval"); + ASSERT_EQ(topts.retval, tests[i].retval, "test_run retval"); + + /* check the resulting flow_keys only if BPF_OK returned */ + if (topts.retval != BPF_OK) + continue; ASSERT_EQ(topts.data_size_out, sizeof(flow_keys), "test_run data_size_out"); CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c index 36afb409c25f..c7a47b57ac91 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c @@ -44,7 +44,7 @@ void serial_test_flow_dissector_load_bytes(void) ASSERT_OK(err, "test_run"); ASSERT_EQ(topts.data_size_out, sizeof(flow_keys), "test_run data_size_out"); - ASSERT_EQ(topts.retval, 1, "test_run retval"); + ASSERT_EQ(topts.retval, BPF_OK, "test_run retval"); if (fd >= -1) close(fd); -- cgit v1.2.3 From d6513727c2af39a8cffb0d9b07376e51a85f347f Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:19 +0300 Subject: bpf, selftests: Test BPF_FLOW_DISSECTOR_CONTINUE The dissector program returns BPF_FLOW_DISSECTOR_CONTINUE (and avoids setting skb->flow_keys or last_dissection map) in case it encounters IP packets whose (outer) source address is 127.0.0.127. Additional test is added to prog_tests/flow_dissector.c which sets this address as test's pkk.iph.saddr, with the expected retval of BPF_FLOW_DISSECTOR_CONTINUE. Also, legacy test_flow_dissector.sh was similarly augmented. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-5-shmulik.ladkani@gmail.com --- .../selftests/bpf/prog_tests/flow_dissector.c | 21 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_flow.c | 15 +++++++++++++++ tools/testing/selftests/bpf/test_flow_dissector.sh | 8 ++++++++ 3 files changed, 44 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 8fa3c454995e..7acca37a3d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -8,6 +8,8 @@ #include "bpf_flow.skel.h" +#define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */ + #ifndef IP_MF #define IP_MF 0x2000 #endif @@ -400,6 +402,25 @@ struct test tests[] = { .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP, .retval = BPF_OK, }, + { + .name = "ipip-encap-dissector-continue", + .pkt.ipip = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_IPIP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .iph.saddr = __bpf_constant_htonl(FLOW_CONTINUE_SADDR), + .iph_inner.ihl = 5, + .iph_inner.protocol = IPPROTO_TCP, + .iph_inner.tot_len = + __bpf_constant_htons(MAGIC_BYTES) - + sizeof(struct iphdr), + .tcp.doff = 5, + .tcp.source = 99, + .tcp.dest = 9090, + }, + .retval = BPF_FLOW_DISSECTOR_CONTINUE, + }, }; static int create_tap(const char *ifname) diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index f266c757b3df..a20c5ed5e454 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -22,6 +22,8 @@ #define PROG(F) PROG_(F, _##F) #define PROG_(NUM, NAME) SEC("flow_dissector") int flow_dissector_##NUM +#define FLOW_CONTINUE_SADDR 0x7f00007f /* 127.0.0.127 */ + /* These are the identifiers of the BPF programs that will be used in tail * calls. Name is limited to 16 characters, with the terminating character and * bpf_func_ above, we have only 6 to work with, anything after will be cropped. @@ -143,6 +145,19 @@ int _dissect(struct __sk_buff *skb) { struct bpf_flow_keys *keys = skb->flow_keys; + if (keys->n_proto == bpf_htons(ETH_P_IP)) { + /* IP traffic from FLOW_CONTINUE_SADDR falls-back to + * standard dissector + */ + struct iphdr *iph, _iph; + + iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph); + if (iph && iph->ihl == 5 && + iph->saddr == bpf_htonl(FLOW_CONTINUE_SADDR)) { + return BPF_FLOW_DISSECTOR_CONTINUE; + } + } + return parse_eth_proto(skb, keys->n_proto); } diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh index dbd91221727d..5303ce0c977b 100755 --- a/tools/testing/selftests/bpf/test_flow_dissector.sh +++ b/tools/testing/selftests/bpf/test_flow_dissector.sh @@ -115,6 +115,14 @@ tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \ # Send 10 IPv4/UDP packets from port 10. Filter should not drop any. ./test_flow_dissector -i 4 -f 10 +echo "Testing IPv4 from 127.0.0.127 (fallback to generic dissector)..." +# Send 10 IPv4/UDP packets from port 8. Filter should not drop any. +./test_flow_dissector -i 4 -S 127.0.0.127 -f 8 +# Send 10 IPv4/UDP packets from port 9. Filter should drop all. +./test_flow_dissector -i 4 -S 127.0.0.127 -f 9 -F +# Send 10 IPv4/UDP packets from port 10. Filter should not drop any. +./test_flow_dissector -i 4 -S 127.0.0.127 -f 10 + echo "Testing IPIP..." # Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any. ./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \ -- cgit v1.2.3 From 2172fb8007eaafbef18563afb6c1ae5a976bf787 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:54 -0700 Subject: bpf: update bpf_{g,s}et_retval documentation * replace 'syscall' with 'upper layers', still mention that it's being exported via syscall errno * describe what happens in set_retval(-EPERM) + return 1 * describe what happens with bind's 'return 3' Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-5-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 +++++++++++++++++----- tools/include/uapi/linux/bpf.h | 22 +++++++++++++++++----- 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f87012b012e..644600dbb114 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5085,17 +5085,29 @@ union bpf_attr { * * int bpf_get_retval(void) * Description - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * Return - * The syscall's return value. + * The BPF program's return value. * * int bpf_set_retval(int retval) * Description - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f38814fbb618..4fb685591035 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5085,17 +5085,29 @@ union bpf_attr { * * int bpf_get_retval(void) * Description - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * Return - * The syscall's return value. + * The BPF program's return value. * * int bpf_set_retval(int retval) * Description - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From e7215f574079ffb138258e8ebfa3f2bf5a4a1238 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:55 -0700 Subject: selftests/bpf: Make sure bpf_{g,s}et_retval is exposed everywhere For each hook, have a simple bpf_set_retval(bpf_get_retval) program and make sure it loads for the hooks we want. The exceptions are the hooks which don't propagate the error to the callers: - sockops - recvmsg - getpeername - getsockname - cg_skb ingress and egress Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-6-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 + .../selftests/bpf/cgroup_getset_retval_hooks.h | 25 +++++++++++ .../bpf/prog_tests/cgroup_getset_retval.c | 48 ++++++++++++++++++++++ .../bpf/progs/cgroup_getset_retval_hooks.c | 16 ++++++++ 4 files changed, 90 insertions(+) create mode 100644 tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h create mode 100644 tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8d59ec7f4c2d..eecad99f1735 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -323,6 +323,7 @@ $(OUTPUT)/test_l4lb_noinline.o: BPF_CFLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: BPF_CFLAGS += -fno-inline $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h +$(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h # Build BPF object using Clang # $1 - input .c file diff --git a/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h new file mode 100644 index 000000000000..a525d3544fd7 --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +BPF_RETVAL_HOOK(ingress, "cgroup_skb/ingress", __sk_buff, -EINVAL) +BPF_RETVAL_HOOK(egress, "cgroup_skb/egress", __sk_buff, -EINVAL) +BPF_RETVAL_HOOK(sock_create, "cgroup/sock_create", bpf_sock, 0) +BPF_RETVAL_HOOK(sock_ops, "sockops", bpf_sock_ops, -EINVAL) +BPF_RETVAL_HOOK(dev, "cgroup/dev", bpf_cgroup_dev_ctx, 0) +BPF_RETVAL_HOOK(bind4, "cgroup/bind4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(bind6, "cgroup/bind6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(connect4, "cgroup/connect4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(connect6, "cgroup/connect6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(post_bind4, "cgroup/post_bind4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(post_bind6, "cgroup/post_bind6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(sendmsg4, "cgroup/sendmsg4", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(sendmsg6, "cgroup/sendmsg6", bpf_sock_addr, 0) +BPF_RETVAL_HOOK(sysctl, "cgroup/sysctl", bpf_sysctl, 0) +BPF_RETVAL_HOOK(recvmsg4, "cgroup/recvmsg4", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(recvmsg6, "cgroup/recvmsg6", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getsockopt, "cgroup/getsockopt", bpf_sockopt, 0) +BPF_RETVAL_HOOK(setsockopt, "cgroup/setsockopt", bpf_sockopt, 0) +BPF_RETVAL_HOOK(getpeername4, "cgroup/getpeername4", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getpeername6, "cgroup/getpeername6", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getsockname4, "cgroup/getsockname4", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(getsockname6, "cgroup/getsockname6", bpf_sock_addr, -EINVAL) +BPF_RETVAL_HOOK(sock_release, "cgroup/sock_release", bpf_sock, 0) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c index 0b47c3c000c7..4d2fa99273d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c @@ -10,6 +10,7 @@ #include "cgroup_getset_retval_setsockopt.skel.h" #include "cgroup_getset_retval_getsockopt.skel.h" +#include "cgroup_getset_retval_hooks.skel.h" #define SOL_CUSTOM 0xdeadbeef @@ -433,6 +434,50 @@ close_bpf_object: cgroup_getset_retval_getsockopt__destroy(obj); } +struct exposed_hook { + const char *name; + int expected_err; +} exposed_hooks[] = { + +#define BPF_RETVAL_HOOK(NAME, SECTION, CTX, EXPECTED_ERR) \ + { \ + .name = #NAME, \ + .expected_err = EXPECTED_ERR, \ + }, + +#include "cgroup_getset_retval_hooks.h" + +#undef BPF_RETVAL_HOOK +}; + +static void test_exposed_hooks(int cgroup_fd, int sock_fd) +{ + struct cgroup_getset_retval_hooks *skel; + struct bpf_program *prog; + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(exposed_hooks); i++) { + skel = cgroup_getset_retval_hooks__open(); + if (!ASSERT_OK_PTR(skel, "cgroup_getset_retval_hooks__open")) + continue; + + prog = bpf_object__find_program_by_name(skel->obj, exposed_hooks[i].name); + if (!ASSERT_NEQ(prog, NULL, "bpf_object__find_program_by_name")) + goto close_skel; + + err = bpf_program__set_autoload(prog, true); + if (!ASSERT_OK(err, "bpf_program__set_autoload")) + goto close_skel; + + err = cgroup_getset_retval_hooks__load(skel); + ASSERT_EQ(err, exposed_hooks[i].expected_err, "expected_err"); + +close_skel: + cgroup_getset_retval_hooks__destroy(skel); + } +} + void test_cgroup_getset_retval(void) { int cgroup_fd = -1; @@ -476,6 +521,9 @@ void test_cgroup_getset_retval(void) if (test__start_subtest("getsockopt-retval_sync")) test_getsockopt_retval_sync(cgroup_fd, sock_fd); + if (test__start_subtest("exposed_hooks")) + test_exposed_hooks(cgroup_fd, sock_fd); + close_fd: close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c new file mode 100644 index 000000000000..13dfb4bbfd28 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_hooks.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#define BPF_RETVAL_HOOK(name, section, ctx, expected_err) \ + __attribute__((__section__("?" section))) \ + int name(struct ctx *_ctx) \ + { \ + bpf_set_retval(bpf_get_retval()); \ + return 1; \ + } + +#include "cgroup_getset_retval_hooks.h" + +#undef BPF_RETVAL_HOOK -- cgit v1.2.3 From 35f14dbd2fc6619dea8ac9eea18976378b18450b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 03:32:26 +0200 Subject: selftests/bpf: Add tests for reference state fixes for callbacks These are regression tests to ensure we don't end up in invalid runtime state for helpers that execute callbacks multiple times. It exercises the fixes to verifier callback handling for reference state in previous patches. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823013226.24988-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/cb_refs.c | 48 ++++++++++ tools/testing/selftests/bpf/progs/cb_refs.c | 116 +++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cb_refs.c create mode 100644 tools/testing/selftests/bpf/progs/cb_refs.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c new file mode 100644 index 000000000000..3bff680de16c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bpf/libbpf.h" +#include +#include + +#include "cb_refs.skel.h" + +static char log_buf[1024 * 1024]; + +struct { + const char *prog_name; + const char *err_msg; +} cb_refs_tests[] = { + { "underflow_prog", "reference has not been acquired before" }, + { "leak_prog", "Unreleased reference" }, + { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ + { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ +}; + +void test_cb_refs(void) +{ + LIBBPF_OPTS(bpf_object_open_opts, opts, .kernel_log_buf = log_buf, + .kernel_log_size = sizeof(log_buf), + .kernel_log_level = 1); + struct bpf_program *prog; + struct cb_refs *skel; + int i; + + for (i = 0; i < ARRAY_SIZE(cb_refs_tests); i++) { + LIBBPF_OPTS(bpf_test_run_opts, run_opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + skel = cb_refs__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "cb_refs__open_and_load")) + return; + prog = bpf_object__find_program_by_name(skel->obj, cb_refs_tests[i].prog_name); + bpf_program__set_autoload(prog, true); + if (!ASSERT_ERR(cb_refs__load(skel), "cb_refs__load")) + bpf_prog_test_run_opts(bpf_program__fd(prog), &run_opts); + if (!ASSERT_OK_PTR(strstr(log_buf, cb_refs_tests[i].err_msg), "expected error message")) { + fprintf(stderr, "Expected: %s\n", cb_refs_tests[i].err_msg); + fprintf(stderr, "Verifier: %s\n", log_buf); + } + cb_refs__destroy(skel); + } +} diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c new file mode 100644 index 000000000000..7653df1bc787 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cb_refs.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +struct map_value { + struct prog_test_ref_kfunc __kptr_ref *ptr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 16); +} array_map SEC(".maps"); + +extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym; +extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; + +static __noinline int cb1(void *map, void *key, void *value, void *ctx) +{ + void *p = *(void **)ctx; + bpf_kfunc_call_test_release(p); + /* Without the fix this would cause underflow */ + return 0; +} + +SEC("?tc") +int underflow_prog(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long sl = 0; + + p = bpf_kfunc_call_test_acquire(&sl); + if (!p) + return 0; + bpf_for_each_map_elem(&array_map, cb1, &p, 0); + return 0; +} + +static __always_inline int cb2(void *map, void *key, void *value, void *ctx) +{ + unsigned long sl = 0; + + *(void **)ctx = bpf_kfunc_call_test_acquire(&sl); + /* Without the fix this would leak memory */ + return 0; +} + +SEC("?tc") +int leak_prog(void *ctx) +{ + struct prog_test_ref_kfunc *p; + struct map_value *v; + unsigned long sl; + + v = bpf_map_lookup_elem(&array_map, &(int){0}); + if (!v) + return 0; + + p = NULL; + bpf_for_each_map_elem(&array_map, cb2, &p, 0); + p = bpf_kptr_xchg(&v->ptr, p); + if (p) + bpf_kfunc_call_test_release(p); + return 0; +} + +static __always_inline int cb(void *map, void *key, void *value, void *ctx) +{ + return 0; +} + +static __always_inline int cb3(void *map, void *key, void *value, void *ctx) +{ + unsigned long sl = 0; + void *p; + + bpf_kfunc_call_test_acquire(&sl); + bpf_for_each_map_elem(&array_map, cb, &p, 0); + /* It should only complain here, not in cb. This is why we need + * callback_ref to be set to frameno. + */ + return 0; +} + +SEC("?tc") +int nested_cb(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long sl = 0; + int sp = 0; + + p = bpf_kfunc_call_test_acquire(&sl); + if (!p) + return 0; + bpf_for_each_map_elem(&array_map, cb3, &sp, 0); + bpf_kfunc_call_test_release(p); + return 0; +} + +SEC("?tc") +int non_cb_transfer_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long sl = 0; + + p = bpf_kfunc_call_test_acquire(&sl); + if (!p) + return 0; + cb1(NULL, NULL, NULL, &p); + bpf_kfunc_call_test_acquire(&sl); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b03914f7ff7bc5aca056aaa49fd3ff9120d24f47 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Wed, 24 Aug 2022 16:39:06 +0000 Subject: selftests/bpf: Add cb_refs test to s390x deny list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cb_refs BPF selftest is failing execution on s390x machines. This is a newly added test that requires a feature not presently supported on this architecture. Denylist the test for this architecture. Fixes: 3cf7e7d8685c ("selftests/bpf: Add tests for reference state fixes for callbacks") Signed-off-by: Daniel Müller Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220824163906.1186832-1-deso@posteo.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a708c3dcc154..37bafcbf952a 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -66,3 +66,4 @@ select_reuseport # intermittently fails on new s390x set xdp_synproxy # JIT does not support calling kernel function (kfunc) unpriv_bpf_disabled # fentry setget_sockopt # attach unexpected error: -524 (trampoline) +cb_refs # expected error message unexpected error: -524 (trampoline) -- cgit v1.2.3 From 7e165d1939284d0bf16a83c591c3c5d24a110d0a Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 24 Aug 2022 09:39:07 +0800 Subject: selftests/bpf: Fix wrong size passed to bpf_setsockopt() sizeof(new_cc) is not real memory size that new_cc points to; introduce a new_cc_len to store the size and then pass it to bpf_setsockopt(). Fixes: 31123c0360e0 ("selftests/bpf: bpf_setsockopt tests") Signed-off-by: Yang Yingliang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220824013907.380448-1-yangyingliang@huawei.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/setget_sockopt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 4a4cb44a4a15..40606ef47a38 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -305,15 +305,19 @@ static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc) if (t->opt == TCP_CONGESTION) { char old_cc[16], tmp_cc[16]; const char *new_cc; + int new_cc_len; if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc))) return 1; - if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc)) + if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc)) { new_cc = reno_cc; - else + new_cc_len = sizeof(reno_cc); + } else { new_cc = cubic_cc; + new_cc_len = sizeof(cubic_cc); + } if (bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, (void *)new_cc, - sizeof(new_cc))) + new_cc_len)) return 1; if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, tmp_cc, sizeof(tmp_cc))) return 1; -- cgit v1.2.3 From c35ecb95c448cde15cbde8fde93350d50bcc8be7 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 22 Aug 2022 11:10:22 -0700 Subject: selftests/net: Add test for timing a bind request to a port with a populated bhash entry This test populates the bhash table for a given port with MAX_THREADS * MAX_CONNECTIONS sockets, and then times how long a bind request on the port takes. When populating the bhash table, we create the sockets and then bind the sockets to the same address and port (SO_REUSEADDR and SO_REUSEPORT are set). When timing how long a bind on the port takes, we bind on a different address without SO_REUSEPORT set. We do not set SO_REUSEPORT because we are interested in the case where the bind request does not go through the tb->fastreuseport path, which is fragile (eg tb->fastreuseport path does not work if binding with a different uid). To run the script: Usage: ./bind_bhash.sh [-6 | -4] [-p port] [-a address] 6: use ipv6 4: use ipv4 port: Port number address: ip address Without any arguments, ./bind_bhash.sh defaults to ipv6 using ip address "2001:0db8:0:f101::1" on port 443. On my local machine, I see: ipv4: before - 0.002317 seconds with bhash2 - 0.000020 seconds ipv6: before - 0.002431 seconds with bhash2 - 0.000021 seconds Signed-off-by: Joanne Koong Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 3 +- tools/testing/selftests/net/Makefile | 3 + tools/testing/selftests/net/bind_bhash.c | 144 ++++++++++++++++++++++++++++++ tools/testing/selftests/net/bind_bhash.sh | 66 ++++++++++++++ 4 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/bind_bhash.c create mode 100755 tools/testing/selftests/net/bind_bhash.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 0e5751af6247..89e2d4aa812a 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -39,4 +39,5 @@ toeplitz tun cmsg_sender unix_connect -tap \ No newline at end of file +tap +bind_bhash diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 11a288b67e2f..b17ec78f3951 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -43,6 +43,7 @@ TEST_PROGS += ndisc_unsolicited_na_test.sh TEST_PROGS += arp_ndisc_untracked_subnets.sh TEST_PROGS += stress_reuseport_listen.sh TEST_PROGS := l2_tos_ttl_inherit.sh +TEST_PROGS += bind_bhash.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest @@ -64,6 +65,7 @@ TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh TEST_GEN_FILES += io_uring_zerocopy_tx +TEST_GEN_FILES += bind_bhash TEST_FILES := settings @@ -74,3 +76,4 @@ include bpf/Makefile $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread +$(OUTPUT)/bind_bhash: LDLIBS += -lpthread diff --git a/tools/testing/selftests/net/bind_bhash.c b/tools/testing/selftests/net/bind_bhash.c new file mode 100644 index 000000000000..57ff67a3751e --- /dev/null +++ b/tools/testing/selftests/net/bind_bhash.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This times how long it takes to bind to a port when the port already + * has multiple sockets in its bhash table. + * + * In the setup(), we populate the port's bhash table with + * MAX_THREADS * MAX_CONNECTIONS number of entries. + */ + +#include +#include +#include +#include +#include +#include + +#define MAX_THREADS 600 +#define MAX_CONNECTIONS 40 + +static const char *setup_addr_v6 = "::1"; +static const char *setup_addr_v4 = "127.0.0.1"; +static const char *setup_addr; +static const char *bind_addr; +static const char *port; +bool use_v6; +int ret; + +static int fd_array[MAX_THREADS][MAX_CONNECTIONS]; + +static int bind_socket(int opt, const char *addr) +{ + struct addrinfo *res, hint = {}; + int sock_fd, reuse = 1, err; + int domain = use_v6 ? AF_INET6 : AF_INET; + + sock_fd = socket(domain, SOCK_STREAM, 0); + if (sock_fd < 0) { + perror("socket fd err"); + return sock_fd; + } + + hint.ai_family = domain; + hint.ai_socktype = SOCK_STREAM; + + err = getaddrinfo(addr, port, &hint, &res); + if (err) { + perror("getaddrinfo failed"); + goto cleanup; + } + + if (opt) { + err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse)); + if (err) { + perror("setsockopt failed"); + goto cleanup; + } + } + + err = bind(sock_fd, res->ai_addr, res->ai_addrlen); + if (err) { + perror("failed to bind to port"); + goto cleanup; + } + + return sock_fd; + +cleanup: + close(sock_fd); + return err; +} + +static void *setup(void *arg) +{ + int sock_fd, i; + int *array = (int *)arg; + + for (i = 0; i < MAX_CONNECTIONS; i++) { + sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr); + if (sock_fd < 0) { + ret = sock_fd; + pthread_exit(&ret); + } + array[i] = sock_fd; + } + + return NULL; +} + +int main(int argc, const char *argv[]) +{ + int listener_fd, sock_fd, i, j; + pthread_t tid[MAX_THREADS]; + clock_t begin, end; + + if (argc != 4) { + printf("Usage: listener \n"); + return -1; + } + + port = argv[1]; + use_v6 = strcmp(argv[2], "ipv6") == 0; + bind_addr = argv[3]; + + setup_addr = use_v6 ? setup_addr_v6 : setup_addr_v4; + + listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr); + if (listen(listener_fd, 100) < 0) { + perror("listen failed"); + return -1; + } + + /* Set up threads to populate the bhash table entry for the port */ + for (i = 0; i < MAX_THREADS; i++) + pthread_create(&tid[i], NULL, setup, fd_array[i]); + + for (i = 0; i < MAX_THREADS; i++) + pthread_join(tid[i], NULL); + + if (ret) + goto done; + + begin = clock(); + + /* Bind to the same port on a different address */ + sock_fd = bind_socket(0, bind_addr); + if (sock_fd < 0) + goto done; + + end = clock(); + + printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC); + + /* clean up */ + close(sock_fd); + +done: + close(listener_fd); + for (i = 0; i < MAX_THREADS; i++) { + for (j = 0; i < MAX_THREADS; i++) + close(fd_array[i][j]); + } + + return 0; +} diff --git a/tools/testing/selftests/net/bind_bhash.sh b/tools/testing/selftests/net/bind_bhash.sh new file mode 100755 index 000000000000..ca0292d4b441 --- /dev/null +++ b/tools/testing/selftests/net/bind_bhash.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +NR_FILES=32768 +SAVED_NR_FILES=$(ulimit -n) + +# default values +port=443 +addr_v6="2001:0db8:0:f101::1" +addr_v4="10.8.8.8" +use_v6=true +addr="" + +usage() { + echo "Usage: $0 [-6 | -4] [-p port] [-a address]" + echo -e "\t6: use ipv6" + echo -e "\t4: use ipv4" + echo -e "\tport: Port number" + echo -e "\taddress: ip address" +} + +while getopts "ha:p:64" opt; do + case ${opt} in + h) + usage $0 + exit 0 + ;; + a) addr=$OPTARG;; + p) + port=$OPTARG;; + 6) + use_v6=true;; + 4) + use_v6=false;; + esac +done + +setup() { + if [[ "$use_v6" == true ]]; then + ip addr add $addr_v6 nodad dev eth0 + else + ip addr add $addr_v4 dev lo + fi + ulimit -n $NR_FILES +} + +cleanup() { + if [[ "$use_v6" == true ]]; then + ip addr del $addr_v6 dev eth0 + else + ip addr del $addr_v4/32 dev lo + fi + ulimit -n $SAVED_NR_FILES +} + +if [[ "$addr" != "" ]]; then + addr_v4=$addr; + addr_v6=$addr; +fi +setup +if [[ "$use_v6" == true ]] ; then + ./bind_bhash $port "ipv6" $addr_v6 +else + ./bind_bhash $port "ipv4" $addr_v4 +fi +cleanup -- cgit v1.2.3 From 1be9ac87a75a4fc0e2cc254e412d2d67a58a7191 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 22 Aug 2022 11:10:23 -0700 Subject: selftests/net: Add sk_bind_sendto_listen and sk_connect_zero_addr This patch adds 2 new tests: sk_bind_sendto_listen and sk_connect_zero_addr. The sk_bind_sendto_listen test exercises the path where a socket's rcv saddr changes after it has been added to the binding tables, and then a listen() on the socket is invoked. The listen() should succeed. The sk_bind_sendto_listen test is copied over from one of syzbot's tests: https://syzkaller.appspot.com/x/repro.c?x=1673a38df00000 The sk_connect_zero_addr test exercises the path where the socket was never previously added to the binding tables and it gets assigned a saddr upon a connect() to address 0. Signed-off-by: Joanne Koong Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 2 + tools/testing/selftests/net/Makefile | 2 + .../testing/selftests/net/sk_bind_sendto_listen.c | 80 ++++++++++++++++++++++ tools/testing/selftests/net/sk_connect_zero_addr.c | 62 +++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 tools/testing/selftests/net/sk_bind_sendto_listen.c create mode 100644 tools/testing/selftests/net/sk_connect_zero_addr.c (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 89e2d4aa812a..bec5cf96984c 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -41,3 +41,5 @@ cmsg_sender unix_connect tap bind_bhash +sk_bind_sendto_listen +sk_connect_zero_addr diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b17ec78f3951..e6a951ba5ba0 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -66,6 +66,8 @@ TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh TEST_GEN_FILES += io_uring_zerocopy_tx TEST_GEN_FILES += bind_bhash +TEST_GEN_PROGS += sk_bind_sendto_listen +TEST_GEN_PROGS += sk_connect_zero_addr TEST_FILES := settings diff --git a/tools/testing/selftests/net/sk_bind_sendto_listen.c b/tools/testing/selftests/net/sk_bind_sendto_listen.c new file mode 100644 index 000000000000..b420d830f72c --- /dev/null +++ b/tools/testing/selftests/net/sk_bind_sendto_listen.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +int main(void) +{ + int fd1, fd2, one = 1; + struct sockaddr_in6 bind_addr = { + .sin6_family = AF_INET6, + .sin6_port = htons(20000), + .sin6_flowinfo = htonl(0), + .sin6_addr = {}, + .sin6_scope_id = 0, + }; + + inet_pton(AF_INET6, "::", &bind_addr.sin6_addr); + + fd1 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP); + if (fd1 < 0) { + error(1, errno, "socket fd1"); + return -1; + } + + if (setsockopt(fd1, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) { + error(1, errno, "setsockopt(SO_REUSEADDR) fd1"); + goto out_err1; + } + + if (bind(fd1, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) { + error(1, errno, "bind fd1"); + goto out_err1; + } + + if (sendto(fd1, NULL, 0, MSG_FASTOPEN, (struct sockaddr *)&bind_addr, + sizeof(bind_addr))) { + error(1, errno, "sendto fd1"); + goto out_err1; + } + + fd2 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP); + if (fd2 < 0) { + error(1, errno, "socket fd2"); + goto out_err1; + } + + if (setsockopt(fd2, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) { + error(1, errno, "setsockopt(SO_REUSEADDR) fd2"); + goto out_err2; + } + + if (bind(fd2, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) { + error(1, errno, "bind fd2"); + goto out_err2; + } + + if (sendto(fd2, NULL, 0, MSG_FASTOPEN, (struct sockaddr *)&bind_addr, + sizeof(bind_addr)) != -1) { + error(1, errno, "sendto fd2"); + goto out_err2; + } + + if (listen(fd2, 0)) { + error(1, errno, "listen"); + goto out_err2; + } + + close(fd2); + close(fd1); + return 0; + +out_err2: + close(fd2); + +out_err1: + close(fd1); + return -1; +} diff --git a/tools/testing/selftests/net/sk_connect_zero_addr.c b/tools/testing/selftests/net/sk_connect_zero_addr.c new file mode 100644 index 000000000000..4be418aefd9f --- /dev/null +++ b/tools/testing/selftests/net/sk_connect_zero_addr.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +int main(void) +{ + int fd1, fd2, one = 1; + struct sockaddr_in6 bind_addr = { + .sin6_family = AF_INET6, + .sin6_port = htons(20000), + .sin6_flowinfo = htonl(0), + .sin6_addr = {}, + .sin6_scope_id = 0, + }; + + inet_pton(AF_INET6, "::", &bind_addr.sin6_addr); + + fd1 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP); + if (fd1 < 0) { + error(1, errno, "socket fd1"); + return -1; + } + + if (setsockopt(fd1, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) { + error(1, errno, "setsockopt(SO_REUSEADDR) fd1"); + goto out_err1; + } + + if (bind(fd1, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) { + error(1, errno, "bind fd1"); + goto out_err1; + } + + if (listen(fd1, 0)) { + error(1, errno, "listen"); + goto out_err1; + } + + fd2 = socket(AF_INET6, SOCK_STREAM, IPPROTO_IP); + if (fd2 < 0) { + error(1, errno, "socket fd2"); + goto out_err1; + } + + if (connect(fd2, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) { + error(1, errno, "bind fd2"); + goto out_err2; + } + + close(fd2); + close(fd1); + return 0; + +out_err2: + close(fd2); +out_err1: + close(fd1); + return -1; +} -- cgit v1.2.3 From c93c296fff6b369a7115916145047c8a3db6e27f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 24 Aug 2022 17:13:26 +0200 Subject: x86/sev: Mark snp_abort() noreturn Mark both the function prototype and definition as noreturn in order to prevent the compiler from doing transformations which confuse objtool like so: vmlinux.o: warning: objtool: sme_enable+0x71: unreachable instruction This triggers with gcc-12. Add it and sev_es_terminate() to the objtool noreturn tracking array too. Sort it while at it. Suggested-by: Michael Matz Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20220824152420.20547-1-bp@alien8.de --- arch/x86/include/asm/sev.h | 2 +- arch/x86/kernel/sev.c | 2 +- tools/objtool/check.c | 34 ++++++++++++++++++---------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4a23e52fe0ee..ebc271bb6d8e 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -195,7 +195,7 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void snp_abort(void); +void __init __noreturn snp_abort(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 4f84c3f11af5..a428c62330d3 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2112,7 +2112,7 @@ bool __init snp_init(struct boot_params *bp) return true; } -void __init snp_abort(void) +void __init __noreturn snp_abort(void) { sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0cec74da7ffe..ad51689dfb41 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -162,32 +162,34 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, /* * Unfortunately these have to be hard coded because the noreturn - * attribute isn't provided in ELF data. + * attribute isn't provided in ELF data. Keep 'em sorted. */ static const char * const global_noreturns[] = { + "__invalid_creds", + "__module_put_and_kthread_exit", + "__reiserfs_panic", "__stack_chk_fail", - "panic", + "__ubsan_handle_builtin_unreachable", + "cpu_bringup_and_idle", + "cpu_startup_entry", "do_exit", + "do_group_exit", "do_task_dead", - "kthread_exit", - "make_task_dead", - "__module_put_and_kthread_exit", + "ex_handler_msr_mce", + "fortify_panic", "kthread_complete_and_exit", - "__reiserfs_panic", + "kthread_exit", + "kunit_try_catch_throw", "lbug_with_loc", - "fortify_panic", - "usercopy_abort", "machine_real_restart", + "make_task_dead", + "panic", "rewind_stack_and_make_dead", - "kunit_try_catch_throw", - "xen_start_kernel", - "cpu_bringup_and_idle", - "do_group_exit", + "sev_es_terminate", + "snp_abort", "stop_this_cpu", - "__invalid_creds", - "cpu_startup_entry", - "__ubsan_handle_builtin_unreachable", - "ex_handler_msr_mce", + "usercopy_abort", + "xen_start_kernel", }; if (!func) -- cgit v1.2.3 From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:13 -0700 Subject: bpf: Introduce cgroup iter Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes: - walking a cgroup's descendants in pre-order. - walking a cgroup's descendants in post-order. - walking a cgroup's ancestors. - process only the given cgroup. When attaching cgroup_iter, one can set a cgroup to the iter_link created from attaching. This cgroup is passed as a file descriptor or cgroup id and serves as the starting point of the walk. If no cgroup is specified, the starting point will be the root cgroup v2. For walking descendants, one can specify the order: either pre-order or post-order. For walking ancestors, the walk starts at the specified cgroup and ends at the root. One can also terminate the walk early by returning 1 from the iter program. Note that because walking cgroup hierarchy holds cgroup_mutex, the iter program is called with cgroup_mutex held. Currently only one session is supported, which means, depending on the volume of data bpf program intends to send to user space, the number of cgroups that can be walked is limited. For example, given the current buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can be walked is 512. This is a limitation of cgroup_iter. If the output data is larger than the kernel buffer size, after all data in the kernel buffer is consumed by user space, the subsequent read() syscall will signal EOPNOTSUPP. In order to work around, the user may have to update their program to reduce the volume of data sent to output. For example, skip some uninteresting cgroups. In future, we may extend bpf_iter flags to allow customizing buffer size. Acked-by: Yonghong Song Acked-by: Tejun Heo Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 + include/uapi/linux/bpf.h | 30 +++ kernel/bpf/Makefile | 3 + kernel/bpf/cgroup_iter.c | 284 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 30 +++ tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 +- 6 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/cgroup_iter.c (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 99fc7a64564f..9c1674973e03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; +struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + /* for map_elem iter */ struct bpf_map *map; + + /* for cgroup iter */ + struct { + struct cgroup *start; /* starting cgroup */ + enum bpf_cgroup_iter_order order; + } cgroup; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 644600dbb114..0f61f09f467a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_ITER_ORDER_UNSPEC = 0, + BPF_ITER_SELF_ONLY, /* process only a single object. */ + BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -6176,11 +6195,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 057ba8e01e70..00e05b69a4df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -24,6 +24,9 @@ endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c new file mode 100644 index 000000000000..cf6d763a57d5 --- /dev/null +++ b/kernel/bpf/cgroup_iter.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Google */ +#include +#include +#include +#include +#include + +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ + +/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. + * + * 1. Walk the descendants of a cgroup in pre-order. + * 2. Walk the descendants of a cgroup in post-order. + * 3. Walk the ancestors of a cgroup. + * 4. Show the given cgroup only. + * + * For walking descendants, cgroup_iter can walk in either pre-order or + * post-order. For walking ancestors, the iter walks up from a cgroup to + * the root. + * + * The iter program can terminate the walk early by returning 1. Walk + * continues if prog returns 0. + * + * The prog can check (seq->num == 0) to determine whether this is + * the first element. The prog may also be passed a NULL cgroup, + * which means the walk has completed and the prog has a chance to + * do post-processing, such as outputting an epilogue. + * + * Note: the iter_prog is called with cgroup_mutex held. + * + * Currently only one session is supported, which means, depending on the + * volume of data bpf program intends to send to user space, the number + * of cgroups that can be walked is limited. For example, given the current + * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each + * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can + * be walked is 512. This is a limitation of cgroup_iter. If the output data + * is larger than the kernel buffer size, after all data in the kernel buffer + * is consumed by user space, the subsequent read() syscall will signal + * EOPNOTSUPP. In order to work around, the user may have to update their + * program to reduce the volume of data sent to output. For example, skip + * some uninteresting cgroups. + */ + +struct bpf_iter__cgroup { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cgroup *, cgroup); +}; + +struct cgroup_iter_priv { + struct cgroup_subsys_state *start_css; + bool visited_all; + bool terminate; + int order; +}; + +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_lock(&cgroup_mutex); + + /* cgroup_iter doesn't support read across multiple sessions. */ + if (*pos > 0) { + if (p->visited_all) + return NULL; + + /* Haven't visited all, but because cgroup_mutex has dropped, + * return -EOPNOTSUPP to indicate incomplete iteration. + */ + return ERR_PTR(-EOPNOTSUPP); + } + + ++*pos; + p->terminate = false; + p->visited_all = false; + if (p->order == BPF_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(NULL, p->start_css); + else if (p->order == BPF_ITER_DESCENDANTS_POST) + return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_ITER_ANCESTORS_UP) + return p->start_css; + else /* BPF_ITER_SELF_ONLY */ + return p->start_css; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop); + +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_unlock(&cgroup_mutex); + + /* pass NULL to the prog for post-processing */ + if (!v) { + __cgroup_iter_seq_show(seq, NULL, true); + p->visited_all = true; + } +} + +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; + struct cgroup_iter_priv *p = seq->private; + + ++*pos; + if (p->terminate) + return NULL; + + if (p->order == BPF_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(curr, p->start_css); + else if (p->order == BPF_ITER_DESCENDANTS_POST) + return css_next_descendant_post(curr, p->start_css); + else if (p->order == BPF_ITER_ANCESTORS_UP) + return curr->parent; + else /* BPF_ITER_SELF_ONLY */ + return NULL; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop) +{ + struct cgroup_iter_priv *p = seq->private; + struct bpf_iter__cgroup ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + /* cgroup is dead, skip this element */ + if (css && cgroup_is_dead(css->cgroup)) + return 0; + + ctx.meta = &meta; + ctx.cgroup = css ? css->cgroup : NULL; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + /* if prog returns > 0, terminate after this element. */ + if (ret != 0) + p->terminate = true; + + return 0; +} + +static int cgroup_iter_seq_show(struct seq_file *seq, void *v) +{ + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, + false); +} + +static const struct seq_operations cgroup_iter_seq_ops = { + .start = cgroup_iter_seq_start, + .next = cgroup_iter_seq_next, + .stop = cgroup_iter_seq_stop, + .show = cgroup_iter_seq_show, +}; + +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) + +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + struct cgroup *cgrp = aux->cgroup.start; + + p->start_css = &cgrp->self; + p->terminate = false; + p->visited_all = false; + p->order = aux->cgroup.order; + return 0; +} + +static const struct bpf_iter_seq_info cgroup_iter_seq_info = { + .seq_ops = &cgroup_iter_seq_ops, + .init_seq_private = cgroup_iter_seq_init, + .seq_priv_size = sizeof(struct cgroup_iter_priv), +}; + +static int bpf_iter_attach_cgroup(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + int fd = linfo->cgroup.cgroup_fd; + u64 id = linfo->cgroup.cgroup_id; + int order = linfo->cgroup.order; + struct cgroup *cgrp; + + if (order != BPF_ITER_DESCENDANTS_PRE && + order != BPF_ITER_DESCENDANTS_POST && + order != BPF_ITER_ANCESTORS_UP && + order != BPF_ITER_SELF_ONLY) + return -EINVAL; + + if (fd && id) + return -EINVAL; + + if (fd) + cgrp = cgroup_get_from_fd(fd); + else if (id) + cgrp = cgroup_get_from_id(id); + else /* walk the entire hierarchy by default. */ + cgrp = cgroup_get_from_path("/"); + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + aux->cgroup.start = cgrp; + aux->cgroup.order = order; + return 0; +} + +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) +{ + cgroup_put(aux->cgroup.start); +} + +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + char *buf; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + seq_puts(seq, "cgroup_path:\t\n"); + goto show_order; + } + + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path + * will print nothing. + * + * Path is in the calling process's cgroup namespace. + */ + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + seq_printf(seq, "cgroup_path:\t%s\n", buf); + kfree(buf); + +show_order: + if (aux->cgroup.order == BPF_ITER_DESCENDANTS_PRE) + seq_puts(seq, "order: descendants_pre\n"); + else if (aux->cgroup.order == BPF_ITER_DESCENDANTS_POST) + seq_puts(seq, "order: descendants_post\n"); + else if (aux->cgroup.order == BPF_ITER_ANCESTORS_UP) + seq_puts(seq, "order: ancestors_up\n"); + else /* BPF_ITER_SELF_ONLY */ + seq_puts(seq, "order: self_only\n"); +} + +static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.cgroup.order = aux->cgroup.order; + info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); + return 0; +} + +DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, + struct cgroup *cgroup) + +static struct bpf_iter_reg bpf_cgroup_reg_info = { + .target = "cgroup", + .feature = BPF_ITER_RESCHED, + .attach_target = bpf_iter_attach_cgroup, + .detach_target = bpf_iter_detach_cgroup, + .show_fdinfo = bpf_iter_cgroup_show_fdinfo, + .fill_link_info = bpf_iter_cgroup_fill_link_info, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cgroup, cgroup), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &cgroup_iter_seq_info, +}; + +static int __init bpf_cgroup_iter_init(void) +{ + bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; + return bpf_iter_reg_target(&bpf_cgroup_reg_info); +} + +late_initcall(bpf_cgroup_iter_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4fb685591035..5056cef2112f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_ITER_ORDER_UNSPEC = 0, + BPF_ITER_SELF_ONLY, /* process only a single object. */ + BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -6176,11 +6195,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 5fce7008d1ff..84c1cfaa2b02 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -764,8 +764,8 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}", - { .map = { .map_fd = 1 }}); + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (__u32)1,.cgroup_fd = (__u32)1,},}", + { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so * complex, we don't do a string comparison, just verify we return -- cgit v1.2.3 From fe0dd9d4b7402c9773fc7a453fa65875abaa24ec Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:14 -0700 Subject: selftests/bpf: Test cgroup_iter. Add a selftest for cgroup_iter. The selftest creates a mini cgroup tree of the following structure: ROOT (working cgroup) | PARENT / \ CHILD1 CHILD2 and tests the following scenarios: - invalid cgroup fd. - pre-order walk over descendants from PARENT. - post-order walk over descendants from PARENT. - walk of ancestors from PARENT. - process only a single object (i.e. PARENT). - early termination. Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-3-haoluo@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- .../testing/selftests/bpf/prog_tests/cgroup_iter.c | 224 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter.h | 7 + tools/testing/selftests/bpf/progs/cgroup_iter.c | 39 ++++ 4 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 84c1cfaa2b02..a1bae92be1fc 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (__u32)1,.cgroup_fd = (__u32)1,},}", + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c new file mode 100644 index 000000000000..38958c37b9ce --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include +#include +#include +#include "cgroup_iter.skel.h" +#include "cgroup_helpers.h" + +#define ROOT 0 +#define PARENT 1 +#define CHILD1 2 +#define CHILD2 3 +#define NUM_CGROUPS 4 + +#define PROLOGUE "prologue\n" +#define EPILOGUE "epilogue\n" + +static const char *cg_path[] = { + "/", "/parent", "/parent/child1", "/parent/child2" +}; + +static int cg_fd[] = {-1, -1, -1, -1}; +static unsigned long long cg_id[] = {0, 0, 0, 0}; +static char expected_output[64]; + +static int setup_cgroups(void) +{ + int fd, i = 0; + + for (i = 0; i < NUM_CGROUPS; i++) { + fd = create_and_get_cgroup(cg_path[i]); + if (fd < 0) + return fd; + + cg_fd[i] = fd; + cg_id[i] = get_cgroup_id(cg_path[i]); + } + return 0; +} + +static void cleanup_cgroups(void) +{ + int i; + + for (i = 0; i < NUM_CGROUPS; i++) + close(cg_fd[i]); +} + +static void read_from_cgroup_iter(struct bpf_program *prog, int cgroup_fd, + int order, const char *testname) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link *link; + int len, iter_fd; + static char buf[128]; + size_t left; + char *p; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = cgroup_fd; + linfo.cgroup.order = order; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(prog, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + return; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (iter_fd < 0) + goto free_link; + + memset(buf, 0, sizeof(buf)); + left = ARRAY_SIZE(buf); + p = buf; + while ((len = read(iter_fd, p, left)) > 0) { + p += len; + left -= len; + } + + ASSERT_STREQ(buf, expected_output, testname); + + /* read() after iter finishes should be ok. */ + if (len == 0) + ASSERT_OK(read(iter_fd, buf, sizeof(buf)), "second_read"); + + close(iter_fd); +free_link: + bpf_link__destroy(link); +} + +/* Invalid cgroup. */ +static void test_invalid_cgroup(struct cgroup_iter *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link *link; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = (__u32)-1; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_id_printer, &opts); + ASSERT_ERR_PTR(link, "attach_iter"); + bpf_link__destroy(link); +} + +/* Specifying both cgroup_fd and cgroup_id is invalid. */ +static void test_invalid_cgroup_spec(struct cgroup_iter *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo; + struct bpf_link *link; + + memset(&linfo, 0, sizeof(linfo)); + linfo.cgroup.cgroup_fd = (__u32)cg_fd[PARENT]; + linfo.cgroup.cgroup_id = (__u64)cg_id[PARENT]; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_id_printer, &opts); + ASSERT_ERR_PTR(link, "attach_iter"); + bpf_link__destroy(link); +} + +/* Preorder walk prints parent and child in order. */ +static void test_walk_preorder(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n%8llu\n" EPILOGUE, + cg_id[PARENT], cg_id[CHILD1], cg_id[CHILD2]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_ITER_DESCENDANTS_PRE, "preorder"); +} + +/* Postorder walk prints child and parent in order. */ +static void test_walk_postorder(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n%8llu\n" EPILOGUE, + cg_id[CHILD1], cg_id[CHILD2], cg_id[PARENT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_ITER_DESCENDANTS_POST, "postorder"); +} + +/* Walking parents prints parent and then root. */ +static void test_walk_ancestors_up(struct cgroup_iter *skel) +{ + /* terminate the walk when ROOT is met. */ + skel->bss->terminal_cgroup = cg_id[ROOT]; + + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n" EPILOGUE, + cg_id[PARENT], cg_id[ROOT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_ITER_ANCESTORS_UP, "ancestors_up"); + + skel->bss->terminal_cgroup = 0; +} + +/* Early termination prints parent only. */ +static void test_early_termination(struct cgroup_iter *skel) +{ + /* terminate the walk after the first element is processed. */ + skel->bss->terminate_early = 1; + + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_ITER_DESCENDANTS_PRE, "early_termination"); + + skel->bss->terminate_early = 0; +} + +/* Waling self prints self only. */ +static void test_walk_self_only(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_ITER_SELF_ONLY, "self_only"); +} + +void test_cgroup_iter(void) +{ + struct cgroup_iter *skel = NULL; + + if (setup_cgroup_environment()) + return; + + if (setup_cgroups()) + goto out; + + skel = cgroup_iter__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter__open_and_load")) + goto out; + + if (test__start_subtest("cgroup_iter__invalid_cgroup")) + test_invalid_cgroup(skel); + if (test__start_subtest("cgroup_iter__invalid_cgroup_spec")) + test_invalid_cgroup_spec(skel); + if (test__start_subtest("cgroup_iter__preorder")) + test_walk_preorder(skel); + if (test__start_subtest("cgroup_iter__postorder")) + test_walk_postorder(skel); + if (test__start_subtest("cgroup_iter__ancestors_up_walk")) + test_walk_ancestors_up(skel); + if (test__start_subtest("cgroup_iter__early_termination")) + test_early_termination(skel); + if (test__start_subtest("cgroup_iter__self_only")) + test_walk_self_only(skel); +out: + cgroup_iter__destroy(skel); + cleanup_cgroups(); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index e9846606690d..c41ee80533ca 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -17,6 +17,7 @@ #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used #define bpf_iter__sockmap bpf_iter__sockmap___not_used #define bpf_iter__bpf_link bpf_iter__bpf_link___not_used +#define bpf_iter__cgroup bpf_iter__cgroup___not_used #define btf_ptr btf_ptr___not_used #define BTF_F_COMPACT BTF_F_COMPACT___not_used #define BTF_F_NONAME BTF_F_NONAME___not_used @@ -40,6 +41,7 @@ #undef bpf_iter__bpf_sk_storage_map #undef bpf_iter__sockmap #undef bpf_iter__bpf_link +#undef bpf_iter__cgroup #undef btf_ptr #undef BTF_F_COMPACT #undef BTF_F_NONAME @@ -141,6 +143,11 @@ struct bpf_iter__bpf_link { struct bpf_link *link; }; +struct bpf_iter__cgroup { + struct bpf_iter_meta *meta; + struct cgroup *cgroup; +} __attribute__((preserve_access_index)); + struct btf_ptr { void *ptr; __u32 type_id; diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter.c b/tools/testing/selftests/bpf/progs/cgroup_iter.c new file mode 100644 index 000000000000..de03997322a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include "bpf_iter.h" +#include +#include + +char _license[] SEC("license") = "GPL"; +int terminate_early = 0; +u64 terminal_cgroup = 0; + +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + +SEC("iter/cgroup") +int cgroup_id_printer(struct bpf_iter__cgroup *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct cgroup *cgrp = ctx->cgroup; + + /* epilogue */ + if (cgrp == NULL) { + BPF_SEQ_PRINTF(seq, "epilogue\n"); + return 0; + } + + /* prologue */ + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "prologue\n"); + + BPF_SEQ_PRINTF(seq, "%8llu\n", cgroup_id(cgrp)); + + if (terminal_cgroup == cgroup_id(cgrp)) + return 1; + + return terminate_early ? 1 : 0; +} -- cgit v1.2.3 From 434992bb603773c94465c7e68331e68424bdc9eb Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Aug 2022 16:31:16 -0700 Subject: selftests/bpf: extend cgroup helpers This patch extends bpf selft cgroup_helpers [ID] n various ways: - Add enable_controllers() that allows tests to enable all or a subset of controllers for a specific cgroup. - Add join_cgroup_parent(). The cgroup workdir is based on the pid, therefore a spawned child cannot join the same cgroup hierarchy of the test through join_cgroup(). join_cgroup_parent() is used in child processes to join a cgroup under the parent's workdir. - Add write_cgroup_file() and write_cgroup_file_parent() (similar to join_cgroup_parent() above). - Add get_root_cgroup() for tests that need to do checks on root cgroup. - Distinguish relative and absolute cgroup paths in function arguments. Now relative paths are called relative_path, and absolute paths are called cgroup_path. Signed-off-by: Yosry Ahmed Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-5-haoluo@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/cgroup_helpers.c | 202 +++++++++++++++++++++------ tools/testing/selftests/bpf/cgroup_helpers.h | 19 ++- 2 files changed, 174 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 9d59c3990ca8..e914cc45b766 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -33,49 +33,52 @@ #define CGROUP_MOUNT_DFLT "/sys/fs/cgroup" #define NETCLS_MOUNT_PATH CGROUP_MOUNT_DFLT "/net_cls" #define CGROUP_WORK_DIR "/cgroup-test-work-dir" -#define format_cgroup_path(buf, path) \ + +#define format_cgroup_path_pid(buf, path, pid) \ snprintf(buf, sizeof(buf), "%s%s%d%s", CGROUP_MOUNT_PATH, \ - CGROUP_WORK_DIR, getpid(), path) + CGROUP_WORK_DIR, pid, path) + +#define format_cgroup_path(buf, path) \ + format_cgroup_path_pid(buf, path, getpid()) + +#define format_parent_cgroup_path(buf, path) \ + format_cgroup_path_pid(buf, path, getppid()) #define format_classid_path(buf) \ snprintf(buf, sizeof(buf), "%s%s", NETCLS_MOUNT_PATH, \ CGROUP_WORK_DIR) -/** - * enable_all_controllers() - Enable all available cgroup v2 controllers - * - * Enable all available cgroup v2 controllers in order to increase - * the code coverage. - * - * If successful, 0 is returned. - */ -static int enable_all_controllers(char *cgroup_path) +static int __enable_controllers(const char *cgroup_path, const char *controllers) { char path[PATH_MAX + 1]; - char buf[PATH_MAX]; + char enable[PATH_MAX + 1]; char *c, *c2; int fd, cfd; ssize_t len; - snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path); - fd = open(path, O_RDONLY); - if (fd < 0) { - log_err("Opening cgroup.controllers: %s", path); - return 1; - } - - len = read(fd, buf, sizeof(buf) - 1); - if (len < 0) { + /* If not controllers are passed, enable all available controllers */ + if (!controllers) { + snprintf(path, sizeof(path), "%s/cgroup.controllers", + cgroup_path); + fd = open(path, O_RDONLY); + if (fd < 0) { + log_err("Opening cgroup.controllers: %s", path); + return 1; + } + len = read(fd, enable, sizeof(enable) - 1); + if (len < 0) { + close(fd); + log_err("Reading cgroup.controllers: %s", path); + return 1; + } else if (len == 0) { /* No controllers to enable */ + close(fd); + return 0; + } + enable[len] = 0; close(fd); - log_err("Reading cgroup.controllers: %s", path); - return 1; + } else { + strncpy(enable, controllers, sizeof(enable)); } - buf[len] = 0; - close(fd); - - /* No controllers available? We're probably on cgroup v1. */ - if (len == 0) - return 0; snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path); cfd = open(path, O_RDWR); @@ -84,7 +87,7 @@ static int enable_all_controllers(char *cgroup_path) return 1; } - for (c = strtok_r(buf, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) { + for (c = strtok_r(enable, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) { if (dprintf(cfd, "+%s\n", c) <= 0) { log_err("Enabling controller %s: %s", c, path); close(cfd); @@ -95,6 +98,87 @@ static int enable_all_controllers(char *cgroup_path) return 0; } +/** + * enable_controllers() - Enable cgroup v2 controllers + * @relative_path: The cgroup path, relative to the workdir + * @controllers: List of controllers to enable in cgroup.controllers format + * + * + * Enable given cgroup v2 controllers, if @controllers is NULL, enable all + * available controllers. + * + * If successful, 0 is returned. + */ +int enable_controllers(const char *relative_path, const char *controllers) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path(cgroup_path, relative_path); + return __enable_controllers(cgroup_path, controllers); +} + +static int __write_cgroup_file(const char *cgroup_path, const char *file, + const char *buf) +{ + char file_path[PATH_MAX + 1]; + int fd; + + snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file); + fd = open(file_path, O_RDWR); + if (fd < 0) { + log_err("Opening %s", file_path); + return 1; + } + + if (dprintf(fd, "%s", buf) <= 0) { + log_err("Writing to %s", file_path); + close(fd); + return 1; + } + close(fd); + return 0; +} + +/** + * write_cgroup_file() - Write to a cgroup file + * @relative_path: The cgroup path, relative to the workdir + * @file: The name of the file in cgroupfs to write to + * @buf: Buffer to write to the file + * + * Write to a file in the given cgroup's directory. + * + * If successful, 0 is returned. + */ +int write_cgroup_file(const char *relative_path, const char *file, + const char *buf) +{ + char cgroup_path[PATH_MAX - 24]; + + format_cgroup_path(cgroup_path, relative_path); + return __write_cgroup_file(cgroup_path, file, buf); +} + +/** + * write_cgroup_file_parent() - Write to a cgroup file in the parent process + * workdir + * @relative_path: The cgroup path, relative to the parent process workdir + * @file: The name of the file in cgroupfs to write to + * @buf: Buffer to write to the file + * + * Write to a file in the given cgroup's directory under the parent process + * workdir. + * + * If successful, 0 is returned. + */ +int write_cgroup_file_parent(const char *relative_path, const char *file, + const char *buf) +{ + char cgroup_path[PATH_MAX - 24]; + + format_parent_cgroup_path(cgroup_path, relative_path); + return __write_cgroup_file(cgroup_path, file, buf); +} + /** * setup_cgroup_environment() - Setup the cgroup environment * @@ -133,7 +217,9 @@ int setup_cgroup_environment(void) return 1; } - if (enable_all_controllers(cgroup_workdir)) + /* Enable all available controllers to increase test coverage */ + if (__enable_controllers(CGROUP_MOUNT_PATH, NULL) || + __enable_controllers(cgroup_workdir, NULL)) return 1; return 0; @@ -173,7 +259,7 @@ static int join_cgroup_from_top(const char *cgroup_path) /** * join_cgroup() - Join a cgroup - * @path: The cgroup path, relative to the workdir, to join + * @relative_path: The cgroup path, relative to the workdir, to join * * This function expects a cgroup to already be created, relative to the cgroup * work dir, and it joins it. For example, passing "/my-cgroup" as the path @@ -182,11 +268,27 @@ static int join_cgroup_from_top(const char *cgroup_path) * * On success, it returns 0, otherwise on failure it returns 1. */ -int join_cgroup(const char *path) +int join_cgroup(const char *relative_path) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path(cgroup_path, relative_path); + return join_cgroup_from_top(cgroup_path); +} + +/** + * join_parent_cgroup() - Join a cgroup in the parent process workdir + * @relative_path: The cgroup path, relative to parent process workdir, to join + * + * See join_cgroup(). + * + * On success, it returns 0, otherwise on failure it returns 1. + */ +int join_parent_cgroup(const char *relative_path) { char cgroup_path[PATH_MAX + 1]; - format_cgroup_path(cgroup_path, path); + format_parent_cgroup_path(cgroup_path, relative_path); return join_cgroup_from_top(cgroup_path); } @@ -212,9 +314,27 @@ void cleanup_cgroup_environment(void) nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT); } +/** + * get_root_cgroup() - Get the FD of the root cgroup + * + * On success, it returns the file descriptor. On failure, it returns -1. + * If there is a failure, it prints the error to stderr. + */ +int get_root_cgroup(void) +{ + int fd; + + fd = open(CGROUP_MOUNT_PATH, O_RDONLY); + if (fd < 0) { + log_err("Opening root cgroup"); + return -1; + } + return fd; +} + /** * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD - * @path: The cgroup path, relative to the workdir, to join + * @relative_path: The cgroup path, relative to the workdir, to join * * This function creates a cgroup under the top level workdir and returns the * file descriptor. It is idempotent. @@ -222,14 +342,14 @@ void cleanup_cgroup_environment(void) * On success, it returns the file descriptor. On failure it returns -1. * If there is a failure, it prints the error to stderr. */ -int create_and_get_cgroup(const char *path) +int create_and_get_cgroup(const char *relative_path) { char cgroup_path[PATH_MAX + 1]; int fd; - format_cgroup_path(cgroup_path, path); + format_cgroup_path(cgroup_path, relative_path); if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup %s .. %s", path, cgroup_path); + log_err("mkdiring cgroup %s .. %s", relative_path, cgroup_path); return -1; } @@ -244,13 +364,13 @@ int create_and_get_cgroup(const char *path) /** * get_cgroup_id() - Get cgroup id for a particular cgroup path - * @path: The cgroup path, relative to the workdir, to join + * @relative_path: The cgroup path, relative to the workdir, to join * * On success, it returns the cgroup id. On failure it returns 0, * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id(const char *path) +unsigned long long get_cgroup_id(const char *relative_path) { int dirfd, err, flags, mount_id, fhsize; union { @@ -261,7 +381,7 @@ unsigned long long get_cgroup_id(const char *path) struct file_handle *fhp, *fhp2; unsigned long long ret = 0; - format_cgroup_path(cgroup_workdir, path); + format_cgroup_path(cgroup_workdir, relative_path); dirfd = AT_FDCWD; flags = 0; diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index fcc9cb91b211..3358734356ab 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -10,11 +10,18 @@ __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) /* cgroupv2 related */ -int cgroup_setup_and_join(const char *path); -int create_and_get_cgroup(const char *path); -unsigned long long get_cgroup_id(const char *path); - -int join_cgroup(const char *path); +int enable_controllers(const char *relative_path, const char *controllers); +int write_cgroup_file(const char *relative_path, const char *file, + const char *buf); +int write_cgroup_file_parent(const char *relative_path, const char *file, + const char *buf); +int cgroup_setup_and_join(const char *relative_path); +int get_root_cgroup(void); +int create_and_get_cgroup(const char *relative_path); +unsigned long long get_cgroup_id(const char *relative_path); + +int join_cgroup(const char *relative_path); +int join_parent_cgroup(const char *relative_path); int setup_cgroup_environment(void); void cleanup_cgroup_environment(void); @@ -26,4 +33,4 @@ int join_classid(void); int setup_classid_environment(void); void cleanup_classid_environment(void); -#endif /* __CGROUP_HELPERS_H */ \ No newline at end of file +#endif /* __CGROUP_HELPERS_H */ -- cgit v1.2.3 From 88886309d2e82afcaa86fc302c2ba25d9e47cbc8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Aug 2022 16:31:17 -0700 Subject: selftests/bpf: add a selftest for cgroup hierarchical stats collection Add a selftest that tests the whole workflow for collecting, aggregating (flushing), and displaying cgroup hierarchical stats. TL;DR: - Userspace program creates a cgroup hierarchy and induces memcg reclaim in parts of it. - Whenever reclaim happens, vmscan_start and vmscan_end update per-cgroup percpu readings, and tell rstat which (cgroup, cpu) pairs have updates. - When userspace tries to read the stats, vmscan_dump calls rstat to flush the stats, and outputs the stats in text format to userspace (similar to cgroupfs stats). - rstat calls vmscan_flush once for every (cgroup, cpu) pair that has updates, vmscan_flush aggregates cpu readings and propagates updates to parents. - Userspace program makes sure the stats are aggregated and read correctly. Detailed explanation: - The test loads tracing bpf programs, vmscan_start and vmscan_end, to measure the latency of cgroup reclaim. Per-cgroup readings are stored in percpu maps for efficiency. When a cgroup reading is updated on a cpu, cgroup_rstat_updated(cgroup, cpu) is called to add the cgroup to the rstat updated tree on that cpu. - A cgroup_iter program, vmscan_dump, is loaded and pinned to a file, for each cgroup. Reading this file invokes the program, which calls cgroup_rstat_flush(cgroup) to ask rstat to propagate the updates for all cpus and cgroups that have updates in this cgroup's subtree. Afterwards, the stats are exposed to the user. vmscan_dump returns 1 to terminate iteration early, so that we only expose stats for one cgroup per read. - An ftrace program, vmscan_flush, is also loaded and attached to bpf_rstat_flush. When rstat flushing is ongoing, vmscan_flush is invoked once for each (cgroup, cpu) pair that has updates. cgroups are popped from the rstat tree in a bottom-up fashion, so calls will always be made for cgroups that have updates before their parents. The program aggregates percpu readings to a total per-cgroup reading, and also propagates them to the parent cgroup. After rstat flushing is over, all cgroups will have correct updated hierarchical readings (including all cpus and all their descendants). - Finally, the test creates a cgroup hierarchy and induces memcg reclaim in parts of it, and makes sure that the stats collection, aggregation, and reading workflow works as expected. Signed-off-by: Yosry Ahmed Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-6-haoluo@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../bpf/prog_tests/cgroup_hierarchical_stats.c | 357 +++++++++++++++++++++ .../bpf/progs/cgroup_hierarchical_stats.c | 226 +++++++++++++ 3 files changed, 584 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 37bafcbf952a..736b65f61022 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -67,3 +67,4 @@ xdp_synproxy # JIT does not support calling kernel f unpriv_bpf_disabled # fentry setget_sockopt # attach unexpected error: -524 (trampoline) cb_refs # expected error message unexpected error: -524 (trampoline) +cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c new file mode 100644 index 000000000000..101a6d70b863 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Functions to manage eBPF programs attached to cgroup subsystems + * + * Copyright 2022 Google LLC. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cgroup_helpers.h" +#include "cgroup_hierarchical_stats.skel.h" + +#define PAGE_SIZE 4096 +#define MB(x) (x << 20) + +#define BPFFS_ROOT "/sys/fs/bpf/" +#define BPFFS_VMSCAN BPFFS_ROOT"vmscan/" + +#define CG_ROOT_NAME "root" +#define CG_ROOT_ID 1 + +#define CGROUP_PATH(p, n) {.path = p"/"n, .name = n} + +static struct { + const char *path, *name; + unsigned long long id; + int fd; +} cgroups[] = { + CGROUP_PATH("/", "test"), + CGROUP_PATH("/test", "child1"), + CGROUP_PATH("/test", "child2"), + CGROUP_PATH("/test/child1", "child1_1"), + CGROUP_PATH("/test/child1", "child1_2"), + CGROUP_PATH("/test/child2", "child2_1"), + CGROUP_PATH("/test/child2", "child2_2"), +}; + +#define N_CGROUPS ARRAY_SIZE(cgroups) +#define N_NON_LEAF_CGROUPS 3 + +static int root_cgroup_fd; +static bool mounted_bpffs; + +/* reads file at 'path' to 'buf', returns 0 on success. */ +static int read_from_file(const char *path, char *buf, size_t size) +{ + int fd, len; + + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + len = read(fd, buf, size); + close(fd); + if (len < 0) + return len; + + buf[len] = 0; + return 0; +} + +/* mounts bpffs and mkdir for reading stats, returns 0 on success. */ +static int setup_bpffs(void) +{ + int err; + + /* Mount bpffs */ + err = mount("bpf", BPFFS_ROOT, "bpf", 0, NULL); + mounted_bpffs = !err; + if (ASSERT_FALSE(err && errno != EBUSY, "mount")) + return err; + + /* Create a directory to contain stat files in bpffs */ + err = mkdir(BPFFS_VMSCAN, 0755); + if (!ASSERT_OK(err, "mkdir")) + return err; + + return 0; +} + +static void cleanup_bpffs(void) +{ + /* Remove created directory in bpffs */ + ASSERT_OK(rmdir(BPFFS_VMSCAN), "rmdir "BPFFS_VMSCAN); + + /* Unmount bpffs, if it wasn't already mounted when we started */ + if (mounted_bpffs) + return; + + ASSERT_OK(umount(BPFFS_ROOT), "unmount bpffs"); +} + +/* sets up cgroups, returns 0 on success. */ +static int setup_cgroups(void) +{ + int i, fd, err; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + return err; + + root_cgroup_fd = get_root_cgroup(); + if (!ASSERT_GE(root_cgroup_fd, 0, "get_root_cgroup")) + return root_cgroup_fd; + + for (i = 0; i < N_CGROUPS; i++) { + fd = create_and_get_cgroup(cgroups[i].path); + if (!ASSERT_GE(fd, 0, "create_and_get_cgroup")) + return fd; + + cgroups[i].fd = fd; + cgroups[i].id = get_cgroup_id(cgroups[i].path); + + /* + * Enable memcg controller for the entire hierarchy. + * Note that stats are collected for all cgroups in a hierarchy + * with memcg enabled anyway, but are only exposed for cgroups + * that have memcg enabled. + */ + if (i < N_NON_LEAF_CGROUPS) { + err = enable_controllers(cgroups[i].path, "memory"); + if (!ASSERT_OK(err, "enable_controllers")) + return err; + } + } + return 0; +} + +static void cleanup_cgroups(void) +{ + close(root_cgroup_fd); + for (int i = 0; i < N_CGROUPS; i++) + close(cgroups[i].fd); + cleanup_cgroup_environment(); +} + +/* Sets up cgroup hiearchary, returns 0 on success. */ +static int setup_hierarchy(void) +{ + return setup_bpffs() || setup_cgroups(); +} + +static void destroy_hierarchy(void) +{ + cleanup_cgroups(); + cleanup_bpffs(); +} + +static int reclaimer(const char *cgroup_path, size_t size) +{ + static char size_buf[128]; + char *buf, *ptr; + int err; + + /* Join cgroup in the parent process workdir */ + if (join_parent_cgroup(cgroup_path)) + return EACCES; + + /* Allocate memory */ + buf = malloc(size); + if (!buf) + return ENOMEM; + + /* Write to memory to make sure it's actually allocated */ + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 1; + + /* Try to reclaim memory */ + snprintf(size_buf, 128, "%lu", size); + err = write_cgroup_file_parent(cgroup_path, "memory.reclaim", size_buf); + + free(buf); + /* memory.reclaim returns EAGAIN if the amount is not fully reclaimed */ + if (err && errno != EAGAIN) + return errno; + + return 0; +} + +static int induce_vmscan(void) +{ + int i, status; + + /* + * In every leaf cgroup, run a child process that allocates some memory + * and attempts to reclaim some of it. + */ + for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) { + pid_t pid; + + /* Create reclaimer child */ + pid = fork(); + if (pid == 0) { + status = reclaimer(cgroups[i].path, MB(5)); + exit(status); + } + + /* Cleanup reclaimer child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status), "reclaimer exited"); + ASSERT_EQ(WEXITSTATUS(status), 0, "reclaim exit code"); + } + return 0; +} + +static unsigned long long +get_cgroup_vmscan_delay(unsigned long long cgroup_id, const char *file_name) +{ + unsigned long long vmscan = 0, id = 0; + static char buf[128], path[128]; + + /* For every cgroup, read the file generated by cgroup_iter */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); + if (!ASSERT_OK(read_from_file(path, buf, 128), "read cgroup_iter")) + return 0; + + /* Check the output file formatting */ + ASSERT_EQ(sscanf(buf, "cg_id: %llu, total_vmscan_delay: %llu\n", + &id, &vmscan), 2, "output format"); + + /* Check that the cgroup_id is displayed correctly */ + ASSERT_EQ(id, cgroup_id, "cgroup_id"); + /* Check that the vmscan reading is non-zero */ + ASSERT_GT(vmscan, 0, "vmscan_reading"); + return vmscan; +} + +static void check_vmscan_stats(void) +{ + unsigned long long vmscan_readings[N_CGROUPS], vmscan_root; + int i; + + for (i = 0; i < N_CGROUPS; i++) { + vmscan_readings[i] = get_cgroup_vmscan_delay(cgroups[i].id, + cgroups[i].name); + } + + /* Read stats for root too */ + vmscan_root = get_cgroup_vmscan_delay(CG_ROOT_ID, CG_ROOT_NAME); + + /* Check that child1 == child1_1 + child1_2 */ + ASSERT_EQ(vmscan_readings[1], vmscan_readings[3] + vmscan_readings[4], + "child1_vmscan"); + /* Check that child2 == child2_1 + child2_2 */ + ASSERT_EQ(vmscan_readings[2], vmscan_readings[5] + vmscan_readings[6], + "child2_vmscan"); + /* Check that test == child1 + child2 */ + ASSERT_EQ(vmscan_readings[0], vmscan_readings[1] + vmscan_readings[2], + "test_vmscan"); + /* Check that root >= test */ + ASSERT_GE(vmscan_root, vmscan_readings[1], "root_vmscan"); +} + +/* Creates iter link and pins in bpffs, returns 0 on success, -errno on failure. + */ +static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj, + int cgroup_fd, const char *file_name) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = {}; + struct bpf_link *link; + static char path[128]; + int err; + + /* + * Create an iter link, parameterized by cgroup_fd. We only want to + * traverse one cgroup, so set the traversal order to "self". + */ + linfo.cgroup.cgroup_fd = cgroup_fd; + linfo.cgroup.order = BPF_ITER_SELF_ONLY; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + return -EFAULT; + + /* Pin the link to a bpffs file */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name); + err = bpf_link__pin(link, path); + ASSERT_OK(err, "pin cgroup_iter"); + + /* Remove the link, leaving only the ref held by the pinned file */ + bpf_link__destroy(link); + return err; +} + +/* Sets up programs for collecting stats, returns 0 on success. */ +static int setup_progs(struct cgroup_hierarchical_stats **skel) +{ + int i, err; + + *skel = cgroup_hierarchical_stats__open_and_load(); + if (!ASSERT_OK_PTR(*skel, "open_and_load")) + return 1; + + /* Attach cgroup_iter program that will dump the stats to cgroups */ + for (i = 0; i < N_CGROUPS; i++) { + err = setup_cgroup_iter(*skel, cgroups[i].fd, cgroups[i].name); + if (!ASSERT_OK(err, "setup_cgroup_iter")) + return err; + } + + /* Also dump stats for root */ + err = setup_cgroup_iter(*skel, root_cgroup_fd, CG_ROOT_NAME); + if (!ASSERT_OK(err, "setup_cgroup_iter")) + return err; + + bpf_program__set_autoattach((*skel)->progs.dump_vmscan, false); + err = cgroup_hierarchical_stats__attach(*skel); + if (!ASSERT_OK(err, "attach")) + return err; + + return 0; +} + +static void destroy_progs(struct cgroup_hierarchical_stats *skel) +{ + static char path[128]; + int i; + + for (i = 0; i < N_CGROUPS; i++) { + /* Delete files in bpffs that cgroup_iters are pinned in */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, + cgroups[i].name); + ASSERT_OK(remove(path), "remove cgroup_iter pin"); + } + + /* Delete root file in bpffs */ + snprintf(path, 128, "%s%s", BPFFS_VMSCAN, CG_ROOT_NAME); + ASSERT_OK(remove(path), "remove cgroup_iter root pin"); + cgroup_hierarchical_stats__destroy(skel); +} + +void test_cgroup_hierarchical_stats(void) +{ + struct cgroup_hierarchical_stats *skel = NULL; + + if (setup_hierarchy()) + goto hierarchy_cleanup; + if (setup_progs(&skel)) + goto cleanup; + if (induce_vmscan()) + goto cleanup; + check_vmscan_stats(); +cleanup: + destroy_progs(skel); +hierarchy_cleanup: + destroy_hierarchy(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c new file mode 100644 index 000000000000..8ab4253a1592 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Functions to manage eBPF programs attached to cgroup subsystems + * + * Copyright 2022 Google LLC. + */ +#include "vmlinux.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* + * Start times are stored per-task, not per-cgroup, as multiple tasks in one + * cgroup can perform reclaim concurrently. + */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, __u64); +} vmscan_start_time SEC(".maps"); + +struct vmscan_percpu { + /* Previous percpu state, to figure out if we have new updates */ + __u64 prev; + /* Current percpu state */ + __u64 state; +}; + +struct vmscan { + /* State propagated through children, pending aggregation */ + __u64 pending; + /* Total state, including all cpus and all children */ + __u64 state; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 100); + __type(key, __u64); + __type(value, struct vmscan_percpu); +} pcpu_cgroup_vmscan_elapsed SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 100); + __type(key, __u64); + __type(value, struct vmscan); +} cgroup_vmscan_elapsed SEC(".maps"); + +extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; +extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; + +static struct cgroup *task_memcg(struct task_struct *task) +{ + int cgrp_id; + +#if __has_builtin(__builtin_preserve_enum_value) + cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id, memory_cgrp_id); +#else + cgrp_id = memory_cgrp_id; +#endif + return task->cgroups->subsys[cgrp_id]->cgroup; +} + +static uint64_t cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + +static int create_vmscan_percpu_elem(__u64 cg_id, __u64 state) +{ + struct vmscan_percpu pcpu_init = {.state = state, .prev = 0}; + + return bpf_map_update_elem(&pcpu_cgroup_vmscan_elapsed, &cg_id, + &pcpu_init, BPF_NOEXIST); +} + +static int create_vmscan_elem(__u64 cg_id, __u64 state, __u64 pending) +{ + struct vmscan init = {.state = state, .pending = pending}; + + return bpf_map_update_elem(&cgroup_vmscan_elapsed, &cg_id, + &init, BPF_NOEXIST); +} + +SEC("tp_btf/mm_vmscan_memcg_reclaim_begin") +int BPF_PROG(vmscan_start, int order, gfp_t gfp_flags) +{ + struct task_struct *task = bpf_get_current_task_btf(); + __u64 *start_time_ptr; + + start_time_ptr = bpf_task_storage_get(&vmscan_start_time, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (start_time_ptr) + *start_time_ptr = bpf_ktime_get_ns(); + return 0; +} + +SEC("tp_btf/mm_vmscan_memcg_reclaim_end") +int BPF_PROG(vmscan_end, unsigned long nr_reclaimed) +{ + struct vmscan_percpu *pcpu_stat; + struct task_struct *current = bpf_get_current_task_btf(); + struct cgroup *cgrp; + __u64 *start_time_ptr; + __u64 current_elapsed, cg_id; + __u64 end_time = bpf_ktime_get_ns(); + + /* + * cgrp is the first parent cgroup of current that has memcg enabled in + * its subtree_control, or NULL if memcg is disabled in the entire tree. + * In a cgroup hierarchy like this: + * a + * / \ + * b c + * If "a" has memcg enabled, while "b" doesn't, then processes in "b" + * will accumulate their stats directly to "a". This makes sure that no + * stats are lost from processes in leaf cgroups that don't have memcg + * enabled, but only exposes stats for cgroups that have memcg enabled. + */ + cgrp = task_memcg(current); + if (!cgrp) + return 0; + + cg_id = cgroup_id(cgrp); + start_time_ptr = bpf_task_storage_get(&vmscan_start_time, current, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!start_time_ptr) + return 0; + + current_elapsed = end_time - *start_time_ptr; + pcpu_stat = bpf_map_lookup_elem(&pcpu_cgroup_vmscan_elapsed, + &cg_id); + if (pcpu_stat) + pcpu_stat->state += current_elapsed; + else if (create_vmscan_percpu_elem(cg_id, current_elapsed)) + return 0; + + cgroup_rstat_updated(cgrp, bpf_get_smp_processor_id()); + return 0; +} + +SEC("fentry/bpf_rstat_flush") +int BPF_PROG(vmscan_flush, struct cgroup *cgrp, struct cgroup *parent, int cpu) +{ + struct vmscan_percpu *pcpu_stat; + struct vmscan *total_stat, *parent_stat; + __u64 cg_id = cgroup_id(cgrp); + __u64 parent_cg_id = parent ? cgroup_id(parent) : 0; + __u64 *pcpu_vmscan; + __u64 state; + __u64 delta = 0; + + /* Add CPU changes on this level since the last flush */ + pcpu_stat = bpf_map_lookup_percpu_elem(&pcpu_cgroup_vmscan_elapsed, + &cg_id, cpu); + if (pcpu_stat) { + state = pcpu_stat->state; + delta += state - pcpu_stat->prev; + pcpu_stat->prev = state; + } + + total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); + if (!total_stat) { + if (create_vmscan_elem(cg_id, delta, 0)) + return 0; + + goto update_parent; + } + + /* Collect pending stats from subtree */ + if (total_stat->pending) { + delta += total_stat->pending; + total_stat->pending = 0; + } + + /* Propagate changes to this cgroup's total */ + total_stat->state += delta; + +update_parent: + /* Skip if there are no changes to propagate, or no parent */ + if (!delta || !parent_cg_id) + return 0; + + /* Propagate changes to cgroup's parent */ + parent_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, + &parent_cg_id); + if (parent_stat) + parent_stat->pending += delta; + else + create_vmscan_elem(parent_cg_id, 0, delta); + return 0; +} + +SEC("iter.s/cgroup") +int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp) +{ + struct seq_file *seq = meta->seq; + struct vmscan *total_stat; + __u64 cg_id = cgrp ? cgroup_id(cgrp) : 0; + + /* Do nothing for the terminal call */ + if (!cg_id) + return 1; + + /* Flush the stats to make sure we get the most updated numbers */ + cgroup_rstat_flush(cgrp); + + total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id); + if (!total_stat) { + BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: 0\n", + cg_id); + } else { + BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: %llu\n", + cg_id, total_stat->state); + } + + /* + * We only dump stats for one cgroup here, so return 1 to stop + * iteration after the first cgroup. + */ + return 1; +} -- cgit v1.2.3 From 7184aef9c0f7a81db8fd18d183ee42481d89bf35 Mon Sep 17 00:00:00 2001 From: Lam Thai Date: Wed, 24 Aug 2022 15:59:00 -0700 Subject: bpftool: Fix a wrong type cast in btf_dumper_int When `data` points to a boolean value, casting it to `int *` is problematic and could lead to a wrong value being passed to `jsonw_bool`. Change the cast to `bool *` instead. Fixes: b12d6ec09730 ("bpf: btf: add btf print functionality") Signed-off-by: Lam Thai Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220824225859.9038-1-lamthai@arista.com --- tools/bpf/bpftool/btf_dumper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 125798b0bc5d..19924b6ce796 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -452,7 +452,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset, *(char *)data); break; case BTF_INT_BOOL: - jsonw_bool(jw, *(int *)data); + jsonw_bool(jw, *(bool *)data); break; default: /* shouldn't happen */ -- cgit v1.2.3 From 1800b2ac96d8bc4ccdddc2ea9e83ecaffd54d3f2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 20:55:00 +0200 Subject: selftests/bpf: Add regression test for pruning fix Add a test to ensure we do mark_chain_precision for the argument type ARG_CONST_ALLOC_SIZE_OR_ZERO. For other argument types, this was already done, but propagation for missing for this case. Without the fix, this test case loads successfully. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823185500.467-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/precise.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 9e754423fa8b..6c03a7d805f9 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -192,3 +192,28 @@ .result = VERBOSE_ACCEPT, .retval = -1, }, +{ + "precise: mark_chain_precision for ARG_CONST_ALLOC_SIZE_OR_ZERO", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct xdp_md, ingress_ifindex)), + BPF_LD_MAP_FD(BPF_REG_6, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1), + BPF_MOV64_IMM(BPF_REG_2, 0x1000), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 42), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .prog_type = BPF_PROG_TYPE_XDP, + .flags = BPF_F_TEST_STATE_FREQ, + .errstr = "invalid access to memory, mem_size=1 off=42 size=8", + .result = REJECT, +}, -- cgit v1.2.3 From 88e500affe72fb704c4f201974b5199ca6f51e6c Mon Sep 17 00:00:00 2001 From: Adel Abouchaev Date: Wed, 24 Aug 2022 11:43:51 -0700 Subject: selftests/net: fix reinitialization of TEST_PROGS in net self tests. Assinging will drop all previous tests. Fixes: b690842d12fd ("selftests/net: test l2 tunnel TOS/TTL inheriting") Signed-off-by: Adel Abouchaev Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20220824184351.3759862-1-adel.abushaev@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index e6a951ba5ba0..f5ac1433c301 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -42,7 +42,7 @@ TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh TEST_PROGS += arp_ndisc_untracked_subnets.sh TEST_PROGS += stress_reuseport_listen.sh -TEST_PROGS := l2_tos_ttl_inherit.sh +TEST_PROGS += l2_tos_ttl_inherit.sh TEST_PROGS += bind_bhash.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh -- cgit v1.2.3 From d4ffb6f39f1a1b260966b43a4ffdb64779c650dd Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 25 Aug 2022 15:39:36 -0700 Subject: bpf: Add CGROUP prefix to cgroup_iter_order bpf_cgroup_iter_order is globally visible but the entries do not have CGROUP prefix. As requested by Andrii, put a CGROUP in the names in bpf_cgroup_iter_order. This patch fixes two previous commits: one introduced the API and the other uses the API in bpf selftest (that is, the selftest cgroup_hierarchical_stats). I tested this patch via the following command: test_progs -t cgroup,iter,btf_dump Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") Fixes: 88886309d2e8 ("selftests/bpf: add a selftest for cgroup hierarchical stats collection") Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220825223936.1865810-1-haoluo@google.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 10 +++---- kernel/bpf/cgroup_iter.c | 32 +++++++++++----------- tools/include/uapi/linux/bpf.h | 10 +++---- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- .../bpf/prog_tests/cgroup_hierarchical_stats.c | 2 +- .../testing/selftests/bpf/prog_tests/cgroup_iter.c | 10 +++---- 6 files changed, 33 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f61f09f467a..bdf4bc6d8d6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key { }; enum bpf_cgroup_iter_order { - BPF_ITER_ORDER_UNSPEC = 0, - BPF_ITER_SELF_ONLY, /* process only a single object. */ - BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ - BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ - BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index cf6d763a57d5..c69bce2f4403 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -74,13 +74,13 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) ++*pos; p->terminate = false; p->visited_all = false; - if (p->order == BPF_ITER_DESCENDANTS_PRE) + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) return css_next_descendant_pre(NULL, p->start_css); - else if (p->order == BPF_ITER_DESCENDANTS_POST) + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); - else if (p->order == BPF_ITER_ANCESTORS_UP) + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return p->start_css; - else /* BPF_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY */ return p->start_css; } @@ -109,13 +109,13 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (p->terminate) return NULL; - if (p->order == BPF_ITER_DESCENDANTS_PRE) + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) return css_next_descendant_pre(curr, p->start_css); - else if (p->order == BPF_ITER_DESCENDANTS_POST) + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(curr, p->start_css); - else if (p->order == BPF_ITER_ANCESTORS_UP) + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; - else /* BPF_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -188,10 +188,10 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_ITER_DESCENDANTS_PRE && - order != BPF_ITER_DESCENDANTS_POST && - order != BPF_ITER_ANCESTORS_UP && - order != BPF_ITER_SELF_ONLY) + if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && + order != BPF_CGROUP_ITER_DESCENDANTS_POST && + order != BPF_CGROUP_ITER_ANCESTORS_UP && + order != BPF_CGROUP_ITER_SELF_ONLY) return -EINVAL; if (fd && id) @@ -239,13 +239,13 @@ static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, kfree(buf); show_order: - if (aux->cgroup.order == BPF_ITER_DESCENDANTS_PRE) + if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) seq_puts(seq, "order: descendants_pre\n"); - else if (aux->cgroup.order == BPF_ITER_DESCENDANTS_POST) + else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) seq_puts(seq, "order: descendants_post\n"); - else if (aux->cgroup.order == BPF_ITER_ANCESTORS_UP) + else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); - else /* BPF_ITER_SELF_ONLY */ + else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5056cef2112f..92f7387e378a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key { }; enum bpf_cgroup_iter_order { - BPF_ITER_ORDER_UNSPEC = 0, - BPF_ITER_SELF_ONLY, /* process only a single object. */ - BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ - BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ - BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index a1bae92be1fc..7b5bbe21b549 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* union with nested struct */ TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, - "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}", { .cgroup = { .order = 1, .cgroup_fd = 1, }}); /* struct skb with nested structs/unions; because type output is so diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c index 101a6d70b863..bed1661596f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c @@ -275,7 +275,7 @@ static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj, * traverse one cgroup, so set the traversal order to "self". */ linfo.cgroup.cgroup_fd = cgroup_fd; - linfo.cgroup.order = BPF_ITER_SELF_ONLY; + linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY; opts.link_info = &linfo; opts.link_info_len = sizeof(linfo); link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts); diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index 38958c37b9ce..c4a2adb38da1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -134,7 +134,7 @@ static void test_walk_preorder(struct cgroup_iter *skel) cg_id[PARENT], cg_id[CHILD1], cg_id[CHILD2]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_DESCENDANTS_PRE, "preorder"); + BPF_CGROUP_ITER_DESCENDANTS_PRE, "preorder"); } /* Postorder walk prints child and parent in order. */ @@ -145,7 +145,7 @@ static void test_walk_postorder(struct cgroup_iter *skel) cg_id[CHILD1], cg_id[CHILD2], cg_id[PARENT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_DESCENDANTS_POST, "postorder"); + BPF_CGROUP_ITER_DESCENDANTS_POST, "postorder"); } /* Walking parents prints parent and then root. */ @@ -159,7 +159,7 @@ static void test_walk_ancestors_up(struct cgroup_iter *skel) cg_id[PARENT], cg_id[ROOT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_ANCESTORS_UP, "ancestors_up"); + BPF_CGROUP_ITER_ANCESTORS_UP, "ancestors_up"); skel->bss->terminal_cgroup = 0; } @@ -174,7 +174,7 @@ static void test_early_termination(struct cgroup_iter *skel) PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_DESCENDANTS_PRE, "early_termination"); + BPF_CGROUP_ITER_DESCENDANTS_PRE, "early_termination"); skel->bss->terminate_early = 0; } @@ -186,7 +186,7 @@ static void test_walk_self_only(struct cgroup_iter *skel) PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]); read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], - BPF_ITER_SELF_ONLY, "self_only"); + BPF_CGROUP_ITER_SELF_ONLY, "self_only"); } void test_cgroup_iter(void) -- cgit v1.2.3 From 98acee3f8db451eaab9fbd422e523c228aacf08c Mon Sep 17 00:00:00 2001 From: Nicholas Miehlbradt Date: Wed, 17 Aug 2022 15:06:40 +1000 Subject: selftests/powerpc: Add a test for execute-only memory This selftest is designed to cover execute-only protections on the Radix MMU but will also work with Hash. The tests are based on those found in pkey_exec_test with modifications to use the generic mprotect() instead of the pkey variants. Signed-off-by: Nicholas Miehlbradt Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220817050640.406017-2-ruscur@russell.cc --- tools/testing/selftests/powerpc/mm/Makefile | 3 +- tools/testing/selftests/powerpc/mm/exec_prot.c | 231 +++++++++++++++++++++++++ 2 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mm/exec_prot.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 27dc09d0bfee..19dd0b2ea397 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -3,7 +3,7 @@ noarg: $(MAKE) -C ../ TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ - large_vm_fork_separation bad_accesses pkey_exec_prot \ + large_vm_fork_separation bad_accesses exec_prot pkey_exec_prot \ pkey_siginfo stack_expansion_signal stack_expansion_ldst \ large_vm_gpr_corruption TEST_PROGS := stress_code_patching.sh @@ -22,6 +22,7 @@ $(OUTPUT)/wild_bctr: CFLAGS += -m64 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64 $(OUTPUT)/large_vm_gpr_corruption: CFLAGS += -m64 $(OUTPUT)/bad_accesses: CFLAGS += -m64 +$(OUTPUT)/exec_prot: CFLAGS += -m64 $(OUTPUT)/pkey_exec_prot: CFLAGS += -m64 $(OUTPUT)/pkey_siginfo: CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/mm/exec_prot.c b/tools/testing/selftests/powerpc/mm/exec_prot.c new file mode 100644 index 000000000000..db75b2225de1 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/exec_prot.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2022, Nicholas Miehlbradt, IBM Corporation + * based on pkey_exec_prot.c + * + * Test if applying execute protection on pages works as expected. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include +#include + +#include "pkeys.h" + + +#define PPC_INST_NOP 0x60000000 +#define PPC_INST_TRAP 0x7fe00008 +#define PPC_INST_BLR 0x4e800020 + +static volatile sig_atomic_t fault_code; +static volatile sig_atomic_t remaining_faults; +static volatile unsigned int *fault_addr; +static unsigned long pgsize, numinsns; +static unsigned int *insns; +static bool pkeys_supported; + +static bool is_fault_expected(int fault_code) +{ + if (fault_code == SEGV_ACCERR) + return true; + + /* Assume any pkey error is fine since pkey_exec_prot test covers them */ + if (fault_code == SEGV_PKUERR && pkeys_supported) + return true; + + return false; +} + +static void trap_handler(int signum, siginfo_t *sinfo, void *ctx) +{ + /* Check if this fault originated from the expected address */ + if (sinfo->si_addr != (void *)fault_addr) + sigsafe_err("got a fault for an unexpected address\n"); + + _exit(1); +} + +static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) +{ + fault_code = sinfo->si_code; + + /* Check if this fault originated from the expected address */ + if (sinfo->si_addr != (void *)fault_addr) { + sigsafe_err("got a fault for an unexpected address\n"); + _exit(1); + } + + /* Check if too many faults have occurred for a single test case */ + if (!remaining_faults) { + sigsafe_err("got too many faults for the same address\n"); + _exit(1); + } + + + /* Restore permissions in order to continue */ + if (is_fault_expected(fault_code)) { + if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE | PROT_EXEC)) { + sigsafe_err("failed to set access permissions\n"); + _exit(1); + } + } else { + sigsafe_err("got a fault with an unexpected code\n"); + _exit(1); + } + + remaining_faults--; +} + +static int check_exec_fault(int rights) +{ + /* + * Jump to the executable region. + * + * The first iteration also checks if the overwrite of the + * first instruction word from a trap to a no-op succeeded. + */ + fault_code = -1; + remaining_faults = 0; + if (!(rights & PROT_EXEC)) + remaining_faults = 1; + + FAIL_IF(mprotect(insns, pgsize, rights) != 0); + asm volatile("mtctr %0; bctrl" : : "r"(insns)); + + FAIL_IF(remaining_faults != 0); + if (!(rights & PROT_EXEC)) + FAIL_IF(!is_fault_expected(fault_code)); + + return 0; +} + +static int test(void) +{ + struct sigaction segv_act, trap_act; + int i; + + /* Skip the test if the CPU doesn't support Radix */ + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00)); + + /* Check if pkeys are supported */ + pkeys_supported = pkeys_unsupported() == 0; + + /* Setup SIGSEGV handler */ + segv_act.sa_handler = 0; + segv_act.sa_sigaction = segv_handler; + FAIL_IF(sigprocmask(SIG_SETMASK, 0, &segv_act.sa_mask) != 0); + segv_act.sa_flags = SA_SIGINFO; + segv_act.sa_restorer = 0; + FAIL_IF(sigaction(SIGSEGV, &segv_act, NULL) != 0); + + /* Setup SIGTRAP handler */ + trap_act.sa_handler = 0; + trap_act.sa_sigaction = trap_handler; + FAIL_IF(sigprocmask(SIG_SETMASK, 0, &trap_act.sa_mask) != 0); + trap_act.sa_flags = SA_SIGINFO; + trap_act.sa_restorer = 0; + FAIL_IF(sigaction(SIGTRAP, &trap_act, NULL) != 0); + + /* Setup executable region */ + pgsize = getpagesize(); + numinsns = pgsize / sizeof(unsigned int); + insns = (unsigned int *)mmap(NULL, pgsize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + FAIL_IF(insns == MAP_FAILED); + + /* Write the instruction words */ + for (i = 1; i < numinsns - 1; i++) + insns[i] = PPC_INST_NOP; + + /* + * Set the first instruction as an unconditional trap. If + * the last write to this address succeeds, this should + * get overwritten by a no-op. + */ + insns[0] = PPC_INST_TRAP; + + /* + * Later, to jump to the executable region, we use a branch + * and link instruction (bctrl) which sets the return address + * automatically in LR. Use that to return back. + */ + insns[numinsns - 1] = PPC_INST_BLR; + + /* + * Pick the first instruction's address from the executable + * region. + */ + fault_addr = insns; + + /* + * Read an instruction word from the address when the page + * is execute only. This should generate an access fault. + */ + fault_code = -1; + remaining_faults = 1; + printf("Testing read on --x, should fault..."); + FAIL_IF(mprotect(insns, pgsize, PROT_EXEC) != 0); + i = *fault_addr; + FAIL_IF(remaining_faults != 0 || !is_fault_expected(fault_code)); + printf("ok!\n"); + + /* + * Write an instruction word to the address when the page + * execute only. This should also generate an access fault. + */ + fault_code = -1; + remaining_faults = 1; + printf("Testing write on --x, should fault..."); + FAIL_IF(mprotect(insns, pgsize, PROT_EXEC) != 0); + *fault_addr = PPC_INST_NOP; + FAIL_IF(remaining_faults != 0 || !is_fault_expected(fault_code)); + printf("ok!\n"); + + printf("Testing exec on ---, should fault..."); + FAIL_IF(check_exec_fault(PROT_NONE)); + printf("ok!\n"); + + printf("Testing exec on r--, should fault..."); + FAIL_IF(check_exec_fault(PROT_READ)); + printf("ok!\n"); + + printf("Testing exec on -w-, should fault..."); + FAIL_IF(check_exec_fault(PROT_WRITE)); + printf("ok!\n"); + + printf("Testing exec on rw-, should fault..."); + FAIL_IF(check_exec_fault(PROT_READ | PROT_WRITE)); + printf("ok!\n"); + + printf("Testing exec on --x, should succeed..."); + FAIL_IF(check_exec_fault(PROT_EXEC)); + printf("ok!\n"); + + printf("Testing exec on r-x, should succeed..."); + FAIL_IF(check_exec_fault(PROT_READ | PROT_EXEC)); + printf("ok!\n"); + + printf("Testing exec on -wx, should succeed..."); + FAIL_IF(check_exec_fault(PROT_WRITE | PROT_EXEC)); + printf("ok!\n"); + + printf("Testing exec on rwx, should succeed..."); + FAIL_IF(check_exec_fault(PROT_READ | PROT_WRITE | PROT_EXEC)); + printf("ok!\n"); + + /* Cleanup */ + FAIL_IF(munmap((void *)insns, pgsize)); + + return 0; +} + +int main(void) +{ + return test_harness(test, "exec_prot"); +} -- cgit v1.2.3 From 343949e10798a52c6d6a14effc962e010ed471ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Wed, 24 Aug 2022 15:40:37 +0200 Subject: libbpf: add map_get_fd_by_id and map_delete_elem in light skeleton This allows to have a better control over maps from the kernel when preloading eBPF programs. Acked-by: Yonghong Song Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220824134055.1328882-8-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/skel_internal.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 00c5f94b43be..1e82ab06c3eb 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -251,6 +251,29 @@ static inline int skel_map_update_elem(int fd, const void *key, return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); } +static inline int skel_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long)key; + + return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_get_fd_by_id(__u32 id) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_id = id; + + return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); +} + static inline int skel_raw_tracepoint_open(const char *name, int prog_fd) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd); -- cgit v1.2.3 From ab9ac19c4d0615fee40ec7d49fa16c9fd33f61f8 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Thu, 25 Aug 2022 23:06:59 -0600 Subject: selftests/bpf: fix type conflict in test_tc_dtime The sys/socket.h header isn't required to build test_tc_dtime and may cause a type conflict. Fixes the following error: In file included from /usr/include/x86_64-linux-gnu/sys/types.h:155, from /usr/include/x86_64-linux-gnu/bits/socket.h:29, from /usr/include/x86_64-linux-gnu/sys/socket.h:33, from progs/test_tc_dtime.c:18: /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:24:18: error: conflicting types for 'int8_t'; have '__int8_t' {aka 'signed char'} 24 | typedef __int8_t int8_t; | ^~~~~~ In file included from progs/test_tc_dtime.c:5: /home/buildroot/opt/cross/lib/gcc/bpf/13.0.0/include/stdint.h:34:23: note: previous declaration of 'int8_t' with type 'int8_t' {aka 'char'} 34 | typedef __INT8_TYPE__ int8_t; | ^~~~~~ /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:27:19: error: conflicting types for 'int64_t'; have '__int64_t' {aka 'long long int'} 27 | typedef __int64_t int64_t; | ^~~~~~~ /home/buildroot/opt/cross/lib/gcc/bpf/13.0.0/include/stdint.h:43:24: note: previous declaration of 'int64_t' with type 'int64_t' {aka 'long int'} 43 | typedef __INT64_TYPE__ int64_t; | ^~~~~~~ make: *** [Makefile:537: /home/buildroot/bpf-next/tools/testing/selftests/bpf/bpf_gcc/test_tc_dtime.o] Error 1 Signed-off-by: James Hilliard Link: https://lore.kernel.org/r/20220826050703.869571-1-james.hilliard1@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/test_tc_dtime.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c index b596479a9ebe..125beec31834 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c @@ -15,7 +15,6 @@ #include #include #include -#include /* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst * | | -- cgit v1.2.3 From b05d64efbb21ad231516b44317af34d2b586cfc4 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Thu, 25 Aug 2022 21:51:39 -0600 Subject: selftests/bpf: Declare subprog_noise as static in tailcall_bpf2bpf4 Due to bpf_map_lookup_elem being declared static we need to also declare subprog_noise as static. Fixes the following error: progs/tailcall_bpf2bpf4.c:26:9: error: 'bpf_map_lookup_elem' is static but used in inline function 'subprog_noise' which is not static [-Werror] 26 | bpf_map_lookup_elem(&nop_table, &key); | ^~~~~~~~~~~~~~~~~~~ Signed-off-by: James Hilliard Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220826035141.737919-1-james.hilliard1@gmail.com --- tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c index b67e8022d500..a017d6b2f1dd 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c @@ -19,7 +19,7 @@ struct { int count = 0; int noise = 0; -__always_inline int subprog_noise(void) +static __always_inline int subprog_noise(void) { __u32 key = 0; -- cgit v1.2.3 From aa75622c3be4d5819ce69c714acbcbd67bba5d65 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 25 Aug 2022 23:08:06 +0100 Subject: bpf: Fix a few typos in BPF helpers documentation Address a few typos in the documentation for the BPF helper functions. They were reported by Jakub [0], who ran spell checkers on the generated man page [1]. [0] https://lore.kernel.org/linux-man/d22dcd47-023c-8f52-d369-7b5308e6c842@gmail.com/T/#mb02e7d4b7fb61d98fa914c77b581184e9a9537af [1] https://lore.kernel.org/linux-man/eb6a1e41-c48e-ac45-5154-ac57a2c76108@gmail.com/T/#m4a8d1b003616928013ffcd1450437309ab652f9f v3: Do not copy unrelated (and breaking) elements to tools/ header v2: Turn a ',' into a ';' Reported-by: Jakub Wilk Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220825220806.107143-1-quentin@isovalent.com --- include/uapi/linux/bpf.h | 16 ++++++++-------- tools/include/uapi/linux/bpf.h | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bdf4bc6d8d6b..962960a98835 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4456,7 +4456,7 @@ union bpf_attr { * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. @@ -4665,7 +4665,7 @@ union bpf_attr { * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -4723,7 +4723,7 @@ union bpf_attr { * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return @@ -4747,12 +4747,12 @@ union bpf_attr { * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 92f7387e378a..f4ba82a1eace 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4456,7 +4456,7 @@ union bpf_attr { * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. @@ -4665,7 +4665,7 @@ union bpf_attr { * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -4723,7 +4723,7 @@ union bpf_attr { * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return @@ -4747,12 +4747,12 @@ union bpf_attr { * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't -- cgit v1.2.3 From bc9e7fe313d5e56d4d5f34bcc04d1165f94f86fb Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 28 Jul 2022 10:39:46 +0100 Subject: perf python: Fix build when PYTHON_CONFIG is user supplied The previous change to Python autodetection had a small mistake where the auto value was used to determine the Python binary, rather than the user supplied value. The Python binary is only used for one part of the build process, rather than the final linking, so it was producing correct builds in most scenarios, especially when the auto detected value matched what the user wanted, or the system only had a valid set of Pythons. Change it so that the Python binary path is derived from either the PYTHON_CONFIG value or PYTHON value, depending on what is specified by the user. This was the original intention. This error was spotted in a build failure an odd cross compilation environment after commit 4c41cb46a732fe82 ("perf python: Prefer python3") was merged. Fixes: 630af16eee495f58 ("perf tools: Use Python devtools for version autodetection rather than runtime") Signed-off-by: James Clark Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220728093946.1337642-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 0661a1cf9855..2171f02daf59 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -265,7 +265,7 @@ endif # defined. get-executable-or-default fails with an error if the first argument is supplied but # doesn't exist. override PYTHON_CONFIG := $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON_AUTO)) -override PYTHON := $(call get-executable-or-default,PYTHON,$(subst -config,,$(PYTHON_AUTO))) +override PYTHON := $(call get-executable-or-default,PYTHON,$(subst -config,,$(PYTHON_CONFIG))) grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) -- cgit v1.2.3 From dbcfe5ec3f9a5799d8b49ad2c81549bbfa8390e7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 12:53:44 -0300 Subject: tools kvm headers arm64: Update KVM header from the kernel sources To pick the changes from: ae3b1da95413614f ("KVM: arm64: Fix compile error due to sign extension") That doesn't result in any changes in tooling (when built on x86), only addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm64/include/uapi/asm/kvm.h' diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Marc Zyngier Cc: Namhyung Kim Cc: Yang Yingliang Link: https://lore.kernel.org/all/YwOMCCc4E79FuvDe@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/kvm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 3bb134355874..316917b98707 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -75,9 +75,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK (0xffff << KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK (0xffff << KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 -- cgit v1.2.3 From bf515f024e4c0ca46a1b08c4f31860c01781d8a5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 22 Aug 2022 14:33:51 -0700 Subject: perf stat: Clear evsel->reset_group for each stat run If a weak group is broken then the reset_group flag remains set for the next run. Having reset_group set means the counter isn't created and ultimately a segfault. A simple reproduction of this is: # perf stat -r2 -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}:W which will be added as a test in the next patch. Fixes: 4804e0111662d7d8 ("perf stat: Use affinity for opening events") Reviewed-by: Andi Kleen Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Tested-by: Xing Zhengjun Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220822213352.75721-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7fb81a44672d..54cd29d07ca8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -826,6 +826,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) } evlist__for_each_entry(evsel_list, counter) { + counter->reset_group = false; if (bpf_counter__load(counter, &target)) return -1; if (!evsel__is_bpf(counter)) -- cgit v1.2.3 From 0c361c6eaba7fe1a29391540dd8797e850e49f21 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 22 Aug 2022 14:33:52 -0700 Subject: perf test: Stat test for repeat with a weak group Breaking a weak group requires multiple passes of an evlist, with multiple runs this can introduce bugs ultimately leading to segfaults. Add a test to cover this. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220822213352.75721-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh index 9313ef2739e0..26a51b48aee4 100755 --- a/tools/perf/tests/shell/stat.sh +++ b/tools/perf/tests/shell/stat.sh @@ -28,6 +28,24 @@ test_stat_record_report() { echo "stat record and report test [Success]" } +test_stat_repeat_weak_groups() { + echo "stat repeat weak groups test" + if ! perf stat -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}' \ + true 2>&1 | grep -q 'seconds time elapsed' + then + echo "stat repeat weak groups test [Skipped event parsing failed]" + return + fi + if ! perf stat -r2 -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}:W' \ + true > /dev/null 2>&1 + then + echo "stat repeat weak groups test [Failed]" + err=1 + return + fi + echo "stat repeat weak groups test [Success]" +} + test_topdown_groups() { # Topdown events must be grouped with the slots event first. Test that # parse-events reorders this. @@ -75,6 +93,7 @@ test_topdown_weak_groups() { test_default_stat test_stat_record_report +test_stat_repeat_weak_groups test_topdown_groups test_topdown_weak_groups exit $err -- cgit v1.2.3 From e89eaa611c7568d1288a2ccca88355a9434f2d47 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 18 Aug 2022 03:01:27 -0700 Subject: perf record: Fix manpage formatting of description of support to hybrid systems The Intel hybrid description is written in a different style than the rest of the perf record man page. There were some new command line options added after it which resulted in very strange section ordering. Move the hybrid include last. Also the sub sections in the hybrid document don't fit the record manpage well (especially since it talks about all kinds of unrelated commands). I left this for now, but would be better to separate this properly in the different man pages. It would be better to use sub sections for the other sections, but these don't seem to be supported in AsciiDoc? Some of the examples are still misrendered in the manpage with an indented troff command, but I don't know how to fix that. In any case it's now better than before. Signed-off-by: Andi Kleen Cc: zhengjun.xing@intel.com Link: https://lore.kernel.org/r/20220818100127.249401-1-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/intel-hybrid.txt | 10 ---------- tools/perf/Documentation/perf-record.txt | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/intel-hybrid.txt b/tools/perf/Documentation/intel-hybrid.txt index c9302096dc46..e7a776ad25d7 100644 --- a/tools/perf/Documentation/intel-hybrid.txt +++ b/tools/perf/Documentation/intel-hybrid.txt @@ -21,11 +21,6 @@ cat /sys/devices/cpu_atom/cpus It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus. -Quickstart - -List hybrid event ------------------ - As before, use perf-list to list the symbolic event. perf list @@ -40,7 +35,6 @@ the event is belong to. Same event name but with different pmu can be supported. Enable hybrid event with a specific pmu ---------------------------------------- To enable a core only event or atom only event, following syntax is supported: @@ -53,7 +47,6 @@ For example, count the 'cycles' event on core cpus. perf stat -e cpu_core/cycles/ Create two events for one hardware event automatically ------------------------------------------------------- When creating one event and the event is available on both atom and core, two events are created automatically. One is for atom, the other is for @@ -132,7 +125,6 @@ For perf-stat result, it displays two events: The first 'cycles' is core event, the second 'cycles' is atom event. Thread mode example: --------------------- perf-stat reports the scaled counts for hybrid event and with a percentage displayed. The percentage is the event's running time/enabling time. @@ -176,14 +168,12 @@ perf_event_attr: 604,097,080 cpu_atom/cycles/ (99.57%) perf-record: ------------- If there is no '-e' specified in perf record, on hybrid platform, it creates two default 'cycles' and adds them to event list. One is for core, the other is for atom. perf-stat: ----------- If there is no '-e' specified in perf stat, on hybrid platform, besides of software events, following events are created and diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 099817ef5150..6ec6d0ba0a72 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -757,8 +757,6 @@ events in data directory files. Option specified with no or empty value defaults to CPU layout. Masks defined or provided by the option value are filtered through the mask provided by -C option. -include::intel-hybrid.txt[] - --debuginfod[=URLs]:: Specify debuginfod URL to be used when cacheing perf.data binaries, it follows the same syntax as the DEBUGINFOD_URLS variable, like: @@ -778,6 +776,8 @@ include::intel-hybrid.txt[] only, as of now. So the applications built without the frame pointer might see bogus addresses. +include::intel-hybrid.txt[] + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1] -- cgit v1.2.3 From d72e5cf3cf69d4c68d3b54aea232451b0a8b69d3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 24 Aug 2022 07:57:33 -0700 Subject: perf sched: Fix memory leaks in __cmd_record detected with -fsanitize=address An array of strings is passed to cmd_record but not freed. As cmd_record modifies the array, add another array as a copy that can be mutated allowing the original array contents to all be freed. Detected with -fsanitize=address. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220824145733.409005-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 2f6cd1b8b662..a5cf243c337f 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3355,7 +3355,8 @@ static bool schedstat_events_exposed(void) static int __cmd_record(int argc, const char **argv) { unsigned int rec_argc, i, j; - const char **rec_argv; + char **rec_argv; + const char **rec_argv_copy; const char * const record_args[] = { "record", "-a", @@ -3384,6 +3385,7 @@ static int __cmd_record(int argc, const char **argv) ARRAY_SIZE(schedstat_args) : 0; struct tep_event *waking_event; + int ret; /* * +2 for either "-e", "sched:sched_wakeup" or @@ -3391,14 +3393,18 @@ static int __cmd_record(int argc, const char **argv) */ rec_argc = ARRAY_SIZE(record_args) + 2 + schedstat_argc + argc - 1; rec_argv = calloc(rec_argc + 1, sizeof(char *)); - if (rec_argv == NULL) return -ENOMEM; + rec_argv_copy = calloc(rec_argc + 1, sizeof(char *)); + if (rec_argv_copy == NULL) { + free(rec_argv); + return -ENOMEM; + } for (i = 0; i < ARRAY_SIZE(record_args); i++) rec_argv[i] = strdup(record_args[i]); - rec_argv[i++] = "-e"; + rec_argv[i++] = strdup("-e"); waking_event = trace_event__tp_format("sched", "sched_waking"); if (!IS_ERR(waking_event)) rec_argv[i++] = strdup("sched:sched_waking"); @@ -3409,11 +3415,19 @@ static int __cmd_record(int argc, const char **argv) rec_argv[i++] = strdup(schedstat_args[j]); for (j = 1; j < (unsigned int)argc; j++, i++) - rec_argv[i] = argv[j]; + rec_argv[i] = strdup(argv[j]); BUG_ON(i != rec_argc); - return cmd_record(i, rec_argv); + memcpy(rec_argv_copy, rec_argv, sizeof(char *) * rec_argc); + ret = cmd_record(rec_argc, rec_argv_copy); + + for (i = 0; i < rec_argc; i++) + free(rec_argv[i]); + free(rec_argv); + free(rec_argv_copy); + + return ret; } int cmd_sched(int argc, const char **argv) -- cgit v1.2.3 From 3126204ce3d9ab083cbdc2d61ab93746232eb89b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 16 Aug 2022 05:56:12 -0700 Subject: perf docs: Update the documentation for the save_type filter Update the documentation to reflect the kernel changes. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Ian Rogers Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220816125612.2042397-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 6ec6d0ba0a72..0228efc96686 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -397,6 +397,9 @@ following filters are defined: - abort_tx: only when the target is a hardware transaction abort - cond: conditional branches - save_type: save branch type during sampling in case binary is not available later + For the platforms with Intel Arch LBR support (12th-Gen+ client or + 4th-Gen Xeon+ server), the save branch type is unconditionally enabled + when the taken branch stack sampling is enabled. + The option requires at least one branch type among any, any_call, any_ret, ind_call, cond. -- cgit v1.2.3 From 48648548ef764dcb1f6ffc9c9f9057f7c610caa4 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Thu, 25 Aug 2022 09:54:58 +0800 Subject: perf stat: Capitalize topdown metrics' names Capitalize topdown metrics' names to follow the intel SDM. Before: # ./perf stat -a sleep 1 Performance counter stats for 'system wide': 228,094.05 msec cpu-clock # 225.026 CPUs utilized 842 context-switches # 3.691 /sec 224 cpu-migrations # 0.982 /sec 70 page-faults # 0.307 /sec 23,164,105 cycles # 0.000 GHz 29,403,446 instructions # 1.27 insn per cycle 5,268,185 branches # 23.097 K/sec 33,239 branch-misses # 0.63% of all branches 136,248,990 slots # 597.337 K/sec 32,976,450 topdown-retiring # 24.2% retiring 4,651,918 topdown-bad-spec # 3.4% bad speculation 26,148,695 topdown-fe-bound # 19.2% frontend bound 72,515,776 topdown-be-bound # 53.2% backend bound 6,008,540 topdown-heavy-ops # 4.4% heavy operations # 19.8% light operations 3,934,049 topdown-br-mispredict # 2.9% branch mispredict # 0.5% machine clears 16,655,439 topdown-fetch-lat # 12.2% fetch latency # 7.0% fetch bandwidth 41,635,972 topdown-mem-bound # 30.5% memory bound # 22.7% Core bound 1.013634593 seconds time elapsed After: # ./perf stat -a sleep 1 Performance counter stats for 'system wide': 228,081.94 msec cpu-clock # 225.003 CPUs utilized 824 context-switches # 3.613 /sec 224 cpu-migrations # 0.982 /sec 67 page-faults # 0.294 /sec 22,647,423 cycles # 0.000 GHz 28,870,551 instructions # 1.27 insn per cycle 5,167,099 branches # 22.655 K/sec 32,383 branch-misses # 0.63% of all branches 133,411,074 slots # 584.926 K/sec 32,352,607 topdown-retiring # 24.3% Retiring 4,456,977 topdown-bad-spec # 3.3% Bad Speculation 25,626,487 topdown-fe-bound # 19.2% Frontend Bound 70,955,316 topdown-be-bound # 53.2% Backend Bound 5,834,844 topdown-heavy-ops # 4.4% Heavy Operations # 19.9% Light Operations 3,738,781 topdown-br-mispredict # 2.8% Branch Mispredict # 0.5% Machine Clears 16,286,803 topdown-fetch-lat # 12.2% Fetch Latency # 7.0% Fetch Bandwidth 40,802,069 topdown-mem-bound # 30.6% Memory Bound # 22.6% Core Bound 1.013683125 seconds time elapsed Reviewed-by: Kan Liang Signed-off-by: Xing Zhengjun Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220825015458.3252239-1-zhengjun.xing@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 979c8cb918f7..788ce5e46470 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -1193,7 +1193,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, &rsd); if (retiring > 0.7) color = PERF_COLOR_GREEN; - print_metric(config, ctxp, color, "%8.1f%%", "retiring", + print_metric(config, ctxp, color, "%8.1f%%", "Retiring", retiring * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) && full_td(cpu_map_idx, st, &rsd)) { @@ -1202,7 +1202,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, &rsd); if (fe_bound > 0.2) color = PERF_COLOR_RED; - print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", + print_metric(config, ctxp, color, "%8.1f%%", "Frontend Bound", fe_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) && full_td(cpu_map_idx, st, &rsd)) { @@ -1211,7 +1211,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, &rsd); if (be_bound > 0.2) color = PERF_COLOR_RED; - print_metric(config, ctxp, color, "%8.1f%%", "backend bound", + print_metric(config, ctxp, color, "%8.1f%%", "Backend Bound", be_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) && full_td(cpu_map_idx, st, &rsd)) { @@ -1220,7 +1220,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, &rsd); if (bad_spec > 0.1) color = PERF_COLOR_RED; - print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", + print_metric(config, ctxp, color, "%8.1f%%", "Bad Speculation", bad_spec * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_HEAVY_OPS) && full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { @@ -1234,13 +1234,13 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (retiring > 0.7 && heavy_ops > 0.1) color = PERF_COLOR_GREEN; - print_metric(config, ctxp, color, "%8.1f%%", "heavy operations", + print_metric(config, ctxp, color, "%8.1f%%", "Heavy Operations", heavy_ops * 100.); if (retiring > 0.7 && light_ops > 0.6) color = PERF_COLOR_GREEN; else color = NULL; - print_metric(config, ctxp, color, "%8.1f%%", "light operations", + print_metric(config, ctxp, color, "%8.1f%%", "Light Operations", light_ops * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BR_MISPREDICT) && full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { @@ -1254,13 +1254,13 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (bad_spec > 0.1 && br_mis > 0.05) color = PERF_COLOR_RED; - print_metric(config, ctxp, color, "%8.1f%%", "branch mispredict", + print_metric(config, ctxp, color, "%8.1f%%", "Branch Mispredict", br_mis * 100.); if (bad_spec > 0.1 && m_clears > 0.05) color = PERF_COLOR_RED; else color = NULL; - print_metric(config, ctxp, color, "%8.1f%%", "machine clears", + print_metric(config, ctxp, color, "%8.1f%%", "Machine Clears", m_clears * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_LAT) && full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { @@ -1274,13 +1274,13 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (fe_bound > 0.2 && fetch_lat > 0.15) color = PERF_COLOR_RED; - print_metric(config, ctxp, color, "%8.1f%%", "fetch latency", + print_metric(config, ctxp, color, "%8.1f%%", "Fetch Latency", fetch_lat * 100.); if (fe_bound > 0.2 && fetch_bw > 0.1) color = PERF_COLOR_RED; else color = NULL; - print_metric(config, ctxp, color, "%8.1f%%", "fetch bandwidth", + print_metric(config, ctxp, color, "%8.1f%%", "Fetch Bandwidth", fetch_bw * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_MEM_BOUND) && full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) { @@ -1294,13 +1294,13 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (be_bound > 0.2 && mem_bound > 0.2) color = PERF_COLOR_RED; - print_metric(config, ctxp, color, "%8.1f%%", "memory bound", + print_metric(config, ctxp, color, "%8.1f%%", "Memory Bound", mem_bound * 100.); if (be_bound > 0.2 && core_bound > 0.1) color = PERF_COLOR_RED; else color = NULL; - print_metric(config, ctxp, color, "%8.1f%%", "Core bound", + print_metric(config, ctxp, color, "%8.1f%%", "Core Bound", core_bound * 100.); } else if (evsel->metric_expr) { generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, -- cgit v1.2.3 From 3721359d3907c313833a2fd6e40c36a30179ea89 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Thu, 25 Aug 2022 23:29:22 -0600 Subject: selftests/bpf: Fix bind{4,6} tcp/socket header type conflict There is a potential for us to hit a type conflict when including netinet/tcp.h with sys/socket.h, we can remove these as they are not actually needed. Fixes errors like the below when compiling with gcc BPF backend: In file included from /usr/include/netinet/tcp.h:91, from progs/bind4_prog.c:10: /home/buildroot/opt/cross/lib/gcc/bpf/13.0.0/include/stdint.h:34:23: error: conflicting types for 'int8_t'; have 'char' 34 | typedef __INT8_TYPE__ int8_t; | ^~~~~~ In file included from /usr/include/x86_64-linux-gnu/sys/types.h:155, from /usr/include/x86_64-linux-gnu/bits/socket.h:29, from /usr/include/x86_64-linux-gnu/sys/socket.h:33, from progs/bind4_prog.c:9: /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:24:18: note: previous declaration of 'int8_t' with type 'int8_t' {aka 'signed char'} 24 | typedef __int8_t int8_t; | ^~~~~~ /home/buildroot/opt/cross/lib/gcc/bpf/13.0.0/include/stdint.h:43:24: error: conflicting types for 'int64_t'; have 'long int' 43 | typedef __INT64_TYPE__ int64_t; | ^~~~~~~ /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:27:19: note: previous declaration of 'int64_t' with type 'int64_t' {aka 'long long int'} 27 | typedef __int64_t int64_t; | ^~~~~~~ make: *** [Makefile:537: /home/buildroot/bpf-next/tools/testing/selftests/bpf/bpf_gcc/bind4_prog.o] Error 1 Signed-off-by: James Hilliard Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220826052925.980431-1-james.hilliard1@gmail.com --- tools/testing/selftests/bpf/progs/bind4_prog.c | 2 -- tools/testing/selftests/bpf/progs/bind6_prog.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 474c6a62078a..a487f60b73ac 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index c19cfa869f30..d62cd9e9cf0e 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include -- cgit v1.2.3 From 2eb680401df62c035ff50a7faf1296565b030df7 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Mon, 29 Aug 2022 09:47:09 -0600 Subject: selftests/bpf: Fix connect4_prog tcp/socket header type conflict There is a potential for us to hit a type conflict when including netinet/tcp.h and sys/socket.h, we can replace both of these includes with linux/tcp.h and bpf_tcp_helpers.h to avoid this conflict. Fixes errors like the below when compiling with gcc BPF backend: In file included from /usr/include/netinet/tcp.h:91, from progs/connect4_prog.c:11: /home/buildroot/opt/cross/lib/gcc/bpf/13.0.0/include/stdint.h:34:23: error: conflicting types for 'int8_t'; have 'char' 34 | typedef __INT8_TYPE__ int8_t; | ^~~~~~ In file included from /usr/include/x86_64-linux-gnu/sys/types.h:155, from /usr/include/x86_64-linux-gnu/bits/socket.h:29, from /usr/include/x86_64-linux-gnu/sys/socket.h:33, from progs/connect4_prog.c:10: /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:24:18: note: previous declaration of 'int8_t' with type 'int8_t' {aka 'signed char'} 24 | typedef __int8_t int8_t; | ^~~~~~ /home/buildroot/opt/cross/lib/gcc/bpf/13.0.0/include/stdint.h:43:24: error: conflicting types for 'int64_t'; have 'long int' 43 | typedef __INT64_TYPE__ int64_t; | ^~~~~~~ /usr/include/x86_64-linux-gnu/bits/stdint-intn.h:27:19: note: previous declaration of 'int64_t' with type 'int64_t' {aka 'long long int'} 27 | typedef __int64_t int64_t; | ^~~~~~~ Signed-off-by: James Hilliard Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220829154710.3870139-1-james.hilliard1@gmail.com --- tools/testing/selftests/bpf/progs/connect4_prog.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index b241932911db..ec25371de789 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -7,14 +7,15 @@ #include #include #include -#include -#include +#include #include #include #include #include +#include "bpf_tcp_helpers.h" + #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 -- cgit v1.2.3 From c0f1bc4e91c52be73ae1a5e6fd53371f5a7f0333 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 14 Aug 2022 00:50:19 -0500 Subject: memblock tests: add command line help option Add a help command line option to the help message. Add the help option to the short and long options so it will be recognized as a valid option. Usage: $ ./main -h Or: $ ./main --help Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/0f3b93a79de78c0da1ca90f74fe35e9a85c7cf93.1660451025.git.remckee0@gmail.com --- tools/testing/memblock/tests/common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index e43b2676af81..76a8ad818f3a 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -14,14 +14,16 @@ static struct test_memory memory_block; static const char __maybe_unused *prefixes[PREFIXES_MAX]; static int __maybe_unused nr_prefixes; -static const char *short_opts = "mv"; +static const char *short_opts = "hmv"; static const struct option long_opts[] = { + {"help", 0, NULL, 'h'}, {"movable-node", 0, NULL, 'm'}, {"verbose", 0, NULL, 'v'}, {NULL, 0, NULL, 0} }; static const char * const help_opts[] = { + "display this help message and exit", "disallow allocations from regions marked as hotplugged\n\t\t\t" "by simulating enabling the \"movable_node\" kernel\n\t\t\t" "parameter", -- cgit v1.2.3 From 61ebea2ba19826ce6dff8686b72ecbea8269f6cc Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 14 Aug 2022 00:50:20 -0500 Subject: memblock tests: update reference to obsolete build option in comments The VERBOSE build option was replaced with the --verbose runtime option, but the comments describing the ASSERT_*() macros still refer to the VERBOSE build option. Update these comments so that they refer to the --verbose runtime option. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/5f8a4c2bde34cc029282c68d47eda982d950f421.1660451025.git.remckee0@gmail.com --- tools/testing/memblock/tests/common.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 3e7f23d341d7..d396e5423a8e 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -16,7 +16,8 @@ * ASSERT_EQ(): * Check the condition * @_expected == @_seen - * If false, print failed test message (if in VERBOSE mode) and then assert + * If false, print failed test message (if running with --verbose) and then + * assert. */ #define ASSERT_EQ(_expected, _seen) do { \ if ((_expected) != (_seen)) \ @@ -28,7 +29,8 @@ * ASSERT_NE(): * Check the condition * @_expected != @_seen - * If false, print failed test message (if in VERBOSE mode) and then assert + * If false, print failed test message (if running with --verbose) and then + * assert. */ #define ASSERT_NE(_expected, _seen) do { \ if ((_expected) == (_seen)) \ @@ -40,7 +42,8 @@ * ASSERT_LT(): * Check the condition * @_expected < @_seen - * If false, print failed test message (if in VERBOSE mode) and then assert + * If false, print failed test message (if running with --verbose) and then + * assert. */ #define ASSERT_LT(_expected, _seen) do { \ if ((_expected) >= (_seen)) \ -- cgit v1.2.3 From ac76d803c4f6c2a32c9c7436d14467e099fd2bfa Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:43 -0500 Subject: memblock tests: update tests to check if memblock_alloc zeroed memory Add an assert in memblock_alloc() tests where allocation is expected to occur. The assert checks whether the entire chunk of allocated memory is cleared. The current memblock_alloc() tests do not check whether the allocated memory was zeroed. memblock_alloc() should zero the allocated memory since it is a wrapper for memblock_alloc_try_nid(). Reviewed-by: Shaoqin Huang Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/83ffb941b65074f40eb14552f8bfe5b71fe50abd.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 23 +++++++++++++++++++++++ tools/testing/memblock/tests/common.c | 7 +++++++ tools/testing/memblock/tests/common.h | 12 ++++++++++++ 3 files changed, 42 insertions(+) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index a14f38eb8a89..c97da91a98d6 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -22,6 +22,8 @@ static int alloc_top_down_simple_check(void) allocated_ptr = memblock_alloc(size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, size); + ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, expected_start); @@ -80,6 +82,8 @@ static int alloc_top_down_disjoint_check(void) allocated_ptr = memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -125,6 +129,8 @@ static int alloc_top_down_before_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - total_size); @@ -173,6 +179,8 @@ static int alloc_top_down_after_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2_size); @@ -223,6 +231,8 @@ static int alloc_top_down_second_fit_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base - r3_size); @@ -277,6 +287,8 @@ static int alloc_in_between_generic_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2.size - r3_size); @@ -418,6 +430,8 @@ static int alloc_limited_space_generic_check(void) allocated_ptr = memblock_alloc(available_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, available_size); + ASSERT_EQ(rgn->size, MEM_SIZE); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -472,6 +486,8 @@ static int alloc_bottom_up_simple_check(void) allocated_ptr = memblock_alloc(SZ_2, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, SZ_2); + ASSERT_EQ(rgn->size, SZ_2); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -528,6 +544,7 @@ static int alloc_bottom_up_disjoint_check(void) allocated_ptr = memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -571,6 +588,8 @@ static int alloc_bottom_up_before_check(void) allocated_ptr = memblock_alloc(r1_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r1_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -618,6 +637,8 @@ static int alloc_bottom_up_after_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base); @@ -669,6 +690,8 @@ static int alloc_bottom_up_second_fit_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base); diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index 76a8ad818f3a..eec6901081af 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -60,16 +60,23 @@ void reset_memblock_attributes(void) memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; } +static inline void fill_memblock(void) +{ + memset(memory_block.base, 1, MEM_SIZE); +} + void setup_memblock(void) { reset_memblock_regions(); memblock_add((phys_addr_t)memory_block.base, MEM_SIZE); + fill_memblock(); } void dummy_physical_memory_init(void) { memory_block.base = malloc(MEM_SIZE); assert(memory_block.base); + fill_memblock(); } void dummy_physical_memory_cleanup(void) diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index d396e5423a8e..93e559780890 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -51,6 +51,18 @@ assert((_expected) < (_seen)); \ } while (0) +/** + * ASSERT_MEM_EQ(): + * Check that the first @_size bytes of @_seen are all equal to @_expected. + * If false, print failed test message (if running with --verbose) and then + * assert. + */ +#define ASSERT_MEM_EQ(_seen, _expected, _size) do { \ + for (int _i = 0; _i < (_size); _i++) { \ + ASSERT_EQ(((char *)_seen)[_i], (_expected)); \ + } \ +} while (0) + #define PREFIX_PUSH() prefix_push(__func__) /* -- cgit v1.2.3 From 25b9defb5bc4aee8beb51ded07838e12745426f9 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:44 -0500 Subject: memblock tests: update zeroed memory check for memblock_alloc_* tests Update the assert in memblock_alloc_try_nid() and memblock_alloc_from() tests that checks whether the memory is cleared so that it checks the entire chunk of allocated memory instead of just the first byte. Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/24b3271751756100142e65b75284d43b4d30c9b7.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_helpers_api.c | 8 +-- tools/testing/memblock/tests/alloc_nid_api.c | 72 ++++++------------------ 2 files changed, 20 insertions(+), 60 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index 1069b4bdd5fd..f1c7d6f170b6 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -19,7 +19,6 @@ static int alloc_from_simple_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -31,10 +30,9 @@ static int alloc_from_simple_generic_check(void) min_addr = memblock_end_of_DRAM() - SMP_CACHE_BYTES; allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -66,7 +64,6 @@ static int alloc_from_misaligned_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -79,10 +76,9 @@ static int alloc_from_misaligned_generic_check(void) min_addr = memblock_end_of_DRAM() - (SMP_CACHE_BYTES * 2 - 1); allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - SMP_CACHE_BYTES); diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 255fd514e9f5..a069534c459e 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -19,7 +19,6 @@ static int alloc_try_nid_top_down_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -35,11 +34,10 @@ static int alloc_try_nid_top_down_simple_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -74,7 +72,6 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -91,11 +88,10 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size - misalign); @@ -128,7 +124,6 @@ static int alloc_try_nid_exact_address_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -144,11 +139,10 @@ static int alloc_try_nid_exact_address_generic_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -183,7 +177,6 @@ static int alloc_try_nid_top_down_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -198,10 +191,9 @@ static int alloc_try_nid_top_down_narrow_range_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -277,7 +269,6 @@ static int alloc_try_nid_min_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -298,10 +289,9 @@ static int alloc_try_nid_min_reserved_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, reserved_base); @@ -332,7 +322,6 @@ static int alloc_try_nid_max_reserved_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -351,10 +340,9 @@ static int alloc_try_nid_max_reserved_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, min_addr); @@ -389,7 +377,6 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -417,10 +404,9 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn1->size, r1.size + r3_size); ASSERT_EQ(rgn1->base, max_addr - r3_size); @@ -456,7 +442,6 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -483,10 +468,9 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r2.base); @@ -522,7 +506,6 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -550,10 +533,9 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -634,7 +616,6 @@ static int alloc_try_nid_top_down_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -649,10 +630,9 @@ static int alloc_try_nid_top_down_cap_max_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -674,7 +654,6 @@ static int alloc_try_nid_top_down_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -689,10 +668,9 @@ static int alloc_try_nid_top_down_cap_min_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -723,7 +701,6 @@ static int alloc_try_nid_bottom_up_simple_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -740,11 +717,10 @@ static int alloc_try_nid_bottom_up_simple_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -779,7 +755,6 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -797,11 +772,10 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr + (SMP_CACHE_BYTES - misalign)); @@ -836,7 +810,6 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -852,10 +825,9 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -890,7 +862,6 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) struct memblock_region *rgn1 = &memblock.reserved.regions[1]; struct memblock_region *rgn2 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -919,10 +890,9 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, max_addr); @@ -964,7 +934,6 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) struct memblock_region *rgn2 = &memblock.reserved.regions[1]; struct memblock_region *rgn3 = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; struct region r1, r2; PREFIX_PUSH(); @@ -993,10 +962,9 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); ASSERT_EQ(rgn3->size, r3_size); ASSERT_EQ(rgn3->base, memblock_start_of_DRAM()); @@ -1024,7 +992,6 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -1040,10 +1007,9 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -1065,7 +1031,6 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) { struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; - char *b; PREFIX_PUSH(); @@ -1081,10 +1046,9 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - b = (char *)allocated_ptr; ASSERT_NE(allocated_ptr, NULL); - ASSERT_EQ(*b, 0); + ASSERT_MEM_EQ(allocated_ptr, 0, size); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); -- cgit v1.2.3 From fb2e97fe853ff515df473d4acec6707816e05d87 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:45 -0500 Subject: memblock tests: add labels to verbose output for generic alloc tests Generic tests for memblock_alloc*() functions do not use separate functions for testing top-down and bottom-up allocation directions. Therefore, the function name that is displayed in the verbose testing output does not include the allocation direction. Add an additional prefix when running generic tests for memblock_alloc*() functions that indicates which allocation direction is set. The prefix will be displayed when the tests are run in verbose mode. Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/fb76a42253d2a196a7daea29dd8121a69904f58e.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 36 ++++++++---------------- tools/testing/memblock/tests/alloc_helpers_api.c | 12 +++----- tools/testing/memblock/tests/alloc_nid_api.c | 36 ++++++++---------------- tools/testing/memblock/tests/common.h | 16 +++++++++++ 4 files changed, 44 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index c97da91a98d6..de3405634f8a 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -751,10 +751,8 @@ static int alloc_after_check(void) static int alloc_in_between_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_in_between_generic_check(); - memblock_set_bottom_up(true); - alloc_in_between_generic_check(); + run_top_down(alloc_in_between_generic_check); + run_bottom_up(alloc_in_between_generic_check); return 0; } @@ -773,10 +771,8 @@ static int alloc_second_fit_check(void) static int alloc_small_gaps_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_small_gaps_generic_check(); - memblock_set_bottom_up(true); - alloc_small_gaps_generic_check(); + run_top_down(alloc_small_gaps_generic_check); + run_bottom_up(alloc_small_gaps_generic_check); return 0; } @@ -784,10 +780,8 @@ static int alloc_small_gaps_check(void) static int alloc_all_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_all_reserved_generic_check(); - memblock_set_bottom_up(true); - alloc_all_reserved_generic_check(); + run_top_down(alloc_all_reserved_generic_check); + run_bottom_up(alloc_all_reserved_generic_check); return 0; } @@ -795,10 +789,8 @@ static int alloc_all_reserved_check(void) static int alloc_no_space_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_no_space_generic_check(); - memblock_set_bottom_up(true); - alloc_no_space_generic_check(); + run_top_down(alloc_no_space_generic_check); + run_bottom_up(alloc_no_space_generic_check); return 0; } @@ -806,10 +798,8 @@ static int alloc_no_space_check(void) static int alloc_limited_space_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_limited_space_generic_check(); - memblock_set_bottom_up(true); - alloc_limited_space_generic_check(); + run_top_down(alloc_limited_space_generic_check); + run_bottom_up(alloc_limited_space_generic_check); return 0; } @@ -817,10 +807,8 @@ static int alloc_limited_space_check(void) static int alloc_no_memory_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_no_memory_generic_check(); - memblock_set_bottom_up(true); - alloc_no_memory_generic_check(); + run_top_down(alloc_no_memory_generic_check); + run_bottom_up(alloc_no_memory_generic_check); return 0; } diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index f1c7d6f170b6..06577bd0e349 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -357,10 +357,8 @@ static int alloc_from_bottom_up_min_addr_cap_check(void) static int alloc_from_simple_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_from_simple_generic_check(); - memblock_set_bottom_up(true); - alloc_from_simple_generic_check(); + run_top_down(alloc_from_simple_generic_check); + run_bottom_up(alloc_from_simple_generic_check); return 0; } @@ -368,10 +366,8 @@ static int alloc_from_simple_check(void) static int alloc_from_misaligned_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_from_misaligned_generic_check(); - memblock_set_bottom_up(true); - alloc_from_misaligned_generic_check(); + run_top_down(alloc_from_misaligned_generic_check); + run_bottom_up(alloc_from_misaligned_generic_check); return 0; } diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index a069534c459e..9324d706ee3a 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -1142,10 +1142,8 @@ static int alloc_try_nid_cap_min_check(void) static int alloc_try_nid_min_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_min_reserved_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_min_reserved_generic_check(); + run_top_down(alloc_try_nid_min_reserved_generic_check); + run_bottom_up(alloc_try_nid_min_reserved_generic_check); return 0; } @@ -1153,10 +1151,8 @@ static int alloc_try_nid_min_reserved_check(void) static int alloc_try_nid_max_reserved_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_max_reserved_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_max_reserved_generic_check(); + run_top_down(alloc_try_nid_max_reserved_generic_check); + run_bottom_up(alloc_try_nid_max_reserved_generic_check); return 0; } @@ -1164,10 +1160,8 @@ static int alloc_try_nid_max_reserved_check(void) static int alloc_try_nid_exact_address_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_exact_address_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_exact_address_generic_check(); + run_top_down(alloc_try_nid_exact_address_generic_check); + run_bottom_up(alloc_try_nid_exact_address_generic_check); return 0; } @@ -1175,10 +1169,8 @@ static int alloc_try_nid_exact_address_check(void) static int alloc_try_nid_reserved_full_merge_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_reserved_full_merge_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_reserved_full_merge_generic_check(); + run_top_down(alloc_try_nid_reserved_full_merge_generic_check); + run_bottom_up(alloc_try_nid_reserved_full_merge_generic_check); return 0; } @@ -1186,10 +1178,8 @@ static int alloc_try_nid_reserved_full_merge_check(void) static int alloc_try_nid_reserved_all_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_reserved_all_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_reserved_all_generic_check(); + run_top_down(alloc_try_nid_reserved_all_generic_check); + run_bottom_up(alloc_try_nid_reserved_all_generic_check); return 0; } @@ -1197,10 +1187,8 @@ static int alloc_try_nid_reserved_all_check(void) static int alloc_try_nid_low_max_check(void) { test_print("\tRunning %s...\n", __func__); - memblock_set_bottom_up(false); - alloc_try_nid_low_max_generic_check(); - memblock_set_bottom_up(true); - alloc_try_nid_low_max_generic_check(); + run_top_down(alloc_try_nid_low_max_generic_check); + run_bottom_up(alloc_try_nid_low_max_generic_check); return 0; } diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 93e559780890..c53f9c365714 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -100,4 +100,20 @@ static inline void test_pass_pop(void) prefix_pop(); } +static inline void run_top_down(int (*func)()) +{ + memblock_set_bottom_up(false); + prefix_push("top-down"); + func(); + prefix_pop(); +} + +static inline void run_bottom_up(int (*func)()) +{ + memblock_set_bottom_up(true); + prefix_push("bottom-up"); + func(); + prefix_pop(); +} + #endif -- cgit v1.2.3 From 21a233f68afe55aafa8b79705c97f7a1d37be3e1 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:46 -0500 Subject: memblock tests: add additional tests for basic api and memblock_alloc Add tests for memblock_add(), memblock_reserve(), memblock_remove(), memblock_free(), and memblock_alloc() for the following test scenarios. memblock_add() and memblock_reserve(): - add/reserve a memory block in the gap between two existing memory blocks, and check that the blocks are merged into one region - try to add/reserve memblock regions that extend past PHYS_ADDR_MAX memblock_remove() and memblock_free(): - remove/free a region when it is the only available region + These tests ensure that the first region is overwritten with a "dummy" region when the last remaining region of that type is removed or freed. - remove/free() a region that overlaps with two existing regions of the relevant type - try to remove/free memblock regions that extend past PHYS_ADDR_MAX memblock_alloc(): - try to allocate a region that is larger than the total size of available memory (memblock.memory) Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/c23c0393c5b9a53fe7f676996913c629495e9727.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 44 +++ tools/testing/memblock/tests/basic_api.c | 499 +++++++++++++++++++++++++++++++ 2 files changed, 543 insertions(+) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index de3405634f8a..e20e326d636f 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -469,6 +469,40 @@ static int alloc_no_memory_generic_check(void) return 0; } +/* + * A test that tries to allocate a region that is larger than the total size of + * available memory (memblock.memory): + * + * +-----------------------------------+ + * | new | + * +-----------------------------------+ + * | | + * | | + * +---------------------------------+ + * + * Expect no allocation to happen. + */ +static int alloc_too_large_generic_check(void) +{ + struct memblock_region *rgn = &memblock.reserved.regions[0]; + void *allocated_ptr = NULL; + + PREFIX_PUSH(); + + setup_memblock(); + + allocated_ptr = memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); + + ASSERT_EQ(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, 0); + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(memblock.reserved.total_size, 0); + + test_pass_pop(); + + return 0; +} + /* * A simple test that tries to allocate a small memory region. * Expect to allocate an aligned region at the beginning of the available @@ -813,6 +847,15 @@ static int alloc_no_memory_check(void) return 0; } +static int alloc_too_large_check(void) +{ + test_print("\tRunning %s...\n", __func__); + run_top_down(alloc_too_large_generic_check); + run_bottom_up(alloc_too_large_generic_check); + + return 0; +} + int memblock_alloc_checks(void) { const char *func_testing = "memblock_alloc"; @@ -835,6 +878,7 @@ int memblock_alloc_checks(void) alloc_no_space_check(); alloc_limited_space_check(); alloc_no_memory_check(); + alloc_too_large_check(); dummy_physical_memory_cleanup(); diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 66f46f261e66..ea79396e4611 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -326,6 +326,102 @@ static int memblock_add_twice_check(void) return 0; } +/* + * A test that tries to add two memory blocks that don't overlap with one + * another and then add a third memory block in the space between the first two: + * + * | +--------+--------+--------+ | + * | | r1 | r3 | r2 | | + * +--------+--------+--------+--------+--+ + * + * Expect to merge the three entries into one region that starts at r1.base + * and has size of r1.size + r2.size + r3.size. The region counter and total + * size of the available memory are updated. + */ +static int memblock_add_between_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = SZ_1G, + .size = SZ_8K + }; + struct region r2 = { + .base = SZ_1G + SZ_16K, + .size = SZ_8K + }; + struct region r3 = { + .base = SZ_1G + SZ_8K, + .size = SZ_8K + }; + + PREFIX_PUSH(); + + total_size = r1.size + r2.size + r3.size; + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_add(r3.base, r3.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries to add a memory block r when r extends past + * PHYS_ADDR_MAX: + * + * +--------+ + * | r | + * +--------+ + * | +----+ + * | | rgn| + * +----------------------------+----+ + * + * Expect to add a memory block of size PHYS_ADDR_MAX - r.base. Expect the + * total size of available memory and the counter to be updated. + */ +static int memblock_add_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.memory.regions[0]; + + struct region r = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = PHYS_ADDR_MAX - r.base; + + reset_memblock_regions(); + memblock_add(r.base, r.size); + + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_add_checks(void) { prefix_reset(); @@ -339,6 +435,8 @@ static int memblock_add_checks(void) memblock_add_overlap_bottom_check(); memblock_add_within_check(); memblock_add_twice_check(); + memblock_add_between_check(); + memblock_add_near_max_check(); prefix_pop(); @@ -604,6 +702,102 @@ static int memblock_reserve_twice_check(void) return 0; } +/* + * A test that tries to mark two memory blocks that don't overlap as reserved + * and then reserve a third memory block in the space between the first two: + * + * | +--------+--------+--------+ | + * | | r1 | r3 | r2 | | + * +--------+--------+--------+--------+--+ + * + * Expect to merge the three entries into one reserved region that starts at + * r1.base and has size of r1.size + r2.size + r3.size. The region counter and + * total for memblock.reserved are updated. + */ +static int memblock_reserve_between_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.reserved.regions[0]; + + struct region r1 = { + .base = SZ_1G, + .size = SZ_8K + }; + struct region r2 = { + .base = SZ_1G + SZ_16K, + .size = SZ_8K + }; + struct region r3 = { + .base = SZ_1G + SZ_8K, + .size = SZ_8K + }; + + PREFIX_PUSH(); + + total_size = r1.size + r2.size + r3.size; + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + memblock_reserve(r3.base, r3.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries to reserve a memory block r when r extends past + * PHYS_ADDR_MAX: + * + * +--------+ + * | r | + * +--------+ + * | +----+ + * | | rgn| + * +----------------------------+----+ + * + * Expect to reserve a memory block of size PHYS_ADDR_MAX - r.base. Expect the + * total size of reserved memory and the counter to be updated. + */ +static int memblock_reserve_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.reserved.regions[0]; + + struct region r = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = PHYS_ADDR_MAX - r.base; + + reset_memblock_regions(); + memblock_reserve(r.base, r.size); + + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_reserve_checks(void) { prefix_reset(); @@ -616,6 +810,8 @@ static int memblock_reserve_checks(void) memblock_reserve_overlap_bottom_check(); memblock_reserve_within_check(); memblock_reserve_twice_check(); + memblock_reserve_between_check(); + memblock_reserve_near_max_check(); prefix_pop(); @@ -887,6 +1083,155 @@ static int memblock_remove_within_check(void) return 0; } +/* + * A simple test that tries to remove a region r1 from the array of + * available memory regions when r1 is the only available region. + * Expect to add a memory block r1 and then remove r1 so that a dummy + * region is added. The region counter stays the same, and the total size + * is updated. + */ +static int memblock_remove_only_region_check(void) +{ + struct memblock_region *rgn; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = SZ_2K, + .size = SZ_4K + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_remove(r1.base, r1.size); + + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(rgn->size, 0); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, 0); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries remove a region r2 from the array of available + * memory regions when r2 extends past PHYS_ADDR_MAX: + * + * +--------+ + * | r2 | + * +--------+ + * | +---+....+ + * | |rgn| | + * +------------------------+---+----+ + * + * Expect that only the portion between PHYS_ADDR_MAX and r2.base is removed. + * Expect the total size of available memory to be updated and the counter to + * not be updated. + */ +static int memblock_remove_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = PHYS_ADDR_MAX - SZ_2M, + .size = SZ_2M + }; + + struct region r2 = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = r1.size - (PHYS_ADDR_MAX - r2.base); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_remove(r2.base, r2.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to remove a region r3 that overlaps with two existing + * regions r1 and r2: + * + * +----------------+ + * | r3 | + * +----------------+ + * | +----+..... ........+--------+ + * | | |r1 : : |r2 | | + * +----+----+----+---+-------+--------+-----+ + * + * Expect that only the intersections of r1 with r3 and r2 with r3 are removed + * from the available memory pool. Expect the total size of available memory to + * be updated and the counter to not be updated. + */ +static int memblock_remove_overlap_two_check(void) +{ + struct memblock_region *rgn1, *rgn2; + phys_addr_t new_r1_size, new_r2_size, r2_end, r3_end, total_size; + + rgn1 = &memblock.memory.regions[0]; + rgn2 = &memblock.memory.regions[1]; + + struct region r1 = { + .base = SZ_16M, + .size = SZ_32M + }; + struct region r2 = { + .base = SZ_64M, + .size = SZ_64M + }; + struct region r3 = { + .base = SZ_32M, + .size = SZ_64M + }; + + PREFIX_PUSH(); + + r2_end = r2.base + r2.size; + r3_end = r3.base + r3.size; + new_r1_size = r3.base - r1.base; + new_r2_size = r2_end - r3_end; + total_size = new_r1_size + new_r2_size; + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_remove(r3.base, r3.size); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, new_r1_size); + + ASSERT_EQ(rgn2->base, r3_end); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.memory.cnt, 2); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_remove_checks(void) { prefix_reset(); @@ -898,6 +1243,9 @@ static int memblock_remove_checks(void) memblock_remove_overlap_top_check(); memblock_remove_overlap_bottom_check(); memblock_remove_within_check(); + memblock_remove_only_region_check(); + memblock_remove_near_max_check(); + memblock_remove_overlap_two_check(); prefix_pop(); @@ -1163,6 +1511,154 @@ static int memblock_free_within_check(void) return 0; } +/* + * A simple test that tries to free a memory block r1 that was marked + * earlier as reserved when r1 is the only available region. + * Expect to reserve a memory block r1 and then free r1 so that r1 is + * overwritten with a dummy region. The region counter stays the same, + * and the total size is updated. + */ +static int memblock_free_only_region_check(void) +{ + struct memblock_region *rgn; + + rgn = &memblock.reserved.regions[0]; + + struct region r1 = { + .base = SZ_2K, + .size = SZ_4K + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_free((void *)r1.base, r1.size); + + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(rgn->size, 0); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, 0); + + test_pass_pop(); + + return 0; +} + +/* + * A simple test that tries free a region r2 when r2 extends past PHYS_ADDR_MAX: + * + * +--------+ + * | r2 | + * +--------+ + * | +---+....+ + * | |rgn| | + * +------------------------+---+----+ + * + * Expect that only the portion between PHYS_ADDR_MAX and r2.base is freed. + * Expect the total size of reserved memory to be updated and the counter to + * not be updated. + */ +static int memblock_free_near_max_check(void) +{ + struct memblock_region *rgn; + phys_addr_t total_size; + + rgn = &memblock.reserved.regions[0]; + + struct region r1 = { + .base = PHYS_ADDR_MAX - SZ_2M, + .size = SZ_2M + }; + + struct region r2 = { + .base = PHYS_ADDR_MAX - SZ_1M, + .size = SZ_2M + }; + + PREFIX_PUSH(); + + total_size = r1.size - (PHYS_ADDR_MAX - r2.base); + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_free((void *)r2.base, r2.size); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to free a reserved region r3 that overlaps with two + * existing reserved regions r1 and r2: + * + * +----------------+ + * | r3 | + * +----------------+ + * | +----+..... ........+--------+ + * | | |r1 : : |r2 | | + * +----+----+----+---+-------+--------+-----+ + * + * Expect that only the intersections of r1 with r3 and r2 with r3 are freed + * from the collection of reserved memory. Expect the total size of reserved + * memory to be updated and the counter to not be updated. + */ +static int memblock_free_overlap_two_check(void) +{ + struct memblock_region *rgn1, *rgn2; + phys_addr_t new_r1_size, new_r2_size, r2_end, r3_end, total_size; + + rgn1 = &memblock.reserved.regions[0]; + rgn2 = &memblock.reserved.regions[1]; + + struct region r1 = { + .base = SZ_16M, + .size = SZ_32M + }; + struct region r2 = { + .base = SZ_64M, + .size = SZ_64M + }; + struct region r3 = { + .base = SZ_32M, + .size = SZ_64M + }; + + PREFIX_PUSH(); + + r2_end = r2.base + r2.size; + r3_end = r3.base + r3.size; + new_r1_size = r3.base - r1.base; + new_r2_size = r2_end - r3_end; + total_size = new_r1_size + new_r2_size; + + reset_memblock_regions(); + memblock_reserve(r1.base, r1.size); + memblock_reserve(r2.base, r2.size); + memblock_free((void *)r3.base, r3.size); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, new_r1_size); + + ASSERT_EQ(rgn2->base, r3_end); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); + + return 0; +} + static int memblock_free_checks(void) { prefix_reset(); @@ -1174,6 +1670,9 @@ static int memblock_free_checks(void) memblock_free_overlap_top_check(); memblock_free_overlap_bottom_check(); memblock_free_within_check(); + memblock_free_only_region_check(); + memblock_free_near_max_check(); + memblock_free_overlap_two_check(); prefix_pop(); -- cgit v1.2.3 From deee033e0f8ea66a9f4acfc1eb069fdef3013bec Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:47 -0500 Subject: memblock tests: update alloc_api to test memblock_alloc_raw Update memblock_alloc() tests so that they test either memblock_alloc() or memblock_alloc_raw() depending on the value of alloc_test_flags. Run through all the existing tests in memblock_alloc_api twice: once for memblock_alloc() and once for memblock_alloc_raw(). When the tests run memblock_alloc(), they test that the entire memory region is zero. When the tests run memblock_alloc_raw(), they test that the entire memory region is nonzero. The content of the memory region is initialized to nonzero, and we expect it to remain unchanged if running memblock_alloc_raw(). Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/5a7cfb2f807ee2cb53ee77f9f5c910107b253d6e.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 91 ++++++++++++++++++++------------ tools/testing/memblock/tests/common.h | 27 ++++++++++ 2 files changed, 85 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index e20e326d636f..36dd7e254cce 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -1,6 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "alloc_api.h" +static int alloc_test_flags = TEST_F_NONE; + +static inline const char * const get_memblock_alloc_name(int flags) +{ + if (flags & TEST_F_RAW) + return "memblock_alloc_raw"; + return "memblock_alloc"; +} + +static inline void *run_memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + if (alloc_test_flags & TEST_F_RAW) + return memblock_alloc_raw(size, align); + return memblock_alloc(size, align); +} + /* * A simple test that tries to allocate a small memory region. * Expect to allocate an aligned region near the end of the available memory. @@ -19,10 +35,10 @@ static int alloc_top_down_simple_check(void) expected_start = memblock_end_of_DRAM() - SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc(size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, expected_start); @@ -79,10 +95,10 @@ static int alloc_top_down_disjoint_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, alignment); + allocated_ptr = run_memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -126,10 +142,10 @@ static int alloc_top_down_before_check(void) memblock_reserve(memblock_end_of_DRAM() - total_size, r1_size); - allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - total_size); @@ -176,10 +192,10 @@ static int alloc_top_down_after_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2_size); @@ -228,10 +244,10 @@ static int alloc_top_down_second_fit_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_test_flags); ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base - r3_size); @@ -284,10 +300,10 @@ static int alloc_in_between_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base - r2.size - r3_size); @@ -332,7 +348,7 @@ static int alloc_small_gaps_generic_check(void) region_end += gap_size + region_size; } - allocated_ptr = memblock_alloc(region_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(region_size, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); @@ -356,7 +372,7 @@ static int alloc_all_reserved_generic_check(void) /* Simulate full memory */ memblock_reserve(memblock_start_of_DRAM(), MEM_SIZE); - allocated_ptr = memblock_alloc(SZ_256, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_256, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); @@ -392,7 +408,7 @@ static int alloc_no_space_generic_check(void) /* Simulate almost-full memory */ memblock_reserve(memblock_start_of_DRAM(), reserved_size); - allocated_ptr = memblock_alloc(SZ_1K, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_1K, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); @@ -427,10 +443,10 @@ static int alloc_limited_space_generic_check(void) /* Simulate almost-full memory */ memblock_reserve(memblock_start_of_DRAM(), reserved_size); - allocated_ptr = memblock_alloc(available_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(available_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, available_size); + assert_mem_content(allocated_ptr, available_size, alloc_test_flags); ASSERT_EQ(rgn->size, MEM_SIZE); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -457,7 +473,7 @@ static int alloc_no_memory_generic_check(void) reset_memblock_regions(); - allocated_ptr = memblock_alloc(SZ_1K, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_1K, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); ASSERT_EQ(rgn->size, 0); @@ -491,7 +507,7 @@ static int alloc_too_large_generic_check(void) setup_memblock(); - allocated_ptr = memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(MEM_SIZE + SZ_2, SMP_CACHE_BYTES); ASSERT_EQ(allocated_ptr, NULL); ASSERT_EQ(rgn->size, 0); @@ -517,10 +533,10 @@ static int alloc_bottom_up_simple_check(void) setup_memblock(); - allocated_ptr = memblock_alloc(SZ_2, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(SZ_2, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, SZ_2); + assert_mem_content(allocated_ptr, SZ_2, alloc_test_flags); ASSERT_EQ(rgn->size, SZ_2); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -575,10 +591,10 @@ static int alloc_bottom_up_disjoint_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, alignment); + allocated_ptr = run_memblock_alloc(r2_size, alignment); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -619,10 +635,10 @@ static int alloc_bottom_up_before_check(void) memblock_reserve(memblock_start_of_DRAM() + r1_size, r2_size); - allocated_ptr = memblock_alloc(r1_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r1_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r1_size); + assert_mem_content(allocated_ptr, r1_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -668,10 +684,10 @@ static int alloc_bottom_up_after_check(void) memblock_reserve(r1.base, r1.size); - allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r2_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r1.base); @@ -721,10 +737,10 @@ static int alloc_bottom_up_second_fit_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); + allocated_ptr = run_memblock_alloc(r3_size, SMP_CACHE_BYTES); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_test_flags); ASSERT_EQ(rgn->size, r2.size + r3_size); ASSERT_EQ(rgn->base, r2.base); @@ -856,13 +872,14 @@ static int alloc_too_large_check(void) return 0; } -int memblock_alloc_checks(void) +static int memblock_alloc_checks_internal(int flags) { - const char *func_testing = "memblock_alloc"; + const char *func = get_memblock_alloc_name(flags); + alloc_test_flags = flags; prefix_reset(); - prefix_push(func_testing); - test_print("Running %s tests...\n", func_testing); + prefix_push(func); + test_print("Running %s tests...\n", func); reset_memblock_attributes(); dummy_physical_memory_init(); @@ -886,3 +903,11 @@ int memblock_alloc_checks(void) return 0; } + +int memblock_alloc_checks(void) +{ + memblock_alloc_checks_internal(TEST_F_NONE); + memblock_alloc_checks_internal(TEST_F_RAW); + + return 0; +} diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index c53f9c365714..78128e109a95 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -12,6 +12,13 @@ #define MEM_SIZE SZ_16K +enum test_flags { + /* No special request. */ + TEST_F_NONE = 0x0, + /* Perform raw allocations (no zeroing of memory). */ + TEST_F_RAW = 0x1, +}; + /** * ASSERT_EQ(): * Check the condition @@ -63,6 +70,18 @@ } \ } while (0) +/** + * ASSERT_MEM_NE(): + * Check that none of the first @_size bytes of @_seen are equal to @_expected. + * If false, print failed test message (if running with --verbose) and then + * assert. + */ +#define ASSERT_MEM_NE(_seen, _expected, _size) do { \ + for (int _i = 0; _i < (_size); _i++) { \ + ASSERT_NE(((char *)_seen)[_i], (_expected)); \ + } \ +} while (0) + #define PREFIX_PUSH() prefix_push(__func__) /* @@ -116,4 +135,12 @@ static inline void run_bottom_up(int (*func)()) prefix_pop(); } +static inline void assert_mem_content(void *mem, int size, int flags) +{ + if (flags & TEST_F_RAW) + ASSERT_MEM_NE(mem, 0, size); + else + ASSERT_MEM_EQ(mem, 0, size); +} + #endif -- cgit v1.2.3 From ae544fd62c14265dc663a65b3f9c6c5a6134098a Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:48 -0500 Subject: memblock tests: update alloc_nid_api to test memblock_alloc_try_nid_raw Update memblock_alloc_try_nid() tests so that they test either memblock_alloc_try_nid() or memblock_alloc_try_nid_raw() depending on the value of alloc_nid_test_flags. Run through all the existing tests in alloc_nid_api twice: once for memblock_alloc_try_nid() and once for memblock_alloc_try_nid_raw(). When the tests run memblock_alloc_try_nid(), they test that the entire memory region is zero. When the tests run memblock_alloc_try_nid_raw(), they test that the entire memory region is nonzero. The content of the memory region is initialized to nonzero, and we expect it to remain unchanged if running memblock_alloc_try_nid_raw(). Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/6fa8938f67872841c10a00afb042947d1d280a04.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_nid_api.c | 180 +++++++++++++++++---------- 1 file changed, 111 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 9324d706ee3a..32b3c1594fdd 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -1,6 +1,26 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "alloc_nid_api.h" +static int alloc_nid_test_flags = TEST_F_NONE; + +static inline const char * const get_memblock_alloc_try_nid_name(int flags) +{ + if (flags & TEST_F_RAW) + return "memblock_alloc_try_nid_raw"; + return "memblock_alloc_try_nid"; +} + +static inline void *run_memblock_alloc_try_nid(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid) +{ + if (alloc_nid_test_flags & TEST_F_RAW) + return memblock_alloc_try_nid_raw(size, align, min_addr, + max_addr, nid); + return memblock_alloc_try_nid(size, align, min_addr, max_addr, nid); +} + /* * A simple test that tries to allocate a memory region within min_addr and * max_addr range: @@ -32,12 +52,13 @@ static int alloc_try_nid_top_down_simple_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -86,12 +107,13 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512 + misalign; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size - misalign); @@ -137,12 +159,13 @@ static int alloc_try_nid_exact_address_generic_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES; max_addr = min_addr + size; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -189,11 +212,12 @@ static int alloc_try_nid_top_down_narrow_range_check(void) min_addr = memblock_start_of_DRAM() + SZ_512; max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, max_addr - size); @@ -241,8 +265,9 @@ static int alloc_try_nid_low_max_generic_check(void) min_addr = memblock_start_of_DRAM(); max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -287,11 +312,12 @@ static int alloc_try_nid_min_reserved_generic_check(void) memblock_reserve(reserved_base, r1_size); - allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, reserved_base); @@ -338,11 +364,12 @@ static int alloc_try_nid_max_reserved_generic_check(void) memblock_reserve(max_addr, r1_size); - allocated_ptr = memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r2_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r2_size); + assert_mem_content(allocated_ptr, r2_size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, min_addr); @@ -402,11 +429,12 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn1->size, r1.size + r3_size); ASSERT_EQ(rgn1->base, max_addr - r3_size); @@ -466,11 +494,12 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, total_size); ASSERT_EQ(rgn->base, r2.base); @@ -531,11 +560,12 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, r1.base); @@ -597,8 +627,9 @@ static int alloc_try_nid_reserved_all_generic_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_EQ(allocated_ptr, NULL); @@ -628,11 +659,12 @@ static int alloc_try_nid_top_down_cap_max_check(void) min_addr = memblock_end_of_DRAM() - SZ_1K; max_addr = memblock_end_of_DRAM() + SZ_256; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -666,11 +698,12 @@ static int alloc_try_nid_top_down_cap_min_check(void) min_addr = memblock_start_of_DRAM() - SZ_256; max_addr = memblock_end_of_DRAM(); - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); @@ -714,13 +747,13 @@ static int alloc_try_nid_bottom_up_simple_check(void) min_addr = memblock_start_of_DRAM() + SMP_CACHE_BYTES * 2; max_addr = min_addr + SZ_512; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -769,13 +802,13 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) min_addr = memblock_start_of_DRAM() + misalign; max_addr = min_addr + SZ_512; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); rgn_end = rgn->base + rgn->size; ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr + (SMP_CACHE_BYTES - misalign)); @@ -822,12 +855,12 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) min_addr = memblock_start_of_DRAM() + SZ_512; max_addr = min_addr + SMP_CACHE_BYTES; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -887,12 +920,12 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn1->size, r1.size); ASSERT_EQ(rgn1->base, max_addr); @@ -959,12 +992,12 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, r3_size); + assert_mem_content(allocated_ptr, r3_size, alloc_nid_test_flags); ASSERT_EQ(rgn3->size, r3_size); ASSERT_EQ(rgn3->base, memblock_start_of_DRAM()); @@ -1004,12 +1037,12 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) min_addr = memblock_start_of_DRAM() + SZ_1K; max_addr = memblock_end_of_DRAM() + SZ_256; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, min_addr); @@ -1043,12 +1076,12 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) min_addr = memblock_start_of_DRAM(); max_addr = memblock_end_of_DRAM() - SZ_256; - allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, - min_addr, max_addr, - NUMA_NO_NODE); + allocated_ptr = run_memblock_alloc_try_nid(size, SMP_CACHE_BYTES, + min_addr, max_addr, + NUMA_NO_NODE); ASSERT_NE(allocated_ptr, NULL); - ASSERT_MEM_EQ(allocated_ptr, 0, size); + assert_mem_content(allocated_ptr, size, alloc_nid_test_flags); ASSERT_EQ(rgn->size, size); ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); @@ -1193,13 +1226,14 @@ static int alloc_try_nid_low_max_check(void) return 0; } -int memblock_alloc_nid_checks(void) +static int memblock_alloc_nid_checks_internal(int flags) { - const char *func_testing = "memblock_alloc_try_nid"; + const char *func = get_memblock_alloc_try_nid_name(flags); + alloc_nid_test_flags = flags; prefix_reset(); - prefix_push(func_testing); - test_print("Running %s tests...\n", func_testing); + prefix_push(func); + test_print("Running %s tests...\n", func); reset_memblock_attributes(); dummy_physical_memory_init(); @@ -1225,3 +1259,11 @@ int memblock_alloc_nid_checks(void) return 0; } + +int memblock_alloc_nid_checks(void) +{ + memblock_alloc_nid_checks_internal(TEST_F_NONE); + memblock_alloc_nid_checks_internal(TEST_F_RAW); + + return 0; +} -- cgit v1.2.3 From a541c6d428f775efcfe25236062c96b59e31b57a Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:49 -0500 Subject: memblock tests: add tests for memblock_*bottom_up functions Add simple tests for memblock_set_bottom_up() and memblock_bottom_up(). Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/b03701d2faeaf00f7184e4b72903de4e5e939437.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/basic_api.c | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'tools') diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index ea79396e4611..c7490291c485 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -1679,6 +1679,50 @@ static int memblock_free_checks(void) return 0; } +static int memblock_set_bottom_up_check(void) +{ + prefix_push("memblock_set_bottom_up"); + + memblock_set_bottom_up(false); + ASSERT_EQ(memblock.bottom_up, false); + memblock_set_bottom_up(true); + ASSERT_EQ(memblock.bottom_up, true); + + reset_memblock_attributes(); + test_pass_pop(); + + return 0; +} + +static int memblock_bottom_up_check(void) +{ + prefix_push("memblock_bottom_up"); + + memblock_set_bottom_up(false); + ASSERT_EQ(memblock_bottom_up(), memblock.bottom_up); + ASSERT_EQ(memblock_bottom_up(), false); + memblock_set_bottom_up(true); + ASSERT_EQ(memblock_bottom_up(), memblock.bottom_up); + ASSERT_EQ(memblock_bottom_up(), true); + + reset_memblock_attributes(); + test_pass_pop(); + + return 0; +} + +static int memblock_bottom_up_checks(void) +{ + test_print("Running memblock_*bottom_up tests...\n"); + + prefix_reset(); + memblock_set_bottom_up_check(); + prefix_reset(); + memblock_bottom_up_check(); + + return 0; +} + int memblock_basic_checks(void) { memblock_initialization_check(); @@ -1686,6 +1730,7 @@ int memblock_basic_checks(void) memblock_reserve_checks(); memblock_remove_checks(); memblock_free_checks(); + memblock_bottom_up_checks(); return 0; } -- cgit v1.2.3 From dcd45ad2ad784c35bfba8ae93c285574bc2a8a1e Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 27 Aug 2022 00:42:50 -0500 Subject: memblock tests: add tests for memblock_trim_memory Add tests for memblock_trim_memory() for the following scenarios: - all regions aligned - one unaligned region that is smaller than the alignment - one unaligned region that is unaligned at the base - one unaligned region that is unaligned at the end Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/0e5f55154a3b66581e04ba3717978795cbc08a5b.1661578349.git.remckee0@gmail.com --- tools/testing/memblock/tests/basic_api.c | 223 +++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) (limited to 'tools') diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index c7490291c485..a13a57ba0815 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -8,6 +8,7 @@ #define FUNC_RESERVE "memblock_reserve" #define FUNC_REMOVE "memblock_remove" #define FUNC_FREE "memblock_free" +#define FUNC_TRIM "memblock_trim_memory" static int memblock_initialization_check(void) { @@ -1723,6 +1724,227 @@ static int memblock_bottom_up_checks(void) return 0; } +/* + * A test that tries to trim memory when both ends of the memory region are + * aligned. Expect that the memory will not be trimmed. Expect the counter to + * not be updated. + */ +static int memblock_trim_memory_aligned_check(void) +{ + struct memblock_region *rgn; + const phys_addr_t alignment = SMP_CACHE_BYTES; + + rgn = &memblock.memory.regions[0]; + + struct region r = { + .base = alignment, + .size = alignment * 4 + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r.base, r.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, r.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to trim memory when there are two available regions, r1 and + * r2. Region r1 is aligned on both ends and region r2 is unaligned on one end + * and smaller than the alignment: + * + * alignment + * |--------| + * | +-----------------+ +------+ | + * | | r1 | | r2 | | + * +--------+-----------------+--------+------+---+ + * ^ ^ ^ ^ ^ + * |________|________|________| | + * | Unaligned address + * Aligned addresses + * + * Expect that r1 will not be trimmed and r2 will be removed. Expect the + * counter to be updated. + */ +static int memblock_trim_memory_too_small_check(void) +{ + struct memblock_region *rgn; + const phys_addr_t alignment = SMP_CACHE_BYTES; + + rgn = &memblock.memory.regions[0]; + + struct region r1 = { + .base = alignment, + .size = alignment * 2 + }; + struct region r2 = { + .base = alignment * 4, + .size = alignment - SZ_2 + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, r1.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to trim memory when there are two available regions, r1 and + * r2. Region r1 is aligned on both ends and region r2 is unaligned at the base + * and aligned at the end: + * + * Unaligned address + * | + * v + * | +-----------------+ +---------------+ | + * | | r1 | | r2 | | + * +--------+-----------------+----------+---------------+---+ + * ^ ^ ^ ^ ^ ^ + * |________|________|________|________|________| + * | + * Aligned addresses + * + * Expect that r1 will not be trimmed and r2 will be trimmed at the base. + * Expect the counter to not be updated. + */ +static int memblock_trim_memory_unaligned_base_check(void) +{ + struct memblock_region *rgn1, *rgn2; + const phys_addr_t alignment = SMP_CACHE_BYTES; + phys_addr_t offset = SZ_2; + phys_addr_t new_r2_base, new_r2_size; + + rgn1 = &memblock.memory.regions[0]; + rgn2 = &memblock.memory.regions[1]; + + struct region r1 = { + .base = alignment, + .size = alignment * 2 + }; + struct region r2 = { + .base = alignment * 4 + offset, + .size = alignment * 2 - offset + }; + + PREFIX_PUSH(); + + new_r2_base = r2.base + (alignment - offset); + new_r2_size = r2.size - (alignment - offset); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1.size); + + ASSERT_EQ(rgn2->base, new_r2_base); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.memory.cnt, 2); + + test_pass_pop(); + + return 0; +} + +/* + * A test that tries to trim memory when there are two available regions, r1 and + * r2. Region r1 is aligned on both ends and region r2 is aligned at the base + * and unaligned at the end: + * + * Unaligned address + * | + * v + * | +-----------------+ +---------------+ | + * | | r1 | | r2 | | + * +--------+-----------------+--------+---------------+---+ + * ^ ^ ^ ^ ^ ^ + * |________|________|________|________|________| + * | + * Aligned addresses + * + * Expect that r1 will not be trimmed and r2 will be trimmed at the end. + * Expect the counter to not be updated. + */ +static int memblock_trim_memory_unaligned_end_check(void) +{ + struct memblock_region *rgn1, *rgn2; + const phys_addr_t alignment = SMP_CACHE_BYTES; + phys_addr_t offset = SZ_2; + phys_addr_t new_r2_size; + + rgn1 = &memblock.memory.regions[0]; + rgn2 = &memblock.memory.regions[1]; + + struct region r1 = { + .base = alignment, + .size = alignment * 2 + }; + struct region r2 = { + .base = alignment * 4, + .size = alignment * 2 - offset + }; + + PREFIX_PUSH(); + + new_r2_size = r2.size - (alignment - offset); + + reset_memblock_regions(); + memblock_add(r1.base, r1.size); + memblock_add(r2.base, r2.size); + memblock_trim_memory(alignment); + + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1.size); + + ASSERT_EQ(rgn2->base, r2.base); + ASSERT_EQ(rgn2->size, new_r2_size); + + ASSERT_EQ(memblock.memory.cnt, 2); + + test_pass_pop(); + + return 0; +} + +static int memblock_trim_memory_checks(void) +{ + prefix_reset(); + prefix_push(FUNC_TRIM); + test_print("Running %s tests...\n", FUNC_TRIM); + + memblock_trim_memory_aligned_check(); + memblock_trim_memory_too_small_check(); + memblock_trim_memory_unaligned_base_check(); + memblock_trim_memory_unaligned_end_check(); + + prefix_pop(); + + return 0; +} + int memblock_basic_checks(void) { memblock_initialization_check(); @@ -1731,6 +1953,7 @@ int memblock_basic_checks(void) memblock_remove_checks(); memblock_free_checks(); memblock_bottom_up_checks(); + memblock_trim_memory_checks(); return 0; } -- cgit v1.2.3 From 6f95de6d713130c953af0a40b13c1da519f91c4e Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Mon, 29 Aug 2022 16:18:28 -0700 Subject: bpftool: Add support for querying cgroup_iter link Support dumping info of a cgroup_iter link. This includes showing the cgroup's id and the order for walking the cgroup hierarchy. Example output is as follows: > bpftool link show 1: iter prog 2 target_name bpf_map 2: iter prog 3 target_name bpf_prog 3: iter prog 12 target_name cgroup cgroup_id 72 order self_only > bpftool -p link show [{ "id": 1, "type": "iter", "prog_id": 2, "target_name": "bpf_map" },{ "id": 2, "type": "iter", "prog_id": 3, "target_name": "bpf_prog" },{ "id": 3, "type": "iter", "prog_id": 12, "target_name": "cgroup", "cgroup_id": 72, "order": "self_only" } ] Signed-off-by: Hao Luo Reviewed-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220829231828.1016835-1-haoluo@google.com Signed-off-by: Martin KaFai Lau --- tools/bpf/bpftool/link.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 7a20931c3250..ef0dc2f8d5a2 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -83,6 +83,29 @@ static bool is_iter_map_target(const char *target_name) strcmp(target_name, "bpf_sk_storage_map") == 0; } +static bool is_iter_cgroup_target(const char *target_name) +{ + return strcmp(target_name, "cgroup") == 0; +} + +static const char *cgroup_order_string(__u32 order) +{ + switch (order) { + case BPF_CGROUP_ITER_ORDER_UNSPEC: + return "order_unspec"; + case BPF_CGROUP_ITER_SELF_ONLY: + return "self_only"; + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + return "descendants_pre"; + case BPF_CGROUP_ITER_DESCENDANTS_POST: + return "descendants_post"; + case BPF_CGROUP_ITER_ANCESTORS_UP: + return "ancestors_up"; + default: /* won't happen */ + return "unknown"; + } +} + static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) { const char *target_name = u64_to_ptr(info->iter.target_name); @@ -91,6 +114,12 @@ static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) if (is_iter_map_target(target_name)) jsonw_uint_field(wtr, "map_id", info->iter.map.map_id); + + if (is_iter_cgroup_target(target_name)) { + jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id); + jsonw_string_field(wtr, "order", + cgroup_order_string(info->iter.cgroup.order)); + } } static int get_prog_info(int prog_id, struct bpf_prog_info *info) @@ -208,6 +237,12 @@ static void show_iter_plain(struct bpf_link_info *info) if (is_iter_map_target(target_name)) printf("map_id %u ", info->iter.map.map_id); + + if (is_iter_cgroup_target(target_name)) { + printf("cgroup_id %llu ", info->iter.cgroup.cgroup_id); + printf("order %s ", + cgroup_order_string(info->iter.cgroup.order)); + } } static int show_link_close_plain(int fd, struct bpf_link_info *info) -- cgit v1.2.3 From b118509076b39cc5e616c0680312b5caaca535fe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Aug 2022 08:49:16 +0200 Subject: netfilter: remove nf_conntrack_helper sysctl and modparam toggles __nf_ct_try_assign_helper() remains in place but it now requires a template to configure the helper. A toggle to disable automatic helper assignment was added by: a9006892643a ("netfilter: nf_ct_helper: allow to disable automatic helper assignment") in 2012 to address the issues described in "Secure use of iptables and connection tracking helpers". Automatic conntrack helper assignment was disabled by: 3bb398d925ec ("netfilter: nf_ct_helper: disable automatic helper assignment") back in 2016. This patch removes the sysctl and modparam toggles, users now have to rely on explicit conntrack helper configuration via ruleset. Update tools/testing/selftests/netfilter/nft_conntrack_helper.sh to check that auto-assignment does not happen anymore. Acked-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 - include/net/netns/conntrack.h | 1 - net/netfilter/nf_conntrack_core.c | 7 +- net/netfilter/nf_conntrack_helper.c | 80 +++------------------- net/netfilter/nf_conntrack_netlink.c | 5 -- net/netfilter/nf_conntrack_standalone.c | 10 --- net/netfilter/nft_ct.c | 3 - .../selftests/netfilter/nft_conntrack_helper.sh | 36 +++++++--- 8 files changed, 37 insertions(+), 107 deletions(-) (limited to 'tools') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a32be8aa7ed2..6a2019aaa464 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -53,8 +53,6 @@ struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; unsigned int expect_count; - u8 sysctl_auto_assign_helper; - bool auto_assign_helper_warned; /* only used from work queues, configuration plane, and so on: */ unsigned int users4; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index c396a3862e80..e1290c159184 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -101,7 +101,6 @@ struct netns_ct { u8 sysctl_log_invalid; /* Log invalid packets */ u8 sysctl_events; u8 sysctl_acct; - u8 sysctl_auto_assign_helper; u8 sysctl_tstamp; u8 sysctl_checksum; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 71c2f4f95d36..1357a2729a4b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1782,7 +1782,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock_bh(&nf_conntrack_expect_lock); } - if (!exp) + if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was @@ -2068,10 +2068,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (ct->master || (help && !hlist_empty(&help->expectations))) return; - - rcu_read_lock(); - __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); @@ -2797,7 +2793,6 @@ int nf_conntrack_init_net(struct net *net) nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); - nf_conntrack_helper_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e96b32221444..ff737a76052e 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = false; -module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); -MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 0)"); - static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; @@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } -static struct nf_conntrack_helper * -__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) -{ - struct nf_conntrack_helper *helper; - struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - unsigned int h; - - if (!nf_ct_helper_count) - return NULL; - - h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) - return helper; - } - return NULL; -} - struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { @@ -209,33 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -static struct nf_conntrack_helper * -nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - if (!cnet->sysctl_auto_assign_helper) { - if (cnet->auto_assign_helper_warned) - return NULL; - if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) - return NULL; - pr_info("nf_conntrack: default automatic helper assignment " - "has been turned off for security reasons and CT-based " - "firewall rule not found. Use the iptables CT target " - "to attach helpers instead.\n"); - cnet->auto_assign_helper_warned = true; - return NULL; - } - - return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); -} - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - struct net *net = nf_ct_net(ct); /* We already got a helper explicitly attached. The function * nf_conntrack_alter_reply - in case NAT is in use - asks for looking @@ -246,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; - if (tmpl != NULL) { - help = nfct_help(tmpl); - if (help != NULL) { - helper = rcu_dereference(help->helper); - set_bit(IPS_HELPER_BIT, &ct->status); - } + if (WARN_ON_ONCE(!tmpl)) + return 0; + + help = nfct_help(tmpl); + if (help != NULL) { + helper = rcu_dereference(help->helper); + set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { - helper = nf_ct_lookup_helper(ct, net); - if (helper == NULL) { - if (help) - RCU_INIT_POINTER(help->helper, NULL); - return 0; - } + if (help) + RCU_INIT_POINTER(help->helper, NULL); + return 0; } if (help == NULL) { @@ -545,19 +498,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); -void nf_ct_set_auto_assign_helper_warned(struct net *net) -{ - nf_ct_pernet(net)->auto_assign_helper_warned = true; -} -EXPORT_SYMBOL_GPL(nf_ct_set_auto_assign_helper_warned); - -void nf_conntrack_helper_pernet_init(struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; -} - int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 04169b54f2a2..7562b215b932 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2298,11 +2298,6 @@ ctnetlink_create_conntrack(struct net *net, ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } - } else { - /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - if (err < 0) - goto err2; } err = ctnetlink_setup_nat(ct, cda); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 05895878610c..4ffe84c5a82c 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -561,7 +561,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, - NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif @@ -680,14 +679,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - [NF_SYSCTL_CT_HELPER] = { - .procname = "nf_conntrack_helper", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", @@ -1100,7 +1091,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b04995c3e17f..a3f01f209a53 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1089,9 +1089,6 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (err < 0) goto err_put_helper; - /* Avoid the bogus warning, helper will be assigned after CT init */ - nf_ct_set_auto_assign_helper_warned(ctx->net); - return 0; err_put_helper: diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh index bf6b9626c7dd..faa7778d7bd1 100755 --- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh +++ b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh @@ -102,26 +102,42 @@ check_for_helper() ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp' if [ $? -ne 0 ] ; then - echo "FAIL: ${netns} did not show attached helper $message" 1>&2 - ret=1 + if [ $autoassign -eq 0 ] ;then + echo "FAIL: ${netns} did not show attached helper $message" 1>&2 + ret=1 + else + echo "PASS: ${netns} did not show attached helper $message" 1>&2 + fi + else + if [ $autoassign -eq 0 ] ;then + echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 + else + echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2 + ret=1 + fi fi - echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2 return 0 } test_helper() { local port=$1 - local msg=$2 + local autoassign=$2 + + if [ $autoassign -eq 0 ] ;then + msg="set via ruleset" + else + msg="auto-assign" + fi sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null & sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null & sleep 1 - check_for_helper "$ns1" "ip $msg" $port - check_for_helper "$ns2" "ip $msg" $port + check_for_helper "$ns1" "ip $msg" $port $autoassign + check_for_helper "$ns2" "ip $msg" $port $autoassign wait @@ -173,9 +189,9 @@ if [ $? -ne 0 ];then fi fi -test_helper 2121 "set via ruleset" -ip netns exec ${ns1} sysctl -q 'net.netfilter.nf_conntrack_helper=1' -ip netns exec ${ns2} sysctl -q 'net.netfilter.nf_conntrack_helper=1' -test_helper 21 "auto-assign" +test_helper 2121 0 +ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1' +test_helper 21 1 exit $ret -- cgit v1.2.3 From be94ecf7608cc11ff46442012e710bb8fb139b99 Mon Sep 17 00:00:00 2001 From: Paul Heidekrüger Date: Tue, 14 Jun 2022 15:48:11 +0000 Subject: tools/memory-model: Clarify LKMM's limitations in litmus-tests.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed, clarify LKMM not recognizing certain kinds of orderings. In particular, highlight the fact that LKMM might deliberately make weaker guarantees than compilers and architectures. [ paulmck: Fix whitespace issue noted by checkpatch.pl. ] Link: https://lore.kernel.org/all/YpoW1deb%2FQeeszO1@ethstick13.dse.in.tum.de/T/#u Co-developed-by: Alan Stern Signed-off-by: Alan Stern Signed-off-by: Paul Heidekrüger Reviewed-by: Marco Elver Reviewed-by: Joel Fernandes (Google) Cc: Charalampos Mainas Cc: Pramod Bhatotia Cc: Soham Chakraborty Cc: Martin Fink Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/litmus-tests.txt | 37 +++++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt index 8a9d5d2787f9..26554b1c5575 100644 --- a/tools/memory-model/Documentation/litmus-tests.txt +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -946,22 +946,39 @@ Limitations of the Linux-kernel memory model (LKMM) include: carrying a dependency, then the compiler can break that dependency by substituting a constant of that value. - Conversely, LKMM sometimes doesn't recognize that a particular - optimization is not allowed, and as a result, thinks that a - dependency is not present (because the optimization would break it). - The memory model misses some pretty obvious control dependencies - because of this limitation. A simple example is: + Conversely, LKMM will sometimes overestimate the amount of + reordering compilers and CPUs can carry out, leading it to miss + some pretty obvious cases of ordering. A simple example is: r1 = READ_ONCE(x); if (r1 == 0) smp_mb(); WRITE_ONCE(y, 1); - There is a control dependency from the READ_ONCE to the WRITE_ONCE, - even when r1 is nonzero, but LKMM doesn't realize this and thinks - that the write may execute before the read if r1 != 0. (Yes, that - doesn't make sense if you think about it, but the memory model's - intelligence is limited.) + The WRITE_ONCE() does not depend on the READ_ONCE(), and as a + result, LKMM does not claim ordering. However, even though no + dependency is present, the WRITE_ONCE() will not be executed before + the READ_ONCE(). There are two reasons for this: + + The presence of the smp_mb() in one of the branches + prevents the compiler from moving the WRITE_ONCE() + up before the "if" statement, since the compiler has + to assume that r1 will sometimes be 0 (but see the + comment below); + + CPUs do not execute stores before po-earlier conditional + branches, even in cases where the store occurs after the + two arms of the branch have recombined. + + It is clear that it is not dangerous in the slightest for LKMM to + make weaker guarantees than architectures. In fact, it is + desirable, as it gives compilers room for making optimizations. + For instance, suppose that a 0 value in r1 would trigger undefined + behavior elsewhere. Then a clever compiler might deduce that r1 + can never be 0 in the if condition. As a result, said clever + compiler might deem it safe to optimize away the smp_mb(), + eliminating the branch and any ordering an architecture would + guarantee otherwise. 2. Multiple access sizes for a single variable are not supported, and neither are misaligned or partially overlapping accesses. -- cgit v1.2.3 From a30d551f34df66d739949a6140b50496afa36f66 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:32 +0200 Subject: tools/nolibc: make argc 32-bit in riscv startup code The "ld a0, 0(sp)" instruction doesn't build on RISCV32 because that would load a 64-bit value into a 32-bit register. But argc 32-bit, not 64, so we ought to use "lw" here. Tested on both RISCV32 and RISCV64. Cc: Pranith Kumar Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-riscv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 95e2b7924925..ba04771cb3a3 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -190,7 +190,7 @@ __asm__ (".section .text\n" ".option norelax\n" "lla gp, __global_pointer$\n" ".option pop\n" - "ld a0, 0(sp)\n" // argc (a0) was in the stack + "lw a0, 0(sp)\n" // argc (a0) was in the stack "add a1, sp, "SZREG"\n" // argv (a1) = sp "slli a2, a0, "PTRLOG"\n" // envp (a2) = SZREG*argc ... "add a2, a2, "SZREG"\n" // + SZREG (skip null) -- cgit v1.2.3 From 8b53e83b08cfdc3f430b5415cd1031d5e7e1f935 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:33 +0200 Subject: tools/nolibc: fix build warning in sys_mmap() when my_syscall6 is not defined We return -ENOSYS when there's no syscall6() operation, but we must cast it to void* to avoid a warning. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 08491070387b..b8c96878c9ce 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -692,7 +692,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, { #ifndef my_syscall6 /* Function not implemented. */ - return -ENOSYS; + return (void *)-ENOSYS; #else int n; -- cgit v1.2.3 From 364702f7551451f2fab341f1b31adf911c888375 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:34 +0200 Subject: tools/nolibc: make sys_mmap() automatically use the right __NR_mmap definition __NR_mmap2 was used for i386 but it's also needed for other archs such as RISCV32 or ARM. Let's decide to use it based on the __NR_mmap2 definition as it's not defined on other archs. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index b8c96878c9ce..ce3ee03aa679 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -697,7 +697,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, int n; -#if defined(__i386__) +#if defined(__NR_mmap2) n = __NR_mmap2; offset >>= 12; #else -- cgit v1.2.3 From 362aecb2d8cfad0268d6c0ae5f448e9b6eee7ffb Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:35 +0200 Subject: selftests/nolibc: add basic infrastructure to ease creation of nolibc tests This creates a "nolibc" selftest that intends to test various parts of the nolibc component, both in terms of build and execution for a given architecture. The aim is for it to be as simple to run as a kernel build, by just passing the compiler (for the build) and the ARCH (for kernel and execution). It brings a basic squeleton made of a single C file that will ease testing and error reporting. The code will be arranged so that it remains easy to add basic tests for syscalls or library calls that may rely on a condition to be executed, and whose result is compared to a value or to an error with a specific errno value. Tests will just use a relative line number in switch/case statements as an index, saving the user from having to maintain arrays and complicated functions which can often just be one-liners. MAINTAINERS was updated. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- MAINTAINERS | 1 + tools/testing/selftests/nolibc/Makefile | 43 +++ tools/testing/selftests/nolibc/nolibc-test.c | 395 +++++++++++++++++++++++++++ 3 files changed, 439 insertions(+) create mode 100644 tools/testing/selftests/nolibc/Makefile create mode 100644 tools/testing/selftests/nolibc/nolibc-test.c (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 8a5012ba6ff9..89f939ad1996 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14444,6 +14444,7 @@ M: Willy Tarreau S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/wtarreau/nolibc.git F: tools/include/nolibc/ +F: tools/testing/selftests/nolibc/ NSDEPS M: Matthias Maennich diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile new file mode 100644 index 000000000000..fd0a67082334 --- /dev/null +++ b/tools/testing/selftests/nolibc/Makefile @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for nolibc tests +include ../../../scripts/Makefile.include + +# we're in ".../tools/testing/selftests/nolibc" +ifeq ($(srctree),) +srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) +endif + +ifeq ($(ARCH),) +include $(srctree)/scripts/subarch.include +ARCH = $(SUBARCH) +endif + +# OUTPUT is only set when run from the main makefile, otherwise +# it defaults to this nolibc directory. +OUTPUT ?= $(CURDIR)/ + +ifeq ($(V),1) +Q= +else +Q=@ +endif + +CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables +LDFLAGS := -s + +all: nolibc-test + +nolibc-test: nolibc-test.c + $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ + -nostdlib -static -include ../../../include/nolibc/nolibc.h $^ -lgcc + +initramfs: nolibc-test + $(QUIET_MKDIR)mkdir -p initramfs + $(call QUIET_INSTALL, initramfs/init) + $(Q)cp nolibc-test initramfs/init + +clean: + $(call QUIET_CLEAN, nolibc-test) + $(Q)rm -f nolibc-test + $(call QUIET_CLEAN, initramfs) + $(Q)rm -rf initramfs diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c new file mode 100644 index 000000000000..6c050d4381fe --- /dev/null +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* platform-specific include files coming from the compiler */ +#include + +/* libc-specific include files + * The program may be built in 2 ways: + * $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined + * $(CC) -nostdlib -I/path/to/nolibc/sysroot + */ +#ifndef NOLIBC +#include +#include +#include +#endif + +/* will be used by nolibc by getenv() */ +char **environ; + +#define CASE_ERR(err) \ + case err: return #err + +/* returns the error name (e.g. "ENOENT") for common errors, "SUCCESS" for 0, + * or the decimal value for less common ones. + */ +const char *errorname(int err) +{ + switch (err) { + case 0: return "SUCCESS"; + CASE_ERR(EPERM); + CASE_ERR(ENOENT); + CASE_ERR(ESRCH); + CASE_ERR(EINTR); + CASE_ERR(EIO); + CASE_ERR(ENXIO); + CASE_ERR(E2BIG); + CASE_ERR(ENOEXEC); + CASE_ERR(EBADF); + CASE_ERR(ECHILD); + CASE_ERR(EAGAIN); + CASE_ERR(ENOMEM); + CASE_ERR(EACCES); + CASE_ERR(EFAULT); + CASE_ERR(ENOTBLK); + CASE_ERR(EBUSY); + CASE_ERR(EEXIST); + CASE_ERR(EXDEV); + CASE_ERR(ENODEV); + CASE_ERR(ENOTDIR); + CASE_ERR(EISDIR); + CASE_ERR(EINVAL); + CASE_ERR(ENFILE); + CASE_ERR(EMFILE); + CASE_ERR(ENOTTY); + CASE_ERR(ETXTBSY); + CASE_ERR(EFBIG); + CASE_ERR(ENOSPC); + CASE_ERR(ESPIPE); + CASE_ERR(EROFS); + CASE_ERR(EMLINK); + CASE_ERR(EPIPE); + CASE_ERR(EDOM); + CASE_ERR(ERANGE); + CASE_ERR(ENOSYS); + default: + return itoa(err); + } +} + +static int pad_spc(int llen, int cnt, const char *fmt, ...) +{ + va_list args; + int len; + int ret; + + for (len = 0; len < cnt - llen; len++) + putchar(' '); + + va_start(args, fmt); + ret = vfprintf(stdout, fmt, args); + va_end(args); + return ret < 0 ? ret : ret + len; +} + +/* The tests below are intended to be used by the macroes, which evaluate + * expression , print the status to stdout, and update the "ret" + * variable to count failures. The functions themselves return the number + * of failures, thus either 0 or 1. + */ + +#define EXPECT_ZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_zr(expr, llen); } while (0) + +static int expect_zr(int expr, int llen) +{ + int ret = !(expr == 0); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_NZ(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_nz(expr, llen; } while (0) + +static int expect_nz(int expr, int llen) +{ + int ret = !(expr != 0); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_EQ(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_eq(expr, llen, val); } while (0) + +static int expect_eq(int expr, int llen, int val) +{ + int ret = !(expr == val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_NE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ne(expr, llen, val); } while (0) + +static int expect_ne(int expr, int llen, int val) +{ + int ret = !(expr != val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_GE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ge(expr, llen, val); } while (0) + +static int expect_ge(int expr, int llen, int val) +{ + int ret = !(expr >= val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_GT(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_gt(expr, llen, val); } while (0) + +static int expect_gt(int expr, int llen, int val) +{ + int ret = !(expr > val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_LE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_le(expr, llen, val); } while (0) + +static int expect_le(int expr, int llen, int val) +{ + int ret = !(expr <= val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_LT(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_lt(expr, llen, val); } while (0) + +static int expect_lt(int expr, int llen, int val) +{ + int ret = !(expr < val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_SYSZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syszr(expr, llen); } while (0) + +static int expect_syszr(int expr, int llen) +{ + int ret = 0; + + if (expr) { + ret = 1; + llen += printf(" = %d %s ", expr, errorname(errno)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += printf(" = %d ", expr); + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_SYSEQ(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syseq(expr, llen, val); } while (0) + +static int expect_syseq(int expr, int llen, int val) +{ + int ret = 0; + + if (expr != val) { + ret = 1; + llen += printf(" = %d %s ", expr, errorname(errno)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += printf(" = %d ", expr); + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_SYSNE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_sysne(expr, llen, val); } while (0) + +static int expect_sysne(int expr, int llen, int val) +{ + int ret = 0; + + if (expr == val) { + ret = 1; + llen += printf(" = %d %s ", expr, errorname(errno)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += printf(" = %d ", expr); + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_SYSER(cond, expr, expret, experr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syserr(expr, expret, experr, llen); } while (0) + +static int expect_syserr(int expr, int expret, int experr, int llen) +{ + int ret = 0; + int _errno = errno; + + llen += printf(" = %d %s ", expr, errorname(_errno)); + if (expr != expret || _errno != experr) { + ret = 1; + llen += printf(" != (%d %s) ", expret, errorname(experr)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_PTRZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrzr(expr, llen); } while (0) + +static int expect_ptrzr(const void *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%p> ", expr); + if (expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_PTRNZ(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrnz(expr, llen); } while (0) + +static int expect_ptrnz(const void *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%p> ", expr); + if (!expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STRZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strzr(expr, llen); } while (0) + +static int expect_strzr(const char *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STRNZ(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strnz(expr, llen); } while (0) + +static int expect_strnz(const char *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (!expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STREQ(cond, expr, cmp) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_streq(expr, llen, cmp); } while (0) + +static int expect_streq(const char *expr, int llen, const char *cmp) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (strcmp(expr, cmp) != 0) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STRNE(cond, expr, cmp) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strne(expr, llen, cmp); } while (0) + +static int expect_strne(const char *expr, int llen, const char *cmp) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (strcmp(expr, cmp) == 0) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + +/* declare tests based on line numbers. There must be exactly one test per line. */ +#define CASE_TEST(name) \ + case __LINE__: llen += printf("%d %s", test, #name); + + +int main(int argc, char **argv, char **envp) +{ + int min = 0; + int max = __INT_MAX__; + int ret = 0; + + environ = envp; + + printf("Total number of errors: %d\n", ret); + printf("Exiting with status %d\n", !!ret); + return !!ret; +} -- cgit v1.2.3 From 23da7bc923609bc90b6989c6c6a17e74c8f433ed Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:36 +0200 Subject: selftests/nolibc: support a test definition format It now becomes possible to pass a string either in argv[1] or in the NOLIBC_TEST environment variable (the former having precedence), to specify which tests to run. The format is: testname[:range]*[,testname...] Where a range is either a single value or the min and max numbers of the test IDs in a sequence, delimited by a dash. Multiple ranges are possible. This should provide enough flexibility to focus on certain failing parts just by playing with the boot command line in a boot loader or in qemu depending on what is accessible. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 91 ++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6c050d4381fe..49177ea9943c 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -17,6 +17,12 @@ /* will be used by nolibc by getenv() */ char **environ; +/* definition of a series of tests */ +struct test { + const char *name; // test name + int (*func)(int min, int max); // handler +}; + #define CASE_ERR(err) \ case err: return #err @@ -376,19 +382,104 @@ static int expect_strne(const char *expr, int llen, const char *cmp) return ret; } + /* declare tests based on line numbers. There must be exactly one test per line. */ #define CASE_TEST(name) \ case __LINE__: llen += printf("%d %s", test, #name); +/* This is the definition of known test names, with their functions */ +static struct test test_names[] = { + /* add new tests here */ + { 0 } +}; + int main(int argc, char **argv, char **envp) { int min = 0; int max = __INT_MAX__; int ret = 0; + int err; + int idx; + char *test; environ = envp; + /* the definition of a series of tests comes from either argv[1] or the + * "NOLIBC_TEST" environment variable. It's made of a comma-delimited + * series of test names and optional ranges: + * syscall:5-15[:.*],stdlib:8-10 + */ + test = argv[1]; + if (!test) + test = getenv("NOLIBC_TEST"); + + if (test) { + char *comma, *colon, *dash, *value; + + do { + comma = strchr(test, ','); + if (comma) + *(comma++) = '\0'; + + colon = strchr(test, ':'); + if (colon) + *(colon++) = '\0'; + + for (idx = 0; test_names[idx].name; idx++) { + if (strcmp(test, test_names[idx].name) == 0) + break; + } + + if (test_names[idx].name) { + /* The test was named, it will be called at least + * once. We may have an optional range at + * here, which defaults to the full range. + */ + do { + min = 0; max = __INT_MAX__; + value = colon; + if (value && *value) { + colon = strchr(value, ':'); + if (colon) + *(colon++) = '\0'; + + dash = strchr(value, '-'); + if (dash) + *(dash++) = '\0'; + + /* support :val: :min-max: :min-: :-max: */ + if (*value) + min = atoi(value); + if (!dash) + max = min; + else if (*dash) + max = atoi(dash); + + value = colon; + } + + /* now's time to call the test */ + printf("Running test '%s'\n", test_names[idx].name); + err = test_names[idx].func(min, max); + ret += err; + printf("Errors during this test: %d\n\n", err); + } while (colon && *colon); + } else + printf("Ignoring unknown test name '%s'\n", test); + + test = comma; + } while (test && *test); + } else { + /* no test mentioned, run everything */ + for (idx = 0; test_names[idx].name; idx++) { + printf("Running test '%s'\n", test_names[idx].name); + err = test_names[idx].func(min, max); + ret += err; + printf("Errors during this test: %d\n\n", err); + } + } + printf("Total number of errors: %d\n", ret); printf("Exiting with status %d\n", !!ret); return !!ret; -- cgit v1.2.3 From b4844fa0bdb4075ad842e89d6c41e3d0c6124848 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:37 +0200 Subject: selftests/nolibc: implement a few tests for various syscalls This adds 63 tests covering about 34 syscalls. Both successes and failures are tested. Two tests fail when run as unprivileged user (link_dir which returns EACCESS instead of EPERM, and chroot which returns EPERM). One test (execve("/")) expects to fail on EACCESS, but needs to have valid arguments otherwise the kernel will log a message. And a few tests require /proc to be mounted. The code is not pretty since all tests are one-liners, sometimes resulting in long lines, especially when using compount statements to preset a line, but it's convenient and doesn't obfuscate the code, which is important to understand what failed. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 110 +++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 49177ea9943c..dc87832912ce 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -388,9 +388,119 @@ static int expect_strne(const char *expr, int llen, const char *cmp) case __LINE__: llen += printf("%d %s", test, #name); +/* used by some syscall tests below */ +int test_getdents64(const char *dir) +{ + char buffer[4096]; + int fd, ret; + int err; + + ret = fd = open(dir, O_RDONLY | O_DIRECTORY, 0); + if (ret < 0) + return ret; + + ret = getdents64(fd, (void *)buffer, sizeof(buffer)); + err = errno; + close(fd); + + errno = err; + return ret; +} + +/* Run syscall tests between IDs and . + * Return 0 on success, non-zero on failure. + */ +int run_syscall(int min, int max) +{ + struct stat stat_buf; + int test; + int tmp; + int ret = 0; + void *p1, *p2; + + for (test = min; test >= 0 && test <= max; test++) { + int llen = 0; // line length + + /* avoid leaving empty lines below, this will insert holes into + * test numbers. + */ + switch (test + __LINE__ + 1) { + CASE_TEST(getpid); EXPECT_SYSNE(1, getpid(), -1); break; + CASE_TEST(getppid); EXPECT_SYSNE(1, getppid(), -1); break; + CASE_TEST(gettid); EXPECT_SYSNE(1, gettid(), -1); break; + CASE_TEST(getpgid_self); EXPECT_SYSNE(1, getpgid(0), -1); break; + CASE_TEST(getpgid_bad); EXPECT_SYSER(1, getpgid(-1), -1, ESRCH); break; + CASE_TEST(kill_0); EXPECT_SYSZR(1, kill(getpid(), 0)); break; + CASE_TEST(kill_CONT); EXPECT_SYSZR(1, kill(getpid(), 0)); break; + CASE_TEST(kill_BADPID); EXPECT_SYSER(1, kill(INT_MAX, 0), -1, ESRCH); break; + CASE_TEST(sbrk); if ((p1 = p2 = sbrk(4096)) != (void *)-1) p2 = sbrk(-4096); EXPECT_SYSZR(1, (p2 == (void *)-1) || p2 == p1); break; + CASE_TEST(brk); EXPECT_SYSZR(1, brk(sbrk(0))); break; + CASE_TEST(chdir_root); EXPECT_SYSZR(1, chdir("/")); break; + CASE_TEST(chdir_dot); EXPECT_SYSZR(1, chdir(".")); break; + CASE_TEST(chdir_blah); EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break; + CASE_TEST(chmod_net); EXPECT_SYSZR(1, chmod("/proc/self/net", 0555)); break; + CASE_TEST(chmod_self); EXPECT_SYSER(1, chmod("/proc/self", 0555), -1, EPERM); break; + CASE_TEST(chown_self); EXPECT_SYSER(1, chown("/proc/self", 0, 0), -1, EPERM); break; + CASE_TEST(chroot_root); EXPECT_SYSZR(1, chroot("/")); break; + CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; + CASE_TEST(chroot_exe); EXPECT_SYSER(1, chroot("/proc/self/exe"), -1, ENOTDIR); break; + CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; + CASE_TEST(close_dup); EXPECT_SYSZR(1, close(dup(0))); break; + CASE_TEST(dup_0); tmp = dup(0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; + CASE_TEST(dup_m1); tmp = dup(-1); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; + CASE_TEST(dup2_0); tmp = dup2(0, 100); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; + CASE_TEST(dup2_m1); tmp = dup2(-1, 100); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; + CASE_TEST(dup3_0); tmp = dup3(0, 100, 0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; + CASE_TEST(dup3_m1); tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; + CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break; + CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; + CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; + CASE_TEST(gettimeofday_null); EXPECT_SYSZR(1, gettimeofday(NULL, NULL)); break; + CASE_TEST(gettimeofday_bad1); EXPECT_SYSER(1, gettimeofday((void *)1, NULL), -1, EFAULT); break; + CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; + CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; + CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; + CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; + CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; + CASE_TEST(link_blah); EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break; + CASE_TEST(link_dir); EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break; + CASE_TEST(link_cross); EXPECT_SYSER(1, link("/proc/self/net", "/blah"), -1, EXDEV); break; + CASE_TEST(lseek_m1); EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break; + CASE_TEST(lseek_0); EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break; + CASE_TEST(mkdir_root); EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break; + CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", 0), -1); if (tmp != -1) close(tmp); break; + CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", 0), -1, ENOENT); if (tmp != -1) close(tmp); break; + CASE_TEST(poll_null); EXPECT_SYSZR(1, poll(NULL, 0, 0)); break; + CASE_TEST(poll_stdout); EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break; + CASE_TEST(poll_fault); EXPECT_SYSER(1, poll((void *)1, 1, 0), -1, EFAULT); break; + CASE_TEST(read_badf); EXPECT_SYSER(1, read(-1, &tmp, 1), -1, EBADF); break; + CASE_TEST(sched_yield); EXPECT_SYSZR(1, sched_yield()); break; + CASE_TEST(select_null); EXPECT_SYSZR(1, ({ struct timeval tv = { 0 }; select(0, NULL, NULL, NULL, &tv); })); break; + CASE_TEST(select_stdout); EXPECT_SYSNE(1, ({ fd_set fds; FD_ZERO(&fds); FD_SET(1, &fds); select(2, NULL, &fds, NULL, NULL); }), -1); break; + CASE_TEST(select_fault); EXPECT_SYSER(1, select(1, (void *)1, NULL, NULL, 0), -1, EFAULT); break; + CASE_TEST(stat_blah); EXPECT_SYSER(1, stat("/proc/self/blah", &stat_buf), -1, ENOENT); break; + CASE_TEST(stat_fault); EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break; + CASE_TEST(symlink_root); EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break; + CASE_TEST(unlink_root); EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break; + CASE_TEST(unlink_blah); EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break; + CASE_TEST(wait_child); EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break; + CASE_TEST(waitpid_min); EXPECT_SYSER(1, waitpid(INT_MIN, &tmp, WNOHANG), -1, ESRCH); break; + CASE_TEST(waitpid_child); EXPECT_SYSER(1, waitpid(getpid(), &tmp, WNOHANG), -1, ECHILD); break; + CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break; + CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break; + case __LINE__: + return ret; /* must be last */ + /* note: do not set any defaults so as to permit holes above */ + } + } + return ret; +} + + /* This is the definition of known test names, with their functions */ static struct test test_names[] = { /* add new tests here */ + { .name = "syscall", .func = run_syscall }, { 0 } }; -- cgit v1.2.3 From 95bc989488eb7150949cf2fcce3f486741d1c57f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:38 +0200 Subject: selftests/nolibc: add a few tests for some libc functions The test series called "stdlib" covers some libc functions (string, stdlib etc). By default they are automatically run after "syscall" but may be requested in argument or in variable NOLIBC_TEST. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index dc87832912ce..b928f099431f 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -496,11 +496,46 @@ int run_syscall(int min, int max) return ret; } +int run_stdlib(int min, int max) +{ + int test; + int tmp; + int ret = 0; + void *p1, *p2; + + for (test = min; test >= 0 && test <= max; test++) { + int llen = 0; // line length + + /* avoid leaving empty lines below, this will insert holes into + * test numbers. + */ + switch (test + __LINE__ + 1) { + CASE_TEST(getenv_TERM); EXPECT_STRNZ(1, getenv("TERM")); break; + CASE_TEST(getenv_blah); EXPECT_STRZR(1, getenv("blah")); break; + CASE_TEST(setcmp_blah_blah); EXPECT_EQ(1, strcmp("blah", "blah"), 0); break; + CASE_TEST(setcmp_blah_blah2); EXPECT_NE(1, strcmp("blah", "blah2"), 0); break; + CASE_TEST(setncmp_blah_blah); EXPECT_EQ(1, strncmp("blah", "blah", 10), 0); break; + CASE_TEST(setncmp_blah_blah4); EXPECT_EQ(1, strncmp("blah", "blah4", 4), 0); break; + CASE_TEST(setncmp_blah_blah5); EXPECT_NE(1, strncmp("blah", "blah5", 5), 0); break; + CASE_TEST(setncmp_blah_blah6); EXPECT_NE(1, strncmp("blah", "blah6", 6), 0); break; + CASE_TEST(strchr_foobar_o); EXPECT_STREQ(1, strchr("foobar", 'o'), "oobar"); break; + CASE_TEST(strchr_foobar_z); EXPECT_STRZR(1, strchr("foobar", 'z')); break; + CASE_TEST(strrchr_foobar_o); EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break; + CASE_TEST(strrchr_foobar_z); EXPECT_STRZR(1, strrchr("foobar", 'z')); break; + case __LINE__: + return ret; /* must be last */ + /* note: do not set any defaults so as to permit holes above */ + } + } + return ret; +} + /* This is the definition of known test names, with their functions */ static struct test test_names[] = { /* add new tests here */ { .name = "syscall", .func = run_syscall }, + { .name = "stdlib", .func = run_stdlib }, { 0 } }; -- cgit v1.2.3 From f49896d7d9a69b7565a59f5085c78ca1c08c7dd7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:40 +0200 Subject: selftests/nolibc: exit with poweroff on success when getpid() == 1 The idea is to ease automated testing under qemu. If the test succeeds while running as PID 1, indicating the system was booted with init=/test, let's just power off so that qemu can exit with a successful code. In other situations it will exit and provoke a panic, which may be caught for example with CONFIG_PVPANIC. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index b928f099431f..291d96bfd7c1 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -626,6 +626,20 @@ int main(int argc, char **argv, char **envp) } printf("Total number of errors: %d\n", ret); + + if (getpid() == 1) { + /* we're running as init, there's no other process on the + * system, thus likely started from a VM for a quick check. + * Exiting will provoke a kernel panic that may be reported + * as an error by Qemu or the hypervisor, while stopping + * cleanly will often be reported as a success. This allows + * to use the output of this program for bisecting kernels. + */ + printf("Leaving init with final status: %d\n", !!ret); + if (ret == 0) + reboot(LINUX_REBOOT_CMD_POWER_OFF); + } + printf("Exiting with status %d\n", !!ret); return !!ret; } -- cgit v1.2.3 From aa73a86cda26705c7f0af1afe9bb255a52accf87 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:41 +0200 Subject: selftests/nolibc: on x86, support exiting with isa-debug-exit QEMU, when started with "-device isa-debug-exit -no-reboot" will exit with status code 2N+1 when N is written to 0x501. This is particularly convenient for automated tests but this is not portable. As such we only enable this on x86_64 when pid==1. In addition, this requires an ioperm() call but in order not to have to define arch-specific syscalls we just perform the syscall by hand there. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 291d96bfd7c1..eeb254749239 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -638,6 +638,15 @@ int main(int argc, char **argv, char **envp) printf("Leaving init with final status: %d\n", !!ret); if (ret == 0) reboot(LINUX_REBOOT_CMD_POWER_OFF); +#if defined(__x86_64__) + /* QEMU started with "-device isa-debug-exit -no-reboot" will + * exit with status code 2N+1 when N is written to 0x501. We + * hard-code the syscall here as it's arch-dependent. + */ + else if (my_syscall3(__NR_ioperm, 0x501, 1, 1) == 0) + asm volatile ("outb %%al, %%dx" :: "d"(0x501), "a"(0)); + /* if it does nothing, fall back to the regular panic */ +#endif } printf("Exiting with status %d\n", !!ret); -- cgit v1.2.3 From 1a5454f625997049d886d8c3dae8e8de2a553125 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:42 +0200 Subject: selftests/nolibc: recreate and populate /dev and /proc if missing Most of the time the program will be run alone in an initramfs. There is no value in requiring the user to populate /dev and /proc for such tests, we can do it ourselves, and it participates to the tests at the same time. What's done here is that when called as init (getpid()==1) we check if /dev exists or create it, if /dev/console and /dev/null exists, otherwise we try to mount a devtmpfs there, and if it fails we fall back to mknod. The console is reopened if stdout was closed. Finally /proc is created and mounted if /proc/self cannot be found. This is sufficient for most tests. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 56 ++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index eeb254749239..a697182c87f5 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -530,6 +530,54 @@ int run_stdlib(int min, int max) return ret; } +/* prepare what needs to be prepared for pid 1 (stdio, /dev, /proc, etc) */ +int prepare(void) +{ + struct stat stat_buf; + + /* It's possible that /dev doesn't even exist or was not mounted, so + * we'll try to create it, mount it, or create minimal entries into it. + * We want at least /dev/null and /dev/console. + */ + if (stat("/dev/.", &stat_buf) == 0 || mkdir("/dev", 0755) == 0) { + if (stat("/dev/console", &stat_buf) != 0 || + stat("/dev/null", &stat_buf) != 0) { + /* try devtmpfs first, otherwise fall back to manual creation */ + if (mount("/dev", "/dev", "devtmpfs", 0, 0) != 0) { + mknod("/dev/console", 0600 | S_IFCHR, makedev(5, 1)); + mknod("/dev/null", 0666 | S_IFCHR, makedev(1, 3)); + } + } + } + + /* If no /dev/console was found before calling init, stdio is closed so + * we need to reopen it from /dev/console. If it failed above, it will + * still fail here and we cannot emit a message anyway. + */ + if (close(dup(1)) == -1) { + int fd = open("/dev/console", O_RDWR); + + if (fd >= 0) { + if (fd != 0) + dup2(fd, 0); + if (fd != 1) + dup2(fd, 1); + if (fd != 2) + dup2(fd, 2); + if (fd > 2) + close(fd); + puts("\nSuccessfully reopened /dev/console."); + } + } + + /* try to mount /proc if not mounted. Silently fail otherwise */ + if (stat("/proc/.", &stat_buf) == 0 || mkdir("/proc", 0755) == 0) { + if (stat("/proc/self", &stat_buf) != 0) + mount("/proc", "/proc", "proc", 0, 0); + } + + return 0; +} /* This is the definition of known test names, with their functions */ static struct test test_names[] = { @@ -550,6 +598,14 @@ int main(int argc, char **argv, char **envp) environ = envp; + /* when called as init, it's possible that no console was opened, for + * example if no /dev file system was provided. We'll check that fd#1 + * was opened, and if not we'll attempt to create and open /dev/console + * and /dev/null that we'll use for later tests. + */ + if (getpid() == 1) + prepare(); + /* the definition of a series of tests comes from either argv[1] or the * "NOLIBC_TEST" environment variable. It's made of a comma-delimited * series of test names and optional ranges: -- cgit v1.2.3 From 7172f1c6854cb424e3be3401e3df34c1c38cffc2 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:43 +0200 Subject: selftests/nolibc: condition some tests on /proc existence If /proc is not available (program run inside a chroot or without sufficient permissions), it's better to disable the associated tests. Some will be preserved like the ones which check for a failure to create some entries there since they're still supposed to fail. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index a697182c87f5..662dea691749 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -413,11 +413,15 @@ int test_getdents64(const char *dir) int run_syscall(int min, int max) { struct stat stat_buf; + int proc; int test; int tmp; int ret = 0; void *p1, *p2; + /* indicates whether or not /proc is mounted */ + proc = stat("/proc", &stat_buf) == 0; + for (test = min; test >= 0 && test <= max; test++) { int llen = 0; // line length @@ -438,12 +442,12 @@ int run_syscall(int min, int max) CASE_TEST(chdir_root); EXPECT_SYSZR(1, chdir("/")); break; CASE_TEST(chdir_dot); EXPECT_SYSZR(1, chdir(".")); break; CASE_TEST(chdir_blah); EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break; - CASE_TEST(chmod_net); EXPECT_SYSZR(1, chmod("/proc/self/net", 0555)); break; - CASE_TEST(chmod_self); EXPECT_SYSER(1, chmod("/proc/self", 0555), -1, EPERM); break; - CASE_TEST(chown_self); EXPECT_SYSER(1, chown("/proc/self", 0, 0), -1, EPERM); break; + CASE_TEST(chmod_net); EXPECT_SYSZR(proc, chmod("/proc/self/net", 0555)); break; + CASE_TEST(chmod_self); EXPECT_SYSER(proc, chmod("/proc/self", 0555), -1, EPERM); break; + CASE_TEST(chown_self); EXPECT_SYSER(proc, chown("/proc/self", 0, 0), -1, EPERM); break; CASE_TEST(chroot_root); EXPECT_SYSZR(1, chroot("/")); break; CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; - CASE_TEST(chroot_exe); EXPECT_SYSER(1, chroot("/proc/self/exe"), -1, ENOTDIR); break; + CASE_TEST(chroot_exe); EXPECT_SYSER(proc, chroot("/proc/self/exe"), -1, ENOTDIR); break; CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; CASE_TEST(close_dup); EXPECT_SYSZR(1, close(dup(0))); break; CASE_TEST(dup_0); tmp = dup(0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; @@ -464,7 +468,7 @@ int run_syscall(int min, int max) CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; CASE_TEST(link_blah); EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break; CASE_TEST(link_dir); EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break; - CASE_TEST(link_cross); EXPECT_SYSER(1, link("/proc/self/net", "/blah"), -1, EXDEV); break; + CASE_TEST(link_cross); EXPECT_SYSER(proc, link("/proc/self/net", "/blah"), -1, EXDEV); break; CASE_TEST(lseek_m1); EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break; CASE_TEST(lseek_0); EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break; CASE_TEST(mkdir_root); EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break; -- cgit v1.2.3 From 1da02f510882cd5684dc04dc7119056e01da90bd Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:44 +0200 Subject: selftests/nolibc: support glibc as well Adding support for glibc can be useful to distinguish between bugs in nolibc and bugs in the kernel when a syscall reports an unusual value. It's not that much work and should not affect the long term maintainability of the tests. The necessary changes can essentially be summed up like this: - set _GNU_SOURCE a the top to access some definitions - many includes added when we know we don't come from nolibc (missing the stdio include guard) - disable gettid() which is not exposed by glibc - disable gettimeofday's support of bad pointers since these crash in glibc - add a simple itoa() for errorname(); strerror() is too verbose (no way to get short messages). strerrorname_np() was added in modern glibc (2.32) to do exactly this but that 's too recent to be usable as the default fallback. - use the standard ioperm() definition. May be we need to implement ioperm() in nolibc if that's useful. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 47 ++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 662dea691749..78bced95ac63 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1,17 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + /* platform-specific include files coming from the compiler */ #include /* libc-specific include files - * The program may be built in 2 ways: + * The program may be built in 3 ways: * $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined - * $(CC) -nostdlib -I/path/to/nolibc/sysroot + * $(CC) -nostdlib -I/path/to/nolibc/sysroot => _NOLIBC_* guards are present + * $(CC) with default libc => NOLIBC* never defined */ #ifndef NOLIBC #include #include #include +#ifndef _NOLIBC_STDIO_H +/* standard libcs need more includes */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif #endif /* will be used by nolibc by getenv() */ @@ -23,6 +47,17 @@ struct test { int (*func)(int min, int max); // handler }; +#ifndef _NOLIBC_STDLIB_H +char *itoa(int i) +{ + static char buf[12]; + int ret; + + ret = snprintf(buf, sizeof(buf), "%d", i); + return (ret >= 0 && ret < sizeof(buf)) ? buf : "#err"; +} +#endif + #define CASE_ERR(err) \ case err: return #err @@ -431,7 +466,9 @@ int run_syscall(int min, int max) switch (test + __LINE__ + 1) { CASE_TEST(getpid); EXPECT_SYSNE(1, getpid(), -1); break; CASE_TEST(getppid); EXPECT_SYSNE(1, getppid(), -1); break; +#ifdef NOLIBC CASE_TEST(gettid); EXPECT_SYSNE(1, gettid(), -1); break; +#endif CASE_TEST(getpgid_self); EXPECT_SYSNE(1, getpgid(0), -1); break; CASE_TEST(getpgid_bad); EXPECT_SYSER(1, getpgid(-1), -1, ESRCH); break; CASE_TEST(kill_0); EXPECT_SYSZR(1, kill(getpid(), 0)); break; @@ -460,9 +497,11 @@ int run_syscall(int min, int max) CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; CASE_TEST(gettimeofday_null); EXPECT_SYSZR(1, gettimeofday(NULL, NULL)); break; +#ifdef NOLIBC CASE_TEST(gettimeofday_bad1); EXPECT_SYSER(1, gettimeofday((void *)1, NULL), -1, EFAULT); break; CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; +#endif CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; @@ -703,7 +742,11 @@ int main(int argc, char **argv, char **envp) * exit with status code 2N+1 when N is written to 0x501. We * hard-code the syscall here as it's arch-dependent. */ +#if defined(_NOLIBC_SYS_H) else if (my_syscall3(__NR_ioperm, 0x501, 1, 1) == 0) +#else + else if (ioperm(0x501, 1, 1) == 0) +#endif asm volatile ("outb %%al, %%dx" :: "d"(0x501), "a"(0)); /* if it does nothing, fall back to the regular panic */ #endif -- cgit v1.2.3 From d248cabff5da2f3f2ce0ab99c1f96a15e8fb98c6 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:45 +0200 Subject: selftests/nolibc: add a "kernel" target to build the kernel with the initramfs The "kernel" target rebuilds the kernel with the current config for the selected arch, with an initramfs containing the nolibc-test utility. Since image names depend on the architecture, the currently supported ones are referenced and resolved based on the architecture. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index fd0a67082334..4a2ab0e73ce2 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -12,6 +12,16 @@ include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) endif +# kernel image names by architecture +IMAGE_i386 = arch/x86/boot/bzImage +IMAGE_x86 = arch/x86/boot/bzImage +IMAGE_arm64 = arch/arm64/boot/Image +IMAGE_arm = arch/arm/boot/zImage +IMAGE_mips = vmlinuz +IMAGE_riscv = arch/riscv/boot/Image +IMAGE = $(IMAGE_$(ARCH)) +IMAGE_NAME = $(notdir $(IMAGE)) + # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. OUTPUT ?= $(CURDIR)/ @@ -36,6 +46,9 @@ initramfs: nolibc-test $(call QUIET_INSTALL, initramfs/init) $(Q)cp nolibc-test initramfs/init +kernel: initramfs + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs + clean: $(call QUIET_CLEAN, nolibc-test) $(Q)rm -f nolibc-test -- cgit v1.2.3 From 5c43fd7954108bd138f7a1b4db4fb997ae1dc696 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:46 +0200 Subject: selftests/nolibc: add a "defconfig" target While most archs will work fine with "make defconfig", not all will do, and it's not always easy to remember the most suitable choice to use for a specific architecture. This adds a "defconfig" target to the Makefile so that one may easily run "make -C ... defconfig" and make sure to clean and rebuild a fresh config. This is *not* used by default because we want to preserve the user's config by default. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 4a2ab0e73ce2..c104719eae8b 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -22,6 +22,15 @@ IMAGE_riscv = arch/riscv/boot/Image IMAGE = $(IMAGE_$(ARCH)) IMAGE_NAME = $(notdir $(IMAGE)) +# default kernel configurations that appear to be usable +DEFCONFIG_i386 = defconfig +DEFCONFIG_x86 = defconfig +DEFCONFIG_arm64 = defconfig +DEFCONFIG_arm = multi_v7_defconfig +DEFCONFIG_mips = malta_defconfig +DEFCONFIG_riscv = defconfig +DEFCONFIG = $(DEFCONFIG_$(ARCH)) + # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. OUTPUT ?= $(CURDIR)/ @@ -46,6 +55,9 @@ initramfs: nolibc-test $(call QUIET_INSTALL, initramfs/init) $(Q)cp nolibc-test initramfs/init +defconfig: + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + kernel: initramfs $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs -- cgit v1.2.3 From 662ea60e373be8a6b8a925b237cdb93a2af353c1 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:47 +0200 Subject: selftests/nolibc: add a "run" target to start the kernel in QEMU The "run" target will build the kernel and start it in QEMU. The "rerun" target will not have the kernel dependency and will just try to start QEMU. The QEMU architecture used to start the kernel is derived from the configured ARCH. This might need to be improved for archs which include different variants under the same name (mips vs mipsel, +/-64, riscv32 vs riscv64). This could be tested for i386, x86, arm, arm64, mips and riscv (the later two reporting issues on some tests). It is possible to pass a test specification for nolibc-test in the TEST variable, which will be passed as-is as NOLIBC_TEST. On success, the number of successful tests is printed. On failure, failed lines are individually printed. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index c104719eae8b..7c1f5360f454 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -31,6 +31,27 @@ DEFCONFIG_mips = malta_defconfig DEFCONFIG_riscv = defconfig DEFCONFIG = $(DEFCONFIG_$(ARCH)) +# optional tests to run (default = all) +TEST = + +# QEMU_ARCH: arch names used by qemu +QEMU_ARCH_i386 = i386 +QEMU_ARCH_x86 = x86_64 +QEMU_ARCH_arm64 = aarch64 +QEMU_ARCH_arm = arm +QEMU_ARCH_mips = mipsel # works with malta_defconfig +QEMU_ARCH_riscv = riscv64 +QEMU_ARCH = $(QEMU_ARCH_$(ARCH)) + +# QEMU_ARGS : some arch-specific args to pass to qemu +QEMU_ARGS_i386 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_mips = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS = $(QEMU_ARGS_$(ARCH)) + # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. OUTPUT ?= $(CURDIR)/ @@ -61,8 +82,20 @@ defconfig: kernel: initramfs $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs +# run the tests after building the kernel +run: kernel + $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" + $(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed." + +# re-run the tests from an existing kernel +rerun: + $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" + $(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed." + clean: $(call QUIET_CLEAN, nolibc-test) $(Q)rm -f nolibc-test $(call QUIET_CLEAN, initramfs) $(Q)rm -rf initramfs + $(call QUIET_CLEAN, run.out) + $(Q)rm -rf run.out -- cgit v1.2.3 From b25c5284db0a481a427c9838f32662587d947d27 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:48 +0200 Subject: selftests/nolibc: "sysroot" target installs a local copy of the sysroot It's not convenient to rely on a sysroot built in another directory, especially when running cross-compilation tests, where one has to switch back and forth between directories. Let's make it possible to install the sysroot directly in the test directory. It's not big and even benefits from being copied by arch so that it's easier to switch between archs if needed. The new "sysroot" target does this, it just calls "headers_standalone" from nolibc to install the sysroot right here. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7c1f5360f454..210f5369fdfc 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -67,9 +67,16 @@ LDFLAGS := -s all: nolibc-test -nolibc-test: nolibc-test.c +sysroot: sysroot/$(ARCH)/include + +sysroot/$(ARCH)/include: + $(QUIET_MKDIR)mkdir -p sysroot + $(Q)$(MAKE) -C ../../../include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone + $(Q)mv sysroot/sysroot sysroot/$(ARCH) + +nolibc-test: nolibc-test.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include ../../../include/nolibc/nolibc.h $^ -lgcc + -nostdlib -static -Isysroot/$(ARCH)/include $< -lgcc initramfs: nolibc-test $(QUIET_MKDIR)mkdir -p initramfs @@ -93,6 +100,8 @@ rerun: $(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed." clean: + $(call QUIET_CLEAN, sysroot) + $(Q)rm -rf sysroot $(call QUIET_CLEAN, nolibc-test) $(Q)rm -f nolibc-test $(call QUIET_CLEAN, initramfs) -- cgit v1.2.3 From ffc297fe2259a701f2bd52a6fb8481abc89d331d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:49 +0200 Subject: selftests/nolibc: add a "help" target It presents the supported targets, and becomes the default target to save the user from having to read the makefile. The "all" target was placed after it and now points to "run" to do everything since it's no longer the default one. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 210f5369fdfc..69ea659caca9 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -65,7 +65,32 @@ endif CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables LDFLAGS := -s -all: nolibc-test +help: + @echo "Supported targets under selftests/nolibc:" + @echo " all call the \"run\" target below" + @echo " help this help" + @echo " sysroot create the nolibc sysroot here (uses \$$ARCH)" + @echo " nolibc-test build the executable (uses \$$CC and \$$CROSS_COMPILE)" + @echo " initramfs prepare the initramfs with nolibc-test" + @echo " defconfig create a fresh new default config (uses \$$ARCH)" + @echo " kernel (re)build the kernel with the initramfs (uses \$$ARCH)" + @echo " run runs the kernel in QEMU after building it (uses \$$ARCH, \$$TEST)" + @echo " rerun runs a previously prebuilt kernel in QEMU (uses \$$ARCH, \$$TEST)" + @echo " clean clean the sysroot, initramfs, build and output files" + @echo "" + @echo "The output file is \"run.out\". Test ranges may be passed using \$$TEST." + @echo "" + @echo "Currently using the following variables:" + @echo " ARCH = $(ARCH)" + @echo " CROSS_COMPILE = $(CROSS_COMPILE)" + @echo " CC = $(CC)" + @echo " OUTPUT = $(OUTPUT)" + @echo " TEST = $(TEST)" + @echo " QEMU_ARCH = $(if $(QEMU_ARCH),$(QEMU_ARCH),UNKNOWN_ARCH) [determined from \$$ARCH]" + @echo " IMAGE_NAME = $(if $(IMAGE_NAME),$(IMAGE_NAME),UNKNOWN_ARCH) [determined from \$$ARCH]" + @echo "" + +all: run sysroot: sysroot/$(ARCH)/include -- cgit v1.2.3 From 43cf168fa99992ee70ff041a61f866f56aa47f3b Mon Sep 17 00:00:00 2001 From: Fernanda Ma'rouf Date: Wed, 20 Jul 2022 05:37:45 +0200 Subject: selftests/nolibc: Avoid generated files being committed After running the nolibc tests, the "git status" is not clean because the generated files are not ignored. Create a `.gitignore` inside the selftests/nolibc directory to ignore them. Cc: Ammar Faizi Cc: Fernanda Ma'rouf Signed-off-by: Fernanda Ma'rouf Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tools/testing/selftests/nolibc/.gitignore (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/.gitignore b/tools/testing/selftests/nolibc/.gitignore new file mode 100644 index 000000000000..4696df589d68 --- /dev/null +++ b/tools/testing/selftests/nolibc/.gitignore @@ -0,0 +1,4 @@ +/initramfs/ +/nolibc-test +/run.out +/sysroot/ -- cgit v1.2.3 From 3f5df3ac646e21a79a421ae4037c4ef0632bcaa9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 30 Aug 2022 09:48:40 -0700 Subject: perf metric: Return early if no CPU PMU table exists Previous behavior is to segfault if there is no CPU PMU table and a metric is sought. To reproduce compile with NO_JEVENTS=1 then request a metric, for example, "perf stat -M IPC true". Committer testing: Before: $ make -k NO_JEVENTS=1 BUILD_BPF_SKEL=1 O=/tmp/build/perf-urgent -C tools/perf install-bin $ perf stat -M IPC true Segmentation fault (core dumped) $ After: $ perf stat -M IPC true Usage: perf stat [] [] -M, --metrics monitor specified metrics or metric groups (separated by ,) $ Fixes: 00facc760903be66 ("perf jevents: Switch build to use jevents.py") Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Caleb Biggers Cc: Florian Fischer Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Mark Rutland Cc: Miaoqian Lin Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20220830164846.401143-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 464475fd6b9a..c93bcaf6d55d 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1655,6 +1655,9 @@ int metricgroup__parse_groups(const struct option *opt, struct evlist *perf_evlist = *(struct evlist **)opt->value; const struct pmu_events_table *table = pmu_events_table__find(); + if (!table) + return -EINVAL; + return parse_groups(perf_evlist, str, metric_no_group, metric_no_merge, NULL, metric_events, table); } -- cgit v1.2.3 From 35503ce12a2c3d5d9a94e3cd85a06739b0120f79 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 31 Aug 2022 14:40:41 +0200 Subject: perf script: Skip dummy event attr check Hongtao Yu reported problem when displaying uregs in perf script for system wide perf.data: # perf script -F uregs | head -10 Samples for 'dummy:HG' event do not have UREGS attribute set. Cannot print 'uregs' field. The problem is the extra dummy event added for system wide, which does not have proper sample_type setup. Skipping attr check completely for dummy event as suggested by Namhyung, because it does not have any samples anyway. Reported-by: Hongtao Yu Suggested-by: Namhyung Kim Signed-off-by: Jiri Olsa Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220831124041.219925-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 13580a9c50b8..304d234d8e84 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -566,6 +566,8 @@ static struct evsel *find_first_output_type(struct evlist *evlist, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) { + if (evsel__is_dummy_event(evsel)) + continue; if (output_type(evsel->core.attr.type) == (int)type) return evsel; } -- cgit v1.2.3 From 8a7d61bdc2fac2c460a2f32a062f5c6dbd21a764 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Aug 2022 15:39:05 +0200 Subject: selftests/xsk: Add missing close() on netns fd Commit 1034b03e54ac ("selftests: xsk: Simplify cleanup of ifobjects") removed close on netns fd, which is not correct, so let us restore it. Fixes: 1034b03e54ac ("selftests: xsk: Simplify cleanup of ifobjects") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220830133905.9945-1-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 74d56d971baf..091402dc5390 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -1606,6 +1606,8 @@ static struct ifobject *ifobject_create(void) if (!ifobj->umem) goto out_umem; + ifobj->ns_fd = -1; + return ifobj; out_umem: @@ -1617,6 +1619,8 @@ out_xsk_arr: static void ifobject_delete(struct ifobject *ifobj) { + if (ifobj->ns_fd != -1) + close(ifobj->ns_fd); free(ifobj->umem); free(ifobj->xsk_arr); free(ifobj); -- cgit v1.2.3 From 14e5ce79943a72b9bf0fff8a5867320a9fa3e40d Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Mon, 29 Aug 2022 15:05:46 -0600 Subject: libbpf: Add GCC support for bpf_tail_call_static The bpf_tail_call_static function is currently not defined unless using clang >= 8. To support bpf_tail_call_static on GCC we can check if __clang__ is not defined to enable bpf_tail_call_static. We need to use GCC assembly syntax when the compiler does not define __clang__ as LLVM inline assembly is not fully compatible with GCC. Signed-off-by: James Hilliard Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220829210546.755377-1-james.hilliard1@gmail.com --- tools/lib/bpf/bpf_helpers.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 7349b16b8e2f..867b734839dd 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -131,7 +131,7 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (!defined(__clang__) || __clang_major__ >= 8) && defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -139,8 +139,8 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) __bpf_unreachable(); /* - * Provide a hard guarantee that LLVM won't optimize setting r2 (map - * pointer) and r3 (constant map index) from _different paths_ ending + * Provide a hard guarantee that the compiler won't optimize setting r2 + * (map pointer) and r3 (constant map index) from _different paths_ ending * up at the _same_ call insn as otherwise we won't be able to use the * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key @@ -148,12 +148,19 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) * * Note on clobber list: we need to stay in-line with BPF calling * convention, so even if we don't end up using r0, r4, r5, we need - * to mark them as clobber so that LLVM doesn't end up using them - * before / after the call. + * to mark them as clobber so that the compiler doesn't end up using + * them before / after the call. */ - asm volatile("r1 = %[ctx]\n\t" + asm volatile( +#ifdef __clang__ + "r1 = %[ctx]\n\t" "r2 = %[map]\n\t" "r3 = %[slot]\n\t" +#else + "mov %%r1,%[ctx]\n\t" + "mov %%r2,%[map]\n\t" + "mov %%r3,%[slot]\n\t" +#endif "call 12" :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) : "r0", "r1", "r2", "r3", "r4", "r5"); -- cgit v1.2.3 From 5a3a59981027b53ec0f729ad76a43ce2b64ad968 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Mon, 29 Aug 2022 11:47:48 -0700 Subject: selftests: net: sort .gitignore file This is the result of `sort tools/testing/selftests/net/.gitignore`, but preserving the comment at the top. Suggested-by: Jakub Kicinski Signed-off-by: Axel Rasmussen Link: https://lore.kernel.org/r/20220829184748.1535580-1-axelrasmussen@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 50 +++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 0e5751af6247..de7d5cc15f85 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,42 +1,42 @@ # SPDX-License-Identifier: GPL-2.0-only +cmsg_sender +fin_ack_lat +gro +hwtstamp_config +ioam6_parser +ip_defrag ipsec +ipv6_flowlabel +ipv6_flowlabel_mgr msg_zerocopy -socket +nettest psock_fanout psock_snd psock_tpacket -stress_reuseport_listen +reuseaddr_conflict +reuseaddr_ports_exhausted reuseport_addr_any reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa reuseport_dualstack -reuseaddr_conflict -tcp_mmap -udpgso -udpgso_bench_rx -udpgso_bench_tx -tcp_inq -tls -txring_overwrite -ip_defrag -ipv6_flowlabel -ipv6_flowlabel_mgr -so_txtime -tcp_fastopen_backup_key -nettest -fin_ack_lat -reuseaddr_ports_exhausted -hwtstamp_config rxtimestamp -timestamping -txtimestamp +socket so_netns_cookie +so_txtime +stress_reuseport_listen +tap +tcp_fastopen_backup_key +tcp_inq +tcp_mmap test_unix_oob -gro -ioam6_parser +timestamping +tls toeplitz tun -cmsg_sender +txring_overwrite +txtimestamp +udpgso +udpgso_bench_rx +udpgso_bench_tx unix_connect -tap \ No newline at end of file -- cgit v1.2.3 From 197072945a708d62181895409effdfcda80c7798 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 30 Aug 2022 16:19:53 -0700 Subject: selftest/bpf: Ensure no module loading in bpf_setsockopt(TCP_CONGESTION) This patch adds a test to ensure bpf_setsockopt(TCP_CONGESTION, "not_exist") will not trigger the kernel module autoload. Before the fix: [ 40.535829] BUG: sleeping function called from invalid context at include/linux/sched/mm.h:274 [...] [ 40.552134] tcp_ca_find_autoload.constprop.0+0xcb/0x200 [ 40.552689] tcp_set_congestion_control+0x99/0x7b0 [ 40.553203] do_tcp_setsockopt+0x3ed/0x2240 [...] [ 40.556041] __bpf_setsockopt+0x124/0x640 Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220830231953.792412-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/progs/setget_sockopt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 40606ef47a38..79debf3c2f44 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -32,6 +32,7 @@ struct sockopt_test { unsigned int flip:1; }; +static const char not_exist_cc[] = "not_exist"; static const char cubic_cc[] = "cubic"; static const char reno_cc[] = "reno"; @@ -307,6 +308,9 @@ static int bpf_test_tcp_sockopt(__u32 i, struct loop_ctx *lc) const char *new_cc; int new_cc_len; + if (!bpf_setsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, + (void *)not_exist_cc, sizeof(not_exist_cc))) + return 1; if (bpf_getsockopt(ctx, IPPROTO_TCP, TCP_CONGESTION, old_cc, sizeof(old_cc))) return 1; if (!bpf_strncmp(old_cc, sizeof(old_cc), cubic_cc)) { -- cgit v1.2.3 From 1c636b6277a2b2bf504df490b8dbadd2bd34ccd4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 31 Aug 2022 12:26:29 +0800 Subject: selftests/bpf: Add test cases for htab update One test demonstrates the reentrancy of hash map update on the same bucket should fail, and another one shows concureently updates of the same hash map bucket should succeed and not fail due to the reentrancy checking for bucket lock. There is no trampoline support on s390x, so move htab_update to denylist. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20220831042629.130006-4-houtao@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../testing/selftests/bpf/prog_tests/htab_update.c | 126 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/htab_update.c | 29 +++++ 3 files changed, 156 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_update.c create mode 100644 tools/testing/selftests/bpf/progs/htab_update.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 736b65f61022..ba02b559ca68 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -68,3 +68,4 @@ unpriv_bpf_disabled # fentry setget_sockopt # attach unexpected error: -524 (trampoline) cb_refs # expected error message unexpected error: -524 (trampoline) cgroup_hierarchical_stats # JIT does not support calling kernel function (kfunc) +htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c new file mode 100644 index 000000000000..2bc85f4814f4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include +#include "htab_update.skel.h" + +struct htab_update_ctx { + int fd; + int loop; + bool stop; +}; + +static void test_reenter_update(void) +{ + struct htab_update *skel; + unsigned int key, value; + int err; + + skel = htab_update__open(); + if (!ASSERT_OK_PTR(skel, "htab_update__open")) + return; + + /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */ + bpf_program__set_autoload(skel->progs.lookup_elem_raw, true); + err = htab_update__load(skel); + if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err) + goto out; + + skel->bss->pid = getpid(); + err = htab_update__attach(skel); + if (!ASSERT_OK(err, "htab_update__attach")) + goto out; + + /* Will trigger the reentrancy of bpf_map_update_elem() */ + key = 0; + value = 0; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0); + if (!ASSERT_OK(err, "add element")) + goto out; + + ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy"); +out: + htab_update__destroy(skel); +} + +static void *htab_update_thread(void *arg) +{ + struct htab_update_ctx *ctx = arg; + cpu_set_t cpus; + int i; + + /* Pinned on CPU 0 */ + CPU_ZERO(&cpus); + CPU_SET(0, &cpus); + pthread_setaffinity_np(pthread_self(), sizeof(cpus), &cpus); + + i = 0; + while (i++ < ctx->loop && !ctx->stop) { + unsigned int key = 0, value = 0; + int err; + + err = bpf_map_update_elem(ctx->fd, &key, &value, 0); + if (err) { + ctx->stop = true; + return (void *)(long)err; + } + } + + return NULL; +} + +static void test_concurrent_update(void) +{ + struct htab_update_ctx ctx; + struct htab_update *skel; + unsigned int i, nr; + pthread_t *tids; + int err; + + skel = htab_update__open_and_load(); + if (!ASSERT_OK_PTR(skel, "htab_update__open_and_load")) + return; + + ctx.fd = bpf_map__fd(skel->maps.htab); + ctx.loop = 1000; + ctx.stop = false; + + nr = 4; + tids = calloc(nr, sizeof(*tids)); + if (!ASSERT_NEQ(tids, NULL, "no mem")) + goto out; + + for (i = 0; i < nr; i++) { + err = pthread_create(&tids[i], NULL, htab_update_thread, &ctx); + if (!ASSERT_OK(err, "pthread_create")) { + unsigned int j; + + ctx.stop = true; + for (j = 0; j < i; j++) + pthread_join(tids[j], NULL); + goto out; + } + } + + for (i = 0; i < nr; i++) { + void *thread_err = NULL; + + pthread_join(tids[i], &thread_err); + ASSERT_EQ(thread_err, NULL, "update error"); + } + +out: + if (tids) + free(tids); + htab_update__destroy(skel); +} + +void test_htab_update(void) +{ + if (test__start_subtest("reenter_update")) + test_reenter_update(); + if (test__start_subtest("concurrent_update")) + test_concurrent_update(); +} diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c new file mode 100644 index 000000000000..7481bb30b29b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} htab SEC(".maps"); + +int pid = 0; +int update_err = 0; + +SEC("?fentry/lookup_elem_raw") +int lookup_elem_raw(void *ctx) +{ + __u32 key = 0, value = 1; + + if ((bpf_get_current_pid_tgid() >> 32) != pid) + return 0; + + update_err = bpf_map_update_elem(&htab, &key, &value, 0); + return 0; +} -- cgit v1.2.3 From 0e4d354762cefd3e16b4cff8988ff276e45effc4 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Mon, 29 Aug 2022 13:18:51 +0200 Subject: net-next: Fix IP_UNICAST_IF option behavior for connected sockets The IP_UNICAST_IF socket option is used to set the outgoing interface for outbound packets. The IP_UNICAST_IF socket option was added as it was needed by the Wine project, since no other existing option (SO_BINDTODEVICE socket option, IP_PKTINFO socket option or the bind function) provided the needed characteristics needed by the IP_UNICAST_IF socket option. [1] The IP_UNICAST_IF socket option works well for unconnected sockets, that is, the interface specified by the IP_UNICAST_IF socket option is taken into consideration in the route lookup process when a packet is being sent. However, for connected sockets, the outbound interface is chosen when connecting the socket, and in the route lookup process which is done when a packet is being sent, the interface specified by the IP_UNICAST_IF socket option is being ignored. This inconsistent behavior was reported and discussed in an issue opened on systemd's GitHub project [2]. Also, a bug report was submitted in the kernel's bugzilla [3]. To understand the problem in more detail, we can look at what happens for UDP packets over IPv4 (The same analysis was done separately in the referenced systemd issue). When a UDP packet is sent the udp_sendmsg function gets called and the following happens: 1. The oif member of the struct ipcm_cookie ipc (which stores the output interface of the packet) is initialized by the ipcm_init_sk function to inet->sk.sk_bound_dev_if (the device set by the SO_BINDTODEVICE socket option). 2. If the IP_PKTINFO socket option was set, the oif member gets overridden by the call to the ip_cmsg_send function. 3. If no output interface was selected yet, the interface specified by the IP_UNICAST_IF socket option is used. 4. If the socket is connected and no destination address is specified in the send function, the struct ipcm_cookie ipc is not taken into consideration and the cached route, that was calculated in the connect function is being used. Thus, for a connected socket, the IP_UNICAST_IF sockopt isn't taken into consideration. This patch corrects the behavior of the IP_UNICAST_IF socket option for connect()ed sockets by taking into consideration the IP_UNICAST_IF sockopt when connecting the socket. In order to avoid reconnecting the socket, this option is still ignored when applied on an already connected socket until connect() is called again by the Richard Gobert. Change the __ip4_datagram_connect function, which is called during socket connection, to take into consideration the interface set by the IP_UNICAST_IF socket option, in a similar way to what is done in the udp_sendmsg function. [1] https://lore.kernel.org/netdev/1328685717.4736.4.camel@edumazet-laptop/T/ [2] https://github.com/systemd/systemd/issues/11935#issuecomment-618691018 [3] https://bugzilla.kernel.org/show_bug.cgi?id=210255 Signed-off-by: Richard Gobert Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220829111554.GA1771@debian Signed-off-by: Jakub Kicinski --- net/ipv4/datagram.c | 2 ++ tools/testing/selftests/net/fcnal-test.sh | 30 ++++++++++++++++++++++++++++++ tools/testing/selftests/net/nettest.c | 16 ++++++++++++++-- 3 files changed, 46 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index ffd57523331f..405a8c2aea64 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,6 +42,8 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; + } else if (!oif) { + oif = inet->uc_index; } fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif, diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 03b586760164..31c3b6ebd388 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -1466,6 +1466,13 @@ ipv4_udp_novrf() run_cmd nettest -D -r ${a} -d ${NSA_DEV} -S -0 ${NSA_IP} log_test_addr ${a} $? 0 "Client, device bind via IP_UNICAST_IF" + log_start + run_cmd_nsb nettest -D -s & + sleep 1 + run_cmd nettest -D -r ${a} -d ${NSA_DEV} -S -0 ${NSA_IP} -U + log_test_addr ${a} $? 0 "Client, device bind via IP_UNICAST_IF, with connect()" + + log_start show_hint "Should fail 'Connection refused'" run_cmd nettest -D -r ${a} @@ -1525,6 +1532,13 @@ ipv4_udp_novrf() run_cmd nettest -D -d ${NSA_DEV} -S -r ${a} log_test_addr ${a} $? 0 "Global server, device client via IP_UNICAST_IF, local connection" + log_start + run_cmd nettest -s -D & + sleep 1 + run_cmd nettest -D -d ${NSA_DEV} -S -r ${a} -U + log_test_addr ${a} $? 0 "Global server, device client via IP_UNICAST_IF, local connection, with connect()" + + # IPv4 with device bind has really weird behavior - it overrides the # fib lookup, generates an rtable and tries to send the packet. This # causes failures for local traffic at different places @@ -1550,6 +1564,15 @@ ipv4_udp_novrf() sleep 1 run_cmd nettest -D -r ${a} -d ${NSA_DEV} -S log_test_addr ${a} $? 1 "Global server, device client via IP_UNICAST_IF, local connection" + + log_start + show_hint "Should fail since addresses on loopback are out of device scope" + run_cmd nettest -D -s & + sleep 1 + run_cmd nettest -D -r ${a} -d ${NSA_DEV} -S -U + log_test_addr ${a} $? 1 "Global server, device client via IP_UNICAST_IF, local connection, with connect()" + + done a=${NSA_IP} @@ -3157,6 +3180,13 @@ ipv6_udp_novrf() sleep 1 run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -S log_test_addr ${a} $? 1 "Global server, device client via IP_UNICAST_IF, local connection" + + log_start + show_hint "Should fail 'No route to host' since addresses on loopback are out of device scope" + run_cmd nettest -6 -D -s & + sleep 1 + run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -S -U + log_test_addr ${a} $? 1 "Global server, device client via IP_UNICAST_IF, local connection, with connect()" done a=${NSA_IP6} diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c index d9a6fd2cd9d3..7900fa98eccb 100644 --- a/tools/testing/selftests/net/nettest.c +++ b/tools/testing/selftests/net/nettest.c @@ -127,6 +127,9 @@ struct sock_args { /* ESP in UDP encap test */ int use_xfrm; + + /* use send() and connect() instead of sendto */ + int datagram_connect; }; static int server_mode; @@ -979,6 +982,11 @@ static int send_msg(int sd, void *addr, socklen_t alen, struct sock_args *args) log_err_errno("write failed sending msg to peer"); return 1; } + } else if (args->datagram_connect) { + if (send(sd, msg, msglen, 0) < 0) { + log_err_errno("send failed sending msg to peer"); + return 1; + } } else if (args->ifindex && args->use_cmsg) { if (send_msg_cmsg(sd, addr, alen, args->ifindex, args->version)) return 1; @@ -1659,7 +1667,7 @@ static int connectsock(void *addr, socklen_t alen, struct sock_args *args) if (args->has_local_ip && bind_socket(sd, args)) goto err; - if (args->type != SOCK_STREAM) + if (args->type != SOCK_STREAM && !args->datagram_connect) goto out; if (args->password && tcp_md5sig(sd, addr, alen, args)) @@ -1854,7 +1862,7 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args) return client_status; } -#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SCi6xL:0:1:2:3:Fbqf" +#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" #define OPT_FORCE_BIND_KEY_IFINDEX 1001 #define OPT_NO_BIND_KEY_IFINDEX 1002 @@ -1891,6 +1899,7 @@ static void print_usage(char *prog) " -I dev bind socket to given device name - server mode\n" " -S use setsockopt (IP_UNICAST_IF or IP_MULTICAST_IF)\n" " to set device binding\n" + " -U Use connect() and send() for datagram sockets\n" " -f bind socket with the IP[V6]_FREEBIND option\n" " -C use cmsg and IP_PKTINFO to specify device binding\n" "\n" @@ -2074,6 +2083,9 @@ int main(int argc, char *argv[]) case 'x': args.use_xfrm = 1; break; + case 'U': + args.datagram_connect = 1; + break; default: print_usage(argv[0]); return 1; -- cgit v1.2.3 From 75847100c351c7a49dddd60d1d023bd3e6640682 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 Sep 2022 11:54:00 +0100 Subject: selftests/net: temporarily disable io_uring zc test We're going to change API, to avoid build problems with a couple of following commits, disable io_uring testing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/12b7507223df04fbd12aa05fc0cb544b51d7ed79.1662027856.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- tools/testing/selftests/net/io_uring_zerocopy_tx.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.c b/tools/testing/selftests/net/io_uring_zerocopy_tx.c index 9d64c560a2d6..7446ef364e9f 100644 --- a/tools/testing/selftests/net/io_uring_zerocopy_tx.c +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.c @@ -36,6 +36,8 @@ #include #include +#if 0 + #define NOTIF_TAG 0xfffffffULL #define NONZC_TAG 0 #define ZC_TAG 1 @@ -603,3 +605,10 @@ int main(int argc, char **argv) error(1, 0, "unknown cfg_test %s", cfg_test); return 0; } + +#else +int main(int argc, char **argv) +{ + return 0; +} +#endif -- cgit v1.2.3 From 916d72c10a4ca80ea51f1421e774cb765b53f28f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 Sep 2022 11:54:05 +0100 Subject: selftests/net: return back io_uring zc send tests Enable io_uring zerocopy send tests back and fix them up to follow the new inteface. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c8e5018c516093bdad0b6e19f2f9847dea17e4d2.1662027856.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- tools/testing/selftests/net/io_uring_zerocopy_tx.c | 110 +++++++-------------- .../testing/selftests/net/io_uring_zerocopy_tx.sh | 10 +- 2 files changed, 41 insertions(+), 79 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.c b/tools/testing/selftests/net/io_uring_zerocopy_tx.c index 7446ef364e9f..8ce48aca8321 100644 --- a/tools/testing/selftests/net/io_uring_zerocopy_tx.c +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.c @@ -36,8 +36,6 @@ #include #include -#if 0 - #define NOTIF_TAG 0xfffffffULL #define NONZC_TAG 0 #define ZC_TAG 1 @@ -49,7 +47,6 @@ enum { MODE_MIXED = 3, }; -static bool cfg_flush = false; static bool cfg_cork = false; static int cfg_mode = MODE_ZC_FIXED; static int cfg_nr_reqs = 8; @@ -168,21 +165,6 @@ static int io_uring_register_buffers(struct io_uring *ring, return (ret < 0) ? -errno : ret; } -static int io_uring_register_notifications(struct io_uring *ring, - unsigned nr, - struct io_uring_notification_slot *slots) -{ - int ret; - struct io_uring_notification_register r = { - .nr_slots = nr, - .data = (unsigned long)slots, - }; - - ret = syscall(__NR_io_uring_register, ring->ring_fd, - IORING_REGISTER_NOTIFIERS, &r, sizeof(r)); - return (ret < 0) ? -errno : ret; -} - static int io_uring_mmap(int fd, struct io_uring_params *p, struct io_uring_sq *sq, struct io_uring_cq *cq) { @@ -299,11 +281,10 @@ static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, const void *buf, size_t len, int flags, - unsigned slot_idx, unsigned zc_flags) + unsigned zc_flags) { io_uring_prep_send(sqe, sockfd, buf, len, flags); - sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF; - sqe->notification_idx = slot_idx; + sqe->opcode = (__u8) IORING_OP_SEND_ZC; sqe->ioprio = zc_flags; } @@ -376,7 +357,6 @@ static int do_setup_tx(int domain, int type, int protocol) static void do_tx(int domain, int type, int protocol) { - struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}}; struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; unsigned long packets = 0, bytes = 0; @@ -392,10 +372,6 @@ static void do_tx(int domain, int type, int protocol) if (ret) error(1, ret, "io_uring: queue init"); - ret = io_uring_register_notifications(&ring, 1, b); - if (ret) - error(1, ret, "io_uring: tx ctx registration"); - iov.iov_base = payload; iov.iov_len = cfg_payload_len; @@ -411,9 +387,8 @@ static void do_tx(int domain, int type, int protocol) for (i = 0; i < cfg_nr_reqs; i++) { unsigned zc_flags = 0; unsigned buf_idx = 0; - unsigned slot_idx = 0; unsigned mode = cfg_mode; - unsigned msg_flags = 0; + unsigned msg_flags = MSG_WAITALL; if (cfg_mode == MODE_MIXED) mode = rand() % 3; @@ -425,13 +400,10 @@ static void do_tx(int domain, int type, int protocol) cfg_payload_len, msg_flags); sqe->user_data = NONZC_TAG; } else { - if (cfg_flush) { - zc_flags |= IORING_RECVSEND_NOTIF_FLUSH; - compl_cqes++; - } + compl_cqes++; io_uring_prep_sendzc(sqe, fd, payload, cfg_payload_len, - msg_flags, slot_idx, zc_flags); + msg_flags, zc_flags); if (mode == MODE_ZC_FIXED) { sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; sqe->buf_index = buf_idx; @@ -444,51 +416,57 @@ static void do_tx(int domain, int type, int protocol) if (ret != cfg_nr_reqs) error(1, ret, "submit"); + if (cfg_cork) + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); for (i = 0; i < cfg_nr_reqs; i++) { ret = io_uring_wait_cqe(&ring, &cqe); if (ret) error(1, ret, "wait cqe"); - if (cqe->user_data == NOTIF_TAG) { + if (cqe->user_data != NONZC_TAG && + cqe->user_data != ZC_TAG) + error(1, -EINVAL, "invalid cqe->user_data"); + + if (cqe->flags & IORING_CQE_F_NOTIF) { + if (cqe->flags & IORING_CQE_F_MORE) + error(1, -EINVAL, "invalid notif flags"); compl_cqes--; i--; - } else if (cqe->user_data != NONZC_TAG && - cqe->user_data != ZC_TAG) { - error(1, cqe->res, "invalid user_data"); - } else if (cqe->res <= 0 && cqe->res != -EAGAIN) { + } else if (cqe->res <= 0) { + if (cqe->flags & IORING_CQE_F_MORE) + error(1, cqe->res, "more with a failed send"); error(1, cqe->res, "send failed"); } else { - if (cqe->res > 0) { - packets++; - bytes += cqe->res; - } - /* failed requests don't flush */ - if (cfg_flush && - cqe->res <= 0 && - cqe->user_data == ZC_TAG) - compl_cqes--; + if (cqe->user_data == ZC_TAG && + !(cqe->flags & IORING_CQE_F_MORE)) + error(1, cqe->res, "missing more flag"); + packets++; + bytes += cqe->res; } io_uring_cqe_seen(&ring); } - if (cfg_cork) - do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); } while (gettimeofday_ms() < tstop); - if (close(fd)) - error(1, errno, "close"); - - fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", - packets, bytes >> 20, - packets / (cfg_runtime_ms / 1000), - (bytes >> 20) / (cfg_runtime_ms / 1000)); - while (compl_cqes) { ret = io_uring_wait_cqe(&ring, &cqe); if (ret) error(1, ret, "wait cqe"); + if (cqe->flags & IORING_CQE_F_MORE) + error(1, -EINVAL, "invalid notif flags"); + if (!(cqe->flags & IORING_CQE_F_NOTIF)) + error(1, -EINVAL, "missing notif flag"); + io_uring_cqe_seen(&ring); compl_cqes--; } + + fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", + packets, bytes >> 20, + packets / (cfg_runtime_ms / 1000), + (bytes >> 20) / (cfg_runtime_ms / 1000)); + + if (close(fd)) + error(1, errno, "close"); } static void do_test(int domain, int type, int protocol) @@ -502,8 +480,8 @@ static void do_test(int domain, int type, int protocol) static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-f] [-n] [-z0] [-s] " - "(-4|-6) [-t