summaryrefslogtreecommitdiff
path: root/tools/testing
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing')
-rw-r--r--tools/testing/selftests/arm64/fp/fp-ptrace.c5
-rw-r--r--tools/testing/selftests/arm64/fp/sve-ptrace.c61
-rw-r--r--tools/testing/selftests/arm64/fp/zt-test.S2
-rw-r--r--tools/testing/selftests/bpf/.gitignore2
-rw-r--r--tools/testing/selftests/bpf/Makefile44
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_ringbufs.c65
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_trigger.c4
-rwxr-xr-xtools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh4
-rw-r--r--tools/testing/selftests/bpf/bpf_arena_list.h6
-rw-r--r--tools/testing/selftests/bpf/bpf_arena_strsearch.h128
-rw-r--r--tools/testing/selftests/bpf/bpf_kfuncs.h12
-rw-r--r--tools/testing/selftests/bpf/network_helpers.c52
-rw-r--r--tools/testing/selftests/bpf/network_helpers.h16
-rw-r--r--tools/testing/selftests/bpf/prog_tests/arena_strsearch.c30
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_gotox.c292
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c504
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf.c65
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_split.c87
-rw-r--r--tools/testing/selftests/bpf/prog_tests/check_mtu.c23
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cls_redirect.c122
-rw-r--r--tools/testing/selftests/bpf/prog_tests/file_reader.c117
-rw-r--r--tools/testing/selftests/bpf/prog_tests/htab_update.c37
-rw-r--r--tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c3
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_branches.c22
-rw-r--r--tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c56
-rw-r--r--tools/testing/selftests/bpf/prog_tests/res_spin_lock.c8
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ringbuf.c65
-rw-r--r--tools/testing/selftests/bpf/prog_tests/select_reuseport.c67
-rw-r--r--tools/testing/selftests/bpf/prog_tests/send_signal.c5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/string_kfuncs.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_lsm.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_tc_edt.c145
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c714
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_tunnel.c107
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_xsk.c2596
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_xsk.h298
-rw-r--r--tools/testing/selftests/bpf/prog_tests/verifier.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/wq.c56
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xsk.c151
-rw-r--r--tools/testing/selftests/bpf/progs/arena_strsearch.c146
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_cc_cubic.c9
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_cubic.c7
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_dctcp.c6
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_gotox.c448
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c17
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_misc.h4
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_tracing_net.h14
-rw-r--r--tools/testing/selftests/bpf/progs/connect4_prog.c21
-rw-r--r--tools/testing/selftests/bpf/progs/dynptr_success.c12
-rw-r--r--tools/testing/selftests/bpf/progs/file_reader.c145
-rw-r--r--tools/testing/selftests/bpf/progs/file_reader_fail.c52
-rw-r--r--tools/testing/selftests/bpf/progs/htab_update.c19
-rw-r--r--tools/testing/selftests/bpf/progs/ip_check_defrag.c5
-rw-r--r--tools/testing/selftests/bpf/progs/lsm.c8
-rw-r--r--tools/testing/selftests/bpf/progs/lsm_tailcall.c8
-rw-r--r--tools/testing/selftests/bpf/progs/rcu_read_lock.c40
-rw-r--r--tools/testing/selftests/bpf/progs/refcounted_kptr.c60
-rw-r--r--tools/testing/selftests/bpf/progs/ringbuf_bench.c11
-rw-r--r--tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c12
-rw-r--r--tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c2
-rw-r--r--tools/testing/selftests/bpf/progs/string_kfuncs_success.c10
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta.h6
-rw-r--r--tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c2
-rw-r--r--tools/testing/selftests/bpf/progs/test_check_mtu.c12
-rw-r--r--tools/testing/selftests/bpf/progs/test_perf_branches.c3
-rw-r--r--tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c98
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_edt.c11
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_tunnel.c95
-rw-r--r--tools/testing/selftests/bpf/progs/trigger_bench.c6
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_async_cb_context.c181
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_bounds.c154
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c59
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_gotox.c389
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_live_stack.c50
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_lsm.c4
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c5
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_sock.c39
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_subprog_precision.c53
-rw-r--r--tools/testing/selftests/bpf/progs/wq.c17
-rw-r--r--tools/testing/selftests/bpf/progs/wq_failures.c23
-rwxr-xr-xtools/testing/selftests/bpf/test_bpftool_build.sh4
-rw-r--r--tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c236
-rw-r--r--tools/testing/selftests/bpf/test_loader.c29
-rw-r--r--tools/testing/selftests/bpf/test_maps.c3
-rw-r--r--tools/testing/selftests/bpf/test_tag.c2
-rwxr-xr-xtools/testing/selftests/bpf/test_tc_edt.sh100
-rwxr-xr-xtools/testing/selftests/bpf/test_tc_tunnel.sh320
-rw-r--r--tools/testing/selftests/bpf/xskxceiver.c2512
-rw-r--r--tools/testing/selftests/bpf/xskxceiver.h156
-rw-r--r--tools/testing/selftests/cgroup/test_core.c7
-rw-r--r--tools/testing/selftests/cgroup/test_cpu.c7
-rw-r--r--tools/testing/selftests/cgroup/test_cpuset.c7
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c7
-rw-r--r--tools/testing/selftests/cgroup/test_kill.c7
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c7
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c7
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c7
-rw-r--r--tools/testing/selftests/coredump/.gitignore4
-rw-r--r--tools/testing/selftests/coredump/Makefile8
-rw-r--r--tools/testing/selftests/coredump/coredump_socket_protocol_test.c1568
-rw-r--r--tools/testing/selftests/coredump/coredump_socket_test.c742
-rw-r--r--tools/testing/selftests/coredump/coredump_test.h59
-rw-r--r--tools/testing/selftests/coredump/coredump_test_helpers.c383
-rw-r--r--tools/testing/selftests/coredump/stackdump_test.c1662
-rw-r--r--tools/testing/selftests/dma/dma_map_benchmark.c2
-rw-r--r--tools/testing/selftests/filesystems/utils.c2
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc107
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc18
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc40
-rw-r--r--tools/testing/selftests/kselftest/runner.sh14
-rw-r--r--tools/testing/selftests/kvm/arm64/set_id_regs.c10
-rw-r--r--tools/testing/selftests/livepatch/functions.sh6
-rw-r--r--tools/testing/selftests/namespaces/.gitignore9
-rw-r--r--tools/testing/selftests/namespaces/Makefile24
-rw-r--r--tools/testing/selftests/namespaces/cred_change_test.c814
-rw-r--r--tools/testing/selftests/namespaces/listns_efault_test.c530
-rw-r--r--tools/testing/selftests/namespaces/listns_pagination_bug.c138
-rw-r--r--tools/testing/selftests/namespaces/listns_permissions_test.c759
-rw-r--r--tools/testing/selftests/namespaces/listns_test.c679
-rw-r--r--tools/testing/selftests/namespaces/ns_active_ref_test.c2672
-rw-r--r--tools/testing/selftests/namespaces/nsid_test.c107
-rw-r--r--tools/testing/selftests/namespaces/regression_pidfd_setns_test.c113
-rw-r--r--tools/testing/selftests/namespaces/siocgskns_test.c1824
-rw-r--r--tools/testing/selftests/namespaces/stress_test.c626
-rw-r--r--tools/testing/selftests/namespaces/wrappers.h35
-rw-r--r--tools/testing/selftests/nolibc/Makefile.nolibc6
-rw-r--r--tools/testing/selftests/nolibc/nolibc-test.c13
-rwxr-xr-xtools/testing/selftests/nolibc/run-tests.sh8
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h15
-rw-r--r--tools/testing/selftests/pidfd/pidfd_info_test.c73
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-again.sh56
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-series.sh116
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE041
-rw-r--r--tools/testing/selftests/rseq/rseq-s390.h39
-rwxr-xr-xtools/testing/selftests/run_kselftest.sh14
-rw-r--r--tools/testing/selftests/sched_ext/Makefile1
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.bpf.c251
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.c224
-rw-r--r--tools/testing/selftests/timers/nanosleep.c55
-rw-r--r--tools/testing/selftests/timers/posix_timers.c32
-rw-r--r--tools/testing/selftests/ublk/kublk.c70
-rw-r--r--tools/testing/selftests/ublk/kublk.h9
-rw-r--r--tools/testing/selftests/vDSO/vdso_config.h4
-rw-r--r--tools/testing/selftests/x86/test_vsyscall.c21
146 files changed, 20226 insertions, 5403 deletions
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index a85c19e9524e..0114108ab25f 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -1071,7 +1071,7 @@ static bool sve_write_supported(struct test_config *config)
static bool sve_write_fpsimd_supported(struct test_config *config)
{
- if (!sve_supported())
+ if (!sve_supported() && !sme_supported())
return false;
if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA))
@@ -1231,9 +1231,6 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config)
vl = vl_expected(config);
vq = __sve_vq_from_vl(vl);
- if (!vl)
- return;
-
iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD);
iov.iov_base = malloc(iov.iov_len);
if (!iov.iov_base) {
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index e0fc3a001e28..f44d44618575 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -394,6 +394,58 @@ out:
free(svebuf);
}
+/* Write the FPSIMD registers via the SVE regset when SVE is not supported */
+static void ptrace_sve_fpsimd_no_sve(pid_t child)
+{
+ void *svebuf;
+ struct user_sve_header *sve;
+ struct user_fpsimd_state *fpsimd, new_fpsimd;
+ unsigned int i, j;
+ unsigned char *p;
+ int ret;
+
+ svebuf = malloc(SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD));
+ if (!svebuf) {
+ ksft_test_result_fail("Failed to allocate FPSIMD buffer\n");
+ return;
+ }
+
+ /* On a system without SVE the VL should be set to 0 */
+ memset(svebuf, 0, SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD));
+ sve = svebuf;
+ sve->flags = SVE_PT_REGS_FPSIMD;
+ sve->size = SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD);
+ sve->vl = 0;
+
+ /* Try to set a known FPSIMD state via PT_REGS_SVE */
+ fpsimd = (struct user_fpsimd_state *)((char *)sve +
+ SVE_PT_FPSIMD_OFFSET);
+ for (i = 0; i < 32; ++i) {
+ p = (unsigned char *)&fpsimd->vregs[i];
+
+ for (j = 0; j < sizeof(fpsimd->vregs[i]); ++j)
+ p[j] = j;
+ }
+
+ ret = set_sve(child, &vec_types[0], sve);
+ ksft_test_result(ret == 0, "FPSIMD write via SVE\n");
+ if (ret) {
+ ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+ goto out;
+ }
+
+ /* Verify via the FPSIMD regset */
+ if (get_fpsimd(child, &new_fpsimd)) {
+ ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+ goto out;
+ }
+ ksft_test_result(memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0,
+ "Verify FPSIMD write via SVE\n");
+
+out:
+ free(svebuf);
+}
+
/* Validate attempting to set SVE data and read SVE data */
static void ptrace_set_sve_get_sve_data(pid_t child,
const struct vec_type *type,
@@ -826,6 +878,15 @@ static int do_parent(pid_t child)
}
}
+ /* We support SVE writes of FPSMID format on SME only systems */
+ if (!(getauxval(AT_HWCAP) & HWCAP_SVE) &&
+ (getauxval(AT_HWCAP2) & HWCAP2_SME)) {
+ ptrace_sve_fpsimd_no_sve(child);
+ } else {
+ ksft_test_result_skip("FPSIMD write via SVE\n");
+ ksft_test_result_skip("Verify FPSIMD write via SVE\n");
+ }
+
ret = EXIT_SUCCESS;
error:
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index 38080f3c3280..a8df05771670 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -276,7 +276,7 @@ function barf
bl putdec
puts ", iteration="
mov x0, x22
- bl putdec
+ bl putdecn
puts "\tExpected ["
mov x0, x10
mov x1, x12
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index be1ee7ba7ce0..19c1638e312a 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -23,6 +23,7 @@ test_tcpnotify_user
test_libbpf
xdping
test_cpp
+test_progs_verification_cert
*.d
*.subskel.h
*.skel.h
@@ -32,7 +33,6 @@ test_cpp
/cpuv4
/host-tools
/tools
-/runqslower
/bench
/veristat
/sign-file
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f00587d4ede6..b7030a6e2e76 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -46,6 +46,7 @@ endif
CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \
-Wall -Werror -fno-omit-frame-pointer \
+ -Wno-unused-but-set-variable \
$(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \
-I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
-I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT)
@@ -98,14 +99,11 @@ TEST_GEN_PROGS += test_progs-cpuv4
TEST_INST_SUBDIRS += cpuv4
endif
-TEST_GEN_FILES = test_tc_edt.bpf.o
TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
test_lirc_mode2.sh \
- test_tc_tunnel.sh \
- test_tc_edt.sh \
test_xdping.sh \
test_bpftool_build.sh \
test_bpftool.sh \
@@ -127,7 +125,6 @@ TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS))
TEST_GEN_PROGS_EXTENDED = \
bench \
flow_dissector_load \
- runqslower \
test_cpp \
test_lirc_mode2_user \
veristat \
@@ -209,8 +206,6 @@ HOST_INCLUDE_DIR := $(INCLUDE_DIR)
endif
HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
-RUNQSLOWER_OUTPUT := $(BUILD_DIR)/runqslower/
-
VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
$(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
../../../../vmlinux \
@@ -232,7 +227,7 @@ $(notdir $(TEST_GEN_PROGS) $(TEST_KMODS) \
MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
$(BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/bpftool \
$(HOST_BUILD_DIR)/resolve_btfids \
- $(RUNQSLOWER_OUTPUT) $(INCLUDE_DIR))
+ $(INCLUDE_DIR))
$(MAKE_DIRS):
$(call msg,MKDIR,,$@)
$(Q)mkdir -p $@
@@ -304,17 +299,6 @@ TRUNNER_BPFTOOL := $(DEFAULT_BPFTOOL)
USE_BOOTSTRAP := "bootstrap/"
endif
-$(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
- $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \
- OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \
- BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \
- BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \
- BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \
- BPF_TARGET_ENDIAN=$(BPF_TARGET_ENDIAN) \
- EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \
- EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' && \
- cp $(RUNQSLOWER_OUTPUT)runqslower $@
-
TEST_GEN_PROGS_EXTENDED += $(TRUNNER_BPFTOOL)
$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ)
@@ -453,7 +437,9 @@ BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \
-I$(abspath $(OUTPUT)/../usr/include) \
-std=gnu11 \
-fno-strict-aliasing \
- -Wno-compare-distinct-pointer-types
+ -Wno-compare-distinct-pointer-types \
+ -Wno-initializer-overrides \
+ #
# TODO: enable me -Wsign-compare
CLANG_CFLAGS = $(CLANG_SYS_INCLUDES)
@@ -498,7 +484,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \
LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \
core_kern.c core_kern_overflow.c test_ringbuf.c \
- test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
+ test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \
+ test_ringbuf_overwrite.c
LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c
@@ -543,6 +530,8 @@ TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \
$$(notdir $$(wildcard $(TRUNNER_TESTS_DIR)/*.c)))
TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \
$$(filter %.c,$(TRUNNER_EXTRA_SOURCES)))
+TRUNNER_LIB_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \
+ $$(filter %.c,$(TRUNNER_LIB_SOURCES)))
TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES))
TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h
TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c))
@@ -686,6 +675,10 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \
$$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@)
$(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
+$(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c
+ $$(call msg,LIB-OBJ,$(TRUNNER_BINARY),$$@)
+ $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
+
# non-flavored in-srctree builds receive special treatment, in particular, we
# do not need to copy extra resources (see e.g. test_btf_dump_case())
$(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT)
@@ -699,6 +692,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS)
$(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \
$(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \
+ $(TRUNNER_LIB_OBJS) \
$(RESOLVE_BTFIDS) \
$(TRUNNER_BPFTOOL) \
$(OUTPUT)/veristat \
@@ -721,7 +715,8 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP)
$(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR)
$(VERIFY_SIG_HDR): $(VERIFICATION_CERT)
- $(Q)xxd -i -n test_progs_verification_cert $< > $@
+ $(Q)ln -fs $< test_progs_verification_cert && \
+ xxd -i test_progs_verification_cert > $@
# Define test_progs test runner.
TRUNNER_TESTS_DIR := prog_tests
@@ -745,6 +740,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \
$(VERIFY_SIG_HDR) \
flow_dissector_load.h \
ip_check_defrag_frags.h
+TRUNNER_LIB_SOURCES := find_bit.c
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \
$(OUTPUT)/liburandom_read.so \
$(OUTPUT)/xdp_synproxy \
@@ -782,6 +778,7 @@ endif
TRUNNER_TESTS_DIR := map_tests
TRUNNER_BPF_PROGS_DIR := progs
TRUNNER_EXTRA_SOURCES := test_maps.c
+TRUNNER_LIB_SOURCES :=
TRUNNER_EXTRA_FILES :=
TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built)
TRUNNER_BPF_CFLAGS :=
@@ -803,7 +800,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT)
$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
# Include find_bit.c to compile xskxceiver.
-EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c
+EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c prog_tests/test_xsk.c prog_tests/test_xsk.h
$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT)
$(call msg,BINARY,,$@)
$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
@@ -893,7 +890,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \
$(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \
no_alu32 cpuv4 bpf_gcc \
liburandom_read.so) \
- $(OUTPUT)/FEATURE-DUMP.selftests
+ $(OUTPUT)/FEATURE-DUMP.selftests \
+ test_progs_verification_cert
.PHONY: docs docs-clean
diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
index e1ee979e6acc..01bdce692799 100644
--- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -19,6 +19,8 @@ static struct {
int ringbuf_sz; /* per-ringbuf, in bytes */
bool ringbuf_use_output; /* use slower output API */
int perfbuf_sz; /* per-CPU size, in pages */
+ bool overwrite;
+ bool bench_producer;
} args = {
.back2back = false,
.batch_cnt = 500,
@@ -27,6 +29,8 @@ static struct {
.ringbuf_sz = 512 * 1024,
.ringbuf_use_output = false,
.perfbuf_sz = 128,
+ .overwrite = false,
+ .bench_producer = false,
};
enum {
@@ -35,6 +39,8 @@ enum {
ARG_RB_BATCH_CNT = 2002,
ARG_RB_SAMPLED = 2003,
ARG_RB_SAMPLE_RATE = 2004,
+ ARG_RB_OVERWRITE = 2005,
+ ARG_RB_BENCH_PRODUCER = 2006,
};
static const struct argp_option opts[] = {
@@ -43,6 +49,8 @@ static const struct argp_option opts[] = {
{ "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"},
{ "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"},
{ "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"},
+ { "rb-overwrite", ARG_RB_OVERWRITE, NULL, 0, "Overwrite mode"},
+ { "rb-bench-producer", ARG_RB_BENCH_PRODUCER, NULL, 0, "Benchmark producer"},
{},
};
@@ -72,6 +80,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
argp_usage(state);
}
break;
+ case ARG_RB_OVERWRITE:
+ args.overwrite = true;
+ break;
+ case ARG_RB_BENCH_PRODUCER:
+ args.bench_producer = true;
+ break;
default:
return ARGP_ERR_UNKNOWN;
}
@@ -95,8 +109,33 @@ static inline void bufs_trigger_batch(void)
static void bufs_validate(void)
{
- if (env.consumer_cnt != 1) {
- fprintf(stderr, "rb-libbpf benchmark needs one consumer!\n");
+ if (args.bench_producer && strcmp(env.bench_name, "rb-libbpf")) {
+ fprintf(stderr, "--rb-bench-producer only works with rb-libbpf!\n");
+ exit(1);
+ }
+
+ if (args.overwrite && !args.bench_producer) {
+ fprintf(stderr, "overwrite mode only works with --rb-bench-producer for now!\n");
+ exit(1);
+ }
+
+ if (args.bench_producer && env.consumer_cnt != 0) {
+ fprintf(stderr, "no consumer is needed for --rb-bench-producer!\n");
+ exit(1);
+ }
+
+ if (args.bench_producer && args.back2back) {
+ fprintf(stderr, "back-to-back mode makes no sense for --rb-bench-producer!\n");
+ exit(1);
+ }
+
+ if (args.bench_producer && args.sampled) {
+ fprintf(stderr, "sampling mode makes no sense for --rb-bench-producer!\n");
+ exit(1);
+ }
+
+ if (!args.bench_producer && env.consumer_cnt != 1) {
+ fprintf(stderr, "benchmarks without --rb-bench-producer require exactly one consumer!\n");
exit(1);
}
@@ -128,12 +167,17 @@ static void ringbuf_libbpf_measure(struct bench_res *res)
{
struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
- res->hits = atomic_swap(&buf_hits.value, 0);
+ if (args.bench_producer)
+ res->hits = atomic_swap(&ctx->skel->bss->hits, 0);
+ else
+ res->hits = atomic_swap(&buf_hits.value, 0);
res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
}
static struct ringbuf_bench *ringbuf_setup_skeleton(void)
{
+ __u32 flags;
+ struct bpf_map *ringbuf;
struct ringbuf_bench *skel;
setup_libbpf();
@@ -146,12 +190,19 @@ static struct ringbuf_bench *ringbuf_setup_skeleton(void)
skel->rodata->batch_cnt = args.batch_cnt;
skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0;
+ skel->rodata->bench_producer = args.bench_producer;
if (args.sampled)
/* record data + header take 16 bytes */
skel->rodata->wakeup_data_size = args.sample_rate * 16;
- bpf_map__set_max_entries(skel->maps.ringbuf, args.ringbuf_sz);
+ ringbuf = skel->maps.ringbuf;
+ if (args.overwrite) {
+ flags = bpf_map__map_flags(ringbuf) | BPF_F_RB_OVERWRITE;
+ bpf_map__set_map_flags(ringbuf, flags);
+ }
+
+ bpf_map__set_max_entries(ringbuf, args.ringbuf_sz);
if (ringbuf_bench__load(skel)) {
fprintf(stderr, "failed to load skeleton\n");
@@ -171,10 +222,12 @@ static void ringbuf_libbpf_setup(void)
{
struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
struct bpf_link *link;
+ int map_fd;
ctx->skel = ringbuf_setup_skeleton();
- ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf),
- buf_process_sample, NULL, NULL);
+
+ map_fd = bpf_map__fd(ctx->skel->maps.ringbuf);
+ ctx->ringbuf = ring_buffer__new(map_fd, buf_process_sample, NULL, NULL);
if (!ctx->ringbuf) {
fprintf(stderr, "failed to create ringbuf\n");
exit(1);
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 1e2aff007c2a..34018fc3927f 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -180,10 +180,10 @@ static void trigger_kernel_count_setup(void)
{
setup_ctx();
bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
- bpf_program__set_autoload(ctx.skel->progs.trigger_count, true);
+ bpf_program__set_autoload(ctx.skel->progs.trigger_kernel_count, true);
load_ctx();
/* override driver program */
- ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count);
+ ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_kernel_count);
}
static void trigger_kprobe_setup(void)
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
index 91e3567962ff..83e05e837871 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
@@ -49,3 +49,7 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)"
done
+header "Ringbuf, multi-producer contention in overwrite mode, no consumer"
+for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
+ summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)"
+done
diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h
index 85dbc3ea4da5..e16fa7d95fcf 100644
--- a/tools/testing/selftests/bpf/bpf_arena_list.h
+++ b/tools/testing/selftests/bpf/bpf_arena_list.h
@@ -64,14 +64,12 @@ static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
static inline void __list_del(arena_list_node_t *n)
{
- arena_list_node_t *next = n->next, *tmp;
+ arena_list_node_t *next = n->next;
arena_list_node_t * __arena *pprev = n->pprev;
cast_user(next);
cast_kern(pprev);
- tmp = *pprev;
- cast_kern(tmp);
- WRITE_ONCE(tmp, next);
+ WRITE_ONCE(*pprev, next);
if (next) {
cast_user(pprev);
cast_kern(next);
diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h
new file mode 100644
index 000000000000..c1b6eaa905bb
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#pragma once
+#include "bpf_arena_common.h"
+
+__noinline int bpf_arena_strlen(const char __arena *s __arg_arena)
+{
+ const char __arena *sc;
+
+ for (sc = s; *sc != '\0'; ++sc)
+ cond_break;
+ return sc - s;
+}
+
+/**
+ * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0)
+ * @pat: Shell-style pattern to match, e.g. "*.[ch]".
+ * @str: String to match. The pattern must match the entire string.
+ *
+ * Perform shell-style glob matching, returning true (1) if the match
+ * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0).
+ *
+ * Pattern metacharacters are ?, *, [ and \.
+ * (And, inside character classes, !, - and ].)
+ *
+ * This is small and simple implementation intended for device blacklists
+ * where a string is matched against a number of patterns. Thus, it
+ * does not preprocess the patterns. It is non-recursive, and run-time
+ * is at most quadratic: strlen(@str)*strlen(@pat).
+ *
+ * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa");
+ * it takes 6 passes over the pattern before matching the string.
+ *
+ * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT
+ * treat / or leading . specially; it isn't actually used for pathnames.
+ *
+ * Note that according to glob(7) (and unlike bash), character classes
+ * are complemented by a leading !; this does not support the regex-style
+ * [^a-z] syntax.
+ *
+ * An opening bracket without a matching close is matched literally.
+ */
+__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena)
+{
+ /*
+ * Backtrack to previous * on mismatch and retry starting one
+ * character later in the string. Because * matches all characters
+ * (no exception for /), it can be easily proved that there's
+ * never a need to backtrack multiple levels.
+ */
+ char const __arena *back_pat = NULL, *back_str;
+
+ /*
+ * Loop over each token (character or class) in pat, matching
+ * it against the remaining unmatched tail of str. Return false
+ * on mismatch, or true after matching the trailing nul bytes.
+ */
+ for (;;) {
+ unsigned char c = *str++;
+ unsigned char d = *pat++;
+
+ switch (d) {
+ case '?': /* Wildcard: anything but nul */
+ if (c == '\0')
+ return false;
+ break;
+ case '*': /* Any-length wildcard */
+ if (*pat == '\0') /* Optimize trailing * case */
+ return true;
+ back_pat = pat;
+ back_str = --str; /* Allow zero-length match */
+ break;
+ case '[': { /* Character class */
+ bool match = false, inverted = (*pat == '!');
+ char const __arena *class = pat + inverted;
+ unsigned char a = *class++;
+
+ /*
+ * Iterate over each span in the character class.
+ * A span is either a single character a, or a
+ * range a-b. The first span may begin with ']'.
+ */
+ do {
+ unsigned char b = a;
+
+ if (a == '\0') /* Malformed */
+ goto literal;
+
+ if (class[0] == '-' && class[1] != ']') {
+ b = class[1];
+
+ if (b == '\0')
+ goto literal;
+
+ class += 2;
+ /* Any special action if a > b? */
+ }
+ match |= (a <= c && c <= b);
+ cond_break;
+ } while ((a = *class++) != ']');
+
+ if (match == inverted)
+ goto backtrack;
+ pat = class;
+ }
+ break;
+ case '\\':
+ d = *pat++;
+ __attribute__((__fallthrough__));
+ default: /* Literal character */
+literal:
+ if (c == d) {
+ if (d == '\0')
+ return true;
+ break;
+ }
+backtrack:
+ if (c == '\0' || !back_pat)
+ return false; /* No point continuing */
+ /* Try again from last *, one character later in str. */
+ pat = back_pat;
+ str = ++back_str;
+ break;
+ }
+ cond_break;
+ }
+ return false;
+}
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 794d44d19c88..e0189254bb6e 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags,
* Either a direct pointer to the dynptr data or a pointer to the user-provided
* buffer if unable to obtain a direct pointer
*/
-extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
- void *buffer, __u32 buffer__szk) __ksym __weak;
+extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset,
+ void *buffer, __u64 buffer__szk) __ksym __weak;
/* Description
* Obtain a read-write pointer to the dynptr's data
@@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
* Either a direct pointer to the dynptr data or a pointer to the user-provided
* buffer if unable to obtain a direct pointer
*/
-extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset,
- void *buffer, __u32 buffer__szk) __ksym __weak;
+extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer,
+ __u64 buffer__szk) __ksym __weak;
-extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak;
+extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak;
extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak;
extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak;
-extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
+extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak;
/* Description
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index cdf7b6641444..0a6a5561bed3 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -97,7 +97,7 @@ int settimeo(int fd, int timeout_ms)
int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen,
const struct network_helper_opts *opts)
{
- int fd;
+ int on = 1, fd;
if (!opts)
opts = &default_opts;
@@ -111,6 +111,12 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t a
if (settimeo(fd, opts->timeout_ms))
goto error_close;
+ if (type == SOCK_STREAM &&
+ setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on))) {
+ log_err("Failed to enable SO_REUSEADDR");
+ goto error_close;
+ }
+
if (opts->post_socket_cb &&
opts->post_socket_cb(fd, opts->cb_opts)) {
log_err("Failed to call post_socket_cb");
@@ -766,6 +772,50 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes)
return err;
}
+int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd)
+{
+ int ifindex, ret;
+
+ if (!ASSERT_TRUE(ingress_fd >= 0 || egress_fd >= 0,
+ "at least one program fd is valid"))
+ return -1;
+
+ ifindex = if_nametoindex(dev);
+ if (!ASSERT_NEQ(ifindex, 0, "get ifindex"))
+ return -1;
+
+ DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex,
+ .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS);
+ DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1,
+ .priority = 1, .prog_fd = ingress_fd);
+ DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1,
+ .priority = 1, .prog_fd = egress_fd);
+
+ ret = bpf_tc_hook_create(&hook);
+ if (!ASSERT_OK(ret, "create tc hook"))
+ return ret;
+
+ if (ingress_fd >= 0) {
+ hook.attach_point = BPF_TC_INGRESS;
+ ret = bpf_tc_attach(&hook, &opts1);
+ if (!ASSERT_OK(ret, "bpf_tc_attach")) {
+ bpf_tc_hook_destroy(&hook);
+ return ret;
+ }
+ }
+
+ if (egress_fd >= 0) {
+ hook.attach_point = BPF_TC_EGRESS;
+ ret = bpf_tc_attach(&hook, &opts2);
+ if (!ASSERT_OK(ret, "bpf_tc_attach")) {
+ bpf_tc_hook_destroy(&hook);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
#ifdef TRAFFIC_MONITOR
struct tmonitor_ctx {
pcap_t *pcap;
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index ef208eefd571..79a010c88e11 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -255,6 +255,22 @@ struct tmonitor_ctx;
typedef int (*tm_print_fn_t)(const char *format, va_list args);
+/**
+ * tc_prog_attach - attach BPF program(s) to an interface
+ *
+ * Takes file descriptors pointing to at least one, at most two BPF
+ * programs, and attach those programs to an interface ingress, egress or
+ * both.
+ *
+ * @dev: string containing the interface name
+ * @ingress_fd: file descriptor of the program to attach to interface ingress
+ * @egress_fd: file descriptor of the program to attach to interface egress
+ *
+ * Returns 0 on success, -1 if no valid file descriptor has been found, if
+ * the interface name is invalid or if an error ocurred during attach.
+ */
+int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd);
+
#ifdef TRAFFIC_MONITOR
struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
const char *subtest_name);
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c b/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c
new file mode 100644
index 000000000000..f81a0c066505
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include "arena_strsearch.skel.h"
+
+static void test_arena_str(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts);
+ struct arena_strsearch *skel;
+ int ret;
+
+ skel = arena_strsearch__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "arena_strsearch__open_and_load"))
+ return;
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_strsearch), &opts);
+ ASSERT_OK(ret, "ret_add");
+ ASSERT_OK(opts.retval, "retval");
+ if (skel->bss->skip) {
+ printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__);
+ test__skip();
+ }
+ arena_strsearch__destroy(skel);
+}
+
+void test_arena_strsearch(void)
+{
+ if (test__start_subtest("arena_strsearch"))
+ test_arena_str();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c
new file mode 100644
index 000000000000..d138cc7b1bda
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <sys/syscall.h>
+#include <bpf/bpf.h>
+
+#include "bpf_gotox.skel.h"
+
+static void __test_run(struct bpf_program *prog, void *ctx_in, size_t ctx_size_in)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .ctx_in = ctx_in,
+ .ctx_size_in = ctx_size_in,
+ );
+ int err, prog_fd;
+
+ prog_fd = bpf_program__fd(prog);
+ err = bpf_prog_test_run_opts(prog_fd, &topts);
+ ASSERT_OK(err, "test_run_opts err");
+}
+
+static void __subtest(struct bpf_gotox *skel, void (*check)(struct bpf_gotox *))
+{
+ if (skel->data->skip)
+ test__skip();
+ else
+ check(skel);
+}
+
+static void check_simple(struct bpf_gotox *skel,
+ struct bpf_program *prog,
+ __u64 ctx_in,
+ __u64 expected)
+{
+ skel->bss->ret_user = 0;
+
+ __test_run(prog, &ctx_in, sizeof(ctx_in));
+
+ if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user"))
+ return;
+}
+
+static void check_simple_fentry(struct bpf_gotox *skel,
+ struct bpf_program *prog,
+ __u64 ctx_in,
+ __u64 expected)
+{
+ skel->bss->in_user = ctx_in;
+ skel->bss->ret_user = 0;
+
+ /* trigger */
+ usleep(1);
+
+ if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user"))
+ return;
+}
+
+/* validate that for two loads of the same jump table libbpf generates only one map */
+static void check_one_map_two_jumps(struct bpf_gotox *skel)
+{
+ struct bpf_prog_info prog_info;
+ struct bpf_map_info map_info;
+ __u32 len;
+ __u32 map_ids[16];
+ int prog_fd, map_fd;
+ int ret;
+ int i;
+ bool seen = false;
+
+ memset(&prog_info, 0, sizeof(prog_info));
+ prog_info.map_ids = (long)map_ids;
+ prog_info.nr_map_ids = ARRAY_SIZE(map_ids);
+ prog_fd = bpf_program__fd(skel->progs.one_map_two_jumps);
+ if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd(one_map_two_jumps)"))
+ return;
+
+ len = sizeof(prog_info);
+ ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &len);
+ if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(prog_fd)"))
+ return;
+
+ for (i = 0; i < prog_info.nr_map_ids; i++) {
+ map_fd = bpf_map_get_fd_by_id(map_ids[i]);
+ if (!ASSERT_GE(map_fd, 0, "bpf_map_get_fd_by_id"))
+ return;
+
+ len = sizeof(map_info);
+ memset(&map_info, 0, len);
+ ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &len);
+ if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(map_fd)")) {
+ close(map_fd);
+ return;
+ }
+
+ if (map_info.type == BPF_MAP_TYPE_INSN_ARRAY) {
+ if (!ASSERT_EQ(seen, false, "more than one INSN_ARRAY map")) {
+ close(map_fd);
+ return;
+ }
+ seen = true;
+ }
+ close(map_fd);
+ }
+
+ ASSERT_EQ(seen, true, "no INSN_ARRAY map");
+}
+
+static void check_one_switch(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.one_switch, in[i], out[i]);
+}
+
+static void check_one_switch_non_zero_sec_off(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.one_switch_non_zero_sec_off, in[i], out[i]);
+}
+
+static void check_two_switches(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {103, 104, 107, 205, 115, 1019, 1019};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.two_switches, in[i], out[i]);
+}
+
+static void check_big_jump_table(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 11, 27, 31, 22, 45, 99};
+ __u64 out[] = {2, 3, 4, 5, 19, 19, 19};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.big_jump_table, in[i], out[i]);
+}
+
+static void check_one_jump_two_maps(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {12, 15, 7 , 15, 12, 15, 15};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.one_jump_two_maps, in[i], out[i]);
+}
+
+static void check_static_global(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.use_static_global1, in[i], out[i]);
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.use_static_global2, in[i], out[i]);
+}
+
+static void check_nonstatic_global(struct bpf_gotox *skel)
+{
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.use_nonstatic_global1, in[i], out[i]);
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple(skel, skel->progs.use_nonstatic_global2, in[i], out[i]);
+}
+
+static void check_other_sec(struct bpf_gotox *skel)
+{
+ struct bpf_link *link;
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ link = bpf_program__attach(skel->progs.simple_test_other_sec);
+ if (!ASSERT_OK_PTR(link, "link"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple_fentry(skel, skel->progs.simple_test_other_sec, in[i], out[i]);
+
+ bpf_link__destroy(link);
+}
+
+static void check_static_global_other_sec(struct bpf_gotox *skel)
+{
+ struct bpf_link *link;
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ link = bpf_program__attach(skel->progs.use_static_global_other_sec);
+ if (!ASSERT_OK_PTR(link, "link"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple_fentry(skel, skel->progs.use_static_global_other_sec, in[i], out[i]);
+
+ bpf_link__destroy(link);
+}
+
+static void check_nonstatic_global_other_sec(struct bpf_gotox *skel)
+{
+ struct bpf_link *link;
+ __u64 in[] = {0, 1, 2, 3, 4, 5, 77};
+ __u64 out[] = {2, 3, 4, 5, 7, 19, 19};
+ int i;
+
+ link = bpf_program__attach(skel->progs.use_nonstatic_global_other_sec);
+ if (!ASSERT_OK_PTR(link, "link"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(in); i++)
+ check_simple_fentry(skel, skel->progs.use_nonstatic_global_other_sec, in[i], out[i]);
+
+ bpf_link__destroy(link);
+}
+
+void test_bpf_gotox(void)
+{
+ struct bpf_gotox *skel;
+ int ret;
+
+ skel = bpf_gotox__open();
+ if (!ASSERT_NEQ(skel, NULL, "bpf_gotox__open"))
+ return;
+
+ ret = bpf_gotox__load(skel);
+ if (!ASSERT_OK(ret, "bpf_gotox__load"))
+ return;
+
+ skel->bss->pid = getpid();
+
+ if (test__start_subtest("one-switch"))
+ __subtest(skel, check_one_switch);
+
+ if (test__start_subtest("one-switch-non-zero-sec-offset"))
+ __subtest(skel, check_one_switch_non_zero_sec_off);
+
+ if (test__start_subtest("two-switches"))
+ __subtest(skel, check_two_switches);
+
+ if (test__start_subtest("big-jump-table"))
+ __subtest(skel, check_big_jump_table);
+
+ if (test__start_subtest("static-global"))
+ __subtest(skel, check_static_global);
+
+ if (test__start_subtest("nonstatic-global"))
+ __subtest(skel, check_nonstatic_global);
+
+ if (test__start_subtest("other-sec"))
+ __subtest(skel, check_other_sec);
+
+ if (test__start_subtest("static-global-other-sec"))
+ __subtest(skel, check_static_global_other_sec);
+
+ if (test__start_subtest("nonstatic-global-other-sec"))
+ __subtest(skel, check_nonstatic_global_other_sec);
+
+ if (test__start_subtest("one-jump-two-maps"))
+ __subtest(skel, check_one_jump_two_maps);
+
+ if (test__start_subtest("one-map-two-jumps"))
+ __subtest(skel, check_one_map_two_jumps);
+
+ bpf_gotox__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c b/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c
new file mode 100644
index 000000000000..269870bec941
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <bpf/bpf.h>
+#include <test_progs.h>
+
+#ifdef __x86_64__
+static int map_create(__u32 map_type, __u32 max_entries)
+{
+ const char *map_name = "insn_array";
+ __u32 key_size = 4;
+ __u32 value_size = sizeof(struct bpf_insn_array_value);
+
+ return bpf_map_create(map_type, map_name, key_size, value_size, max_entries, NULL);
+}
+
+static int prog_load(struct bpf_insn *insns, __u32 insn_cnt, int *fd_array, __u32 fd_array_cnt)
+{
+ LIBBPF_OPTS(bpf_prog_load_opts, opts);
+
+ opts.fd_array = fd_array;
+ opts.fd_array_cnt = fd_array_cnt;
+
+ return bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, &opts);
+}
+
+static void __check_success(struct bpf_insn *insns, __u32 insn_cnt, __u32 *map_in, __u32 *map_out)
+{
+ struct bpf_insn_array_value val = {};
+ int prog_fd = -1, map_fd, i;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, insn_cnt);
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ for (i = 0; i < insn_cnt; i++) {
+ val.orig_off = map_in[i];
+ if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
+ goto cleanup;
+ }
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ prog_fd = prog_load(insns, insn_cnt, &map_fd, 1);
+ if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
+ goto cleanup;
+
+ for (i = 0; i < insn_cnt; i++) {
+ char buf[64];
+
+ if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
+ goto cleanup;
+
+ snprintf(buf, sizeof(buf), "val.xlated_off should be equal map_out[%d]", i);
+ ASSERT_EQ(val.xlated_off, map_out[i], buf);
+ }
+
+cleanup:
+ close(prog_fd);
+ close(map_fd);
+}
+
+/*
+ * Load a program, which will not be anyhow mangled by the verifier. Add an
+ * insn_array map pointing to every instruction. Check that it hasn't changed
+ * after the program load.
+ */
+static void check_one_to_one_mapping(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ __u32 map_in[] = {0, 1, 2, 3, 4, 5};
+ __u32 map_out[] = {0, 1, 2, 3, 4, 5};
+
+ __check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
+}
+
+/*
+ * Load a program with two patches (get jiffies, for simplicity). Add an
+ * insn_array map pointing to every instruction. Check how it was changed
+ * after the program load.
+ */
+static void check_simple(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ __u32 map_in[] = {0, 1, 2, 3, 4, 5};
+ __u32 map_out[] = {0, 1, 4, 5, 8, 9};
+
+ __check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
+}
+
+/*
+ * Verifier can delete code in two cases: nops & dead code. From insn
+ * array's point of view, the two cases are the same, so test using
+ * the simplest method: by loading some nops
+ */
+static void check_deletions(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ __u32 map_in[] = {0, 1, 2, 3, 4, 5};
+ __u32 map_out[] = {0, -1, 1, -1, 2, 3};
+
+ __check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
+}
+
+/*
+ * Same test as check_deletions, but also add code which adds instructions
+ */
+static void check_deletions_with_functions(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ };
+ __u32 map_in[] = { 0, 1, 2, 3, 4, 5, /* func */ 6, 7, 8, 9, 10};
+ __u32 map_out[] = {-1, 0, -1, 3, 4, 5, /* func */ -1, 6, -1, 9, 10};
+
+ __check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
+}
+
+/*
+ * Try to load a program with a map which points to outside of the program
+ */
+static void check_out_of_bounds_index(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ int prog_fd, map_fd;
+ struct bpf_insn_array_value val = {};
+ int key;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1);
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ key = 0;
+ val.orig_off = ARRAY_SIZE(insns); /* too big */
+ if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem"))
+ goto cleanup;
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) {
+ close(prog_fd);
+ goto cleanup;
+ }
+
+cleanup:
+ close(map_fd);
+}
+
+/*
+ * Try to load a program with a map which points to the middle of 16-bit insn
+ */
+static void check_mid_insn_index(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_LD_IMM64(BPF_REG_0, 0), /* 2 x 8 */
+ BPF_EXIT_INSN(),
+ };
+ int prog_fd, map_fd;
+ struct bpf_insn_array_value val = {};
+ int key;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1);
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ key = 0;
+ val.orig_off = 1; /* middle of 16-byte instruction */
+ if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem"))
+ goto cleanup;
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) {
+ close(prog_fd);
+ goto cleanup;
+ }
+
+cleanup:
+ close(map_fd);
+}
+
+static void check_incorrect_index(void)
+{
+ check_out_of_bounds_index();
+ check_mid_insn_index();
+}
+
+static int set_bpf_jit_harden(char *level)
+{
+ char old_level;
+ int err = -1;
+ int fd = -1;
+
+ fd = open("/proc/sys/net/core/bpf_jit_harden", O_RDWR | O_NONBLOCK);
+ if (fd < 0) {
+ ASSERT_FAIL("open .../bpf_jit_harden returned %d (errno=%d)", fd, errno);
+ return -1;
+ }
+
+ err = read(fd, &old_level, 1);
+ if (err != 1) {
+ ASSERT_FAIL("read from .../bpf_jit_harden returned %d (errno=%d)", err, errno);
+ err = -1;
+ goto end;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ err = write(fd, level, 1);
+ if (err != 1) {
+ ASSERT_FAIL("write to .../bpf_jit_harden returned %d (errno=%d)", err, errno);
+ err = -1;
+ goto end;
+ }
+
+ err = 0;
+ *level = old_level;
+end:
+ if (fd >= 0)
+ close(fd);
+ return err;
+}
+
+static void check_blindness(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+ int prog_fd = -1, map_fd;
+ struct bpf_insn_array_value val = {};
+ char bpf_jit_harden = '@'; /* non-exizsting value */
+ int i;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns));
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(insns); i++) {
+ val.orig_off = i;
+ if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
+ goto cleanup;
+ }
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ bpf_jit_harden = '2';
+ if (set_bpf_jit_harden(&bpf_jit_harden)) {
+ bpf_jit_harden = '@'; /* open, read or write failed => no write was done */
+ goto cleanup;
+ }
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(insns); i++) {
+ char fmt[32];
+
+ if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
+ goto cleanup;
+
+ snprintf(fmt, sizeof(fmt), "val should be equal 3*%d", i);
+ ASSERT_EQ(val.xlated_off, i * 3, fmt);
+ }
+
+cleanup:
+ /* restore the old one */
+ if (bpf_jit_harden != '@')
+ set_bpf_jit_harden(&bpf_jit_harden);
+
+ close(prog_fd);
+ close(map_fd);
+}
+
+/* Once map was initialized, it should be frozen */
+static void check_load_unfrozen_map(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ int prog_fd = -1, map_fd;
+ struct bpf_insn_array_value val = {};
+ int i;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns));
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(insns); i++) {
+ val.orig_off = i;
+ if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
+ goto cleanup;
+ }
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)"))
+ goto cleanup;
+
+ /* correctness: now freeze the map, the program should load fine */
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(insns); i++) {
+ if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
+ goto cleanup;
+
+ ASSERT_EQ(val.xlated_off, i, "val should be equal i");
+ }
+
+cleanup:
+ close(prog_fd);
+ close(map_fd);
+}
+
+/* Map can be used only by one BPF program */
+static void check_no_map_reuse(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ int prog_fd = -1, map_fd, extra_fd = -1;
+ struct bpf_insn_array_value val = {};
+ int i;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns));
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(insns); i++) {
+ val.orig_off = i;
+ if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
+ goto cleanup;
+ }
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(insns); i++) {
+ if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
+ goto cleanup;
+
+ ASSERT_EQ(val.xlated_off, i, "val should be equal i");
+ }
+
+ extra_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
+ if (!ASSERT_EQ(extra_fd, -EBUSY, "program should have been rejected (extra_fd != -EBUSY)"))
+ goto cleanup;
+
+ /* correctness: check that prog is still loadable without fd_array */
+ extra_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0);
+ if (!ASSERT_GE(extra_fd, 0, "bpf(BPF_PROG_LOAD): expected no error"))
+ goto cleanup;
+
+cleanup:
+ close(extra_fd);
+ close(prog_fd);
+ close(map_fd);
+}
+
+static void check_bpf_no_lookup(void)
+{
+ struct bpf_insn insns[] = {
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ };
+ int prog_fd = -1, map_fd;
+
+ map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1);
+ if (!ASSERT_GE(map_fd, 0, "map_create"))
+ return;
+
+ insns[0].imm = map_fd;
+
+ if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
+ goto cleanup;
+
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0);
+ if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)"))
+ goto cleanup;
+
+ /* correctness: check that prog is still loadable with normal map */
+ close(map_fd);
+ map_fd = map_create(BPF_MAP_TYPE_ARRAY, 1);
+ insns[0].imm = map_fd;
+ prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0);
+ if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
+ goto cleanup;
+
+cleanup:
+ close(prog_fd);
+ close(map_fd);
+}
+
+static void check_bpf_side(void)
+{
+ check_bpf_no_lookup();
+}
+
+static void __test_bpf_insn_array(void)
+{
+ /* Test if offsets are adjusted properly */
+
+ if (test__start_subtest("one2one"))
+ check_one_to_one_mapping();
+
+ if (test__start_subtest("simple"))
+ check_simple();
+
+ if (test__start_subtest("deletions"))
+ check_deletions();
+
+ if (test__start_subtest("deletions-with-functions"))
+ check_deletions_with_functions();
+
+ if (test__start_subtest("blindness"))
+ check_blindness();
+
+ /* Check all kinds of operations and related restrictions */
+
+ if (test__start_subtest("incorrect-index"))
+ check_incorrect_index();
+
+ if (test__start_subtest("load-unfrozen-map"))
+ check_load_unfrozen_map();
+
+ if (test__start_subtest("no-map-reuse"))
+ check_no_map_reuse();
+
+ if (test__start_subtest("bpf-side-ops"))
+ check_bpf_side();
+}
+#else
+static void __test_bpf_insn_array(void)
+{
+ test__skip();
+}
+#endif
+
+void test_bpf_insn_array(void)
+{
+ __test_bpf_insn_array();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 8a9ba4292109..054ecb6b1e9f 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -7496,6 +7496,71 @@ static struct btf_dedup_test dedup_tests[] = {
},
},
{
+ .descr = "dedup: recursive typedef",
+ /*
+ * This test simulates a recursive typedef, which in GO is defined as such:
+ *
+ * type Foo func() Foo
+ *
+ * In BTF terms, this is represented as a TYPEDEF referencing
+ * a FUNC_PROTO that returns the same TYPEDEF.
+ */
+ .input = {
+ .raw_types = {
+ /*
+ * [1] typedef Foo -> func() Foo
+ * [2] func_proto() -> Foo
+ * [3] typedef Foo -> func() Foo
+ * [4] func_proto() -> Foo
+ */
+ BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 0), /* [2] */
+ BTF_TYPEDEF_ENC(NAME_NTH(1), 4), /* [3] */
+ BTF_FUNC_PROTO_ENC(3, 0), /* [4] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0Foo"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 0), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0Foo"),
+ },
+},
+{
+ .descr = "dedup: typedef",
+ /*
+ * // CU 1:
+ * typedef int foo;
+ *
+ * // CU 2:
+ * typedef int foo;
+ */
+ .input = {
+ .raw_types = {
+ /* CU 1 */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */
+ /* CU 2 */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [3] */
+ BTF_TYPEDEF_ENC(NAME_NTH(1), 3), /* [4] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0foo"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0foo"),
+ },
+},
+{
.descr = "dedup: typedef tags",
.input = {
.raw_types = {
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_split.c b/tools/testing/selftests/bpf/prog_tests/btf_split.c
index 3696fb9a05ed..2d47cad50a51 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_split.c
@@ -12,11 +12,45 @@ static void btf_dump_printf(void *ctx, const char *fmt, va_list args)
vfprintf(ctx, fmt, args);
}
+/* Write raw BTF to file, return number of bytes written or negative errno */
+static ssize_t btf_raw_write(struct btf *btf, char *file)
+{
+ ssize_t written = 0;
+ const void *data;
+ __u32 size = 0;
+ int fd, ret;
+
+ fd = mkstemp(file);
+ if (!ASSERT_GE(fd, 0, "create_file"))
+ return -errno;
+
+ data = btf__raw_data(btf, &size);
+ if (!ASSERT_OK_PTR(data, "btf__raw_data")) {
+ close(fd);
+ return -EINVAL;
+ }
+ while (written < size) {
+ ret = write(fd, data + written, size - written);
+ if (!ASSERT_GE(ret, 0, "write succeeded")) {
+ close(fd);
+ return -errno;
+ }
+ written += ret;
+ }
+ close(fd);
+ return written;
+}
+
static void __test_btf_split(bool multi)
{
+ char multisplit_btf_file[] = "/tmp/test_btf_multisplit.XXXXXX";
+ char split_btf_file[] = "/tmp/test_btf_split.XXXXXX";
+ char base_btf_file[] = "/tmp/test_btf_base.XXXXXX";
+ ssize_t multisplit_btf_sz = 0, split_btf_sz = 0, base_btf_sz = 0;
struct btf_dump *d = NULL;
- const struct btf_type *t;
- struct btf *btf1, *btf2, *btf3 = NULL;
+ const struct btf_type *t, *ot;
+ struct btf *btf1 = NULL, *btf2 = NULL, *btf3 = NULL;
+ struct btf *btf4 = NULL, *btf5 = NULL, *btf6 = NULL;
int str_off, i, err;
btf1 = btf__new_empty();
@@ -123,6 +157,45 @@ static void __test_btf_split(bool multi)
" int uf2;\n"
"};\n\n", "c_dump");
+ /* write base, split BTFs to files and ensure parsing succeeds */
+ base_btf_sz = btf_raw_write(btf1, base_btf_file);
+ if (base_btf_sz < 0)
+ goto cleanup;
+ split_btf_sz = btf_raw_write(btf2, split_btf_file);
+ if (split_btf_sz < 0)
+ goto cleanup;
+ btf4 = btf__parse(base_btf_file, NULL);
+ if (!ASSERT_OK_PTR(btf4, "parse_base"))
+ goto cleanup;
+ btf5 = btf__parse_split(split_btf_file, btf4);
+ if (!ASSERT_OK_PTR(btf5, "parse_split"))
+ goto cleanup;
+ if (multi) {
+ multisplit_btf_sz = btf_raw_write(btf3, multisplit_btf_file);
+ if (multisplit_btf_sz < 0)
+ goto cleanup;
+ btf6 = btf__parse_split(multisplit_btf_file, btf5);
+ if (!ASSERT_OK_PTR(btf6, "parse_multisplit"))
+ goto cleanup;
+ } else {
+ btf6 = btf5;
+ }
+
+ if (!ASSERT_EQ(btf__type_cnt(btf3), btf__type_cnt(btf6), "cmp_type_cnt"))
+ goto cleanup;
+
+ /* compare parsed to original BTF */
+ for (i = 1; i < btf__type_cnt(btf6); i++) {
+ t = btf__type_by_id(btf6, i);
+ if (!ASSERT_OK_PTR(t, "type_in_parsed_btf"))
+ goto cleanup;
+ ot = btf__type_by_id(btf3, i);
+ if (!ASSERT_OK_PTR(ot, "type_in_orig_btf"))
+ goto cleanup;
+ if (!ASSERT_EQ(memcmp(t, ot, sizeof(*ot)), 0, "cmp_parsed_orig_btf"))
+ goto cleanup;
+ }
+
cleanup:
if (dump_buf_file)
fclose(dump_buf_file);
@@ -132,6 +205,16 @@ cleanup:
btf__free(btf2);
if (btf2 != btf3)
btf__free(btf3);
+ btf__free(btf4);
+ btf__free(btf5);
+ if (btf5 != btf6)
+ btf__free(btf6);
+ if (base_btf_sz > 0)
+ unlink(base_btf_file);
+ if (split_btf_sz > 0)
+ unlink(split_btf_file);
+ if (multisplit_btf_sz > 0)
+ unlink(multisplit_btf_file);
}
void test_btf_split(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c
index 2a9a30650350..65b4512967e7 100644
--- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c
+++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c
@@ -153,6 +153,26 @@ static void test_check_mtu_run_tc(struct test_check_mtu *skel,
ASSERT_EQ(mtu_result, mtu_expect, "MTU-compare-user");
}
+static void test_chk_segs_flag(struct test_check_mtu *skel, __u32 mtu)
+{
+ int err, prog_fd = bpf_program__fd(skel->progs.tc_chk_segs_flag);
+ struct __sk_buff skb = {
+ .gso_size = 10,
+ };
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .ctx_in = &skb,
+ .ctx_size_in = sizeof(skb),
+ );
+
+ /* Lower the mtu to test the BPF_MTU_CHK_SEGS */
+ SYS_NOFAIL("ip link set dev lo mtu 10");
+ err = bpf_prog_test_run_opts(prog_fd, &topts);
+ SYS_NOFAIL("ip link set dev lo mtu %u", mtu);
+ ASSERT_OK(err, "test_run");
+ ASSERT_EQ(topts.retval, BPF_OK, "retval");
+}
static void test_check_mtu_tc(__u32 mtu, __u32 ifindex)
{
@@ -177,11 +197,12 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex)
test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu);
test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu);
test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu);
+ test_chk_segs_flag(skel, mtu);
cleanup:
test_check_mtu__destroy(skel);
}
-void serial_test_check_mtu(void)
+void test_ns_check_mtu(void)
{
int mtu_lo;
diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
index 34b59f6baca1..7488a7606e6a 100644
--- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
@@ -22,79 +22,37 @@
static int duration = 0;
-struct addr_port {
- in_port_t port;
- union {
- struct in_addr in_addr;
- struct in6_addr in6_addr;
- };
-};
-
-struct tuple {
- int family;
- struct addr_port src;
- struct addr_port dst;
-};
-
-static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
-{
- const struct sockaddr_in6 *in6;
- const struct sockaddr_in *in;
-
- switch (sa->sa_family) {
- case AF_INET:
- in = (const struct sockaddr_in *)sa;
- ap->in_addr = in->sin_addr;
- ap->port = in->sin_port;
- return true;
-
- case AF_INET6:
- in6 = (const struct sockaddr_in6 *)sa;
- ap->in6_addr = in6->sin6_addr;
- ap->port = in6->sin6_port;
- return true;
-
- default:
- return false;
- }
-}
-static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
- int *server, int *conn, struct tuple *tuple)
+static bool set_up_conn(const struct sockaddr_storage *addr, socklen_t len, int type,
+ int *server, int *conn,
+ struct sockaddr_storage *src,
+ struct sockaddr_storage *dst)
{
struct sockaddr_storage ss;
socklen_t slen = sizeof(ss);
- struct sockaddr *sa = (struct sockaddr *)&ss;
- *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL);
+ *server = start_server_addr(type, addr, len, NULL);
if (*server < 0)
return false;
- if (CHECK_FAIL(getsockname(*server, sa, &slen)))
+ if (CHECK_FAIL(getsockname(*server, (struct sockaddr *)&ss, &slen)))
goto close_server;
- *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL);
+ *conn = connect_to_addr(type, &ss, slen, NULL);
if (*conn < 0)
goto close_server;
/* We want to simulate packets arriving at conn, so we have to
* swap src and dst.
*/
- slen = sizeof(ss);
- if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
- goto close_conn;
-
- if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
+ slen = sizeof(*dst);
+ if (CHECK_FAIL(getsockname(*conn, (struct sockaddr *)dst, &slen)))
goto close_conn;
- slen = sizeof(ss);
- if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
+ slen = sizeof(*src);
+ if (CHECK_FAIL(getpeername(*conn, (struct sockaddr *)src, &slen)))
goto close_conn;
- if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
- goto close_conn;
-
- tuple->family = ss.ss_family;
return true;
close_conn:
@@ -110,17 +68,16 @@ static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
{
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
+ memset(addr, 0, sizeof(*addr));
switch (family) {
case AF_INET:
addr4 = (struct sockaddr_in *)addr;
- memset(addr4, 0, sizeof(*addr4));
addr4->sin_family = family;
addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
return sizeof(*addr4);
case AF_INET6:
addr6 = (struct sockaddr_in6 *)addr;
- memset(addr6, 0, sizeof(*addr6));
addr6->sin6_family = family;
addr6->sin6_addr = in6addr_loopback;
return sizeof(*addr6);
@@ -242,9 +199,15 @@ static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
}
static size_t build_input(const struct test_cfg *test, void *const buf,
- const struct tuple *tuple)
+ const struct sockaddr_storage *src,
+ const struct sockaddr_storage *dst)
{
- in_port_t sport = tuple->src.port;
+ struct sockaddr_in6 *src_in6 = (struct sockaddr_in6 *)src;
+ struct sockaddr_in6 *dst_in6 = (struct sockaddr_in6 *)dst;
+ struct sockaddr_in *src_in = (struct sockaddr_in *)src;
+ struct sockaddr_in *dst_in = (struct sockaddr_in *)dst;
+ sa_family_t family = src->ss_family;
+ in_port_t sport, dport;
encap_headers_t encap;
struct iphdr ip;
struct ipv6hdr ipv6;
@@ -254,8 +217,11 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
uint8_t *p = buf;
int proto;
+ sport = (family == AF_INET) ? src_in->sin_port : src_in6->sin6_port;
+ dport = (family == AF_INET) ? dst_in->sin_port : dst_in6->sin6_port;
+
proto = IPPROTO_IPIP;
- if (tuple->family == AF_INET6)
+ if (family == AF_INET6)
proto = IPPROTO_IPV6;
encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
@@ -270,15 +236,15 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
if (test->type == UDP)
proto = IPPROTO_UDP;
- switch (tuple->family) {
+ switch (family) {
case AF_INET:
ip = (struct iphdr){
.ihl = 5,
.version = 4,
.ttl = IPDEFTTL,
.protocol = proto,
- .saddr = tuple->src.in_addr.s_addr,
- .daddr = tuple->dst.in_addr.s_addr,
+ .saddr = src_in->sin_addr.s_addr,
+ .daddr = dst_in->sin_addr.s_addr,
};
p = mempcpy(p, &ip, sizeof(ip));
break;
@@ -287,8 +253,8 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
.version = 6,
.hop_limit = IPDEFTTL,
.nexthdr = proto,
- .saddr = tuple->src.in6_addr,
- .daddr = tuple->dst.in6_addr,
+ .saddr = src_in6->sin6_addr,
+ .daddr = dst_in6->sin6_addr,
};
p = mempcpy(p, &ipv6, sizeof(ipv6));
break;
@@ -303,18 +269,16 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
case TCP:
tcp = (struct tcphdr){
.source = sport,
- .dest = tuple->dst.port,
+ .dest = dport,
+ .syn = (test->flags == SYN),
+ .ack = (test->flags == ACK),
};
- if (test->flags == SYN)
- tcp.syn = true;
- if (test->flags == ACK)
- tcp.ack = true;
p = mempcpy(p, &tcp, sizeof(tcp));
break;
case UDP:
udp = (struct udphdr){
.source = sport,
- .dest = tuple->dst.port,
+ .dest = dport,
};
p = mempcpy(p, &udp, sizeof(udp));
break;
@@ -339,27 +303,26 @@ static void test_cls_redirect_common(struct bpf_program *prog)
LIBBPF_OPTS(bpf_test_run_opts, tattr);
int families[] = { AF_INET, AF_INET6 };
struct sockaddr_storage ss;
- struct sockaddr *addr;
socklen_t slen;
int i, j, err, prog_fd;
int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
- struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
+ struct sockaddr_storage srcs[__NR_KIND][ARRAY_SIZE(families)];
+ struct sockaddr_storage dsts[__NR_KIND][ARRAY_SIZE(families)];
- addr = (struct sockaddr *)&ss;
for (i = 0; i < ARRAY_SIZE(families); i++) {
slen = prepare_addr(&ss, families[i]);
if (CHECK_FAIL(!slen))
goto cleanup;
- if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
+ if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_DGRAM,
&servers[UDP][i], &conns[UDP][i],
- &tuples[UDP][i])))
+ &srcs[UDP][i], &dsts[UDP][i])))
goto cleanup;
- if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
+ if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_STREAM,
&servers[TCP][i], &conns[TCP][i],
- &tuples[TCP][i])))
+ &srcs[TCP][i], &dsts[TCP][i])))
goto cleanup;
}
@@ -368,11 +331,12 @@ static void test_cls_redirect_common(struct bpf_program *prog)
struct test_cfg *test = &tests[i];
for (j = 0; j < ARRAY_SIZE(families); j++) {
- struct tuple *tuple = &tuples[test->type][j];
+ struct sockaddr_storage *src = &srcs[test->type][j];
+ struct sockaddr_storage *dst = &dsts[test->type][j];
char input[256];
char tmp[256];
- test_str(tmp, sizeof(tmp), test, tuple->family);
+ test_str(tmp, sizeof(tmp), test, families[j]);
if (!test__start_subtest(tmp))
continue;
@@ -380,7 +344,7 @@ static void test_cls_redirect_common(struct bpf_program *prog)
tattr.data_size_out = sizeof(tmp);
tattr.data_in = input;
- tattr.data_size_in = build_input(test, input, tuple);
+ tattr.data_size_in = build_input(test, input, src, dst);
if (CHECK_FAIL(!tattr.data_size_in))
continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/file_reader.c b/tools/testing/selftests/bpf/prog_tests/file_reader.c
new file mode 100644
index 000000000000..5cde32b35da4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/file_reader.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "file_reader.skel.h"
+#include "file_reader_fail.skel.h"
+#include <dlfcn.h>
+#include <sys/mman.h>
+
+const char *user_ptr = "hello world";
+char file_contents[256000];
+
+void *get_executable_base_addr(void)
+{
+ Dl_info info;
+
+ if (!dladdr((void *)&get_executable_base_addr, &info)) {
+ fprintf(stderr, "dladdr failed\n");
+ return NULL;
+ }
+
+ return info.dli_fbase;
+}
+
+static int initialize_file_contents(void)
+{
+ int fd, page_sz = sysconf(_SC_PAGESIZE);
+ ssize_t n = 0, cur, off;
+ void *addr;
+
+ fd = open("/proc/self/exe", O_RDONLY);
+ if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n"))
+ return 1;
+
+ do {
+ cur = read(fd, file_contents + n, sizeof(file_contents) - n);
+ if (!ASSERT_GT(cur, 0, "read success"))
+ break;
+ n += cur;
+ } while (n < sizeof(file_contents));
+
+ close(fd);
+
+ if (!ASSERT_EQ(n, sizeof(file_contents), "Read /proc/self/exe\n"))
+ return 1;
+
+ addr = get_executable_base_addr();
+ if (!ASSERT_NEQ(addr, NULL, "get executable address"))
+ return 1;
+
+ /* page-align base file address */
+ addr = (void *)((unsigned long)addr & ~(page_sz - 1));
+
+ /*
+ * Page out range 0..512K, use 0..256K for positive tests and
+ * 256K..512K for negative tests expecting page faults
+ */
+ for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) {
+ if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT),
+ "madvise pageout"))
+ return errno;
+ }
+
+ return 0;
+}
+
+static void run_test(const char *prog_name)
+{
+ struct file_reader *skel;
+ struct bpf_program *prog;
+ int err, fd;
+
+ err = initialize_file_contents();
+ if (!ASSERT_OK(err, "initialize file contents"))
+ return;
+
+ skel = file_reader__open();
+ if (!ASSERT_OK_PTR(skel, "file_reader__open"))
+ return;
+
+ bpf_object__for_each_program(prog, skel->obj) {
+ bpf_program__set_autoload(prog, strcmp(bpf_program__name(prog), prog_name) == 0);
+ }
+
+ memcpy(skel->bss->user_buf, file_contents, sizeof(file_contents));
+ skel->bss->pid = getpid();
+
+ err = file_reader__load(skel);
+ if (!ASSERT_OK(err, "file_reader__load"))
+ goto cleanup;
+
+ err = file_reader__attach(skel);
+ if (!ASSERT_OK(err, "file_reader__attach"))
+ goto cleanup;
+
+ fd = open("/proc/self/exe", O_RDONLY);
+ if (fd >= 0)
+ close(fd);
+
+ ASSERT_EQ(skel->bss->err, 0, "err");
+ ASSERT_EQ(skel->bss->run_success, 1, "run_success");
+cleanup:
+ file_reader__destroy(skel);
+}
+
+void test_file_reader(void)
+{
+ if (test__start_subtest("on_open_expect_fault"))
+ run_test("on_open_expect_fault");
+
+ if (test__start_subtest("on_open_validate_file_read"))
+ run_test("on_open_validate_file_read");
+
+ if (test__start_subtest("negative"))
+ RUN_TESTS(file_reader_fail);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c
index 2bc85f4814f4..d0b405eb2966 100644
--- a/tools/testing/selftests/bpf/prog_tests/htab_update.c
+++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c
@@ -15,17 +15,17 @@ struct htab_update_ctx {
static void test_reenter_update(void)
{
struct htab_update *skel;
- unsigned int key, value;
+ void *value = NULL;
+ unsigned int key, value_size;
int err;
skel = htab_update__open();
if (!ASSERT_OK_PTR(skel, "htab_update__open"))
return;
- /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */
- bpf_program__set_autoload(skel->progs.lookup_elem_raw, true);
+ bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true);
err = htab_update__load(skel);
- if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err)
+ if (!ASSERT_TRUE(!err, "htab_update__load") || err)
goto out;
skel->bss->pid = getpid();
@@ -33,14 +33,33 @@ static void test_reenter_update(void)
if (!ASSERT_OK(err, "htab_update__attach"))
goto out;
- /* Will trigger the reentrancy of bpf_map_update_elem() */
+ value_size = bpf_map__value_size(skel->maps.htab);
+
+ value = calloc(1, value_size);
+ if (!ASSERT_OK_PTR(value, "calloc value"))
+ goto out;
+ /*
+ * First update: plain insert. This should NOT trigger the re-entrancy
+ * path, because there is no old element to free yet.
+ */
key = 0;
- value = 0;
- err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0);
- if (!ASSERT_OK(err, "add element"))
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY);
+ if (!ASSERT_OK(err, "first update (insert)"))
+ goto out;
+
+ /*
+ * Second update: replace existing element with same key and trigger
+ * the reentrancy of bpf_map_update_elem().
+ * check_and_free_fields() calls bpf_obj_free_fields() on the old
+ * value, which is where fentry program runs and performs a nested
+ * bpf_map_update_elem(), triggering -EDEADLK.
+ */
+ memset(value, 0, value_size);
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY);
+ if (!ASSERT_OK(err, "second update (replace)"))
goto out;
- ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy");
+ ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy");
out:
htab_update__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
index 1de14b111931..6e35e13c2022 100644
--- a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c
@@ -57,7 +57,8 @@ static void subtest_kmem_cache_iter_check_slabinfo(struct kmem_cache_iter *skel)
if (!ASSERT_OK(ret, "kmem_cache_lookup"))
break;
- ASSERT_STREQ(r.name, name, "kmem_cache_name");
+ ASSERT_STRNEQ(r.name, name, sizeof(r.name) - 1,
+ "kmem_cache_name");
ASSERT_EQ(r.obj_size, objsize, "kmem_cache_objsize");
seen++;
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
index bc24f83339d6..0a7ef770c487 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
@@ -15,6 +15,10 @@ static void check_good_sample(struct test_perf_branches *skel)
int pbe_size = sizeof(struct perf_branch_entry);
int duration = 0;
+ if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
+ "checked sample validity before prog run"))
+ return;
+
if (CHECK(!skel->bss->valid, "output not valid",
"no valid sample from prog"))
return;
@@ -45,6 +49,10 @@ static void check_bad_sample(struct test_perf_branches *skel)
int written_stack = skel->bss->written_stack_out;
int duration = 0;
+ if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
+ "checked sample validity before prog run"))
+ return;
+
if (CHECK(!skel->bss->valid, "output not valid",
"no valid sample from prog"))
return;
@@ -83,8 +91,12 @@ static void test_perf_branches_common(int perf_fd,
err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err))
goto out_destroy;
- /* spin the loop for a while (random high number) */
- for (i = 0; i < 1000000; ++i)
+
+ /* Spin the loop for a while by using a high iteration count, and by
+ * checking whether the specific run count marker has been explicitly
+ * incremented at least once by the backing perf_event BPF program.
+ */
+ for (i = 0; i < 100000000 && !*(volatile int *)&skel->bss->run_cnt; ++i)
++j;
test_perf_branches__detach(skel);
@@ -116,11 +128,11 @@ static void test_perf_branches_hw(void)
pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
/*
- * Some setups don't support branch records (virtual machines, !x86),
- * so skip test in this case.
+ * Some setups don't support LBR (virtual machines, !x86, AMD Milan Zen
+ * 3 which only supports BRS), so skip test in this case.
*/
if (pfd < 0) {
- if (errno == ENOENT || errno == EOPNOTSUPP) {
+ if (errno == ENOENT || errno == EOPNOTSUPP || errno == EINVAL) {
printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n",
__func__);
test__skip();
diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
index c9f855e5da24..246eb259c08a 100644
--- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
@@ -28,6 +28,7 @@ static void test_success(void)
bpf_program__set_autoload(skel->progs.two_regions, true);
bpf_program__set_autoload(skel->progs.non_sleepable_1, true);
bpf_program__set_autoload(skel->progs.non_sleepable_2, true);
+ bpf_program__set_autoload(skel->progs.nested_rcu_region, true);
bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true);
bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog, true);
bpf_program__set_autoload(skel->progs.rcu_read_lock_global_subprog, true);
@@ -78,7 +79,8 @@ static const char * const inproper_region_tests[] = {
"non_sleepable_rcu_mismatch",
"inproper_sleepable_helper",
"inproper_sleepable_kfunc",
- "nested_rcu_region",
+ "nested_rcu_region_unbalanced_1",
+ "nested_rcu_region_unbalanced_2",
"rcu_read_lock_global_subprog_lock",
"rcu_read_lock_global_subprog_unlock",
"rcu_read_lock_sleepable_helper_global_subprog",
diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
index d6bd5e16e637..d2c0542716a8 100644
--- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
@@ -44,3 +44,59 @@ void test_refcounted_kptr_wrong_owner(void)
ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval");
refcounted_kptr__destroy(skel);
}
+
+void test_percpu_hash_refcounted_kptr_refcount_leak(void)
+{
+ struct refcounted_kptr *skel;
+ int cpu_nr, fd, err, key = 0;
+ struct bpf_map *map;
+ size_t values_sz;
+ u64 *values;
+ LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .repeat = 1,
+ );
+
+ cpu_nr = libbpf_num_possible_cpus();
+ if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus"))
+ return;
+
+ values = calloc(cpu_nr, sizeof(u64));
+ if (!ASSERT_OK_PTR(values, "calloc values"))
+ return;
+
+ skel = refcounted_kptr__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) {
+ free(values);
+ return;
+ }
+
+ values_sz = cpu_nr * sizeof(u64);
+ memset(values, 0, values_sz);
+
+ map = skel->maps.percpu_hash;
+ err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0);
+ if (!ASSERT_OK(err, "bpf_map__update_elem"))
+ goto out;
+
+ fd = bpf_program__fd(skel->progs.percpu_hash_refcount_leak);
+ err = bpf_prog_test_run_opts(fd, &opts);
+ if (!ASSERT_OK(err, "bpf_prog_test_run_opts"))
+ goto out;
+ if (!ASSERT_EQ(opts.retval, 2, "opts.retval"))
+ goto out;
+
+ err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0);
+ if (!ASSERT_OK(err, "bpf_map__update_elem"))
+ goto out;
+
+ fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount);
+ err = bpf_prog_test_run_opts(fd, &opts);
+ ASSERT_OK(err, "bpf_prog_test_run_opts");
+ ASSERT_EQ(opts.retval, 1, "opts.retval");
+
+out:
+ refcounted_kptr__destroy(skel);
+ free(values);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c
index 8c6c2043a432..f0a8c828f8f1 100644
--- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c
@@ -110,8 +110,8 @@ void serial_test_res_spin_lock_stress(void)
ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA");
sleep(5);
unload_module("bpf_test_rqspinlock", false);
-
- ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA");
- sleep(5);
- unload_module("bpf_test_rqspinlock", false);
+ /*
+ * Insert bpf_test_rqspinlock.ko manually with test_mode=[1|2] to test
+ * other cases (ABBA, ABBCCA).
+ */
}
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
index d1e4cb28a72c..64520684d2cb 100644
--- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
@@ -17,6 +17,7 @@
#include "test_ringbuf_n.lskel.h"
#include "test_ringbuf_map_key.lskel.h"
#include "test_ringbuf_write.lskel.h"
+#include "test_ringbuf_overwrite.lskel.h"
#define EDONE 7777
@@ -497,6 +498,68 @@ cleanup:
test_ringbuf_map_key_lskel__destroy(skel_map_key);
}
+static void ringbuf_overwrite_mode_subtest(void)
+{
+ unsigned long size, len1, len2, len3, len4, len5;
+ unsigned long expect_avail_data, expect_prod_pos, expect_over_pos;
+ struct test_ringbuf_overwrite_lskel *skel;
+ int page_size = getpagesize();
+ int err;
+
+ skel = test_ringbuf_overwrite_lskel__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ size = page_size;
+ len1 = page_size / 2;
+ len2 = page_size / 4;
+ len3 = size - len1 - len2 - BPF_RINGBUF_HDR_SZ * 3;
+ len4 = len3 - 8;
+ len5 = len3; /* retry with len3 */
+
+ skel->maps.ringbuf.max_entries = size;
+ skel->rodata->LEN1 = len1;
+ skel->rodata->LEN2 = len2;
+ skel->rodata->LEN3 = len3;
+ skel->rodata->LEN4 = len4;
+ skel->rodata->LEN5 = len5;
+
+ skel->bss->pid = getpid();
+
+ err = test_ringbuf_overwrite_lskel__load(skel);
+ if (!ASSERT_OK(err, "skel_load"))
+ goto cleanup;
+
+ err = test_ringbuf_overwrite_lskel__attach(skel);
+ if (!ASSERT_OK(err, "skel_attach"))
+ goto cleanup;
+
+ syscall(__NR_getpgid);
+
+ ASSERT_EQ(skel->bss->reserve1_fail, 0, "reserve 1");
+ ASSERT_EQ(skel->bss->reserve2_fail, 0, "reserve 2");
+ ASSERT_EQ(skel->bss->reserve3_fail, 1, "reserve 3");
+ ASSERT_EQ(skel->bss->reserve4_fail, 0, "reserve 4");
+ ASSERT_EQ(skel->bss->reserve5_fail, 0, "reserve 5");
+
+ ASSERT_EQ(skel->bss->ring_size, size, "check_ring_size");
+
+ expect_avail_data = len2 + len4 + len5 + 3 * BPF_RINGBUF_HDR_SZ;
+ ASSERT_EQ(skel->bss->avail_data, expect_avail_data, "check_avail_size");
+
+ ASSERT_EQ(skel->bss->cons_pos, 0, "check_cons_pos");
+
+ expect_prod_pos = len1 + len2 + len4 + len5 + 4 * BPF_RINGBUF_HDR_SZ;
+ ASSERT_EQ(skel->bss->prod_pos, expect_prod_pos, "check_prod_pos");
+
+ expect_over_pos = len1 + BPF_RINGBUF_HDR_SZ;
+ ASSERT_EQ(skel->bss->over_pos, expect_over_pos, "check_over_pos");
+
+ test_ringbuf_overwrite_lskel__detach(skel);
+cleanup:
+ test_ringbuf_overwrite_lskel__destroy(skel);
+}
+
void test_ringbuf(void)
{
if (test__start_subtest("ringbuf"))
@@ -507,4 +570,6 @@ void test_ringbuf(void)
ringbuf_map_key_subtest();
if (test__start_subtest("ringbuf_write"))
ringbuf_write_subtest();
+ if (test__start_subtest("ringbuf_overwrite_mode"))
+ ringbuf_overwrite_mode_subtest();
}
diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 036d4760d2c1..3dbcc091f16c 100644
--- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -41,11 +41,7 @@ static struct bpf_object *obj;
static __u32 index_zero;
static int epfd;
-static union sa46 {
- struct sockaddr_in6 v6;
- struct sockaddr_in v4;
- sa_family_t family;
-} srv_sa;
+static struct sockaddr_storage srv_sa;
#define RET_IF(condition, tag, format...) ({ \
if (CHECK_FAIL(condition)) { \
@@ -135,24 +131,24 @@ static int prepare_bpf_obj(void)
return 0;
}
-static void sa46_init_loopback(union sa46 *sa, sa_family_t family)
+static void ss_init_loopback(struct sockaddr_storage *sa, sa_family_t family)
{
memset(sa, 0, sizeof(*sa));
- sa->family = family;
- if (sa->family == AF_INET6)
- sa->v6.sin6_addr = in6addr_loopback;
+ sa->ss_family = family;
+ if (sa->ss_family == AF_INET6)
+ ((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_loopback;
else
- sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ ((struct sockaddr_in *)sa)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
}
-static void sa46_init_inany(union sa46 *sa, sa_family_t family)
+static void ss_init_inany(struct sockaddr_storage *sa, sa_family_t family)
{
memset(sa, 0, sizeof(*sa));
- sa->family = family;
- if (sa->family == AF_INET6)
- sa->v6.sin6_addr = in6addr_any;
+ sa->ss_family = family;
+ if (sa->ss_family == AF_INET6)
+ ((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_any;
else
- sa->v4.sin_addr.s_addr = INADDR_ANY;
+ ((struct sockaddr_in *)sa)->sin_addr.s_addr = INADDR_ANY;
}
static int read_int_sysctl(const char *sysctl)
@@ -228,7 +224,7 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
int cli_fd)
{
struct data_check expected = {}, result;
- union sa46 cli_sa;
+ struct sockaddr_storage cli_sa;
socklen_t addrlen;
int err;
@@ -251,26 +247,32 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
}
if (family == AF_INET6) {
+ struct sockaddr_in6 *srv_v6 = (struct sockaddr_in6 *)&srv_sa;
+ struct sockaddr_in6 *cli_v6 = (struct sockaddr_in6 *)&cli_sa;
+
expected.eth_protocol = htons(ETH_P_IPV6);
- expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] &&
- !srv_sa.v6.sin6_addr.s6_addr32[2] &&
- !srv_sa.v6.sin6_addr.s6_addr32[1] &&
- !srv_sa.v6.sin6_addr.s6_addr32[0];
+ expected.bind_inany = !srv_v6->sin6_addr.s6_addr32[3] &&
+ !srv_v6->sin6_addr.s6_addr32[2] &&
+ !srv_v6->sin6_addr.s6_addr32[1] &&
+ !srv_v6->sin6_addr.s6_addr32[0];
- memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32,
- sizeof(cli_sa.v6.sin6_addr));
+ memcpy(&expected.skb_addrs[0], cli_v6->sin6_addr.s6_addr32,
+ sizeof(cli_v6->sin6_addr));
memcpy(&expected.skb_addrs[4], &in6addr_loopback,
sizeof(in6addr_loopback));
- expected.skb_ports[0] = cli_sa.v6.sin6_port;
- expected.skb_ports[1] = srv_sa.v6.sin6_port;
+ expected.skb_ports[0] = cli_v6->sin6_port;
+ expected.skb_ports[1] = srv_v6->sin6_port;
} else {
+ struct sockaddr_in *srv_v4 = (struct sockaddr_in *)&srv_sa;
+ struct sockaddr_in *cli_v4 = (struct sockaddr_in *)&cli_sa;
+
expected.eth_protocol = htons(ETH_P_IP);
- expected.bind_inany = !srv_sa.v4.sin_addr.s_addr;
+ expected.bind_inany = !srv_v4->sin_addr.s_addr;
- expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr;
+ expected.skb_addrs[0] = cli_v4->sin_addr.s_addr;
expected.skb_addrs[1] = htonl(INADDR_LOOPBACK);
- expected.skb_ports[0] = cli_sa.v4.sin_port;
- expected.skb_ports[1] = srv_sa.v4.sin_port;
+ expected.skb_ports[0] = cli_v4->sin_port;
+ expected.skb_ports[1] = srv_v4->sin_port;
}
if (memcmp(&result, &expected, offsetof(struct data_check,
@@ -364,16 +366,15 @@ static void check_results(void)
static int send_data(int type, sa_family_t family, void *data, size_t len,
enum result expected)
{
- union sa46 cli_sa;
+ struct sockaddr_storage cli_sa;
int fd, err;
fd = socket(family, type, 0);
RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno);
- sa46_init_loopback(&cli_sa, family);
+ ss_init_loopback(&cli_sa, family);
err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa));
RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno);
-
err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa,
sizeof(srv_sa));
RET_ERR(err != len && expected >= PASS,
@@ -589,9 +590,9 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany)
socklen_t addrlen;
if (inany)
- sa46_init_inany(&srv_sa, family);
+ ss_init_inany(&srv_sa, family);
else
- sa46_init_loopback(&srv_sa, family);
+ ss_init_loopback(&srv_sa, family);
addrlen = sizeof(srv_sa);
/*
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 1702aa592c2c..7ac4d5a488aa 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -206,6 +206,11 @@ destroy_skel:
skel_open_load_failure:
close(pipe_c2p[0]);
close(pipe_p2c[1]);
+ /*
+ * Child is either about to exit cleanly or stuck in case of errors.
+ * Nudge it to exit.
+ */
+ kill(pid, SIGKILL);
wait(NULL);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c
index 4d66fad3c8bd..0f3bf594e7a5 100644
--- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c
+++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c
@@ -20,7 +20,9 @@ static const char * const test_cases[] = {
"strcspn_str",
"strcspn_reject",
"strstr",
+ "strcasestr",
"strnstr",
+ "strncasestr",
};
void run_too_long_tests(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
index 2a27f3714f5c..bdc4fc06bc5a 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
@@ -139,7 +139,7 @@ static void test_lsm_tailcall(void)
if (CHECK_FAIL(!err))
goto close_prog;
- prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog);
+ prog_fd = bpf_program__fd(skel->progs.lsm_kernfs_init_security_prog);
if (CHECK_FAIL(prog_fd < 0))
goto close_prog;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c b/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c
new file mode 100644
index 000000000000..462512fb191f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/*
+ * BPF-based flow shaping
+ *
+ * The test brings up two veth in two isolated namespaces, attach some flow
+ * shaping program onto it, and ensures that a manual speedtest maximum
+ * value matches the rate set in the BPF shapers.
+ */
+
+#include <asm-generic/socket.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <math.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <bpf/libbpf.h>
+#include <pthread.h>
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "test_tc_edt.skel.h"
+
+#define SERVER_NS "tc-edt-server-ns"
+#define CLIENT_NS "tc-edt-client-ns"
+#define IP4_ADDR_VETH1 "192.168.1.1"
+#define IP4_ADDR_VETH2 "192.168.1.2"
+#define IP4_ADDR_VETH2_HEX 0xC0A80102
+
+#define TIMEOUT_MS 2000
+#define TEST_PORT 9000
+#define TARGET_RATE_MBPS 5.0
+#define TX_BYTES_COUNT (1 * 1000 * 1000)
+#define RATE_ERROR_PERCENT 2.0
+
+struct connection {
+ int server_listen_fd;
+ int server_conn_fd;
+ int client_conn_fd;
+};
+
+static int setup(struct test_tc_edt *skel)
+{
+ struct nstoken *nstoken_client, *nstoken_server;
+ int ret;
+
+ if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns"))
+ goto fail;
+ if (!ASSERT_OK(make_netns(SERVER_NS), "create server ns"))
+ goto fail_delete_client_ns;
+
+ nstoken_client = open_netns(CLIENT_NS);
+ if (!ASSERT_OK_PTR(nstoken_client, "open client ns"))
+ goto fail_delete_server_ns;
+ SYS(fail_close_client_ns, "ip link add veth1 type veth peer name %s",
+ "veth2 netns " SERVER_NS);
+ SYS(fail_close_client_ns, "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1");
+ SYS(fail_close_client_ns, "ip link set veth1 up");
+
+ nstoken_server = open_netns(SERVER_NS);
+ if (!ASSERT_OK_PTR(nstoken_server, "enter server ns"))
+ goto fail_close_client_ns;
+ SYS(fail_close_server_ns, "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2");
+ SYS(fail_close_server_ns, "ip link set veth2 up");
+ SYS(fail_close_server_ns, "tc qdisc add dev veth2 root fq");
+ ret = tc_prog_attach("veth2", -1, bpf_program__fd(skel->progs.tc_prog));
+ if (!ASSERT_OK(ret, "attach bpf prog"))
+ goto fail_close_server_ns;
+ skel->bss->target_rate = TARGET_RATE_MBPS * 1000 * 1000;
+ close_netns(nstoken_server);
+ close_netns(nstoken_client);
+
+ return 0;
+
+fail_close_server_ns:
+ close_netns(nstoken_server);
+fail_close_client_ns:
+ close_netns(nstoken_client);
+fail_delete_server_ns:
+ remove_netns(SERVER_NS);
+fail_delete_client_ns:
+ remove_netns(CLIENT_NS);
+fail:
+ return -1;
+}
+
+static void cleanup(void)
+{
+ remove_netns(CLIENT_NS);
+ remove_netns(SERVER_NS);
+}
+
+static void run_test(void)
+{
+ int server_fd, client_fd, err;
+ double rate_mbps, rate_error;
+ struct nstoken *nstoken;
+ __u64 ts_start, ts_end;
+
+ nstoken = open_netns(SERVER_NS);
+ if (!ASSERT_OK_PTR(nstoken, "open server ns"))
+ return;
+ server_fd = start_server(AF_INET, SOCK_STREAM, IP4_ADDR_VETH2,
+ TEST_PORT, TIMEOUT_MS);
+ if (!ASSERT_OK_FD(server_fd, "start server"))
+ return;
+
+ close_netns(nstoken);
+ nstoken = open_netns(CLIENT_NS);
+ if (!ASSERT_OK_PTR(nstoken, "open client ns"))
+ return;
+ client_fd = connect_to_fd(server_fd, 0);
+ if (!ASSERT_OK_FD(client_fd, "connect client"))
+ return;
+
+ ts_start = get_time_ns();
+ err = send_recv_data(server_fd, client_fd, TX_BYTES_COUNT);
+ ts_end = get_time_ns();
+ close_netns(nstoken);
+ ASSERT_OK(err, "send_recv_data");
+
+ rate_mbps = TX_BYTES_COUNT / ((ts_end - ts_start) / 1000.0);
+ rate_error =
+ fabs((rate_mbps - TARGET_RATE_MBPS) * 100.0 / TARGET_RATE_MBPS);
+
+ ASSERT_LE(rate_error, RATE_ERROR_PERCENT,
+ "rate error is lower than threshold");
+}
+
+void test_tc_edt(void)
+{
+ struct test_tc_edt *skel;
+
+ skel = test_tc_edt__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel open and load"))
+ return;
+
+ if (!ASSERT_OK(setup(skel), "global setup"))
+ return;
+
+ run_test();
+
+ cleanup();
+ test_tc_edt__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c
new file mode 100644
index 000000000000..0fe0a8f62486
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c
@@ -0,0 +1,714 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/*
+ * End-to-end eBPF tunnel test suite
+ * The file tests BPF network tunnels implementation. For each tunnel
+ * type, the test validates that:
+ * - basic communication can first be established between the two veths
+ * - when adding a BPF-based encapsulation on client egress, it now fails
+ * to communicate with the server
+ * - when adding a kernel-based decapsulation on server ingress, client
+ * can now connect
+ * - when replacing the kernel-based decapsulation with a BPF-based one,
+ * the client can still connect
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <bpf/libbpf.h>
+
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "test_tc_tunnel.skel.h"
+
+#define SERVER_NS "tc-tunnel-server-ns"
+#define CLIENT_NS "tc-tunnel-client-ns"
+#define MAC_ADDR_VETH1 "00:11:22:33:44:55"
+#define IP4_ADDR_VETH1 "192.168.1.1"
+#define IP6_ADDR_VETH1 "fd::1"
+#define MAC_ADDR_VETH2 "66:77:88:99:AA:BB"
+#define IP4_ADDR_VETH2 "192.168.1.2"
+#define IP6_ADDR_VETH2 "fd::2"
+
+#define TEST_NAME_MAX_LEN 64
+#define PROG_NAME_MAX_LEN 64
+#define TUNNEL_ARGS_MAX_LEN 128
+#define BUFFER_LEN 2000
+#define DEFAULT_TEST_DATA_SIZE 100
+#define GSO_TEST_DATA_SIZE BUFFER_LEN
+
+#define TIMEOUT_MS 1000
+#define TEST_PORT 8000
+#define UDP_PORT 5555
+#define MPLS_UDP_PORT 6635
+#define FOU_MPLS_PROTO 137
+#define VXLAN_ID 1
+#define VXLAN_PORT 8472
+#define MPLS_TABLE_ENTRIES_COUNT 65536
+
+static char tx_buffer[BUFFER_LEN], rx_buffer[BUFFER_LEN];
+
+struct subtest_cfg {
+ char *ebpf_tun_type;
+ char *iproute_tun_type;
+ char *mac_tun_type;
+ int ipproto;
+ void (*extra_decap_mod_args_cb)(struct subtest_cfg *cfg, char *dst);
+ bool tunnel_need_veth_mac;
+ bool configure_fou_rx_port;
+ char *tmode;
+ bool expect_kern_decap_failure;
+ bool configure_mpls;
+ bool test_gso;
+ char *tunnel_client_addr;
+ char *tunnel_server_addr;
+ char name[TEST_NAME_MAX_LEN];
+ char *server_addr;
+ int client_egress_prog_fd;
+ int server_ingress_prog_fd;
+ char extra_decap_mod_args[TUNNEL_ARGS_MAX_LEN];
+ int server_fd;
+};
+
+struct connection {
+ int client_fd;
+ int server_fd;
+};
+
+static int build_subtest_name(struct subtest_cfg *cfg, char *dst, size_t size)
+{
+ int ret;
+
+ ret = snprintf(dst, size, "%s_%s", cfg->ebpf_tun_type,
+ cfg->mac_tun_type);
+
+ return ret < 0 ? ret : 0;
+}
+
+static int set_subtest_progs(struct subtest_cfg *cfg, struct test_tc_tunnel *skel)
+{
+ char prog_name[PROG_NAME_MAX_LEN];
+ struct bpf_program *prog;
+ int ret;
+
+ ret = snprintf(prog_name, PROG_NAME_MAX_LEN, "__encap_");
+ if (ret < 0)
+ return ret;
+ ret = build_subtest_name(cfg, prog_name + ret, PROG_NAME_MAX_LEN - ret);
+ if (ret < 0)
+ return ret;
+ prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+ if (!prog)
+ return -1;
+
+ cfg->client_egress_prog_fd = bpf_program__fd(prog);
+ cfg->server_ingress_prog_fd = bpf_program__fd(skel->progs.decap_f);
+ return 0;
+}
+
+static void set_subtest_addresses(struct subtest_cfg *cfg)
+{
+ if (cfg->ipproto == 6)
+ cfg->server_addr = IP6_ADDR_VETH2;
+ else
+ cfg->server_addr = IP4_ADDR_VETH2;
+
+ /* Some specific tunnel types need specific addressing, it then
+ * has been already set in the configuration table. Otherwise,
+ * deduce the relevant addressing from the ipproto
+ */
+ if (cfg->tunnel_client_addr && cfg->tunnel_server_addr)
+ return;
+
+ if (cfg->ipproto == 6) {
+ cfg->tunnel_client_addr = IP6_ADDR_VETH1;
+ cfg->tunnel_server_addr = IP6_ADDR_VETH2;
+ } else {
+ cfg->tunnel_client_addr = IP4_ADDR_VETH1;
+ cfg->tunnel_server_addr = IP4_ADDR_VETH2;
+ }
+}
+
+static int run_server(struct subtest_cfg *cfg)
+{
+ int family = cfg->ipproto == 6 ? AF_INET6 : AF_INET;
+ struct nstoken *nstoken;
+ struct network_helper_opts opts = {
+ .timeout_ms = TIMEOUT_MS
+ };
+
+ nstoken = open_netns(SERVER_NS);
+ if (!ASSERT_OK_PTR(nstoken, "open server ns"))
+ return -1;
+
+ cfg->server_fd = start_server_str(family, SOCK_STREAM, cfg->server_addr,
+ TEST_PORT, &opts);
+ close_netns(nstoken);
+ if (!ASSERT_OK_FD(cfg->server_fd, "start server"))
+ return -1;
+
+ return 0;
+}
+
+static int check_server_rx_data(struct subtest_cfg *cfg,
+ struct connection *conn, int len)
+{
+ int err;
+
+ memset(rx_buffer, 0, BUFFER_LEN);
+ err = recv(conn->server_fd, rx_buffer, len, 0);
+ if (!ASSERT_EQ(err, len, "check rx data len"))
+ return 1;
+ if (!ASSERT_MEMEQ(tx_buffer, rx_buffer, len, "check received data"))
+ return 1;
+ return 0;
+}
+
+static struct connection *connect_client_to_server(struct subtest_cfg *cfg)
+{
+ struct network_helper_opts opts = {.timeout_ms = 500};
+ int family = cfg->ipproto == 6 ? AF_INET6 : AF_INET;
+ struct connection *conn = NULL;
+ int client_fd, server_fd;
+
+ conn = malloc(sizeof(struct connection));
+ if (!conn)
+ return conn;
+
+ client_fd = connect_to_addr_str(family, SOCK_STREAM, cfg->server_addr,
+ TEST_PORT, &opts);
+
+ if (client_fd < 0) {
+ free(conn);
+ return NULL;
+ }
+
+ server_fd = accept(cfg->server_fd, NULL, NULL);
+ if (server_fd < 0) {
+ close(client_fd);
+ free(conn);
+ return NULL;
+ }
+
+ conn->server_fd = server_fd;
+ conn->client_fd = client_fd;
+
+ return conn;
+}
+
+static void disconnect_client_from_server(struct subtest_cfg *cfg,
+ struct connection *conn)
+{
+ close(conn->server_fd);
+ close(conn->client_fd);
+ free(conn);
+}
+
+static int send_and_test_data(struct subtest_cfg *cfg, bool must_succeed)
+{
+ struct connection *conn;
+ int err, res = -1;
+
+ conn = connect_client_to_server(cfg);
+ if (!must_succeed && !ASSERT_ERR_PTR(conn, "connection that must fail"))
+ goto end;
+ else if (!must_succeed)
+ return 0;
+
+ if (!ASSERT_OK_PTR(conn, "connection that must succeed"))
+ return -1;
+
+ err = send(conn->client_fd, tx_buffer, DEFAULT_TEST_DATA_SIZE, 0);
+ if (!ASSERT_EQ(err, DEFAULT_TEST_DATA_SIZE, "send data from client"))
+ goto end;
+ if (check_server_rx_data(cfg, conn, DEFAULT_TEST_DATA_SIZE))
+ goto end;
+
+ if (!cfg->test_gso) {
+ res = 0;
+ goto end;
+ }
+
+ err = send(conn->client_fd, tx_buffer, GSO_TEST_DATA_SIZE, 0);
+ if (!ASSERT_EQ(err, GSO_TEST_DATA_SIZE, "send (large) data from client"))
+ goto end;
+ if (check_server_rx_data(cfg, conn, DEFAULT_TEST_DATA_SIZE))
+ goto end;
+
+ res = 0;
+end:
+ disconnect_client_from_server(cfg, conn);
+ return res;
+}
+
+static void vxlan_decap_mod_args_cb(struct subtest_cfg *cfg, char *dst)
+{
+ snprintf(dst, TUNNEL_ARGS_MAX_LEN, "id %d dstport %d udp6zerocsumrx",
+ VXLAN_ID, VXLAN_PORT);
+}
+
+static void udp_decap_mod_args_cb(struct subtest_cfg *cfg, char *dst)
+{
+ bool is_mpls = !strcmp(cfg->mac_tun_type, "mpls");
+
+ snprintf(dst, TUNNEL_ARGS_MAX_LEN,
+ "encap fou encap-sport auto encap-dport %d",
+ is_mpls ? MPLS_UDP_PORT : UDP_PORT);
+}
+
+static int configure_fou_rx_port(struct subtest_cfg *cfg, bool add)
+{
+ bool is_mpls = strcmp(cfg->mac_tun_type, "mpls") == 0;
+ int fou_proto;
+
+ if (is_mpls)
+ fou_proto = FOU_MPLS_PROTO;
+ else
+ fou_proto = cfg->ipproto == 6 ? 41 : 4;
+
+ SYS(fail, "ip fou %s port %d ipproto %d%s", add ? "add" : "del",
+ is_mpls ? MPLS_UDP_PORT : UDP_PORT, fou_proto,
+ cfg->ipproto == 6 ? " -6" : "");
+
+ return 0;
+fail:
+ return 1;
+}
+
+static int add_fou_rx_port(struct subtest_cfg *cfg)
+{
+ return configure_fou_rx_port(cfg, true);
+}
+
+static int del_fou_rx_port(struct subtest_cfg *cfg)
+{
+ return configure_fou_rx_port(cfg, false);
+}
+
+static int update_tunnel_intf_addr(struct subtest_cfg *cfg)
+{
+ SYS(fail, "ip link set dev testtun0 address " MAC_ADDR_VETH2);
+ return 0;
+fail:
+ return -1;
+}
+
+static int configure_kernel_for_mpls(struct subtest_cfg *cfg)
+{
+ SYS(fail, "sysctl -qw net.mpls.platform_labels=%d",
+ MPLS_TABLE_ENTRIES_COUNT);
+ SYS(fail, "ip -f mpls route add 1000 dev lo");
+ SYS(fail, "ip link set lo up");
+ SYS(fail, "sysctl -qw net.mpls.conf.testtun0.input=1");
+ SYS(fail, "sysctl -qw net.ipv4.conf.lo.rp_filter=0");
+ return 0;
+fail:
+ return -1;
+}
+
+static int configure_encapsulation(struct subtest_cfg *cfg)
+{
+ int ret;
+
+ ret = tc_prog_attach("veth1", -1, cfg->client_egress_prog_fd);
+
+ return ret;
+}
+
+static int configure_kernel_decapsulation(struct subtest_cfg *cfg)
+{
+ struct nstoken *nstoken = open_netns(SERVER_NS);
+ int ret = -1;
+
+ if (!ASSERT_OK_PTR(nstoken, "open server ns"))
+ return ret;
+
+ if (cfg->configure_fou_rx_port &&
+ !ASSERT_OK(add_fou_rx_port(cfg), "configure FOU RX port"))
+ goto fail;
+ SYS(fail, "ip link add name testtun0 type %s %s remote %s local %s %s",
+ cfg->iproute_tun_type, cfg->tmode ? cfg->tmode : "",
+ cfg->tunnel_client_addr, cfg->tunnel_server_addr,
+ cfg->extra_decap_mod_args);
+ if (cfg->tunnel_need_veth_mac &&
+ !ASSERT_OK(update_tunnel_intf_addr(cfg), "update testtun0 mac"))
+ goto fail;
+ if (cfg->configure_mpls &&
+ (!ASSERT_OK(configure_kernel_for_mpls(cfg),
+ "configure MPLS decap")))
+ goto fail;
+ SYS(fail, "sysctl -qw net.ipv4.conf.all.rp_filter=0");
+ SYS(fail, "sysctl -qw net.ipv4.conf.testtun0.rp_filter=0");
+ SYS(fail, "ip link set dev testtun0 up");
+
+ ret = 0;
+fail:
+ close_netns(nstoken);
+ return ret;
+}
+
+static void remove_kernel_decapsulation(struct subtest_cfg *cfg)
+{
+ SYS_NOFAIL("ip link del testtun0");
+ if (cfg->configure_mpls)
+ SYS_NOFAIL("ip -f mpls route del 1000 dev lo");
+ if (cfg->configure_fou_rx_port)
+ del_fou_rx_port(cfg);
+}
+
+static int configure_ebpf_decapsulation(struct subtest_cfg *cfg)
+{
+ struct nstoken *nstoken = open_netns(SERVER_NS);
+ int ret = -1;
+
+ if (!ASSERT_OK_PTR(nstoken, "open server ns"))
+ return ret;
+
+ if (!cfg->expect_kern_decap_failure)
+ SYS(fail, "ip link del testtun0");
+
+ if (!ASSERT_OK(tc_prog_attach("veth2", cfg->server_ingress_prog_fd, -1),
+ "attach_program"))
+ goto fail;
+
+ ret = 0;
+fail:
+ close_netns(nstoken);
+ return ret;
+}
+
+static void run_test(struct subtest_cfg *cfg)
+{
+ struct nstoken *nstoken;
+
+ if (!ASSERT_OK(run_server(cfg), "run server"))
+ return;
+
+ nstoken = open_netns(CLIENT_NS);
+ if (!ASSERT_OK_PTR(nstoken, "open client ns"))
+ goto fail;
+
+ /* Basic communication must work */
+ if (!ASSERT_OK(send_and_test_data(cfg, true), "connect without any encap"))
+ goto fail;
+
+ /* Attach encapsulation program to client */
+ if (!ASSERT_OK(configure_encapsulation(cfg), "configure encapsulation"))
+ goto fail;
+
+ /* If supported, insert kernel decap module, connection must succeed */
+ if (!cfg->expect_kern_decap_failure) {
+ if (!ASSERT_OK(configure_kernel_decapsulation(cfg),
+ "configure kernel decapsulation"))
+ goto fail;
+ if (!ASSERT_OK(send_and_test_data(cfg, true),
+ "connect with encap prog and kern decap"))
+ goto fail;
+ }
+
+ /* Replace kernel decapsulation with BPF decapsulation, test must pass */
+ if (!ASSERT_OK(configure_ebpf_decapsulation(cfg), "configure ebpf decapsulation"))
+ goto fail;
+ ASSERT_OK(send_and_test_data(cfg, true), "connect with encap and decap progs");
+
+fail:
+ close_netns(nstoken);
+ close(cfg->server_fd);
+}
+
+static int setup(void)
+{
+ struct nstoken *nstoken_client, *nstoken_server;
+ int fd, err;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (!ASSERT_OK_FD(fd, "open urandom"))
+ goto fail;
+ err = read(fd, tx_buffer, BUFFER_LEN);
+ close(fd);
+
+ if (!ASSERT_EQ(err, BUFFER_LEN, "read random bytes"))
+ goto fail;
+
+ /* Configure the testing network */
+ if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns") ||
+ !ASSERT_OK(make_netns(SERVER_NS), "create server ns"))
+ goto fail;
+
+ nstoken_client = open_netns(CLIENT_NS);
+ if (!ASSERT_OK_PTR(nstoken_client, "open client ns"))
+ goto fail_delete_ns;
+ SYS(fail_close_ns_client, "ip link add %s type veth peer name %s",
+ "veth1 mtu 1500 netns " CLIENT_NS " address " MAC_ADDR_VETH1,
+ "veth2 mtu 1500 netns " SERVER_NS " address " MAC_ADDR_VETH2);
+ SYS(fail_close_ns_client, "ethtool -K veth1 tso off");
+ SYS(fail_close_ns_client, "ip link set veth1 up");
+ nstoken_server = open_netns(SERVER_NS);
+ if (!ASSERT_OK_PTR(nstoken_server, "open server ns"))
+ goto fail_close_ns_client;
+ SYS(fail_close_ns_server, "ip link set veth2 up");
+
+ close_netns(nstoken_server);
+ close_netns(nstoken_client);
+ return 0;
+
+fail_close_ns_server:
+ close_netns(nstoken_server);
+fail_close_ns_client:
+ close_netns(nstoken_client);
+fail_delete_ns:
+ SYS_NOFAIL("ip netns del " CLIENT_NS);
+ SYS_NOFAIL("ip netns del " SERVER_NS);
+fail:
+ return -1;
+}
+
+static int subtest_setup(struct test_tc_tunnel *skel, struct subtest_cfg *cfg)
+{
+ struct nstoken *nstoken_client, *nstoken_server;
+ int ret = -1;
+
+ set_subtest_addresses(cfg);
+ if (!ASSERT_OK(set_subtest_progs(cfg, skel),
+ "find subtest progs"))
+ goto fail;
+ if (cfg->extra_decap_mod_args_cb)
+ cfg->extra_decap_mod_args_cb(cfg, cfg->extra_decap_mod_args);
+
+ nstoken_client = open_netns(CLIENT_NS);
+ if (!ASSERT_OK_PTR(nstoken_client, "open client ns"))
+ goto fail;
+ SYS(fail_close_client_ns,
+ "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1");
+ SYS(fail_close_client_ns, "ip -4 route flush table main");
+ SYS(fail_close_client_ns,
+ "ip -4 route add " IP4_ADDR_VETH2 " mtu 1450 dev veth1");
+ SYS(fail_close_client_ns,
+ "ip -6 addr add " IP6_ADDR_VETH1 "/64 dev veth1 nodad");
+ SYS(fail_close_client_ns, "ip -6 route flush table main");
+ SYS(fail_close_client_ns,
+ "ip -6 route add " IP6_ADDR_VETH2 " mtu 1430 dev veth1");
+ nstoken_server = open_netns(SERVER_NS);
+ if (!ASSERT_OK_PTR(nstoken_server, "open server ns"))
+ goto fail_close_client_ns;
+ SYS(fail_close_server_ns,
+ "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2");
+ SYS(fail_close_server_ns,
+ "ip -6 addr add " IP6_ADDR_VETH2 "/64 dev veth2 nodad");
+
+ ret = 0;
+
+fail_close_server_ns:
+ close_netns(nstoken_server);
+fail_close_client_ns:
+ close_netns(nstoken_client);
+fail:
+ return ret;
+}
+
+
+static void subtest_cleanup(struct subtest_cfg *cfg)
+{
+ struct nstoken *nstoken;
+
+ nstoken = open_netns(CLIENT_NS);
+ if (ASSERT_OK_PTR(nstoken, "open clien ns")) {
+ SYS_NOFAIL("tc qdisc delete dev veth1 parent ffff:fff1");
+ SYS_NOFAIL("ip a flush veth1");
+ close_netns(nstoken);
+ }
+ nstoken = open_netns(SERVER_NS);
+ if (ASSERT_OK_PTR(nstoken, "open clien ns")) {
+ SYS_NOFAIL("tc qdisc delete dev veth2 parent ffff:fff1");
+ SYS_NOFAIL("ip a flush veth2");
+ if (!cfg->expect_kern_decap_failure)
+ remove_kernel_decapsulation(cfg);
+ close_netns(nstoken);
+ }
+}
+
+static void cleanup(void)
+{
+ remove_netns(CLIENT_NS);
+ remove_netns(SERVER_NS);
+}
+
+static struct subtest_cfg subtests_cfg[] = {
+ {
+ .ebpf_tun_type = "ipip",
+ .mac_tun_type = "none",
+ .iproute_tun_type = "ipip",
+ .ipproto = 4,
+ },
+ {
+ .ebpf_tun_type = "ipip6",
+ .mac_tun_type = "none",
+ .iproute_tun_type = "ip6tnl",
+ .ipproto = 4,
+ .tunnel_client_addr = IP6_ADDR_VETH1,
+ .tunnel_server_addr = IP6_ADDR_VETH2,
+ },
+ {
+ .ebpf_tun_type = "ip6tnl",
+ .iproute_tun_type = "ip6tnl",
+ .mac_tun_type = "none",
+ .ipproto = 6,
+ },
+ {
+ .mac_tun_type = "none",
+ .ebpf_tun_type = "sit",
+ .iproute_tun_type = "sit",
+ .ipproto = 6,
+ .tunnel_client_addr = IP4_ADDR_VETH1,
+ .tunnel_server_addr = IP4_ADDR_VETH2,
+ },
+ {
+ .ebpf_tun_type = "vxlan",
+ .mac_tun_type = "eth",
+ .iproute_tun_type = "vxlan",
+ .ipproto = 4,
+ .extra_decap_mod_args_cb = vxlan_decap_mod_args_cb,
+ .tunnel_need_veth_mac = true
+ },
+ {
+ .ebpf_tun_type = "ip6vxlan",
+ .mac_tun_type = "eth",
+ .iproute_tun_type = "vxlan",
+ .ipproto = 6,
+ .extra_decap_mod_args_cb = vxlan_decap_mod_args_cb,
+ .tunnel_need_veth_mac = true
+ },
+ {
+ .ebpf_tun_type = "gre",
+ .mac_tun_type = "none",
+ .iproute_tun_type = "gre",
+ .ipproto = 4,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "gre",
+ .mac_tun_type = "eth",
+ .iproute_tun_type = "gretap",
+ .ipproto = 4,
+ .tunnel_need_veth_mac = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "gre",
+ .mac_tun_type = "mpls",
+ .iproute_tun_type = "gre",
+ .ipproto = 4,
+ .configure_mpls = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "ip6gre",
+ .mac_tun_type = "none",
+ .iproute_tun_type = "ip6gre",
+ .ipproto = 6,
+ .test_gso = true,
+ },
+ {
+ .ebpf_tun_type = "ip6gre",
+ .mac_tun_type = "eth",
+ .iproute_tun_type = "ip6gretap",
+ .ipproto = 6,
+ .tunnel_need_veth_mac = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "ip6gre",
+ .mac_tun_type = "mpls",
+ .iproute_tun_type = "ip6gre",
+ .ipproto = 6,
+ .configure_mpls = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "udp",
+ .mac_tun_type = "none",
+ .iproute_tun_type = "ipip",
+ .ipproto = 4,
+ .extra_decap_mod_args_cb = udp_decap_mod_args_cb,
+ .configure_fou_rx_port = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "udp",
+ .mac_tun_type = "eth",
+ .iproute_tun_type = "ipip",
+ .ipproto = 4,
+ .extra_decap_mod_args_cb = udp_decap_mod_args_cb,
+ .configure_fou_rx_port = true,
+ .expect_kern_decap_failure = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "udp",
+ .mac_tun_type = "mpls",
+ .iproute_tun_type = "ipip",
+ .ipproto = 4,
+ .extra_decap_mod_args_cb = udp_decap_mod_args_cb,
+ .configure_fou_rx_port = true,
+ .tmode = "mode any ttl 255",
+ .configure_mpls = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "ip6udp",
+ .mac_tun_type = "none",
+ .iproute_tun_type = "ip6tnl",
+ .ipproto = 6,
+ .extra_decap_mod_args_cb = udp_decap_mod_args_cb,
+ .configure_fou_rx_port = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "ip6udp",
+ .mac_tun_type = "eth",
+ .iproute_tun_type = "ip6tnl",
+ .ipproto = 6,
+ .extra_decap_mod_args_cb = udp_decap_mod_args_cb,
+ .configure_fou_rx_port = true,
+ .expect_kern_decap_failure = true,
+ .test_gso = true
+ },
+ {
+ .ebpf_tun_type = "ip6udp",
+ .mac_tun_type = "mpls",
+ .iproute_tun_type = "ip6tnl",
+ .ipproto = 6,
+ .extra_decap_mod_args_cb = udp_decap_mod_args_cb,
+ .configure_fou_rx_port = true,
+ .tmode = "mode any ttl 255",
+ .expect_kern_decap_failure = true,
+ .test_gso = true
+ },
+};
+
+void test_tc_tunnel(void)
+{
+ struct test_tc_tunnel *skel;
+ struct subtest_cfg *cfg;
+ int i, ret;
+
+ skel = test_tc_tunnel__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel open and load"))
+ return;
+
+ if (!ASSERT_OK(setup(), "global setup"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(subtests_cfg); i++) {
+ cfg = &subtests_cfg[i];
+ ret = build_subtest_name(cfg, cfg->name, TEST_NAME_MAX_LEN);
+ if (ret < 0 || !test__start_subtest(cfg->name))
+ continue;
+ if (subtest_setup(skel, cfg) == 0)
+ run_test(cfg);
+ subtest_cleanup(cfg);
+ }
+ cleanup();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index bae0e9de277d..eb9309931272 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -534,85 +534,6 @@ static void ping6_dev1(void)
close_netns(nstoken);
}
-static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd)
-{
- DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex,
- .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS);
- DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1,
- .priority = 1, .prog_fd = igr_fd);
- DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1,
- .priority = 1, .prog_fd = egr_fd);
- int ret;
-
- ret = bpf_tc_hook_create(&hook);
- if (!ASSERT_OK(ret, "create tc hook"))
- return ret;
-
- if (igr_fd >= 0) {
- hook.attach_point = BPF_TC_INGRESS;
- ret = bpf_tc_attach(&hook, &opts1);
- if (!ASSERT_OK(ret, "bpf_tc_attach")) {
- bpf_tc_hook_destroy(&hook);
- return ret;
- }
- }
-
- if (egr_fd >= 0) {
- hook.attach_point = BPF_TC_EGRESS;
- ret = bpf_tc_attach(&hook, &opts2);
- if (!ASSERT_OK(ret, "bpf_tc_attach")) {
- bpf_tc_hook_destroy(&hook);
- return ret;
- }
- }
-
- return 0;
-}
-
-static int generic_attach(const char *dev, int igr_fd, int egr_fd)
-{
- int ifindex;
-
- if (!ASSERT_OK_FD(igr_fd, "check ingress fd"))
- return -1;
- if (!ASSERT_OK_FD(egr_fd, "check egress fd"))
- return -1;
-
- ifindex = if_nametoindex(dev);
- if (!ASSERT_NEQ(ifindex, 0, "get ifindex"))
- return -1;
-
- return attach_tc_prog(ifindex, igr_fd, egr_fd);
-}
-
-static int generic_attach_igr(const char *dev, int igr_fd)
-{
- int ifindex;
-
- if (!ASSERT_OK_FD(igr_fd, "check ingress fd"))
- return -1;
-
- ifindex = if_nametoindex(dev);
- if (!ASSERT_NEQ(ifindex, 0, "get ifindex"))
- return -1;
-
- return attach_tc_prog(ifindex, igr_fd, -1);
-}
-
-static int generic_attach_egr(const char *dev, int egr_fd)
-{
- int ifindex;
-
- if (!ASSERT_OK_FD(egr_fd, "check egress fd"))
- return -1;
-
- ifindex = if_nametoindex(dev);
- if (!ASSERT_NEQ(ifindex, 0, "get ifindex"))
- return -1;
-
- return attach_tc_prog(ifindex, -1, egr_fd);
-}
-
static void test_vxlan_tunnel(void)
{
struct test_tunnel_kern *skel = NULL;
@@ -635,12 +556,12 @@ static void test_vxlan_tunnel(void)
goto done;
get_src_prog_fd = bpf_program__fd(skel->progs.vxlan_get_tunnel_src);
set_src_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_src);
- if (generic_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd))
+ if (tc_prog_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd))
goto done;
/* load and attach bpf prog to veth dev tc hook point */
set_dst_prog_fd = bpf_program__fd(skel->progs.veth_set_outer_dst);
- if (generic_attach_igr("veth1", set_dst_prog_fd))
+ if (tc_prog_attach("veth1", set_dst_prog_fd, -1))
goto done;
/* load and attach prog set_md to tunnel dev tc hook point at_ns0 */
@@ -648,7 +569,7 @@ static void test_vxlan_tunnel(void)
if (!ASSERT_OK_PTR(nstoken, "setns src"))
goto done;
set_dst_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_dst);
- if (generic_attach_egr(VXLAN_TUNL_DEV0, set_dst_prog_fd))
+ if (tc_prog_attach(VXLAN_TUNL_DEV0, -1, set_dst_prog_fd))
goto done;
close_netns(nstoken);
@@ -695,7 +616,7 @@ static void test_ip6vxlan_tunnel(void)
goto done;
get_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_get_tunnel_src);
set_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_src);
- if (generic_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd))
+ if (tc_prog_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd))
goto done;
/* load and attach prog set_md to tunnel dev tc hook point at_ns0 */
@@ -703,7 +624,7 @@ static void test_ip6vxlan_tunnel(void)
if (!ASSERT_OK_PTR(nstoken, "setns src"))
goto done;
set_dst_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_dst);
- if (generic_attach_egr(IP6VXLAN_TUNL_DEV0, set_dst_prog_fd))
+ if (tc_prog_attach(IP6VXLAN_TUNL_DEV0, -1, set_dst_prog_fd))
goto done;
close_netns(nstoken);
@@ -764,7 +685,7 @@ static void test_ipip_tunnel(enum ipip_encap encap)
skel->progs.ipip_set_tunnel);
}
- if (generic_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd))
+ if (tc_prog_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd))
goto done;
ping_dev0();
@@ -797,7 +718,7 @@ static void test_xfrm_tunnel(void)
/* attach tc prog to tunnel dev */
tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state);
- if (generic_attach_igr("veth1", tc_prog_fd))
+ if (tc_prog_attach("veth1", tc_prog_fd, -1))
goto done;
/* attach xdp prog to tunnel dev */
@@ -870,7 +791,7 @@ static void test_gre_tunnel(enum gre_test test)
if (!ASSERT_OK(err, "add tunnel"))
goto done;
- if (generic_attach(GRE_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(GRE_TUNL_DEV1, get_fd, set_fd))
goto done;
ping_dev0();
@@ -911,7 +832,7 @@ static void test_ip6gre_tunnel(enum ip6gre_test test)
set_fd = bpf_program__fd(skel->progs.ip6gretap_set_tunnel);
get_fd = bpf_program__fd(skel->progs.ip6gretap_get_tunnel);
- if (generic_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd))
goto done;
ping6_veth0();
@@ -954,7 +875,7 @@ static void test_erspan_tunnel(enum erspan_test test)
set_fd = bpf_program__fd(skel->progs.erspan_set_tunnel);
get_fd = bpf_program__fd(skel->progs.erspan_get_tunnel);
- if (generic_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd))
goto done;
ping_dev0();
@@ -990,7 +911,7 @@ static void test_ip6erspan_tunnel(enum erspan_test test)
set_fd = bpf_program__fd(skel->progs.ip4ip6erspan_set_tunnel);
get_fd = bpf_program__fd(skel->progs.ip4ip6erspan_get_tunnel);
- if (generic_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd))
goto done;
ping6_veth0();
@@ -1017,7 +938,7 @@ static void test_geneve_tunnel(void)
set_fd = bpf_program__fd(skel->progs.geneve_set_tunnel);
get_fd = bpf_program__fd(skel->progs.geneve_get_tunnel);
- if (generic_attach(GENEVE_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(GENEVE_TUNL_DEV1, get_fd, set_fd))
goto done;
ping_dev0();
@@ -1044,7 +965,7 @@ static void test_ip6geneve_tunnel(void)
set_fd = bpf_program__fd(skel->progs.ip6geneve_set_tunnel);
get_fd = bpf_program__fd(skel->progs.ip6geneve_get_tunnel);
- if (generic_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd))
goto done;
ping_dev0();
@@ -1083,7 +1004,7 @@ static void test_ip6tnl_tunnel(enum ip6tnl_test test)
get_fd = bpf_program__fd(skel->progs.ip6ip6_get_tunnel);
break;
}
- if (generic_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd))
+ if (tc_prog_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd))
goto done;
ping6_veth0();
diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
new file mode 100644
index 000000000000..5af28f359cfd
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
@@ -0,0 +1,2596 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <linux/bitmap.h>
+#include <linux/if_link.h>
+#include <linux/mman.h>
+#include <linux/netdev.h>
+#include <poll.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "network_helpers.h"
+#include "test_xsk.h"
+#include "xsk_xdp_common.h"
+#include "xsk_xdp_progs.skel.h"
+
+#define DEFAULT_BATCH_SIZE 64
+#define MIN_PKT_SIZE 64
+#define MAX_ETH_JUMBO_SIZE 9000
+#define MAX_INTERFACES 2
+#define MAX_TEARDOWN_ITER 10
+#define MAX_TX_BUDGET_DEFAULT 32
+#define PKT_DUMP_NB_TO_PRINT 16
+/* Just to align the data in the packet */
+#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2)
+#define POLL_TMOUT 1000
+#define THREAD_TMOUT 3
+#define UMEM_HEADROOM_TEST_SIZE 128
+#define XSK_DESC__INVALID_OPTION (0xffff)
+#define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1)
+#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024)
+#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024)
+
+static const u8 g_mac[ETH_ALEN] = {0x55, 0x44, 0x33, 0x22, 0x11, 0x00};
+
+bool opt_verbose;
+pthread_barrier_t barr;
+pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int pkts_in_flight;
+
+/* The payload is a word consisting of a packet sequence number in the upper
+ * 16-bits and a intra packet data sequence number in the lower 16 bits. So the 3rd packet's
+ * 5th word of data will contain the number (2<<16) | 4 as they are numbered from 0.
+ */
+static void write_payload(void *dest, u32 pkt_nb, u32 start, u32 size)
+{
+ u32 *ptr = (u32 *)dest, i;
+
+ start /= sizeof(*ptr);
+ size /= sizeof(*ptr);
+ for (i = 0; i < size; i++)
+ ptr[i] = htonl(pkt_nb << 16 | (i + start));
+}
+
+static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr)
+{
+ memcpy(eth_hdr->h_dest, xsk->dst_mac, ETH_ALEN);
+ memcpy(eth_hdr->h_source, xsk->src_mac, ETH_ALEN);
+ eth_hdr->h_proto = htons(ETH_P_LOOPBACK);
+}
+
+static bool is_umem_valid(struct ifobject *ifobj)
+{
+ return !!ifobj->umem->umem;
+}
+
+static u32 mode_to_xdp_flags(enum test_mode mode)
+{
+ return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE;
+}
+
+static u64 umem_size(struct xsk_umem_info *umem)
+{
+ return umem->num_frames * umem->frame_size;
+}
+
+int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer,
+ u64 size)
+{
+ struct xsk_umem_config cfg = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = umem->frame_size,
+ .frame_headroom = umem->frame_headroom,
+ .flags = XSK_UMEM__DEFAULT_FLAGS
+ };
+ int ret;
+
+ if (umem->fill_size)
+ cfg.fill_size = umem->fill_size;
+
+ if (umem->comp_size)
+ cfg.comp_size = umem->comp_size;
+
+ if (umem->unaligned_mode)
+ cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+
+ ret = xsk_umem__create(&umem->umem, buffer, size,
+ &umem->fq, &umem->cq, &cfg);
+ if (ret)
+ return ret;
+
+ umem->buffer = buffer;
+ if (ifobj->shared_umem && ifobj->rx_on) {
+ umem->base_addr = umem_size(umem);
+ umem->next_buffer = umem_size(umem);
+ }
+
+ return 0;
+}
+
+static u64 umem_alloc_buffer(struct xsk_umem_info *umem)
+{
+ u64 addr;
+
+ addr = umem->next_buffer;
+ umem->next_buffer += umem->frame_size;
+ if (umem->next_buffer >= umem->base_addr + umem_size(umem))
+ umem->next_buffer = umem->base_addr;
+
+ return addr;
+}
+
+static void umem_reset_alloc(struct xsk_umem_info *umem)
+{
+ umem->next_buffer = 0;
+}
+
+static int enable_busy_poll(struct xsk_socket_info *xsk)
+{
+ int sock_opt;
+
+ sock_opt = 1;
+ if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL,
+ (void *)&sock_opt, sizeof(sock_opt)) < 0)
+ return -errno;
+
+ sock_opt = 20;
+ if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL,
+ (void *)&sock_opt, sizeof(sock_opt)) < 0)
+ return -errno;
+
+ sock_opt = xsk->batch_size;
+ if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET,
+ (void *)&sock_opt, sizeof(sock_opt)) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem,
+ struct ifobject *ifobject, bool shared)
+{
+ struct xsk_socket_config cfg = {};
+ struct xsk_ring_cons *rxr;
+ struct xsk_ring_prod *txr;
+
+ xsk->umem = umem;
+ cfg.rx_size = xsk->rxqsize;
+ cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+ cfg.bind_flags = ifobject->bind_flags;
+ if (shared)
+ cfg.bind_flags |= XDP_SHARED_UMEM;
+ if (ifobject->mtu > MAX_ETH_PKT_SIZE)
+ cfg.bind_flags |= XDP_USE_SG;
+ if (umem->comp_size)
+ cfg.tx_size = umem->comp_size;
+ if (umem->fill_size)
+ cfg.rx_size = umem->fill_size;
+
+ txr = ifobject->tx_on ? &xsk->tx : NULL;
+ rxr = ifobject->rx_on ? &xsk->rx : NULL;
+ return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg);
+}
+
+#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags"
+static unsigned int get_max_skb_frags(void)
+{
+ unsigned int max_skb_frags = 0;
+ FILE *file;
+
+ file = fopen(MAX_SKB_FRAGS_PATH, "r");
+ if (!file) {
+ ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH);
+ return 0;
+ }
+
+ if (fscanf(file, "%u", &max_skb_frags) != 1)
+ ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH);
+
+ fclose(file);
+ return max_skb_frags;
+}
+
+static int set_ring_size(struct ifobject *ifobj)
+{
+ int ret;
+ u32 ctr = 0;
+
+ while (ctr++ < SOCK_RECONF_CTR) {
+ ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring);
+ if (!ret)
+ break;
+
+ /* Retry if it fails */
+ if (ctr >= SOCK_RECONF_CTR || errno != EBUSY)
+ return -errno;
+
+ usleep(USLEEP_MAX);
+ }
+
+ return ret;
+}
+
+int hw_ring_size_reset(struct ifobject *ifobj)
+{
+ ifobj->ring.tx_pending = ifobj->set_ring.default_tx;
+ ifobj->ring.rx_pending = ifobj->set_ring.default_rx;
+ return set_ring_size(ifobj);
+}
+
+static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
+ struct ifobject *ifobj_rx)
+{
+ u32 i, j;
+
+ for (i = 0; i < MAX_INTERFACES; i++) {
+ struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
+
+ ifobj->xsk = &ifobj->xsk_arr[0];
+ ifobj->use_poll = false;
+ ifobj->use_fill_ring = true;
+ ifobj->release_rx = true;
+ ifobj->validation_func = NULL;
+ ifobj->use_metadata = false;
+
+ if (i == 0) {
+ ifobj->rx_on = false;
+ ifobj->tx_on = true;
+ } else {
+ ifobj->rx_on = true;
+ ifobj->tx_on = false;
+ }
+
+ memset(ifobj->umem, 0, sizeof(*ifobj->umem));
+ ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS;
+ ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+
+ for (j = 0; j < MAX_SOCKETS; j++) {
+ memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j]));
+ ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+ ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE;
+ if (i == 0)
+ ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default;
+ else
+ ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default;
+
+ memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN);
+ memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN);
+ ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0);
+ ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1);
+ }
+ }
+
+ if (ifobj_tx->hw_ring_size_supp)
+ hw_ring_size_reset(ifobj_tx);
+
+ test->ifobj_tx = ifobj_tx;
+ test->ifobj_rx = ifobj_rx;
+ test->current_step = 0;
+ test->total_steps = 1;
+ test->nb_sockets = 1;
+ test->fail = false;
+ test->set_ring = false;
+ test->adjust_tail = false;
+ test->adjust_tail_support = false;
+ test->mtu = MAX_ETH_PKT_SIZE;
+ test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog;
+ test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk;
+ test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog;
+ test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk;
+}
+
+void test_init(struct test_spec *test, struct ifobject *ifobj_tx,
+ struct ifobject *ifobj_rx, enum test_mode mode,
+ const struct test_spec *test_to_run)
+{
+ struct pkt_stream *tx_pkt_stream;
+ struct pkt_stream *rx_pkt_stream;
+ u32 i;
+
+ tx_pkt_stream = test->tx_pkt_stream_default;
+ rx_pkt_stream = test->rx_pkt_stream_default;
+ memset(test, 0, sizeof(*test));
+ test->tx_pkt_stream_default = tx_pkt_stream;
+ test->rx_pkt_stream_default = rx_pkt_stream;
+
+ for (i = 0; i < MAX_INTERFACES; i++) {
+ struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
+
+ ifobj->bind_flags = XDP_USE_NEED_WAKEUP;
+ if (mode == TEST_MODE_ZC)
+ ifobj->bind_flags |= XDP_ZEROCOPY;
+ else
+ ifobj->bind_flags |= XDP_COPY;
+ }
+
+ memcpy(test->name, test_to_run->name, MAX_TEST_NAME_SIZE);
+ test->test_func = test_to_run->test_func;
+ test->mode = mode;
+ __test_spec_init(test, ifobj_tx, ifobj_rx);
+}
+
+static void test_spec_reset(struct test_spec *test)
+{
+ __test_spec_init(test, test->ifobj_tx, test->ifobj_rx);
+}
+
+static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx,
+ struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx,
+ struct bpf_map *xskmap_tx)
+{
+ test->xdp_prog_rx = xdp_prog_rx;
+ test->xdp_prog_tx = xdp_prog_tx;
+ test->xskmap_rx = xskmap_rx;
+ test->xskmap_tx = xskmap_tx;
+}
+
+static int test_spec_set_mtu(struct test_spec *test, int mtu)
+{
+ int err;
+
+ if (test->ifobj_rx->mtu != mtu) {
+ err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu);
+ if (err)
+ return err;
+ test->ifobj_rx->mtu = mtu;
+ }
+ if (test->ifobj_tx->mtu != mtu) {
+ err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu);
+ if (err)
+ return err;
+ test->ifobj_tx->mtu = mtu;
+ }
+
+ return 0;
+}
+
+void pkt_stream_reset(struct pkt_stream *pkt_stream)
+{
+ if (pkt_stream) {
+ pkt_stream->current_pkt_nb = 0;
+ pkt_stream->nb_rx_pkts = 0;
+ }
+}
+
+static struct pkt *pkt_stream_get_next_tx_pkt(struct pkt_stream *pkt_stream)
+{
+ if (pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts)
+ return NULL;
+
+ return &pkt_stream->pkts[pkt_stream->current_pkt_nb++];
+}
+
+static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent)
+{
+ while (pkt_stream->current_pkt_nb < pkt_stream->nb_pkts) {
+ (*pkts_sent)++;
+ if (pkt_stream->pkts[pkt_stream->current_pkt_nb].valid)
+ return &pkt_stream->pkts[pkt_stream->current_pkt_nb++];
+ pkt_stream->current_pkt_nb++;
+ }
+ return NULL;
+}
+
+void pkt_stream_delete(struct pkt_stream *pkt_stream)
+{
+ free(pkt_stream->pkts);
+ free(pkt_stream);
+}
+
+void pkt_stream_restore_default(struct test_spec *test)
+{
+ struct pkt_stream *tx_pkt_stream = test->ifobj_tx->xsk->pkt_stream;
+ struct pkt_stream *rx_pkt_stream = test->ifobj_rx->xsk->pkt_stream;
+
+ if (tx_pkt_stream != test->tx_pkt_stream_default) {
+ pkt_stream_delete(test->ifobj_tx->xsk->pkt_stream);
+ test->ifobj_tx->xsk->pkt_stream = test->tx_pkt_stream_default;
+ }
+
+ if (rx_pkt_stream != test->rx_pkt_stream_default) {
+ pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream);
+ test->ifobj_rx->xsk->pkt_stream = test->rx_pkt_stream_default;
+ }
+}
+
+static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts)
+{
+ struct pkt_stream *pkt_stream;
+
+ pkt_stream = calloc(1, sizeof(*pkt_stream));
+ if (!pkt_stream)
+ return NULL;
+
+ pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts));
+ if (!pkt_stream->pkts) {
+ free(pkt_stream);
+ return NULL;
+ }
+
+ pkt_stream->nb_pkts = nb_pkts;
+ return pkt_stream;
+}
+
+static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt)
+{
+ u32 nb_frags = 1, next_frag;
+
+ if (!pkt)
+ return 1;
+
+ if (!pkt_stream->verbatim) {
+ if (!pkt->valid || !pkt->len)
+ return 1;
+ return ceil_u32(pkt->len, frame_size);
+ }
+
+ /* Search for the end of the packet in verbatim mode */
+ if (!pkt_continues(pkt->options))
+ return nb_frags;
+
+ next_frag = pkt_stream->current_pkt_nb;
+ pkt++;
+ while (next_frag++ < pkt_stream->nb_pkts) {
+ nb_frags++;
+ if (!pkt_continues(pkt->options) || !pkt->valid)
+ break;
+ pkt++;
+ }
+ return nb_frags;
+}
+
+static bool set_pkt_valid(int offset, u32 len)
+{
+ return len <= MAX_ETH_JUMBO_SIZE;
+}
+
+static void pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len)
+{
+ pkt->offset = offset;
+ pkt->len = len;
+ pkt->valid = set_pkt_valid(offset, len);
+}
+
+static void pkt_stream_pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len)
+{
+ bool prev_pkt_valid = pkt->valid;
+
+ pkt_set(pkt_stream, pkt, offset, len);
+ pkt_stream->nb_valid_entries += pkt->valid - prev_pkt_valid;
+}
+
+static u32 pkt_get_buffer_len(struct xsk_umem_info *umem, u32 len)
+{
+ return ceil_u32(len, umem->frame_size) * umem->frame_size;
+}
+
+static struct pkt_stream *__pkt_stream_generate(u32 nb_pkts, u32 pkt_len, u32 nb_start, u32 nb_off)
+{
+ struct pkt_stream *pkt_stream;
+ u32 i;
+
+ pkt_stream = __pkt_stream_alloc(nb_pkts);
+ if (!pkt_stream)
+ return NULL;
+
+ pkt_stream->nb_pkts = nb_pkts;
+ pkt_stream->max_pkt_len = pkt_len;
+ for (i = 0; i < nb_pkts; i++) {
+ struct pkt *pkt = &pkt_stream->pkts[i];
+
+ pkt_stream_pkt_set(pkt_stream, pkt, 0, pkt_len);
+ pkt->pkt_nb = nb_start + i * nb_off;
+ }
+
+ return pkt_stream;
+}
+
+struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len)
+{
+ return __pkt_stream_generate(nb_pkts, pkt_len, 0, 1);
+}
+
+static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream)
+{
+ return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len);
+}
+
+static int pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len)
+{
+ ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len);
+
+ if (!ifobj->xsk->pkt_stream)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len)
+{
+ int ret;
+
+ ret = pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len);
+ if (ret)
+ return ret;
+
+ return pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len);
+}
+
+static int __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len,
+ int offset)
+{
+ struct pkt_stream *pkt_stream;
+ u32 i;
+
+ pkt_stream = pkt_stream_clone(ifobj->xsk->pkt_stream);
+ if (!pkt_stream)
+ return -ENOMEM;
+
+ for (i = 1; i < ifobj->xsk->pkt_stream->nb_pkts; i += 2)
+ pkt_stream_pkt_set(pkt_stream, &pkt_stream->pkts[i], offset, pkt_len);
+
+ ifobj->xsk->pkt_stream = pkt_stream;
+
+ return 0;
+}
+
+static int pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset)
+{
+ int ret = __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset);
+
+ if (ret)
+ return ret;
+
+ return __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset);
+}
+
+static int pkt_stream_receive_half(struct test_spec *test)
+{
+ struct pkt_stream *pkt_stream = test->ifobj_tx->xsk->pkt_stream;
+ u32 i;
+
+ if (test->ifobj_rx->xsk->pkt_stream != test->rx_pkt_stream_default)
+ /* Packet stream has already been replaced so we have to release this one.
+ * The newly created one will be freed by the restore_default() at the
+ * end of the test
+ */
+ pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream);
+
+ test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(pkt_stream->nb_pkts,
+ pkt_stream->pkts[0].len);
+ if (!test->ifobj_rx->xsk->pkt_stream)
+ return -ENOMEM;
+
+ pkt_stream = test->ifobj_rx->xsk->pkt_stream;
+ for (i = 1; i < pkt_stream->nb_pkts; i += 2)
+ pkt_stream->pkts[i].valid = false;
+
+ pkt_stream->nb_valid_entries /= 2;
+
+ return 0;
+}
+
+static int pkt_stream_even_odd_sequence(struct test_spec *test)
+{
+ struct pkt_stream *pkt_stream;
+ u32 i;
+
+ for (i = 0; i < test->nb_sockets; i++) {
+ pkt_stream = test->ifobj_tx->xsk_arr[i].pkt_stream;
+ pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2,
+ pkt_stream->pkts[0].len, i, 2);
+ if (!pkt_stream)
+ return -ENOMEM;
+ test->ifobj_tx->xsk_arr[i].pkt_stream = pkt_stream;
+
+ pkt_stream = test->ifobj_rx->xsk_arr[i].pkt_stream;
+ pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2,
+ pkt_stream->pkts[0].len, i, 2);
+ if (!pkt_stream)
+ return -ENOMEM;
+ test->ifobj_rx->xsk_arr[i].pkt_stream = pkt_stream;
+ }
+
+ return 0;
+}
+
+static void release_even_odd_sequence(struct test_spec *test)
+{
+ struct pkt_stream *later_free_tx = test->ifobj_tx->xsk->pkt_stream;
+ struct pkt_stream *later_free_rx = test->ifobj_rx->xsk->pkt_stream;
+ int i;
+
+ for (i = 0; i < test->nb_sockets; i++) {
+ /* later_free_{rx/tx} will be freed by restore_default() */
+ if (test->ifobj_tx->xsk_arr[i].pkt_stream != later_free_tx)
+ pkt_stream_delete(test->ifobj_tx->xsk_arr[i].pkt_stream);
+ if (test->ifobj_rx->xsk_arr[i].pkt_stream != later_free_rx)
+ pkt_stream_delete(test->ifobj_rx->xsk_arr[i].pkt_stream);
+ }
+
+}
+
+static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem)
+{
+ if (!pkt->valid)
+ return pkt->offset;
+ return pkt->offset + umem_alloc_buffer(umem);
+}
+
+static void pkt_stream_cancel(struct pkt_stream *pkt_stream)
+{
+ pkt_stream->current_pkt_nb--;
+}
+
+static void pkt_generate(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, u64 addr, u32 len,
+ u32 pkt_nb, u32 bytes_written)
+{
+ void *data = xsk_umem__get_data(umem->buffer, addr);
+
+ if (len < MIN_PKT_SIZE)
+ return;
+
+ if (!bytes_written) {
+ gen_eth_hdr(xsk, data);
+
+ len -= PKT_HDR_SIZE;
+ data += PKT_HDR_SIZE;
+ } else {
+ bytes_written -= PKT_HDR_SIZE;
+ }
+
+ write_payload(data, pkt_nb, bytes_written, len);
+}
+
+static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames,
+ u32 nb_frames, bool verbatim)
+{
+ u32 i, len = 0, pkt_nb = 0, payload = 0;
+ struct pkt_stream *pkt_stream;
+
+ pkt_stream = __pkt_stream_alloc(nb_frames);
+ if (!pkt_stream)
+ return NULL;
+
+ for (i = 0; i < nb_frames; i++) {
+ struct pkt *pkt = &pkt_stream->pkts[pkt_nb];
+ struct pkt *frame = &frames[i];
+
+ pkt->offset = frame->offset;
+ if (verbatim) {
+ *pkt = *frame;
+ pkt->pkt_nb = payload;
+ if (!frame->valid || !pkt_continues(frame->options))
+ payload++;
+ } else {
+ if (frame->valid)
+ len += frame->len;
+ if (frame->valid && pkt_continues(frame->options))
+ continue;
+
+ pkt->pkt_nb = pkt_nb;
+ pkt->len = len;
+ pkt->valid = frame->valid;
+ pkt->options = 0;
+
+ len = 0;
+ }
+
+ print_verbose("offset: %d len: %u valid: %u options: %u pkt_nb: %u\n",
+ pkt->offset, pkt->len, pkt->valid, pkt->options, pkt->pkt_nb);
+
+ if (pkt->valid && pkt->len > pkt_stream->max_pkt_len)
+ pkt_stream->max_pkt_len = pkt->len;
+
+ if (pkt->valid)
+ pkt_stream->nb_valid_entries++;
+
+ pkt_nb++;
+ }
+
+ pkt_stream->nb_pkts = pkt_nb;
+ pkt_stream->verbatim = verbatim;
+ return pkt_stream;
+}
+
+static int pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts)
+{
+ struct pkt_stream *pkt_stream;
+
+ pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true);
+ if (!pkt_stream)
+ return -ENOMEM;
+ test->ifobj_tx->xsk->pkt_stream = pkt_stream;
+
+ pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false);
+ if (!pkt_stream)
+ return -ENOMEM;
+ test->ifobj_rx->xsk->pkt_stream = pkt_stream;
+
+ return 0;
+}
+
+static void pkt_print_data(u32 *data, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ u32 seqnum, pkt_nb;
+
+ seqnum = ntohl(*data) & 0xffff;
+ pkt_nb = ntohl(*data) >> 16;
+ ksft_print_msg("%u:%u ", pkt_nb, seqnum);
+ data++;
+ }
+}
+
+static void pkt_dump(void *pkt, u32 len, bool eth_header)
+{
+ struct ethhdr *ethhdr = pkt;
+ u32 i, *data;
+
+ if (eth_header) {
+ /*extract L2 frame */
+ ksft_print_msg("DEBUG>> L2: dst mac: ");
+ for (i = 0; i < ETH_ALEN; i++)
+ ksft_print_msg("%02X", ethhdr->h_dest[i]);
+
+ ksft_print_msg("\nDEBUG>> L2: src mac: ");
+ for (i = 0; i < ETH_ALEN; i++)
+ ksft_print_msg("%02X", ethhdr->h_source[i]);
+
+ data = pkt + PKT_HDR_SIZE;
+ } else {
+ data = pkt;
+ }
+
+ /*extract L5 frame */
+ ksft_print_msg("\nDEBUG>> L5: seqnum: ");
+ pkt_print_data(data, PKT_DUMP_NB_TO_PRINT);
+ ksft_print_msg("....");
+ if (len > PKT_DUMP_NB_TO_PRINT * sizeof(u32)) {
+ ksft_print_msg("\n.... ");
+ pkt_print_data(data + len / sizeof(u32) - PKT_DUMP_NB_TO_PRINT,
+ PKT_DUMP_NB_TO_PRINT);
+ }
+ ksft_print_msg("\n---------------------------------------\n");
+}
+
+static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr)
+{
+ u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom;
+ u32 offset = addr % umem->frame_size, expected_offset;
+ int pkt_offset = pkt->valid ? pkt->offset : 0;
+
+ if (!umem->unaligned_mode)
+ pkt_offset = 0;
+
+ expected_offset = (pkt_offset + headroom + XDP_PACKET_HEADROOM) % umem->frame_size;
+
+ if (offset == expected_offset)
+ return true;
+
+ ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset);
+ return false;
+}
+
+static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr)
+{
+ void *data = xsk_umem__get_data(buffer, addr);
+ struct xdp_info *meta = data - sizeof(struct xdp_info);
+
+ if (meta->count != pkt->pkt_nb) {
+ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n",
+ __func__, pkt->pkt_nb,
+ (unsigned long long)meta->count);
+ return false;
+ }
+
+ return true;
+}
+
+static int is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx, bool *supported)
+{
+ struct bpf_map *data_map;
+ int adjust_value = 0;
+ int key = 0;
+ int ret;
+
+ data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss");
+ if (!data_map || !bpf_map__is_internal(data_map)) {
+ ksft_print_msg("Error: could not find bss section of XDP program\n");
+ return -EINVAL;
+ }
+
+ ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value);
+ if (ret) {
+ ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret);
+ return ret;
+ }
+
+ /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail
+ * helper is not supported. Skip the adjust_tail test case in this scenario.
+ */
+ *supported = adjust_value != -EOPNOTSUPP;
+
+ return 0;
+}
+
+static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb,
+ u32 bytes_processed)
+{
+ u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum;
+ void *data = xsk_umem__get_data(umem->buffer, addr);
+
+ addr -= umem->base_addr;
+
+ if (addr >= umem->num_frames * umem->frame_size ||
+ addr + len > umem->num_frames * umem->frame_size) {
+ ksft_print_msg("Frag invalid addr: %llx len: %u\n",
+ (unsigned long long)addr, len);
+ return false;
+ }
+ if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) {
+ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n",
+ (unsigned long long)addr, len);
+ return false;
+ }
+
+ pkt_data = data;
+ if (!bytes_processed) {
+ pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data);
+ len -= PKT_HDR_SIZE;
+ } else {
+ bytes_processed -= PKT_HDR_SIZE;
+ }
+
+ expected_seqnum = bytes_processed / sizeof(*pkt_data);
+ seqnum = ntohl(*pkt_data) & 0xffff;
+ pkt_nb = ntohl(*pkt_data) >> 16;
+
+ if (expected_pkt_nb != pkt_nb) {
+ ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n",
+ __func__, expected_pkt_nb, pkt_nb);
+ goto error;
+ }
+ if (expected_seqnum != seqnum) {
+ ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n",
+ __func__, expected_seqnum, seqnum);
+ goto error;
+ }
+
+ words_to_end = len / sizeof(*pkt_data) - 1;
+ pkt_data += words_to_end;
+ seqnum = ntohl(*pkt_data) & 0xffff;
+ expected_seqnum += words_to_end;
+ if (expected_seqnum != seqnum) {
+ ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n",
+ __func__, expected_seqnum, seqnum);
+ goto error;
+ }
+
+ return true;
+
+error:
+ pkt_dump(data, len, !bytes_processed);
+ return false;
+}
+
+static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len)
+{
+ if (pkt->len != len) {
+ ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n",
+ __func__, pkt->len, len);
+ pkt_dump(xsk_umem__get_data(buffer, addr), len, true);
+ return false;
+ }
+
+ return true;
+}
+
+static u32 load_value(u32 *counter)
+{
+ return __atomic_load_n(counter, __ATOMIC_ACQUIRE);
+}
+
+static bool kick_tx_with_check(struct xsk_socket_info *xsk, int *ret)
+{
+ u32 max_budget = MAX_TX_BUDGET_DEFAULT;
+ u32 cons, ready_to_send;
+ int delta;
+
+ cons = load_value(xsk->tx.consumer);
+ ready_to_send = load_value(xsk->tx.producer) - cons;
+ *ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
+
+ delta = load_value(xsk->tx.consumer) - cons;
+ /* By default, xsk should consume exact @max_budget descs at one
+ * send in this case where hitting the max budget limit in while
+ * loop is triggered in __xsk_generic_xmit(). Please make sure that
+ * the number of descs to be sent is larger than @max_budget, or
+ * else the tx.consumer will be updated in xskq_cons_peek_desc()
+ * in time which hides the issue we try to verify.
+ */
+ if (ready_to_send > max_budget && delta != max_budget)
+ return false;
+
+ return true;
+}
+
+int kick_tx(struct xsk_socket_info *xsk)
+{
+ int ret;
+
+ if (xsk->check_consumer) {
+ if (!kick_tx_with_check(xsk, &ret))
+ return TEST_FAILURE;
+ } else {
+ ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
+ }
+ if (ret >= 0)
+ return TEST_PASS;
+ if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) {
+ usleep(100);
+ return TEST_PASS;
+ }
+ return TEST_FAILURE;
+}
+
+int kick_rx(struct xsk_socket_info *xsk)
+{
+ int ret;
+
+ ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL);
+ if (ret < 0)
+ return TEST_FAILURE;
+
+ return TEST_PASS;
+}
+
+static int complete_pkts(struct xsk_socket_info *xsk, int batch_size)
+{
+ unsigned int rcvd;
+ u32 idx;
+ int ret;
+
+ if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+ ret = kick_tx(xsk);
+ if (ret)
+ return TEST_FAILURE;
+ }
+
+ rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
+ if (rcvd) {
+ if (rcvd > xsk->outstanding_tx) {
+ u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1);
+
+ ksft_print_msg("[%s] Too many packets completed\n", __func__);
+ ksft_print_msg("Last completion address: %llx\n",
+ (unsigned long long)addr);
+ return TEST_FAILURE;
+ }
+
+ xsk_ring_cons__release(&xsk->umem->cq, rcvd);
+ xsk->outstanding_tx -= rcvd;
+ }
+
+ return TEST_PASS;
+}
+
+static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk)
+{
+ u32 frags_processed = 0, nb_frags = 0, pkt_len = 0;
+ u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0;
+ struct pkt_stream *pkt_stream = xsk->pkt_stream;
+ struct ifobject *ifobj = test->ifobj_rx;
+ struct xsk_umem_info *umem = xsk->umem;
+ struct pollfd fds = { };
+ struct pkt *pkt;
+ u64 first_addr = 0;
+ int ret;
+
+ fds.fd = xsk_socket__fd(xsk->xsk);
+ fds.events = POLLIN;
+
+ ret = kick_rx(xsk);
+ if (ret)
+ return TEST_FAILURE;
+
+ if (ifobj->use_poll) {
+ ret = poll(&fds, 1, POLL_TMOUT);
+ if (ret < 0)
+ return TEST_FAILURE;
+
+ if (!ret) {
+ if (!is_umem_valid(test->ifobj_tx))
+ return TEST_PASS;
+
+ ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__);
+ return TEST_CONTINUE;
+ }
+
+ if (!(fds.revents & POLLIN))
+ return TEST_CONTINUE;
+ }
+
+ rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx);
+ if (!rcvd)
+ return TEST_CONTINUE;
+
+ if (ifobj->use_fill_ring) {
+ ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
+ while (ret != rcvd) {
+ if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
+ ret = poll(&fds, 1, POLL_TMOUT);
+ if (ret < 0)
+ return TEST_FAILURE;
+ }
+ ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
+ }
+ }
+
+ while (frags_processed < rcvd) {
+ const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++);
+ u64 addr = desc->addr, orig;
+
+ orig = xsk_umem__extract_addr(addr);
+ addr = xsk_umem__add_offset_to_addr(addr);
+
+ if (!nb_frags) {
+ pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent);
+ if (!pkt) {
+ ksft_print_msg("[%s] received too many packets addr: %lx len %u\n",
+ __func__, addr, desc->len);
+ return TEST_FAILURE;
+ }
+ }
+
+ print_verbose("Rx: addr: %lx len: %u options: %u pkt_nb: %u valid: %u\n",
+ addr, desc->len, desc->options, pkt->pkt_nb, pkt->valid);
+
+ if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) ||
+ !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata &&
+ !is_metadata_correct(pkt, umem->buffer, addr)))
+ return TEST_FAILURE;
+
+ if (!nb_frags++)
+ first_addr = addr;
+ frags_processed++;
+ pkt_len += desc->len;
+ if (ifobj->use_fill_ring)
+ *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig;
+
+ if (pkt_continues(desc->options))
+ continue;
+
+ /* The complete packet has been received */
+ if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) ||
+ !is_offset_correct(umem, pkt, addr))
+ return TEST_FAILURE;
+
+ pkt_stream->nb_rx_pkts++;
+ nb_frags = 0;
+ pkt_len = 0;
+ }
+
+ if (nb_frags) {
+ /* In the middle of a packet. Start over from beginning of packet. */
+ idx_rx -= nb_frags;
+ xsk_ring_cons__cancel(&xsk->rx, nb_frags);
+ if (ifobj->use_fill_ring) {
+ idx_fq -= nb_frags;
+ xsk_ring_prod__cancel(&umem->fq, nb_frags);
+ }
+ frags_processed -= nb_frags;
+ }
+
+ if (ifobj->use_fill_ring)
+ xsk_ring_prod__submit(&umem->fq, frags_processed);
+ if (ifobj->release_rx)
+ xsk_ring_cons__release(&xsk->rx, frags_processed);
+
+ pthread_mutex_lock(&pacing_mutex);
+ pkts_in_flight -= pkts_sent;
+ pthread_mutex_unlock(&pacing_mutex);
+ pkts_sent = 0;
+
+ return TEST_CONTINUE;
+}
+
+bool all_packets_received(struct test_spec *test, struct xsk_socket_info *xsk, u32 sock_num,
+ unsigned long *bitmap)
+{
+ struct pkt_stream *pkt_stream = xsk->pkt_stream;
+
+ if (!pkt_stream) {
+ __set_bit(sock_num, bitmap);
+ return false;
+ }
+
+ if (pkt_stream->nb_rx_pkts == pkt_stream->nb_valid_entries) {
+ __set_bit(sock_num, bitmap);
+ if (bitmap_full(bitmap, test->nb_sockets))
+ return true;
+ }
+
+ return false;
+}
+
+static int receive_pkts(struct test_spec *test)
+{
+ struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0};
+ DECLARE_BITMAP(bitmap, test->nb_sockets);
+ struct xsk_socket_info *xsk;
+ u32 sock_num = 0;
+ int res, ret;
+
+ bitmap_zero(bitmap, test->nb_sockets);
+
+ ret = gettimeofday(&tv_now, NULL);
+ if (ret)
+ return TEST_FAILURE;
+
+ timeradd(&tv_now, &tv_timeout, &tv_end);
+
+ while (1) {
+ xsk = &test->ifobj_rx->xsk_arr[sock_num];
+
+ if ((all_packets_received(test, xsk, sock_num, bitmap)))
+ break;
+
+ res = __receive_pkts(test, xsk);
+ if (!(res == TEST_PASS || res == TEST_CONTINUE))
+ return res;
+
+ ret = gettimeofday(&tv_now, NULL);
+ if (ret)
+ return TEST_FAILURE;
+
+ if (timercmp(&tv_now, &tv_end, >)) {
+ ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__);
+ return TEST_FAILURE;
+ }
+ sock_num = (sock_num + 1) % test->nb_sockets;
+ }
+
+ return TEST_PASS;
+}
+
+static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout)
+{
+ u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len;
+ struct pkt_stream *pkt_stream = xsk->pkt_stream;
+ struct xsk_umem_info *umem = ifobject->umem;
+ bool use_poll = ifobject->use_poll;
+ struct pollfd fds = { };
+ int ret;
+
+ buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len);
+ /* pkts_in_flight might be negative if many invalid packets are sent */
+ if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) /
+ buffer_len)) {
+ ret = kick_tx(xsk);
+ if (ret)
+ return TEST_FAILURE;
+ return TEST_CONTINUE;
+ }
+
+ fds.fd = xsk_socket__fd(xsk->xsk);
+ fds.events = POLLOUT;
+
+ while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) {
+ if (use_poll) {
+ ret = poll(&fds, 1, POLL_TMOUT);
+ if (timeout) {
+ if (ret < 0) {
+ ksft_print_msg("ERROR: [%s] Poll error %d\n",
+ __func__, errno);
+ return TEST_FAILURE;
+ }
+ if (ret == 0)
+ return TEST_PASS;
+ break;
+ }
+ if (ret <= 0) {
+ ksft_print_msg("ERROR: [%s] Poll error %d\n",
+ __func__, errno);
+ return TEST_FAILURE;
+ }
+ }
+
+ complete_pkts(xsk, xsk->batch_size);
+ }
+
+ for (i = 0; i < xsk->batch_size; i++) {
+ struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
+ u32 nb_frags_left, nb_frags, bytes_written = 0;
+
+ if (!pkt)
+ break;
+
+ nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt);
+ if (nb_frags > xsk->batch_size - i) {
+ pkt_stream_cancel(pkt_stream);
+ xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i);
+ break;
+ }
+ nb_frags_left = nb_frags;
+
+ while (nb_frags_left--) {
+ struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i);
+
+ tx_desc->addr = pkt_get_addr(pkt, ifobject->umem);
+ if (pkt_stream->verbatim) {
+ tx_desc->len = pkt->len;
+ tx_desc->options = pkt->options;
+ } else if (nb_frags_left) {
+ tx_desc->len = umem->frame_size;
+ tx_desc->options = XDP_PKT_CONTD;
+ } else {
+ tx_desc->len = pkt->len - bytes_written;
+ tx_desc->options = 0;
+ }
+ if (pkt->valid)
+ pkt_generate(xsk, umem, tx_desc->addr, tx_desc->len, pkt->pkt_nb,
+ bytes_written);
+ bytes_written += tx_desc->len;
+
+ print_verbose("Tx addr: %llx len: %u options: %u pkt_nb: %u\n",
+ tx_desc->addr, tx_desc->len, tx_desc->options, pkt->pkt_nb);
+
+ if (nb_frags_left) {
+ i++;
+ if (pkt_stream->verbatim)
+ pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
+ }
+ }
+
+ if (pkt && pkt->valid) {
+ valid_pkts++;
+ valid_frags += nb_frags;
+ }
+ }
+
+ pthread_mutex_lock(&pacing_mutex);
+ pkts_in_flight += valid_pkts;
+ pthread_mutex_unlock(&pacing_mutex);
+
+ xsk_ring_prod__submit(&xsk->tx, i);
+ xsk->outstanding_tx += valid_frags;
+
+ if (use_poll) {
+ ret = poll(&fds, 1, POLL_TMOUT);
+ if (ret <= 0) {
+ if (ret == 0 && timeout)
+ return TEST_PASS;
+
+ ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret);
+ return TEST_FAILURE;
+ }
+ }
+
+ if (!timeout) {
+ if (complete_pkts(xsk, i))
+ return TEST_FAILURE;
+
+ usleep(10);
+ return TEST_PASS;
+ }
+
+ return TEST_CONTINUE;
+}
+
+static int wait_for_tx_completion(struct xsk_socket_info *xsk)
+{
+ struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0};
+ int ret;
+
+ ret = gettimeofday(&tv_now, NULL);
+ if (ret)
+ return TEST_FAILURE;
+ timeradd(&tv_now, &tv_timeout, &tv_end);
+
+ while (xsk->outstanding_tx) {
+ ret = gettimeofday(&tv_now, NULL);
+ if (ret)
+ return TEST_FAILURE;
+ if (timercmp(&tv_now, &tv_end, >)) {
+ ksft_print_msg("ERROR: [%s] Transmission loop timed out\n", __func__);
+ return TEST_FAILURE;
+ }
+
+ complete_pkts(xsk, xsk->batch_size);
+ }
+
+ return TEST_PASS;
+}
+
+bool all_packets_sent(struct test_spec *test, unsigned long *bitmap)
+{
+ return bitmap_full(bitmap, test->nb_sockets);
+}
+
+static int send_pkts(struct test_spec *test, struct ifobject *ifobject)
+{
+ bool timeout = !is_umem_valid(test->ifobj_rx);
+ DECLARE_BITMAP(bitmap, test->nb_sockets);
+ u32 i, ret;
+
+ bitmap_zero(bitmap, test->nb_sockets);
+
+ while (!(all_packets_sent(test, bitmap))) {
+ for (i = 0; i < test->nb_sockets; i++) {
+ struct pkt_stream *pkt_stream;
+
+ pkt_stream = ifobject->xsk_arr[i].pkt_stream;
+ if (!pkt_stream || pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) {
+ __set_bit(i, bitmap);
+ continue;
+ }
+ ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout);
+ if (ret == TEST_CONTINUE && !test->fail)
+ continue;
+
+ if ((ret || test->fail) && !timeout)
+ return TEST_FAILURE;
+
+ if (ret == TEST_PASS && timeout)
+ return ret;
+
+ ret = wait_for_tx_completion(&ifobject->xsk_arr[i]);
+ if (ret)
+ return TEST_FAILURE;
+ }
+ }
+
+ return TEST_PASS;
+}
+
+static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats)
+{
+ int fd = xsk_socket__fd(xsk), err;
+ socklen_t optlen, expected_len;
+
+ optlen = sizeof(*stats);
+ err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen);
+ if (err) {
+ ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n",
+ __func__, -err, strerror(-err));
+ return TEST_FAILURE;
+ }
+
+ expected_len = sizeof(struct xdp_statistics);
+ if (optlen != expected_len) {
+ ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n",
+ __func__, expected_len, optlen);
+ return TEST_FAILURE;
+ }
+
+ return TEST_PASS;
+}
+
+static int validate_rx_dropped(struct ifobject *ifobject)
+{
+ struct xsk_socket *xsk = ifobject->xsk->xsk;
+ struct xdp_statistics stats;
+ int err;
+
+ err = kick_rx(ifobject->xsk);
+ if (err)
+ return TEST_FAILURE;
+
+ err = get_xsk_stats(xsk, &stats);
+ if (err)
+ return TEST_FAILURE;
+
+ /* The receiver calls getsockopt after receiving the last (valid)
+ * packet which is not the final packet sent in this test (valid and
+ * invalid packets are sent in alternating fashion with the final
+ * packet being invalid). Since the last packet may or may not have
+ * been dropped already, both outcomes must be allowed.
+ */
+ if (stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 ||
+ stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 - 1)
+ return TEST_PASS;
+
+ return TEST_FAILURE;
+}
+
+static int validate_rx_full(struct ifobject *ifobject)
+{
+ struct xsk_socket *xsk = ifobject->xsk->xsk;
+ struct xdp_statistics stats;
+ int err;
+
+ usleep(1000);
+ err = kick_rx(ifobject->xsk);
+ if (err)
+ return TEST_FAILURE;
+
+ err = get_xsk_stats(xsk, &stats);
+ if (err)
+ return TEST_FAILURE;
+
+ if (stats.rx_ring_full)
+ return TEST_PASS;
+
+ return TEST_FAILURE;
+}
+
+static int validate_fill_empty(struct ifobject *ifobject)
+{
+ struct xsk_socket *xsk = ifobject->xsk->xsk;
+ struct xdp_statistics stats;
+ int err;
+
+ usleep(1000);
+ err = kick_rx(ifobject->xsk);
+ if (err)
+ return TEST_FAILURE;
+
+ err = get_xsk_stats(xsk, &stats);
+ if (err)
+ return TEST_FAILURE;
+
+ if (stats.rx_fill_ring_empty_descs)
+ return TEST_PASS;
+
+ return TEST_FAILURE;
+}
+
+static int validate_tx_invalid_descs(struct ifobject *ifobject)
+{
+ struct xsk_socket *xsk = ifobject->xsk->xsk;
+ int fd = xsk_socket__fd(xsk);
+ struct xdp_statistics stats;
+ socklen_t optlen;
+ int err;
+
+ optlen = sizeof(stats);
+ err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
+ if (err) {
+ ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n",
+ __func__, -err, strerror(-err));
+ return TEST_FAILURE;
+ }
+
+ if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) {
+ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n",
+ __func__,
+ (unsigned long long)stats.tx_invalid_descs,
+ ifobject->xsk->pkt_stream->nb_pkts);
+ return TEST_FAILURE;
+ }
+
+ return TEST_PASS;
+}
+
+static int xsk_configure(struct test_spec *test, struct ifobject *ifobject,
+ struct xsk_umem_info *umem, bool tx)
+{
+ int i, ret;
+
+ for (i = 0; i < test->nb_sockets; i++) {
+ bool shared = (ifobject->shared_umem && tx) ? true : !!i;
+ u32 ctr = 0;
+
+ while (ctr++ < SOCK_RECONF_CTR) {
+ ret = xsk_configure_socket(&ifobject->xsk_arr[i], umem,
+ ifobject, shared);
+ if (!ret)
+ break;
+
+ /* Retry if it fails as xsk_socket__create() is asynchronous */
+ if (ctr >= SOCK_RECONF_CTR)
+ return ret;
+ usleep(USLEEP_MAX);
+ }
+ if (ifobject->busy_poll) {
+ ret = enable_busy_poll(&ifobject->xsk_arr[i]);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject)
+{
+ int ret = xsk_configure(test, ifobject, test->ifobj_rx->umem, true);
+
+ if (ret)
+ return ret;
+ ifobject->xsk = &ifobject->xsk_arr[0];
+ ifobject->xskmap = test->ifobj_rx->xskmap;
+ memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info));
+ ifobject->umem->base_addr = 0;
+
+ return 0;
+}
+
+static int xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream,
+ bool fill_up)
+{
+ u32 rx_frame_size = umem->frame_size - XDP_PACKET_HEADROOM;
+ u32 idx = 0, filled = 0, buffers_to_fill, nb_pkts;
+ int ret;
+
+ if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS)
+ buffers_to_fill = umem->num_frames;
+ else
+ buffers_to_fill = umem->fill_size;
+
+ ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx);
+ if (ret != buffers_to_fill)
+ return -ENOSPC;
+
+ while (filled < buffers_to_fill) {
+ struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &nb_pkts);
+ u64 addr;
+ u32 i;
+
+ for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) {
+ if (!pkt) {
+ if (!fill_up)
+ break;
+ addr = filled * umem->frame_size + umem->base_addr;
+ } else if (pkt->offset >= 0) {
+ addr = pkt->offset % umem->frame_size + umem_alloc_buffer(umem);
+ } else {
+ addr = pkt->offset + umem_alloc_buffer(umem);
+ }
+
+ *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
+ if (++filled >= buffers_to_fill)
+ break;
+ }
+ }
+ xsk_ring_prod__submit(&umem->fq, filled);
+ xsk_ring_prod__cancel(&umem->fq, buffers_to_fill - filled);
+
+ pkt_stream_reset(pkt_stream);
+ umem_reset_alloc(umem);
+
+ return 0;
+}
+
+static int thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
+{
+ LIBBPF_OPTS(bpf_xdp_query_opts, opts);
+ int mmap_flags;
+ u64 umem_sz;
+ void *bufs;
+ int ret;
+ u32 i;
+
+ umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
+ mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+
+ if (ifobject->umem->unaligned_mode)
+ mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB;
+
+ if (ifobject->shared_umem)
+ umem_sz *= 2;
+
+ bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+ if (bufs == MAP_FAILED)
+ return -errno;
+
+ ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz);
+ if (ret)
+ return ret;
+
+ ret = xsk_configure(test, ifobject, ifobject->umem, false);
+ if (ret)
+ return ret;
+
+ ifobject->xsk = &ifobject->xsk_arr[0];
+
+ if (!ifobject->rx_on)
+ return 0;
+
+ ret = xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream,
+ ifobject->use_fill_ring);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < test->nb_sockets; i++) {
+ ifobject->xsk = &ifobject->xsk_arr[i];
+ ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void *worker_testapp_validate_tx(void *arg)
+{
+ struct test_spec *test = (struct test_spec *)arg;
+ struct ifobject *ifobject = test->ifobj_tx;
+ int err;
+
+ if (test->current_step == 1) {
+ if (!ifobject->shared_umem) {
+ if (thread_common_ops(test, ifobject)) {
+ test->fail = true;
+ pthread_exit(NULL);
+ }
+ } else {
+ if (thread_common_ops_tx(test, ifobject)) {
+ test->fail = true;
+ pthread_exit(NULL);
+ }
+ }
+ }
+
+ err = send_pkts(test, ifobject);
+
+ if (!err && ifobject->validation_func)
+ err = ifobject->validation_func(ifobject);
+ if (err)
+ test->fail = true;
+
+ pthread_exit(NULL);
+}
+
+void *worker_testapp_validate_rx(void *arg)
+{
+ struct test_spec *test = (struct test_spec *)arg;
+ struct ifobject *ifobject = test->ifobj_rx;
+ int err;
+
+ if (test->current_step == 1) {
+ err = thread_common_ops(test, ifobject);
+ } else {
+ xsk_clear_xskmap(ifobject->xskmap);
+ err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, 0);
+ if (err)
+ ksft_print_msg("Error: Failed to update xskmap, error %s\n",
+ strerror(-err));
+ }
+
+ pthread_barrier_wait(&barr);
+
+ /* We leave only now in case of error to avoid getting stuck in the barrier */
+ if (err) {
+ test->fail = true;
+ pthread_exit(NULL);
+ }
+
+ err = receive_pkts(test);
+
+ if (!err && ifobject->validation_func)
+ err = ifobject->validation_func(ifobject);
+
+ if (err) {
+ if (!test->adjust_tail) {
+ test->fail = true;
+ } else {
+ bool supported;
+
+ if (is_adjust_tail_supported(ifobject->xdp_progs, &supported))
+ test->fail = true;
+ else if (!supported)
+ test->adjust_tail_support = false;
+ else
+ test->fail = true;
+ }
+ }
+
+ pthread_exit(NULL);
+}
+
+static void testapp_clean_xsk_umem(struct ifobject *ifobj)
+{
+ u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size;
+
+ if (ifobj->shared_umem)
+ umem_sz *= 2;
+
+ umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE;
+ xsk_umem__delete(ifobj->umem->umem);
+ munmap(ifobj->umem->buffer, umem_sz);
+}
+
+static void handler(int signum)
+{
+ pthread_exit(NULL);
+}
+
+static bool xdp_prog_changed_rx(struct test_spec *test)
+{
+ struct ifobject *ifobj = test->ifobj_rx;
+
+ return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode;
+}
+
+static bool xdp_prog_changed_tx(struct test_spec *test)
+{
+ struct ifobject *ifobj = test->ifobj_tx;
+
+ return ifobj->xdp_prog != test->xdp_prog_tx || ifobj->mode != test->mode;
+}
+
+static int xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog,
+ struct bpf_map *xskmap, enum test_mode mode)
+{
+ int err;
+
+ xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode));
+ err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode));
+ if (err) {
+ ksft_print_msg("Error attaching XDP program\n");
+ return err;
+ }
+
+ if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC))
+ if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) {
+ ksft_print_msg("ERROR: XDP prog not in DRV mode\n");
+ return -EINVAL;
+ }
+
+ ifobj->xdp_prog = xdp_prog;
+ ifobj->xskmap = xskmap;
+ ifobj->mode = mode;
+
+ return 0;
+}
+
+static int xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx,
+ struct ifobject *ifobj_tx)
+{
+ int err = 0;
+
+ if (xdp_prog_changed_rx(test)) {
+ err = xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode);
+ if (err)
+ return err;
+ }
+
+ if (!ifobj_tx || ifobj_tx->shared_umem)
+ return 0;
+
+ if (xdp_prog_changed_tx(test))
+ err = xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode);
+
+ return err;
+}
+
+static void clean_sockets(struct test_spec *test, struct ifobject *ifobj)
+{
+ u32 i;
+
+ if (!ifobj || !test)
+ return;
+
+ for (i = 0; i < test->nb_sockets; i++)
+ xsk_socket__delete(ifobj->xsk_arr[i].xsk);
+}
+
+static void clean_umem(struct test_spec *test, struct ifobject *ifobj1, struct ifobject *ifobj2)
+{
+ if (!ifobj1)
+ return;
+
+ testapp_clean_xsk_umem(ifobj1);
+ if (ifobj2 && !ifobj2->shared_umem)
+ testapp_clean_xsk_umem(ifobj2);
+}
+
+static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1,
+ struct ifobject *ifobj2)
+{
+ pthread_t t0, t1;
+ int err;
+
+ if (test->mtu > MAX_ETH_PKT_SIZE) {
+ if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp ||
+ (ifobj2 && !ifobj2->multi_buff_zc_supp))) {
+ ksft_print_msg("Multi buffer for zero-copy not supported.\n");
+ return TEST_SKIP;
+ }
+ if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp ||
+ (ifobj2 && !ifobj2->multi_buff_supp))) {
+ ksft_print_msg("Multi buffer not supported.\n");
+ return TEST_SKIP;
+ }
+ }
+ err = test_spec_set_mtu(test, test->mtu);
+ if (err) {
+ ksft_print_msg("Error, could not set mtu.\n");
+ return TEST_FAILURE;
+ }
+
+ if (ifobj2) {
+ if (pthread_barrier_init(&barr, NULL, 2))
+ return TEST_FAILURE;
+ pkt_stream_reset(ifobj2->xsk->pkt_stream);
+ }
+
+ test->current_step++;
+ pkt_stream_reset(ifobj1->xsk->pkt_stream);
+ pkts_in_flight = 0;
+
+ signal(SIGUSR1, handler);
+ /*Spawn RX thread */
+ pthread_create(&t0, NULL, ifobj1->func_ptr, test);
+
+ if (ifobj2) {
+ pthread_barrier_wait(&barr);
+ if (pthread_barrier_destroy(&barr)) {
+ pthread_kill(t0, SIGUSR1);
+ clean_sockets(test, ifobj1);
+ clean_umem(test, ifobj1, NULL);
+ return TEST_FAILURE;
+ }
+
+ /*Spawn TX thread */
+ pthread_create(&t1, NULL, ifobj2->func_ptr, test);
+
+ pthread_join(t1, NULL);
+ }
+
+ if (!ifobj2)
+ pthread_kill(t0, SIGUSR1);
+ else
+ pthread_join(t0, NULL);
+
+ if (test->total_steps == test->current_step || test->fail) {
+ clean_sockets(test, ifobj1);
+ clean_sockets(test, ifobj2);
+ clean_umem(test, ifobj1, ifobj2);
+ }
+
+ if (test->fail)
+ return TEST_FAILURE;
+
+ return TEST_PASS;
+}
+
+static int testapp_validate_traffic(struct test_spec *test)
+{
+ struct ifobject *ifobj_rx = test->ifobj_rx;
+ struct ifobject *ifobj_tx = test->ifobj_tx;
+
+ if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) ||
+ (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) {
+ ksft_print_msg("No huge pages present.\n");
+ return TEST_SKIP;
+ }
+
+ if (test->set_ring) {
+ if (ifobj_tx->hw_ring_size_supp) {
+ if (set_ring_size(ifobj_tx)) {
+ ksft_print_msg("Failed to change HW ring size.\n");
+ return TEST_FAILURE;
+ }
+ } else {
+ ksft_print_msg("Changing HW ring size not supported.\n");
+ return TEST_SKIP;
+ }
+ }
+
+ if (xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx))
+ return TEST_FAILURE;
+ return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx);
+}
+
+static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj)
+{
+ return __testapp_validate_traffic(test, ifobj, NULL);
+}
+
+int testapp_teardown(struct test_spec *test)
+{
+ int i;
+
+ for (i = 0; i < MAX_TEARDOWN_ITER; i++) {
+ if (testapp_validate_traffic(test))
+ return TEST_FAILURE;
+ test_spec_reset(test);
+ }
+
+ return TEST_PASS;
+}
+
+static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2)
+{
+ thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr;
+ struct ifobject *tmp_ifobj = (*ifobj1);
+
+ (*ifobj1)->func_ptr = (*ifobj2)->func_ptr;
+ (*ifobj2)->func_ptr = tmp_func_ptr;
+
+ *ifobj1 = *ifobj2;
+ *ifobj2 = tmp_ifobj;
+}
+
+int testapp_bidirectional(struct test_spec *test)
+{
+ int res;
+
+ test->ifobj_tx->rx_on = true;
+ test->ifobj_rx->tx_on = true;
+ test->total_steps = 2;
+ if (testapp_validate_traffic(test))
+ return TEST_FAILURE;
+
+ print_verbose("Switching Tx/Rx direction\n");
+ swap_directions(&test->ifobj_rx, &test->ifobj_tx);
+ res = __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx);
+
+ swap_directions(&test->ifobj_rx, &test->ifobj_tx);
+ return res;
+}
+
+static int swap_xsk_resources(struct test_spec *test)
+{
+ int ret;
+
+ test->ifobj_tx->xsk_arr[0].pkt_stream = NULL;
+ test->ifobj_rx->xsk_arr[0].pkt_stream = NULL;
+ test->ifobj_tx->xsk_arr[1].pkt_stream = test->tx_pkt_stream_default;
+ test->ifobj_rx->xsk_arr[1].pkt_stream = test->rx_pkt_stream_default;
+ test->ifobj_tx->xsk = &test->ifobj_tx->xsk_arr[1];
+ test->ifobj_rx->xsk = &test->ifobj_rx->xsk_arr[1];
+
+ ret = xsk_update_xskmap(test->ifobj_rx->xskmap, test->ifobj_rx->xsk->xsk, 0);
+ if (ret)
+ return TEST_FAILURE;
+
+ return TEST_PASS;
+}
+
+int testapp_xdp_prog_cleanup(struct test_spec *test)
+{
+ test->total_steps = 2;
+ test->nb_sockets = 2;
+ if (testapp_validate_traffic(test))
+ return TEST_FAILURE;
+
+ if (swap_xsk_resources(test)) {
+ clean_sockets(test, test->ifobj_rx);
+ clean_sockets(test, test->ifobj_tx);
+ clean_umem(test, test->ifobj_rx, test->ifobj_tx);
+ return TEST_FAILURE;
+ }
+
+ return testapp_validate_traffic(test);
+}
+
+int testapp_headroom(struct test_spec *test)
+{
+ test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_stats_rx_dropped(struct test_spec *test)
+{
+ if (test->mode == TEST_MODE_ZC) {
+ ksft_print_msg("Can not run RX_DROPPED test for ZC mode\n");
+ return TEST_SKIP;
+ }
+
+ if (pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0))
+ return TEST_FAILURE;
+ test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size -
+ XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3;
+ if (pkt_stream_receive_half(test))
+ return TEST_FAILURE;
+ test->ifobj_rx->validation_func = validate_rx_dropped;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_stats_tx_invalid_descs(struct test_spec *test)
+{
+ if (pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0))
+ return TEST_FAILURE;
+ test->ifobj_tx->validation_func = validate_tx_invalid_descs;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_stats_rx_full(struct test_spec *test)
+{
+ if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE))
+ return TEST_FAILURE;
+ test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE);
+
+ test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS;
+ test->ifobj_rx->release_rx = false;
+ test->ifobj_rx->validation_func = validate_rx_full;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_stats_fill_empty(struct test_spec *test)
+{
+ if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE))
+ return TEST_FAILURE;
+ test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE);
+
+ test->ifobj_rx->use_fill_ring = false;
+ test->ifobj_rx->validation_func = validate_fill_empty;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_send_receive_unaligned(struct test_spec *test)
+{
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ /* Let half of the packets straddle a 4K buffer boundary */
+ if (pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2))
+ return TEST_FAILURE;
+
+ return testapp_validate_traffic(test);
+}
+
+int testapp_send_receive_unaligned_mb(struct test_spec *test)
+{
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE))
+ return TEST_FAILURE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_single_pkt(struct test_spec *test)
+{
+ struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}};
+
+ if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)))
+ return TEST_FAILURE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_send_receive_mb(struct test_spec *test)
+{
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE))
+ return TEST_FAILURE;
+
+ return testapp_validate_traffic(test);
+}
+
+int testapp_invalid_desc_mb(struct test_spec *test)
+{
+ struct xsk_umem_info *umem = test->ifobj_tx->umem;
+ u64 umem_size = umem->num_frames * umem->frame_size;
+ struct pkt pkts[] = {
+ /* Valid packet for synch to start with */
+ {0, MIN_PKT_SIZE, 0, true, 0},
+ /* Zero frame len is not legal */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, 0, 0, false, 0},
+ /* Invalid address in the second frame */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ /* Invalid len in the middle */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ /* Invalid options in the middle */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION},
+ /* Transmit 2 frags, receive 3 */
+ {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD},
+ {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0},
+ /* Middle frame crosses chunk boundary with small length */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0},
+ /* Valid packet for synch so that something is received */
+ {0, MIN_PKT_SIZE, 0, true, 0}};
+
+ if (umem->unaligned_mode) {
+ /* Crossing a chunk boundary allowed */
+ pkts[12].valid = true;
+ pkts[13].valid = true;
+ }
+
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)))
+ return TEST_FAILURE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_invalid_desc(struct test_spec *test)
+{
+ struct xsk_umem_info *umem = test->ifobj_tx->umem;
+ u64 umem_size = umem->num_frames * umem->frame_size;
+ struct pkt pkts[] = {
+ /* Zero packet address allowed */
+ {0, MIN_PKT_SIZE, 0, true},
+ /* Allowed packet */
+ {0, MIN_PKT_SIZE, 0, true},
+ /* Straddling the start of umem */
+ {-2, MIN_PKT_SIZE, 0, false},
+ /* Packet too large */
+ {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false},
+ /* Up to end of umem allowed */
+ {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true},
+ /* After umem ends */
+ {umem_size, MIN_PKT_SIZE, 0, false},
+ /* Straddle the end of umem */
+ {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false},
+ /* Straddle a 4K boundary */
+ {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false},
+ /* Straddle a 2K boundary */
+ {0x800 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, true},
+ /* Valid packet for synch so that something is received */
+ {0, MIN_PKT_SIZE, 0, true}};
+
+ if (umem->unaligned_mode) {
+ /* Crossing a page boundary allowed */
+ pkts[7].valid = true;
+ }
+ if (umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) {
+ /* Crossing a 2K frame size boundary not allowed */
+ pkts[8].valid = false;
+ }
+
+ if (test->ifobj_tx->shared_umem) {
+ pkts[4].offset += umem_size;
+ pkts[5].offset += umem_size;
+ pkts[6].offset += umem_size;
+ }
+
+ if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)))
+ return TEST_FAILURE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_xdp_drop(struct test_spec *test)
+{
+ struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
+ struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
+
+ test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop,
+ skel_rx->maps.xsk, skel_tx->maps.xsk);
+
+ if (pkt_stream_receive_half(test))
+ return TEST_FAILURE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_xdp_metadata_copy(struct test_spec *test)
+{
+ struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
+ struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
+
+ test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata,
+ skel_tx->progs.xsk_xdp_populate_metadata,
+ skel_rx->maps.xsk, skel_tx->maps.xsk);
+ test->ifobj_rx->use_metadata = true;
+
+ skel_rx->bss->count = 0;
+
+ return testapp_validate_traffic(test);
+}
+
+int testapp_xdp_shared_umem(struct test_spec *test)
+{
+ struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
+ struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
+ int ret;
+
+ test->total_steps = 1;
+ test->nb_sockets = 2;
+
+ test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_shared_umem,
+ skel_tx->progs.xsk_xdp_shared_umem,
+ skel_rx->maps.xsk, skel_tx->maps.xsk);
+
+ if (pkt_stream_even_odd_sequence(test))
+ return TEST_FAILURE;
+
+ ret = testapp_validate_traffic(test);
+
+ release_even_odd_sequence(test);
+
+ return ret;
+}
+
+int testapp_poll_txq_tmout(struct test_spec *test)
+{
+ test->ifobj_tx->use_poll = true;
+ /* create invalid frame by set umem frame_size and pkt length equal to 2048 */
+ test->ifobj_tx->umem->frame_size = 2048;
+ if (pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048))
+ return TEST_FAILURE;
+ return testapp_validate_traffic_single_thread(test, test->ifobj_tx);
+}
+
+int testapp_poll_rxq_tmout(struct test_spec *test)
+{
+ test->ifobj_rx->use_poll = true;
+ return testapp_validate_traffic_single_thread(test, test->ifobj_rx);
+}
+
+int testapp_too_many_frags(struct test_spec *test)
+{
+ struct pkt *pkts;
+ u32 max_frags, i;
+ int ret = TEST_FAILURE;
+
+ if (test->mode == TEST_MODE_ZC) {
+ max_frags = test->ifobj_tx->xdp_zc_max_segs;
+ } else {
+ max_frags = get_max_skb_frags();
+ if (!max_frags) {
+ ksft_print_msg("Can't get MAX_SKB_FRAGS from system, using default (17)\n");
+ max_frags = 17;
+ }
+ max_frags += 1;
+ }
+
+ pkts = calloc(2 * max_frags + 2, sizeof(struct pkt));
+ if (!pkts)
+ return TEST_FAILURE;
+
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+
+ /* Valid packet for synch */
+ pkts[0].len = MIN_PKT_SIZE;
+ pkts[0].valid = true;
+
+ /* One valid packet with the max amount of frags */
+ for (i = 1; i < max_frags + 1; i++) {
+ pkts[i].len = MIN_PKT_SIZE;
+ pkts[i].options = XDP_PKT_CONTD;
+ pkts[i].valid = true;
+ }
+ pkts[max_frags].options = 0;
+
+ /* An invalid packet with the max amount of frags but signals packet
+ * continues on the last frag
+ */
+ for (i = max_frags + 1; i < 2 * max_frags + 1; i++) {
+ pkts[i].len = MIN_PKT_SIZE;
+ pkts[i].options = XDP_PKT_CONTD;
+ pkts[i].valid = false;
+ }
+
+ /* Valid packet for synch */
+ pkts[2 * max_frags + 1].len = MIN_PKT_SIZE;
+ pkts[2 * max_frags + 1].valid = true;
+
+ if (pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2)) {
+ free(pkts);
+ return TEST_FAILURE;
+ }
+
+ ret = testapp_validate_traffic(test);
+ free(pkts);
+ return ret;
+}
+
+static int xsk_load_xdp_programs(struct ifobject *ifobj)
+{
+ ifobj->xdp_progs = xsk_xdp_progs__open_and_load();
+ if (libbpf_get_error(ifobj->xdp_progs))
+ return libbpf_get_error(ifobj->xdp_progs);
+
+ return 0;
+}
+
+/* Simple test */
+static bool hugepages_present(void)
+{
+ size_t mmap_sz = 2 * DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ void *bufs;
+
+ bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, MAP_HUGE_2MB);
+ if (bufs == MAP_FAILED)
+ return false;
+
+ mmap_sz = ceil_u64(mmap_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE;
+ munmap(bufs, mmap_sz);
+ return true;
+}
+
+int init_iface(struct ifobject *ifobj, thread_func_t func_ptr)
+{
+ LIBBPF_OPTS(bpf_xdp_query_opts, query_opts);
+ int err;
+
+ ifobj->func_ptr = func_ptr;
+
+ err = xsk_load_xdp_programs(ifobj);
+ if (err) {
+ ksft_print_msg("Error loading XDP program\n");
+ return err;
+ }
+
+ if (hugepages_present())
+ ifobj->unaligned_supp = true;
+
+ err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts);
+ if (err) {
+ ksft_print_msg("Error querying XDP capabilities\n");
+ return err;
+ }
+ if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG)
+ ifobj->multi_buff_supp = true;
+ if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
+ if (query_opts.xdp_zc_max_segs > 1) {
+ ifobj->multi_buff_zc_supp = true;
+ ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs;
+ } else {
+ ifobj->xdp_zc_max_segs = 0;
+ }
+ }
+
+ return 0;
+}
+
+int testapp_send_receive(struct test_spec *test)
+{
+ return testapp_validate_traffic(test);
+}
+
+int testapp_send_receive_2k_frame(struct test_spec *test)
+{
+ test->ifobj_tx->umem->frame_size = 2048;
+ test->ifobj_rx->umem->frame_size = 2048;
+ if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE))
+ return TEST_FAILURE;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_poll_rx(struct test_spec *test)
+{
+ test->ifobj_rx->use_poll = true;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_poll_tx(struct test_spec *test)
+{
+ test->ifobj_tx->use_poll = true;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_aligned_inv_desc(struct test_spec *test)
+{
+ return testapp_invalid_desc(test);
+}
+
+int testapp_aligned_inv_desc_2k_frame(struct test_spec *test)
+{
+ test->ifobj_tx->umem->frame_size = 2048;
+ test->ifobj_rx->umem->frame_size = 2048;
+ return testapp_invalid_desc(test);
+}
+
+int testapp_unaligned_inv_desc(struct test_spec *test)
+{
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ return testapp_invalid_desc(test);
+}
+
+int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test)
+{
+ u64 page_size, umem_size;
+
+ /* Odd frame size so the UMEM doesn't end near a page boundary. */
+ test->ifobj_tx->umem->frame_size = 4001;
+ test->ifobj_rx->umem->frame_size = 4001;
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ /* This test exists to test descriptors that staddle the end of
+ * the UMEM but not a page.
+ */
+ page_size = sysconf(_SC_PAGESIZE);
+ umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size;
+ assert(umem_size % page_size > MIN_PKT_SIZE);
+ assert(umem_size % page_size < page_size - MIN_PKT_SIZE);
+
+ return testapp_invalid_desc(test);
+}
+
+int testapp_aligned_inv_desc_mb(struct test_spec *test)
+{
+ return testapp_invalid_desc_mb(test);
+}
+
+int testapp_unaligned_inv_desc_mb(struct test_spec *test)
+{
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ return testapp_invalid_desc_mb(test);
+}
+
+int testapp_xdp_metadata(struct test_spec *test)
+{
+ return testapp_xdp_metadata_copy(test);
+}
+
+int testapp_xdp_metadata_mb(struct test_spec *test)
+{
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ return testapp_xdp_metadata_copy(test);
+}
+
+int testapp_hw_sw_min_ring_size(struct test_spec *test)
+{
+ int ret;
+
+ test->set_ring = true;
+ test->total_steps = 2;
+ test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE;
+ test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2;
+ test->ifobj_tx->xsk->batch_size = 1;
+ test->ifobj_rx->xsk->batch_size = 1;
+ ret = testapp_validate_traffic(test);
+ if (ret)
+ return ret;
+
+ /* Set batch size to hw_ring_size - 1 */
+ test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1;
+ test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1;
+ return testapp_validate_traffic(test);
+}
+
+int testapp_hw_sw_max_ring_size(struct test_spec *test)
+{
+ u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4;
+ int ret;
+
+ test->set_ring = true;
+ test->total_steps = 2;
+ test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending;
+ test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending;
+ test->ifobj_rx->umem->num_frames = max_descs;
+ test->ifobj_rx->umem->fill_size = max_descs;
+ test->ifobj_rx->umem->comp_size = max_descs;
+ test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+ test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+
+ ret = testapp_validate_traffic(test);
+ if (ret)
+ return ret;
+
+ /* Set batch_size to 8152 for testing, as the ice HW ignores the 3 lowest bits when
+ * updating the Rx HW tail register.
+ */
+ test->ifobj_tx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8;
+ test->ifobj_rx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8;
+ if (pkt_stream_replace(test, max_descs, MIN_PKT_SIZE)) {
+ clean_sockets(test, test->ifobj_tx);
+ clean_sockets(test, test->ifobj_rx);
+ clean_umem(test, test->ifobj_rx, test->ifobj_tx);
+ return TEST_FAILURE;
+ }
+
+ return testapp_validate_traffic(test);
+}
+
+static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value)
+{
+ struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
+ struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
+
+ test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail,
+ skel_tx->progs.xsk_xdp_adjust_tail,
+ skel_rx->maps.xsk, skel_tx->maps.xsk);
+
+ skel_rx->bss->adjust_value = adjust_value;
+
+ return testapp_validate_traffic(test);
+}
+
+static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len)
+{
+ int ret;
+
+ test->adjust_tail_support = true;
+ test->adjust_tail = true;
+ test->total_steps = 1;
+
+ ret = pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len);
+ if (ret)
+ return TEST_FAILURE;
+
+ ret = pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value);
+ if (ret)
+ return TEST_FAILURE;
+
+ ret = testapp_xdp_adjust_tail(test, value);
+ if (ret)
+ return ret;
+
+ if (!test->adjust_tail_support) {
+ ksft_print_msg("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n",
+ mode_string(test), busy_poll_string(test));
+ return TEST_SKIP;
+ }
+
+ return 0;
+}
+
+int testapp_adjust_tail_shrink(struct test_spec *test)
+{
+ /* Shrink by 4 bytes for testing purpose */
+ return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2);
+}
+
+int testapp_adjust_tail_shrink_mb(struct test_spec *test)
+{
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ /* Shrink by the frag size */
+ return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2);
+}
+
+int testapp_adjust_tail_grow(struct test_spec *test)
+{
+ /* Grow by 4 bytes for testing purpose */
+ return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2);
+}
+
+int testapp_adjust_tail_grow_mb(struct test_spec *test)
+{
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */
+ return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1,
+ XSK_UMEM__LARGE_FRAME_SIZE * 2);
+}
+
+int testapp_tx_queue_consumer(struct test_spec *test)
+{
+ int nr_packets;
+
+ if (test->mode == TEST_MODE_ZC) {
+ ksft_print_msg("Can not run TX_QUEUE_CONSUMER test for ZC mode\n");
+ return TEST_SKIP;
+ }
+
+ nr_packets = MAX_TX_BUDGET_DEFAULT + 1;
+ if (pkt_stream_replace(test, nr_packets, MIN_PKT_SIZE))
+ return TEST_FAILURE;
+ test->ifobj_tx->xsk->batch_size = nr_packets;
+ test->ifobj_tx->xsk->check_consumer = true;
+
+ return testapp_validate_traffic(test);
+}
+
+struct ifobject *ifobject_create(void)
+{
+ struct ifobject *ifobj;
+
+ ifobj = calloc(1, sizeof(struct ifobject));
+ if (!ifobj)
+ return NULL;
+
+ ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr));
+ if (!ifobj->xsk_arr)
+ goto out_xsk_arr;
+
+ ifobj->umem = calloc(1, sizeof(*ifobj->umem));
+ if (!ifobj->umem)
+ goto out_umem;
+
+ return ifobj;
+
+out_umem:
+ free(ifobj->xsk_arr);
+out_xsk_arr:
+ free(ifobj);
+ return NULL;
+}
+
+void ifobject_delete(struct ifobject *ifobj)
+{
+ free(ifobj->umem);
+ free(ifobj->xsk_arr);
+ free(ifobj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.h b/tools/testing/selftests/bpf/prog_tests/test_xsk.h
new file mode 100644
index 000000000000..8fc78a057de0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.h
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef TEST_XSK_H_
+#define TEST_XSK_H_
+
+#include <linux/ethtool.h>
+#include <linux/if_xdp.h>
+
+#include "../kselftest.h"
+#include "xsk.h"
+
+#ifndef SO_PREFER_BUSY_POLL
+#define SO_PREFER_BUSY_POLL 69
+#endif
+
+#ifndef SO_BUSY_POLL_BUDGET
+#define SO_BUSY_POLL_BUDGET 70
+#endif
+
+#define TEST_PASS 0
+#define TEST_FAILURE -1
+#define TEST_CONTINUE 1
+#define TEST_SKIP 2
+
+#define DEFAULT_PKT_CNT (4 * 1024)
+#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4)
+#define HUGEPAGE_SIZE (2 * 1024 * 1024)
+#define MIN_PKT_SIZE 64
+#define MAX_ETH_PKT_SIZE 1518
+#define MAX_INTERFACE_NAME_CHARS 16
+#define MAX_TEST_NAME_SIZE 48
+#define SOCK_RECONF_CTR 10
+#define USLEEP_MAX 10000
+
+extern bool opt_verbose;
+#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0)
+
+
+static inline u32 ceil_u32(u32 a, u32 b)
+{
+ return (a + b - 1) / b;
+}
+
+static inline u64 ceil_u64(u64 a, u64 b)
+{
+ return (a + b - 1) / b;
+}
+
+/* Simple test */
+enum test_mode {
+ TEST_MODE_SKB,
+ TEST_MODE_DRV,
+ TEST_MODE_ZC,
+ TEST_MODE_ALL
+};
+
+struct ifobject;
+struct test_spec;
+typedef int (*validation_func_t)(struct ifobject *ifobj);
+typedef void *(*thread_func_t)(void *arg);
+typedef int (*test_func_t)(struct test_spec *test);
+
+struct xsk_socket_info {
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_umem_info *umem;
+ struct xsk_socket *xsk;
+ struct pkt_stream *pkt_stream;
+ u32 outstanding_tx;
+ u32 rxqsize;
+ u32 batch_size;
+ u8 dst_mac[ETH_ALEN];
+ u8 src_mac[ETH_ALEN];
+ bool check_consumer;
+};
+
+int kick_rx(struct xsk_socket_info *xsk);
+int kick_tx(struct xsk_socket_info *xsk);
+
+struct xsk_umem_info {
+ struct xsk_ring_prod fq;
+ struct xsk_ring_cons cq;
+ struct xsk_umem *umem;
+ u64 next_buffer;
+ u32 num_frames;
+ u32 frame_headroom;
+ void *buffer;
+ u32 frame_size;
+ u32 base_addr;
+ u32 fill_size;
+ u32 comp_size;
+ bool unaligned_mode;
+};
+
+struct set_hw_ring {
+ u32 default_tx;
+ u32 default_rx;
+};
+
+int hw_ring_size_reset(struct ifobject *ifobj);
+
+struct ifobject {
+ char ifname[MAX_INTERFACE_NAME_CHARS];
+ struct xsk_socket_info *xsk;
+ struct xsk_socket_info *xsk_arr;
+ struct xsk_umem_info *umem;
+ thread_func_t func_ptr;
+ validation_func_t validation_func;
+ struct xsk_xdp_progs *xdp_progs;
+ struct bpf_map *xskmap;
+ struct bpf_program *xdp_prog;
+ struct ethtool_ringparam ring;
+ struct set_hw_ring set_ring;
+ enum test_mode mode;
+ int ifindex;
+ int mtu;
+ u32 bind_flags;
+ u32 xdp_zc_max_segs;
+ bool tx_on;
+ bool rx_on;
+ bool use_poll;
+ bool busy_poll;
+ bool use_fill_ring;
+ bool release_rx;
+ bool shared_umem;
+ bool use_metadata;
+ bool unaligned_supp;
+ bool multi_buff_supp;
+ bool multi_buff_zc_supp;
+ bool hw_ring_size_supp;
+};
+struct ifobject *ifobject_create(void);
+void ifobject_delete(struct ifobject *ifobj);
+int init_iface(struct ifobject *ifobj, thread_func_t func_ptr);
+
+int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, u64 size);
+int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem,
+ struct ifobject *ifobject, bool shared);
+
+
+struct pkt {
+ int offset;
+ u32 len;
+ u32 pkt_nb;
+ bool valid;
+ u16 options;
+};
+
+struct pkt_stream {
+ u32 nb_pkts;
+ u32 current_pkt_nb;
+ struct pkt *pkts;
+ u32 max_pkt_len;
+ u32 nb_rx_pkts;
+ u32 nb_valid_entries;
+ bool verbatim;
+};
+
+static inline bool pkt_continues(u32 options)
+{
+ return options & XDP_PKT_CONTD;
+}
+
+struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len);
+void pkt_stream_delete(struct pkt_stream *pkt_stream);
+void pkt_stream_reset(struct pkt_stream *pkt_stream);
+void pkt_stream_restore_default(struct test_spec *test);
+
+struct test_spec {
+ struct ifobject *ifobj_tx;
+ struct ifobject *ifobj_rx;
+ struct pkt_stream *tx_pkt_stream_default;
+ struct pkt_stream *rx_pkt_stream_default;
+ struct bpf_program *xdp_prog_rx;
+ struct bpf_program *xdp_prog_tx;
+ struct bpf_map *xskmap_rx;
+ struct bpf_map *xskmap_tx;
+ test_func_t test_func;
+ int mtu;
+ u16 total_steps;
+ u16 current_step;
+ u16 nb_sockets;
+ bool fail;
+ bool set_ring;
+ bool adjust_tail;
+ bool adjust_tail_support;
+ enum test_mode mode;
+ char name[MAX_TEST_NAME_SIZE];
+};
+
+#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : ""
+static inline char *mode_string(struct test_spec *test)
+{
+ switch (test->mode) {
+ case TEST_MODE_SKB:
+ return "SKB";
+ case TEST_MODE_DRV:
+ return "DRV";
+ case TEST_MODE_ZC:
+ return "ZC";
+ default:
+ return "BOGUS";
+ }
+}
+
+void test_init(struct test_spec *test, struct ifobject *ifobj_tx,
+ struct ifobject *ifobj_rx, enum test_mode mode,
+ const struct test_spec *test_to_run);
+
+int testapp_adjust_tail_grow(struct test_spec *test);
+int testapp_adjust_tail_grow_mb(struct test_spec *test);
+int testapp_adjust_tail_shrink(struct test_spec *test);
+int testapp_adjust_tail_shrink_mb(struct test_spec *test);
+int testapp_aligned_inv_desc(struct test_spec *test);
+int testapp_aligned_inv_desc_2k_frame(struct test_spec *test);
+int testapp_aligned_inv_desc_mb(struct test_spec *test);
+int testapp_bidirectional(struct test_spec *test);
+int testapp_headroom(struct test_spec *test);
+int testapp_hw_sw_max_ring_size(struct test_spec *test);
+int testapp_hw_sw_min_ring_size(struct test_spec *test);
+int testapp_poll_rx(struct test_spec *test);
+int testapp_poll_rxq_tmout(struct test_spec *test);
+int testapp_poll_tx(struct test_spec *test);
+int testapp_poll_txq_tmout(struct test_spec *test);
+int testapp_send_receive(struct test_spec *test);
+int testapp_send_receive_2k_frame(struct test_spec *test);
+int testapp_send_receive_mb(struct test_spec *test);
+int testapp_send_receive_unaligned(struct test_spec *test);
+int testapp_send_receive_unaligned_mb(struct test_spec *test);
+int testapp_single_pkt(struct test_spec *test);
+int testapp_stats_fill_empty(struct test_spec *test);
+int testapp_stats_rx_dropped(struct test_spec *test);
+int testapp_stats_tx_invalid_descs(struct test_spec *test);
+int testapp_stats_rx_full(struct test_spec *test);
+int testapp_teardown(struct test_spec *test);
+int testapp_too_many_frags(struct test_spec *test);
+int testapp_tx_queue_consumer(struct test_spec *test);
+int testapp_unaligned_inv_desc(struct test_spec *test);
+int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test);
+int testapp_unaligned_inv_desc_mb(struct test_spec *test);
+int testapp_xdp_drop(struct test_spec *test);
+int testapp_xdp_metadata(struct test_spec *test);
+int testapp_xdp_metadata_mb(struct test_spec *test);
+int testapp_xdp_prog_cleanup(struct test_spec *test);
+int testapp_xdp_shared_umem(struct test_spec *test);
+
+void *worker_testapp_validate_rx(void *arg);
+void *worker_testapp_validate_tx(void *arg);
+
+static const struct test_spec tests[] = {
+ {.name = "SEND_RECEIVE", .test_func = testapp_send_receive},
+ {.name = "SEND_RECEIVE_2K_FRAME", .test_func = testapp_send_receive_2k_frame},
+ {.name = "SEND_RECEIVE_SINGLE_PKT", .test_func = testapp_single_pkt},
+ {.name = "POLL_RX", .test_func = testapp_poll_rx},
+ {.name = "POLL_TX", .test_func = testapp_poll_tx},
+ {.name = "POLL_RXQ_FULL", .test_func = testapp_poll_rxq_tmout},
+ {.name = "POLL_TXQ_FULL", .test_func = testapp_poll_txq_tmout},
+ {.name = "ALIGNED_INV_DESC", .test_func = testapp_aligned_inv_desc},
+ {.name = "ALIGNED_INV_DESC_2K_FRAME_SIZE", .test_func = testapp_aligned_inv_desc_2k_frame},
+ {.name = "UMEM_HEADROOM", .test_func = testapp_headroom},
+ {.name = "BIDIRECTIONAL", .test_func = testapp_bidirectional},
+ {.name = "STAT_RX_DROPPED", .test_func = testapp_stats_rx_dropped},
+ {.name = "STAT_TX_INVALID", .test_func = testapp_stats_tx_invalid_descs},
+ {.name = "STAT_RX_FULL", .test_func = testapp_stats_rx_full},
+ {.name = "STAT_FILL_EMPTY", .test_func = testapp_stats_fill_empty},
+ {.name = "XDP_PROG_CLEANUP", .test_func = testapp_xdp_prog_cleanup},
+ {.name = "XDP_DROP_HALF", .test_func = testapp_xdp_drop},
+ {.name = "XDP_SHARED_UMEM", .test_func = testapp_xdp_shared_umem},
+ {.name = "XDP_METADATA_COPY", .test_func = testapp_xdp_metadata},
+ {.name = "XDP_METADATA_COPY_MULTI_BUFF", .test_func = testapp_xdp_metadata_mb},
+ {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb},
+ {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags},
+ {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink},
+ {.name = "TX_QUEUE_CONSUMER", .test_func = testapp_tx_queue_consumer},
+ };
+
+static const struct test_spec ci_skip_tests[] = {
+ /* Flaky tests */
+ {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb},
+ {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow},
+ {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb},
+ {.name = "SEND_RECEIVE_9K_PACKETS", .test_func = testapp_send_receive_mb},
+ /* Tests with huge page dependency */
+ {.name = "SEND_RECEIVE_UNALIGNED", .test_func = testapp_send_receive_unaligned},
+ {.name = "UNALIGNED_INV_DESC", .test_func = testapp_unaligned_inv_desc},
+ {.name = "UNALIGNED_INV_DESC_4001_FRAME_SIZE",
+ .test_func = testapp_unaligned_inv_desc_4001_frame},
+ {.name = "SEND_RECEIVE_UNALIGNED_9K_PACKETS",
+ .test_func = testapp_send_receive_unaligned_mb},
+ {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb},
+ /* Test with HW ring size dependency */
+ {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size},
+ {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size},
+ /* Too long test */
+ {.name = "TEARDOWN", .test_func = testapp_teardown},
+};
+
+
+#endif /* TEST_XSK_H_ */
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 28e81161e6fc..4b4b081b46cc 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -7,6 +7,7 @@
#include "verifier_arena.skel.h"
#include "verifier_arena_large.skel.h"
#include "verifier_array_access.skel.h"
+#include "verifier_async_cb_context.skel.h"
#include "verifier_basic_stack.skel.h"
#include "verifier_bitfield_write.skel.h"
#include "verifier_bounds.skel.h"
@@ -34,6 +35,7 @@
#include "verifier_global_subprogs.skel.h"
#include "verifier_global_ptr_args.skel.h"
#include "verifier_gotol.skel.h"
+#include "verifier_gotox.skel.h"
#include "verifier_helper_access_var_len.skel.h"
#include "verifier_helper_packet_access.skel.h"
#include "verifier_helper_restricted.skel.h"
@@ -172,6 +174,7 @@ void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); }
void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); }
void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); }
void test_verifier_gotol(void) { RUN(verifier_gotol); }
+void test_verifier_gotox(void) { RUN(verifier_gotox); }
void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); }
void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); }
void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); }
@@ -280,6 +283,7 @@ void test_verifier_array_access(void)
verifier_array_access__elf_bytes,
init_array_access_maps);
}
+void test_verifier_async_cb_context(void) { RUN(verifier_async_cb_context); }
static int init_value_ptr_arith_maps(struct bpf_object *obj)
{
diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c
index 99e438fe12ac..15c67d23128b 100644
--- a/tools/testing/selftests/bpf/prog_tests/wq.c
+++ b/tools/testing/selftests/bpf/prog_tests/wq.c
@@ -38,3 +38,59 @@ void serial_test_failures_wq(void)
{
RUN_TESTS(wq_failures);
}
+
+static void test_failure_map_no_btf(void)
+{
+ struct wq *skel = NULL;
+ char log[8192];
+ const struct bpf_insn *insns;
+ size_t insn_cnt;
+ int ret, err, map_fd;
+ LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_size = sizeof(log), .log_buf = log,
+ .log_level = 2);
+
+ skel = wq__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ err = bpf_object__prepare(skel->obj);
+ if (!ASSERT_OK(err, "skel__prepare"))
+ goto out;
+
+ map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "map_no_btf", sizeof(__u32), sizeof(__u64), 100,
+ NULL);
+ if (!ASSERT_GT(map_fd, -1, "map create"))
+ goto out;
+
+ err = bpf_map__reuse_fd(skel->maps.array, map_fd);
+ if (!ASSERT_OK(err, "map reuse fd")) {
+ close(map_fd);
+ goto out;
+ }
+
+ insns = bpf_program__insns(skel->progs.test_map_no_btf);
+ if (!ASSERT_OK_PTR(insns, "insns ptr"))
+ goto out;
+
+ insn_cnt = bpf_program__insn_cnt(skel->progs.test_map_no_btf);
+ if (!ASSERT_GT(insn_cnt, 0u, "insn cnt"))
+ goto out;
+
+ ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
+ if (!ASSERT_LT(ret, 0, "prog load failed")) {
+ if (ret > 0)
+ close(ret);
+ goto out;
+ }
+
+ ASSERT_HAS_SUBSTR(log, "map 'map_no_btf' has to have BTF in order to use bpf_wq",
+ "log complains no map BTF");
+out:
+ wq__destroy(skel);
+}
+
+void test_wq_custom(void)
+{
+ if (test__start_subtest("test_failure_map_no_btf"))
+ test_failure_map_no_btf();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xsk.c b/tools/testing/selftests/bpf/prog_tests/xsk.c
new file mode 100644
index 000000000000..dd4c35c0e428
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xsk.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/if.h>
+#include <stdarg.h>
+
+#include "network_helpers.h"
+#include "test_progs.h"
+#include "test_xsk.h"
+#include "xsk_xdp_progs.skel.h"
+
+#define VETH_RX "veth0"
+#define VETH_TX "veth1"
+#define MTU 1500
+
+int setup_veth(bool busy_poll)
+{
+ SYS(fail,
+ "ip link add %s numtxqueues 4 numrxqueues 4 type veth peer name %s numtxqueues 4 numrxqueues 4",
+ VETH_RX, VETH_TX);
+ SYS(fail, "sysctl -wq net.ipv6.conf.%s.disable_ipv6=1", VETH_RX);
+ SYS(fail, "sysctl -wq net.ipv6.conf.%s.disable_ipv6=1", VETH_TX);
+
+ if (busy_poll) {
+ SYS(fail, "echo 2 > /sys/class/net/%s/napi_defer_hard_irqs", VETH_RX);
+ SYS(fail, "echo 200000 > /sys/class/net/%s/gro_flush_timeout", VETH_RX);
+ SYS(fail, "echo 2 > /sys/class/net/%s/napi_defer_hard_irqs", VETH_TX);
+ SYS(fail, "echo 200000 > /sys/class/net/%s/gro_flush_timeout", VETH_TX);
+ }
+
+ SYS(fail, "ip link set %s mtu %d", VETH_RX, MTU);
+ SYS(fail, "ip link set %s mtu %d", VETH_TX, MTU);
+ SYS(fail, "ip link set %s up", VETH_RX);
+ SYS(fail, "ip link set %s up", VETH_TX);
+
+ return 0;
+
+fail:
+ return -1;
+}
+
+void delete_veth(void)
+{
+ SYS_NOFAIL("ip link del %s", VETH_RX);
+ SYS_NOFAIL("ip link del %s", VETH_TX);
+}
+
+int configure_ifobj(struct ifobject *tx, struct ifobject *rx)
+{
+ rx->ifindex = if_nametoindex(VETH_RX);
+ if (!ASSERT_OK_FD(rx->ifindex, "get RX ifindex"))
+ return -1;
+
+ tx->ifindex = if_nametoindex(VETH_TX);
+ if (!ASSERT_OK_FD(tx->ifindex, "get TX ifindex"))
+ return -1;
+
+ tx->shared_umem = false;
+ rx->shared_umem = false;
+
+
+ return 0;
+}
+
+static void test_xsk(const struct test_spec *test_to_run, enum test_mode mode)
+{
+ struct ifobject *ifobj_tx, *ifobj_rx;
+ struct test_spec test;
+ int ret;
+
+ ifobj_tx = ifobject_create();
+ if (!ASSERT_OK_PTR(ifobj_tx, "create ifobj_tx"))
+ return;
+
+ ifobj_rx = ifobject_create();
+ if (!ASSERT_OK_PTR(ifobj_rx, "create ifobj_rx"))
+ goto delete_tx;
+
+ if (!ASSERT_OK(configure_ifobj(ifobj_tx, ifobj_rx), "conigure ifobj"))
+ goto delete_rx;
+
+ ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring);
+ if (!ret) {
+ ifobj_tx->hw_ring_size_supp = true;
+ ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending;
+ ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending;
+ }
+
+ if (!ASSERT_OK(init_iface(ifobj_rx, worker_testapp_validate_rx), "init RX"))
+ goto delete_rx;
+ if (!ASSERT_OK(init_iface(ifobj_tx, worker_testapp_validate_tx), "init TX"))
+ goto delete_rx;
+
+ test_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]);
+
+ test.tx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE);
+ if (!ASSERT_OK_PTR(test.tx_pkt_stream_default, "TX pkt generation"))
+ goto delete_rx;
+ test.rx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE);
+ if (!ASSERT_OK_PTR(test.rx_pkt_stream_default, "RX pkt generation"))
+ goto delete_rx;
+
+
+ test_init(&test, ifobj_tx, ifobj_rx, mode, test_to_run);
+ ret = test.test_func(&test);
+ if (ret != TEST_SKIP)
+ ASSERT_OK(ret, "Run test");
+ pkt_stream_restore_default(&test);
+
+ if (ifobj_tx->hw_ring_size_supp)
+ hw_ring_size_reset(ifobj_tx);
+
+ pkt_stream_delete(test.tx_pkt_stream_default);
+ pkt_stream_delete(test.rx_pkt_stream_default);
+ xsk_xdp_progs__destroy(ifobj_tx->xdp_progs);
+ xsk_xdp_progs__destroy(ifobj_rx->xdp_progs);
+
+delete_rx:
+ ifobject_delete(ifobj_rx);
+delete_tx:
+ ifobject_delete(ifobj_tx);
+}
+
+void test_ns_xsk_skb(void)
+{
+ int i;
+
+ if (!ASSERT_OK(setup_veth(false), "setup veth"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ if (test__start_subtest(tests[i].name))
+ test_xsk(&tests[i], TEST_MODE_SKB);
+ }
+
+ delete_veth();
+}
+
+void test_ns_xsk_drv(void)
+{
+ int i;
+
+ if (!ASSERT_OK(setup_veth(false), "setup veth"))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ if (test__start_subtest(tests[i].name))
+ test_xsk(&tests[i], TEST_MODE_DRV);
+ }
+
+ delete_veth();
+}
+
diff --git a/tools/testing/selftests/bpf/progs/arena_strsearch.c b/tools/testing/selftests/bpf/progs/arena_strsearch.c
new file mode 100644
index 000000000000..ef6b76658f7f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/arena_strsearch.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include "bpf_experimental.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 100); /* number of pages */
+} arena SEC(".maps");
+
+#include "bpf_arena_strsearch.h"
+
+struct glob_test {
+ char const __arena *pat, *str;
+ bool expected;
+};
+
+static bool test(char const __arena *pat, char const __arena *str, bool expected)
+{
+ bool match = glob_match(pat, str);
+ bool success = match == expected;
+
+ /* bpf_printk("glob_match %s %s res %d ok %d", pat, str, match, success); */
+ return success;
+}
+
+/*
+ * The tests are all jammed together in one array to make it simpler
+ * to place that array in the .init.rodata section. The obvious
+ * "array of structures containing char *" has no way to force the
+ * pointed-to strings to be in a particular section.
+ *
+ * Anyway, a test consists of:
+ * 1. Expected glob_match result: '1' or '0'.
+ * 2. Pattern to match: null-terminated string
+ * 3. String to match against: null-terminated string
+ *
+ * The list of tests is terminated with a final '\0' instead of
+ * a glob_match result character.
+ */
+static const char __arena glob_tests[] =
+ /* Some basic tests */
+ "1" "a\0" "a\0"
+ "0" "a\0" "b\0"
+ "0" "a\0" "aa\0"
+ "0" "a\0" "\0"
+ "1" "\0" "\0"
+ "0" "\0" "a\0"
+ /* Simple character class tests */
+ "1" "[a]\0" "a\0"
+ "0" "[a]\0" "b\0"
+ "0" "[!a]\0" "a\0"
+ "1" "[!a]\0" "b\0"
+ "1" "[ab]\0" "a\0"
+ "1" "[ab]\0" "b\0"
+ "0" "[ab]\0" "c\0"
+ "1" "[!ab]\0" "c\0"
+ "1" "[a-c]\0" "b\0"
+ "0" "[a-c]\0" "d\0"
+ /* Corner cases in character class parsing */
+ "1" "[a-c-e-g]\0" "-\0"
+ "0" "[a-c-e-g]\0" "d\0"
+ "1" "[a-c-e-g]\0" "f\0"
+ "1" "[]a-ceg-ik[]\0" "a\0"
+ "1" "[]a-ceg-ik[]\0" "]\0"
+ "1" "[]a-ceg-ik[]\0" "[\0"
+ "1" "[]a-ceg-ik[]\0" "h\0"
+ "0" "[]a-ceg-ik[]\0" "f\0"
+ "0" "[!]a-ceg-ik[]\0" "h\0"
+ "0" "[!]a-ceg-ik[]\0" "]\0"
+ "1" "[!]a-ceg-ik[]\0" "f\0"
+ /* Simple wild cards */
+ "1" "?\0" "a\0"
+ "0" "?\0" "aa\0"
+ "0" "??\0" "a\0"
+ "1" "?x?\0" "axb\0"
+ "0" "?x?\0" "abx\0"
+ "0" "?x?\0" "xab\0"
+ /* Asterisk wild cards (backtracking) */
+ "0" "*??\0" "a\0"
+ "1" "*??\0" "ab\0"
+ "1" "*??\0" "abc\0"
+ "1" "*??\0" "abcd\0"
+ "0" "??*\0" "a\0"
+ "1" "??*\0" "ab\0"
+ "1" "??*\0" "abc\0"
+ "1" "??*\0" "abcd\0"
+ "0" "?*?\0" "a\0"
+ "1" "?*?\0" "ab\0"
+ "1" "?*?\0" "abc\0"
+ "1" "?*?\0" "abcd\0"
+ "1" "*b\0" "b\0"
+ "1" "*b\0" "ab\0"
+ "0" "*b\0" "ba\0"
+ "1" "*b\0" "bb\0"
+ "1" "*b\0" "abb\0"
+ "1" "*b\0" "bab\0"
+ "1" "*bc\0" "abbc\0"
+ "1" "*bc\0" "bc\0"
+ "1" "*bc\0" "bbc\0"
+ "1" "*bc\0" "bcbc\0"
+ /* Multiple asterisks (complex backtracking) */
+ "1" "*ac*\0" "abacadaeafag\0"
+ "1" "*ac*ae*ag*\0" "abacadaeafag\0"
+ "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0"
+ "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0"
+ "1" "*abcd*\0" "abcabcabcabcdefg\0"
+ "1" "*ab*cd*\0" "abcabcabcabcdefg\0"
+ "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0"
+ "0" "*abcd*\0" "abcabcabcabcefg\0"
+ "0" "*ab*cd*\0" "abcabcabcabcefg\0";
+
+bool skip = false;
+
+SEC("syscall")
+int arena_strsearch(void *ctx)
+{
+ unsigned successes = 0;
+ unsigned n = 0;
+ char const __arena *p = glob_tests;
+
+ /*
+ * Tests are jammed together in a string. The first byte is '1'
+ * or '0' to indicate the expected outcome, or '\0' to indicate the
+ * end of the tests. Then come two null-terminated strings: the
+ * pattern and the string to match it against.
+ */
+ while (*p) {
+ bool expected = *p++ & 1;
+ char const __arena *pat = p;
+
+ cond_break;
+ p += bpf_arena_strlen(p) + 1;
+ successes += test(pat, p, expected);
+ p += bpf_arena_strlen(p) + 1;
+ n++;
+ }
+
+ n -= successes;
+ /* bpf_printk("glob: %u self-tests passed, %u failed\n", successes, n); */
+
+ return n ? -1 : 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c
index 4e51785e7606..9af19dfe4e80 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c
@@ -22,10 +22,6 @@
#define TCP_PACING_CA_RATIO (120)
#define TCP_REORDERING (12)
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#define after(seq2, seq1) before(seq1, seq2)
-
extern void cubictcp_init(struct sock *sk) __ksym;
extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
@@ -34,11 +30,6 @@ extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
-static bool before(__u32 seq1, __u32 seq2)
-{
- return (__s32)(seq1-seq2) < 0;
-}
-
static __u64 div64_u64(__u64 dividend, __u64 divisor)
{
return dividend / divisor;
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index f089faa97ae6..46fb2b37d3a7 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -20,13 +20,6 @@
char _license[] SEC("license") = "GPL";
#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#define max(a, b) ((a) > (b) ? (a) : (b))
-static bool before(__u32 seq1, __u32 seq2)
-{
- return (__s32)(seq1-seq2) < 0;
-}
-#define after(seq2, seq1) before(seq1, seq2)
extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index 32c511bcd60b..1cc83140849f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -13,16 +13,10 @@
#ifndef EBUSY
#define EBUSY 16
#endif
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#define max(a, b) ((a) > (b) ? (a) : (b))
#define min_not_zero(x, y) ({ \
typeof(x) __x = (x); \
typeof(y) __y = (y); \
__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
-static bool before(__u32 seq1, __u32 seq2)
-{
- return (__s32)(seq1-seq2) < 0;
-}
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_gotox.c b/tools/testing/selftests/bpf/progs/bpf_gotox.c
new file mode 100644
index 000000000000..216c71b94c64
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_gotox.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_misc.h"
+
+__u64 in_user;
+__u64 ret_user;
+
+int pid;
+
+/*
+ * Skip all the tests if compiler doesn't support indirect jumps.
+ *
+ * If tests are skipped, then all functions below are compiled as
+ * dummy, such that the skeleton looks the same, and the userspace
+ * program can avoid any checks rather than if data->skip is set.
+ */
+#ifdef __BPF_FEATURE_GOTOX
+__u64 skip SEC(".data") = 0;
+#else
+__u64 skip = 1;
+#endif
+
+struct simple_ctx {
+ __u64 x;
+};
+
+#ifdef __BPF_FEATURE_GOTOX
+__u64 some_var;
+
+/*
+ * This function adds code which will be replaced by a different
+ * number of instructions by the verifier. This adds additional
+ * stress on testing the insn_array maps corresponding to indirect jumps.
+ */
+static __always_inline void adjust_insns(__u64 x)
+{
+ some_var ^= x + bpf_jiffies64();
+}
+
+SEC("syscall")
+int one_switch(struct simple_ctx *ctx)
+{
+ switch (ctx->x) {
+ case 0:
+ adjust_insns(ctx->x + 1);
+ ret_user = 2;
+ break;
+ case 1:
+ adjust_insns(ctx->x + 7);
+ ret_user = 3;
+ break;
+ case 2:
+ adjust_insns(ctx->x + 9);
+ ret_user = 4;
+ break;
+ case 3:
+ adjust_insns(ctx->x + 11);
+ ret_user = 5;
+ break;
+ case 4:
+ adjust_insns(ctx->x + 17);
+ ret_user = 7;
+ break;
+ default:
+ adjust_insns(ctx->x + 177);
+ ret_user = 19;
+ break;
+ }
+
+ return 0;
+}
+
+SEC("syscall")
+int one_switch_non_zero_sec_off(struct simple_ctx *ctx)
+{
+ switch (ctx->x) {
+ case 0:
+ adjust_insns(ctx->x + 1);
+ ret_user = 2;
+ break;
+ case 1:
+ adjust_insns(ctx->x + 7);
+ ret_user = 3;
+ break;
+ case 2:
+ adjust_insns(ctx->x + 9);
+ ret_user = 4;
+ break;
+ case 3:
+ adjust_insns(ctx->x + 11);
+ ret_user = 5;
+ break;
+ case 4:
+ adjust_insns(ctx->x + 17);
+ ret_user = 7;
+ break;
+ default:
+ adjust_insns(ctx->x + 177);
+ ret_user = 19;
+ break;
+ }
+
+ return 0;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
+int simple_test_other_sec(struct pt_regs *ctx)
+{
+ __u64 x = in_user;
+
+ if (bpf_get_current_pid_tgid() >> 32 != pid)
+ return 0;
+
+ switch (x) {
+ case 0:
+ adjust_insns(x + 1);
+ ret_user = 2;
+ break;
+ case 1:
+ adjust_insns(x + 7);
+ ret_user = 3;
+ break;
+ case 2:
+ adjust_insns(x + 9);
+ ret_user = 4;
+ break;
+ case 3:
+ adjust_insns(x + 11);
+ ret_user = 5;
+ break;
+ case 4:
+ adjust_insns(x + 17);
+ ret_user = 7;
+ break;
+ default:
+ adjust_insns(x + 177);
+ ret_user = 19;
+ break;
+ }
+
+ return 0;
+}
+
+SEC("syscall")
+int two_switches(struct simple_ctx *ctx)
+{
+ switch (ctx->x) {
+ case 0:
+ adjust_insns(ctx->x + 1);
+ ret_user = 2;
+ break;
+ case 1:
+ adjust_insns(ctx->x + 7);
+ ret_user = 3;
+ break;
+ case 2:
+ adjust_insns(ctx->x + 9);
+ ret_user = 4;
+ break;
+ case 3:
+ adjust_insns(ctx->x + 11);
+ ret_user = 5;
+ break;
+ case 4:
+ adjust_insns(ctx->x + 17);
+ ret_user = 7;
+ break;
+ default:
+ adjust_insns(ctx->x + 177);
+ ret_user = 19;
+ break;
+ }
+
+ switch (ctx->x + !!ret_user) {
+ case 1:
+ adjust_insns(ctx->x + 7);
+ ret_user = 103;
+ break;
+ case 2:
+ adjust_insns(ctx->x + 9);
+ ret_user = 104;
+ break;
+ case 3:
+ adjust_insns(ctx->x + 11);
+ ret_user = 107;
+ break;
+ case 4:
+ adjust_insns(ctx->x + 11);
+ ret_user = 205;
+ break;
+ case 5:
+ adjust_insns(ctx->x + 11);
+ ret_user = 115;
+ break;
+ default:
+ adjust_insns(ctx->x + 177);
+ ret_user = 1019;
+ break;
+ }
+
+ return 0;
+}
+
+SEC("syscall")
+int big_jump_table(struct simple_ctx *ctx __attribute__((unused)))
+{
+ const void *const jt[256] = {
+ [0 ... 255] = &&default_label,
+ [0] = &&l0,
+ [11] = &&l11,
+ [27] = &&l27,
+ [31] = &&l31,
+ };
+
+ goto *jt[ctx->x & 0xff];
+
+l0:
+ adjust_insns(ctx->x + 1);
+ ret_user = 2;
+ return 0;
+
+l11:
+ adjust_insns(ctx->x + 7);
+ ret_user = 3;
+ return 0;
+
+l27:
+ adjust_insns(ctx->x + 9);
+ ret_user = 4;
+ return 0;
+
+l31:
+ adjust_insns(ctx->x + 11);
+ ret_user = 5;
+ return 0;
+
+default_label:
+ adjust_insns(ctx->x + 177);
+ ret_user = 19;
+ return 0;
+}
+
+SEC("syscall")
+int one_jump_two_maps(struct simple_ctx *ctx __attribute__((unused)))
+{
+ __label__ l1, l2, l3, l4;
+ void *jt1[2] = { &&l1, &&l2 };
+ void *jt2[2] = { &&l3, &&l4 };
+ unsigned int a = ctx->x % 2;
+ unsigned int b = (ctx->x / 2) % 2;
+ volatile int ret = 0;
+
+ if (!(a < 2 && b < 2))
+ return 19;
+
+ if (ctx->x % 2)
+ goto *jt1[a];
+ else
+ goto *jt2[b];
+
+ l1: ret += 1;
+ l2: ret += 3;
+ l3: ret += 5;
+ l4: ret += 7;
+
+ ret_user = ret;
+ return ret;
+}
+
+SEC("syscall")
+int one_map_two_jumps(struct simple_ctx *ctx __attribute__((unused)))
+{
+ __label__ l1, l2, l3;
+ void *jt[3] = { &&l1, &&l2, &&l3 };
+ unsigned int a = (ctx->x >> 2) & 1;
+ unsigned int b = (ctx->x >> 3) & 1;
+ volatile int ret = 0;
+
+ if (ctx->x % 2)
+ goto *jt[a];
+
+ if (ctx->x % 3)
+ goto *jt[a + b];
+
+ l1: ret += 3;
+ l2: ret += 5;
+ l3: ret += 7;
+
+ ret_user = ret;
+ return ret;
+}
+
+/* Just to introduce some non-zero offsets in .text */
+static __noinline int f0(volatile struct simple_ctx *ctx __arg_ctx)
+{
+ if (ctx)
+ return 1;
+ else
+ return 13;
+}
+
+SEC("syscall") int f1(struct simple_ctx *ctx)
+{
+ ret_user = 0;
+ return f0(ctx);
+}
+
+static __noinline int __static_global(__u64 x)
+{
+ switch (x) {
+ case 0:
+ adjust_insns(x + 1);
+ ret_user = 2;
+ break;
+ case 1:
+ adjust_insns(x + 7);
+ ret_user = 3;
+ break;
+ case 2:
+ adjust_insns(x + 9);
+ ret_user = 4;
+ break;
+ case 3:
+ adjust_insns(x + 11);
+ ret_user = 5;
+ break;
+ case 4:
+ adjust_insns(x + 17);
+ ret_user = 7;
+ break;
+ default:
+ adjust_insns(x + 177);
+ ret_user = 19;
+ break;
+ }
+
+ return 0;
+}
+
+SEC("syscall")
+int use_static_global1(struct simple_ctx *ctx)
+{
+ ret_user = 0;
+ return __static_global(ctx->x);
+}
+
+SEC("syscall")
+int use_static_global2(struct simple_ctx *ctx)
+{
+ ret_user = 0;
+ adjust_insns(ctx->x + 1);
+ return __static_global(ctx->x);
+}
+
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
+int use_static_global_other_sec(void *ctx)
+{
+ if (bpf_get_current_pid_tgid() >> 32 != pid)
+ return 0;
+
+ return __static_global(in_user);
+}
+
+__noinline int __nonstatic_global(__u64 x)
+{
+ switch (x) {
+ case 0:
+ adjust_insns(x + 1);
+ ret_user = 2;
+ break;
+ case 1:
+ adjust_insns(x + 7);
+ ret_user = 3;
+ break;
+ case 2:
+ adjust_insns(x + 9);
+ ret_user = 4;
+ break;
+ case 3:
+ adjust_insns(x + 11);
+ ret_user = 5;
+ break;
+ case 4:
+ adjust_insns(x + 17);
+ ret_user = 7;
+ break;
+ default:
+ adjust_insns(x + 177);
+ ret_user = 19;
+ break;
+ }
+
+ return 0;
+}
+
+SEC("syscall")
+int use_nonstatic_global1(struct simple_ctx *ctx)
+{
+ ret_user = 0;
+ return __nonstatic_global(ctx->x);
+}
+
+SEC("syscall")
+int use_nonstatic_global2(struct simple_ctx *ctx)
+{
+ ret_user = 0;
+ adjust_insns(ctx->x + 1);
+ return __nonstatic_global(ctx->x);
+}
+
+SEC("fentry/" SYS_PREFIX "sys_nanosleep")
+int use_nonstatic_global_other_sec(void *ctx)
+{
+ if (bpf_get_current_pid_tgid() >> 32 != pid)
+ return 0;
+
+ return __nonstatic_global(in_user);
+}
+
+#else /* __BPF_FEATURE_GOTOX */
+
+#define SKIP_TEST(TEST_NAME) \
+ SEC("syscall") int TEST_NAME(void *ctx) \
+ { \
+ return 0; \
+ }
+
+SKIP_TEST(one_switch);
+SKIP_TEST(one_switch_non_zero_sec_off);
+SKIP_TEST(simple_test_other_sec);
+SKIP_TEST(two_switches);
+SKIP_TEST(big_jump_table);
+SKIP_TEST(one_jump_two_maps);
+SKIP_TEST(one_map_two_jumps);
+SKIP_TEST(use_static_global1);
+SKIP_TEST(use_static_global2);
+SKIP_TEST(use_static_global_other_sec);
+SKIP_TEST(use_nonstatic_global1);
+SKIP_TEST(use_nonstatic_global2);
+SKIP_TEST(use_nonstatic_global_other_sec);
+
+#endif /* __BPF_FEATURE_GOTOX */
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
index 774d4dbe8189..a8aa5a71d846 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
@@ -18,23 +18,10 @@
unsigned short reuse_listen_hport = 0;
unsigned short listen_hport = 0;
-char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic";
+const char cubic_cc[] = "bpf_cubic";
char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp";
bool random_retry = false;
-static bool tcp_cc_eq(const char *a, const char *b)
-{
- int i;
-
- for (i = 0; i < TCP_CA_NAME_MAX; i++) {
- if (a[i] != b[i])
- return false;
- if (!a[i])
- break;
- }
-
- return true;
-}
SEC("iter/tcp")
int change_tcp_cc(struct bpf_iter__tcp *ctx)
@@ -58,7 +45,7 @@ int change_tcp_cc(struct bpf_iter__tcp *ctx)
cur_cc, sizeof(cur_cc)))
return 0;
- if (!tcp_cc_eq(cur_cc, cubic_cc))
+ if (bpf_strncmp(cur_cc, TCP_CA_NAME_MAX, cubic_cc))
return 0;
if (random_retry && bpf_get_prandom_u32() % 4 == 1)
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index a7a1a684eed1..c9bfbe1bafc1 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -126,6 +126,9 @@
* Several __arch_* annotations could be specified at once.
* When test case is not run on current arch it is marked as skipped.
* __caps_unpriv Specify the capabilities that should be set when running the test.
+ *
+ * __linear_size Specify the size of the linear area of non-linear skbs, or
+ * 0 for linear skbs.
*/
#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg)))
#define __not_msg(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg=" XSTR(__COUNTER__) "=" msg)))
@@ -159,6 +162,7 @@
#define __stderr_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr_unpriv=" XSTR(__COUNTER__) "=" msg)))
#define __stdout(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout=" XSTR(__COUNTER__) "=" msg)))
#define __stdout_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout_unpriv=" XSTR(__COUNTER__) "=" msg)))
+#define __linear_size(sz) __attribute__((btf_decl_tag("comment:test_linear_size=" XSTR(sz))))
/* Define common capabilities tested using __caps_unpriv */
#define CAP_NET_ADMIN 12
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 17db400f0e0d..d8dacef37c16 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -146,6 +146,20 @@
#define tcp_jiffies32 ((__u32)bpf_jiffies64())
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+static inline bool before(__u32 seq1, __u32 seq2)
+{
+ return (__s32)(seq1 - seq2) < 0;
+}
+
+#define after(seq2, seq1) before(seq1, seq2)
+
static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
return (struct inet_connection_sock *)sk;
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
index 9e9ebf27b878..9d158cfad981 100644
--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -34,6 +34,9 @@
#define SOL_TCP 6
#endif
+const char reno[] = "reno";
+const char cubic[] = "cubic";
+
__attribute__ ((noinline)) __weak
int do_bind(struct bpf_sock_addr *ctx)
{
@@ -50,35 +53,27 @@ int do_bind(struct bpf_sock_addr *ctx)
}
static __inline int verify_cc(struct bpf_sock_addr *ctx,
- char expected[TCP_CA_NAME_MAX])
+ const char expected[])
{
char buf[TCP_CA_NAME_MAX];
- int i;
if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf)))
return 1;
- for (i = 0; i < TCP_CA_NAME_MAX; i++) {
- if (buf[i] != expected[i])
- return 1;
- if (buf[i] == 0)
- break;
- }
+ if (bpf_strncmp(buf, TCP_CA_NAME_MAX, expected))
+ return 1;
return 0;
}
static __inline int set_cc(struct bpf_sock_addr *ctx)
{
- char reno[TCP_CA_NAME_MAX] = "reno";
- char cubic[TCP_CA_NAME_MAX] = "cubic";
-
- if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno)))
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, (void *)reno, sizeof(reno)))
return 1;
if (verify_cc(ctx, reno))
return 1;
- if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic)))
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic)))
return 1;
if (verify_cc(ctx, cubic))
return 1;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index 127dea342e5a..e0d672d93adf 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -914,8 +914,8 @@ void *user_ptr;
char expected_str[384];
__u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257};
-typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr);
+typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr);
/* Returns the offset just before the end of the maximum sized xdp fragment.
* Any write larger than 32 bytes will be split between 2 fragments.
@@ -1106,16 +1106,16 @@ int test_copy_from_user_str_dynptr(void *ctx)
return 0;
}
-static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr)
+static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr)
{
struct task_struct *task = bpf_get_current_task_btf();
return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task);
}
-static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr)
+static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr)
{
struct task_struct *task = bpf_get_current_task_btf();
diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c
new file mode 100644
index 000000000000..4d756b623557
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/file_reader.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <string.h>
+#include <stdbool.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "errno.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct elem);
+} arrmap SEC(".maps");
+
+struct elem {
+ struct file *file;
+ struct bpf_task_work tw;
+};
+
+char user_buf[256000];
+char tmp_buf[256000];
+
+int pid = 0;
+int err, run_success = 0;
+
+static int validate_file_read(struct file *file);
+static int task_work_callback(struct bpf_map *map, void *key, void *value);
+
+SEC("lsm/file_open")
+int on_open_expect_fault(void *c)
+{
+ struct bpf_dynptr dynptr;
+ struct file *file;
+ int local_err = 1;
+ __u32 user_buf_sz = sizeof(user_buf);
+
+ if (bpf_get_current_pid_tgid() >> 32 != pid)
+ return 0;
+
+ file = bpf_get_task_exe_file(bpf_get_current_task_btf());
+ if (!file)
+ return 0;
+
+ if (bpf_dynptr_from_file(file, 0, &dynptr))
+ goto out;
+
+ local_err = bpf_dynptr_read(tmp_buf, user_buf_sz, &dynptr, user_buf_sz, 0);
+ if (local_err == -EFAULT) { /* Expect page fault */
+ local_err = 0;
+ run_success = 1;
+ }
+out:
+ bpf_dynptr_file_discard(&dynptr);
+ if (local_err)
+ err = local_err;
+ bpf_put_file(file);
+ return 0;
+}
+
+SEC("lsm/file_open")
+int on_open_validate_file_read(void *c)
+{
+ struct task_struct *task = bpf_get_current_task_btf();
+ struct elem *work;
+ int key = 0;
+
+ if (bpf_get_current_pid_tgid() >> 32 != pid)
+ return 0;
+
+ work = bpf_map_lookup_elem(&arrmap, &key);
+ if (!work) {
+ err = 1;
+ return 0;
+ }
+ bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL);
+ return 0;
+}
+
+/* Called in a sleepable context, read 256K bytes, cross check with user space read data */
+static int task_work_callback(struct bpf_map *map, void *key, void *value)
+{
+ struct task_struct *task = bpf_get_current_task_btf();
+ struct file *file = bpf_get_task_exe_file(task);
+
+ if (!file)
+ return 0;
+
+ err = validate_file_read(file);
+ if (!err)
+ run_success = 1;
+ bpf_put_file(file);
+ return 0;
+}
+
+static int verify_dynptr_read(struct bpf_dynptr *ptr, u32 off, char *user_buf, u32 len)
+{
+ int i;
+
+ if (bpf_dynptr_read(tmp_buf, len, ptr, off, 0))
+ return 1;
+
+ /* Verify file contents read from BPF is the same as the one read from userspace */
+ bpf_for(i, 0, len)
+ {
+ if (tmp_buf[i] != user_buf[i])
+ return 1;
+ }
+ return 0;
+}
+
+static int validate_file_read(struct file *file)
+{
+ struct bpf_dynptr dynptr;
+ int loc_err = 1, off;
+ __u32 user_buf_sz = sizeof(user_buf);
+
+ if (bpf_dynptr_from_file(file, 0, &dynptr))
+ goto cleanup;
+
+ loc_err = verify_dynptr_read(&dynptr, 0, user_buf, user_buf_sz);
+ off = 1;
+ loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, user_buf_sz - off);
+ off = user_buf_sz - 1;
+ loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, user_buf_sz - off);
+ /* Read file with random offset and length */
+ off = 4097;
+ loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, 100);
+
+ /* Adjust dynptr, verify read */
+ loc_err = loc_err ?: bpf_dynptr_adjust(&dynptr, off, off + 1);
+ loc_err = loc_err ?: verify_dynptr_read(&dynptr, 0, user_buf + off, 1);
+ /* Can't read more than 1 byte */
+ loc_err = loc_err ?: verify_dynptr_read(&dynptr, 0, user_buf + off, 2) == 0;
+ /* Can't read with far offset */
+ loc_err = loc_err ?: verify_dynptr_read(&dynptr, 1, user_buf + off, 1) == 0;
+
+cleanup:
+ bpf_dynptr_file_discard(&dynptr);
+ return loc_err;
+}
diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c
new file mode 100644
index 000000000000..32fe28ed2439
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <string.h>
+#include <stdbool.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int err;
+void *user_ptr;
+
+SEC("lsm/file_open")
+__failure
+__msg("Unreleased reference id=")
+int on_nanosleep_unreleased_ref(void *ctx)
+{
+ struct task_struct *task = bpf_get_current_task_btf();
+ struct file *file = bpf_get_task_exe_file(task);
+ struct bpf_dynptr dynptr;
+
+ if (!file)
+ return 0;
+
+ err = bpf_dynptr_from_file(file, 0, &dynptr);
+ return err ? 1 : 0;
+}
+
+SEC("xdp")
+__failure
+__msg("Expected a dynptr of type file as arg #0")
+int xdp_wrong_dynptr_type(struct xdp_md *xdp)
+{
+ struct bpf_dynptr dynptr;
+
+ bpf_dynptr_from_xdp(xdp, 0, &dynptr);
+ bpf_dynptr_file_discard(&dynptr);
+ return 0;
+}
+
+SEC("xdp")
+__failure
+__msg("Expected an initialized dynptr as arg #0")
+int xdp_no_dynptr_type(struct xdp_md *xdp)
+{
+ struct bpf_dynptr dynptr;
+
+ bpf_dynptr_file_discard(&dynptr);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c
index 7481bb30b29b..195d3b2fba00 100644
--- a/tools/testing/selftests/bpf/progs/htab_update.c
+++ b/tools/testing/selftests/bpf/progs/htab_update.c
@@ -6,24 +6,31 @@
char _license[] SEC("license") = "GPL";
+/* Map value type: has BTF-managed field (bpf_timer) */
+struct val {
+ struct bpf_timer t;
+ __u64 payload;
+};
+
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1);
- __uint(key_size, sizeof(__u32));
- __uint(value_size, sizeof(__u32));
+ __type(key, __u32);
+ __type(value, struct val);
} htab SEC(".maps");
int pid = 0;
int update_err = 0;
-SEC("?fentry/lookup_elem_raw")
-int lookup_elem_raw(void *ctx)
+SEC("?fentry/bpf_obj_free_fields")
+int bpf_obj_free_fields(void *ctx)
{
- __u32 key = 0, value = 1;
+ __u32 key = 0;
+ struct val value = { .payload = 1 };
if ((bpf_get_current_pid_tgid() >> 32) != pid)
return 0;
- update_err = bpf_map_update_elem(&htab, &key, &value, 0);
+ update_err = bpf_map_update_elem(&htab, &key, &value, BPF_ANY);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/ip_check_defrag.c b/tools/testing/selftests/bpf/progs/ip_check_defrag.c
index 645b2c9f7867..0e87ad1ebcfa 100644
--- a/tools/testing/selftests/bpf/progs/ip_check_defrag.c
+++ b/tools/testing/selftests/bpf/progs/ip_check_defrag.c
@@ -12,11 +12,6 @@
#define IP_OFFSET 0x1FFF
#define NEXTHDR_FRAGMENT 44
-extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags,
- struct bpf_dynptr *ptr__uninit) __ksym;
-extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, uint32_t offset,
- void *buffer, uint32_t buffer__sz) __ksym;
-
volatile int shootdowns = 0;
static bool is_frag_v4(struct iphdr *iph)
diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c
index 0c13b7409947..7de173daf27b 100644
--- a/tools/testing/selftests/bpf/progs/lsm.c
+++ b/tools/testing/selftests/bpf/progs/lsm.c
@@ -89,14 +89,16 @@ SEC("lsm/file_mprotect")
int BPF_PROG(test_int_hook, struct vm_area_struct *vma,
unsigned long reqprot, unsigned long prot, int ret)
{
- if (ret != 0)
+ struct mm_struct *mm = vma->vm_mm;
+
+ if (ret != 0 || !mm)
return ret;
__s32 pid = bpf_get_current_pid_tgid() >> 32;
int is_stack = 0;
- is_stack = (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack);
+ is_stack = (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack);
if (is_stack && monitored_pid == pid) {
mprotect_count++;
diff --git a/tools/testing/selftests/bpf/progs/lsm_tailcall.c b/tools/testing/selftests/bpf/progs/lsm_tailcall.c
index 49c075ce2d4c..6e7e58051e64 100644
--- a/tools/testing/selftests/bpf/progs/lsm_tailcall.c
+++ b/tools/testing/selftests/bpf/progs/lsm_tailcall.c
@@ -20,14 +20,14 @@ int lsm_file_permission_prog(void *ctx)
return 0;
}
-SEC("lsm/file_alloc_security")
-int lsm_file_alloc_security_prog(void *ctx)
+SEC("lsm/kernfs_init_security")
+int lsm_kernfs_init_security_prog(void *ctx)
{
return 0;
}
-SEC("lsm/file_alloc_security")
-int lsm_file_alloc_security_entry(void *ctx)
+SEC("lsm/kernfs_init_security")
+int lsm_kernfs_init_security_entry(void *ctx)
{
bpf_tail_call_static(ctx, &jmp_table, 0);
return 0;
diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
index 3a868a199349..d70c28824bbe 100644
--- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
@@ -278,6 +278,46 @@ out:
return 0;
}
+SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep")
+int nested_rcu_region_unbalanced_1(void *ctx)
+{
+ struct task_struct *task, *real_parent;
+
+ /* nested rcu read lock regions */
+ task = bpf_get_current_task_btf();
+ bpf_rcu_read_lock();
+ bpf_rcu_read_lock();
+ real_parent = task->real_parent;
+ if (!real_parent)
+ goto out;
+ (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
+ bpf_rcu_read_unlock();
+ bpf_rcu_read_unlock();
+ bpf_rcu_read_unlock();
+ return 0;
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep")
+int nested_rcu_region_unbalanced_2(void *ctx)
+{
+ struct task_struct *task, *real_parent;
+
+ /* nested rcu read lock regions */
+ task = bpf_get_current_task_btf();
+ bpf_rcu_read_lock();
+ bpf_rcu_read_lock();
+ bpf_rcu_read_lock();
+ real_parent = task->real_parent;
+ if (!real_parent)
+ goto out;
+ (void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
+out:
+ bpf_rcu_read_unlock();
+ bpf_rcu_read_unlock();
+ return 0;
+}
+
SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
int task_trusted_non_rcuptr(void *ctx)
{
diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
index 893a4fdb4b6e..1aca85d86aeb 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
@@ -568,4 +568,64 @@ err_out:
return 0;
}
+private(kptr_ref) u64 ref;
+
+static int probe_read_refcount(void)
+{
+ u32 refcount;
+
+ bpf_probe_read_kernel(&refcount, sizeof(refcount), (void *) ref);
+ return refcount;
+}
+
+static int __insert_in_list(struct bpf_list_head *head, struct bpf_spin_lock *lock,
+ struct node_data __kptr **node)
+{
+ struct node_data *node_new, *node_ref, *node_old;
+
+ node_new = bpf_obj_new(typeof(*node_new));
+ if (!node_new)
+ return -1;
+
+ node_ref = bpf_refcount_acquire(node_new);
+ node_old = bpf_kptr_xchg(node, node_new);
+ if (node_old) {
+ bpf_obj_drop(node_old);
+ bpf_obj_drop(node_ref);
+ return -2;
+ }
+
+ bpf_spin_lock(lock);
+ bpf_list_push_front(head, &node_ref->l);
+ ref = (u64)(void *) &node_ref->ref;
+ bpf_spin_unlock(lock);
+ return probe_read_refcount();
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __type(key, int);
+ __type(value, struct map_value);
+ __uint(max_entries, 1);
+} percpu_hash SEC(".maps");
+
+SEC("tc")
+int percpu_hash_refcount_leak(void *ctx)
+{
+ struct map_value *v;
+ int key = 0;
+
+ v = bpf_map_lookup_elem(&percpu_hash, &key);
+ if (!v)
+ return 0;
+
+ return __insert_in_list(&head, &lock, &v->node);
+}
+
+SEC("tc")
+int check_percpu_hash_refcount(void *ctx)
+{
+ return probe_read_refcount();
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c
index 6a468496f539..d96c7d1e8fc2 100644
--- a/tools/testing/selftests/bpf/progs/ringbuf_bench.c
+++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
+#include <stdbool.h>
#include <linux/bpf.h>
#include <stdint.h>
#include <bpf/bpf_helpers.h>
@@ -14,9 +15,11 @@ struct {
const volatile int batch_cnt = 0;
const volatile long use_output = 0;
+const volatile bool bench_producer = false;
long sample_val = 42;
long dropped __attribute__((aligned(128))) = 0;
+long hits __attribute__((aligned(128))) = 0;
const volatile long wakeup_data_size = 0;
@@ -24,6 +27,9 @@ static __always_inline long get_flags()
{
long sz;
+ if (bench_producer)
+ return BPF_RB_NO_WAKEUP;
+
if (!wakeup_data_size)
return 0;
@@ -47,6 +53,8 @@ int bench_ringbuf(void *ctx)
*sample = sample_val;
flags = get_flags();
bpf_ringbuf_submit(sample, flags);
+ if (bench_producer)
+ __sync_add_and_fetch(&hits, 1);
}
}
} else {
@@ -55,6 +63,9 @@ int bench_ringbuf(void *ctx)
if (bpf_ringbuf_output(&ringbuf, &sample_val,
sizeof(sample_val), flags))
__sync_add_and_fetch(&dropped, 1);
+ else if (bench_producer)
+ __sync_add_and_fetch(&hits, 1);
+
}
}
return 0;
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c
index 99d72c68f76a..826e6b6aff7e 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c
@@ -45,8 +45,12 @@ SEC("syscall") __retval(USER_PTR_ERR)int test_strcspn_null1(void *ctx) { return
SEC("syscall") __retval(USER_PTR_ERR)int test_strcspn_null2(void *ctx) { return bpf_strcspn("hello", NULL); }
SEC("syscall") __retval(USER_PTR_ERR)int test_strstr_null1(void *ctx) { return bpf_strstr(NULL, "hello"); }
SEC("syscall") __retval(USER_PTR_ERR)int test_strstr_null2(void *ctx) { return bpf_strstr("hello", NULL); }
+SEC("syscall") __retval(USER_PTR_ERR)int test_strcasestr_null1(void *ctx) { return bpf_strcasestr(NULL, "hello"); }
+SEC("syscall") __retval(USER_PTR_ERR)int test_strcasestr_null2(void *ctx) { return bpf_strcasestr("hello", NULL); }
SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null1(void *ctx) { return bpf_strnstr(NULL, "hello", 1); }
SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null2(void *ctx) { return bpf_strnstr("hello", NULL, 1); }
+SEC("syscall") __retval(USER_PTR_ERR)int test_strncasestr_null1(void *ctx) { return bpf_strncasestr(NULL, "hello", 1); }
+SEC("syscall") __retval(USER_PTR_ERR)int test_strncasestr_null2(void *ctx) { return bpf_strncasestr("hello", NULL, 1); }
/* Passing userspace ptr to string kfuncs */
SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { return bpf_strcmp(user_ptr, "hello"); }
@@ -65,8 +69,12 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcspn_user_ptr1(void *ctx) { re
SEC("syscall") __retval(USER_PTR_ERR) int test_strcspn_user_ptr2(void *ctx) { return bpf_strcspn("hello", user_ptr); }
SEC("syscall") __retval(USER_PTR_ERR) int test_strstr_user_ptr1(void *ctx) { return bpf_strstr(user_ptr, "hello"); }
SEC("syscall") __retval(USER_PTR_ERR) int test_strstr_user_ptr2(void *ctx) { return bpf_strstr("hello", user_ptr); }
+SEC("syscall") __retval(USER_PTR_ERR) int test_strcasestr_user_ptr1(void *ctx) { return bpf_strcasestr(user_ptr, "hello"); }
+SEC("syscall") __retval(USER_PTR_ERR) int test_strcasestr_user_ptr2(void *ctx) { return bpf_strcasestr("hello", user_ptr); }
SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr1(void *ctx) { return bpf_strnstr(user_ptr, "hello", 1); }
SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr2(void *ctx) { return bpf_strnstr("hello", user_ptr, 1); }
+SEC("syscall") __retval(USER_PTR_ERR) int test_strncasestr_user_ptr1(void *ctx) { return bpf_strncasestr(user_ptr, "hello", 1); }
+SEC("syscall") __retval(USER_PTR_ERR) int test_strncasestr_user_ptr2(void *ctx) { return bpf_strncasestr("hello", user_ptr, 1); }
#endif /* __TARGET_ARCH_s390 */
@@ -87,7 +95,11 @@ SEC("syscall") __retval(-EFAULT) int test_strcspn_pagefault1(void *ctx) { return
SEC("syscall") __retval(-EFAULT) int test_strcspn_pagefault2(void *ctx) { return bpf_strcspn("hello", invalid_kern_ptr); }
SEC("syscall") __retval(-EFAULT) int test_strstr_pagefault1(void *ctx) { return bpf_strstr(invalid_kern_ptr, "hello"); }
SEC("syscall") __retval(-EFAULT) int test_strstr_pagefault2(void *ctx) { return bpf_strstr("hello", invalid_kern_ptr); }
+SEC("syscall") __retval(-EFAULT) int test_strcasestr_pagefault1(void *ctx) { return bpf_strcasestr(invalid_kern_ptr, "hello"); }
+SEC("syscall") __retval(-EFAULT) int test_strcasestr_pagefault2(void *ctx) { return bpf_strcasestr("hello", invalid_kern_ptr); }
SEC("syscall") __retval(-EFAULT) int test_strnstr_pagefault1(void *ctx) { return bpf_strnstr(invalid_kern_ptr, "hello", 1); }
SEC("syscall") __retval(-EFAULT) int test_strnstr_pagefault2(void *ctx) { return bpf_strnstr("hello", invalid_kern_ptr, 1); }
+SEC("syscall") __retval(-EFAULT) int test_strncasestr_pagefault1(void *ctx) { return bpf_strncasestr(invalid_kern_ptr, "hello", 1); }
+SEC("syscall") __retval(-EFAULT) int test_strncasestr_pagefault2(void *ctx) { return bpf_strncasestr("hello", invalid_kern_ptr, 1); }
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c
index e41cc5601994..05e1da1f250f 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c
@@ -19,6 +19,8 @@ SEC("syscall") int test_strspn_accept_too_long(void *ctx) { return bpf_strspn("b
SEC("syscall") int test_strcspn_str_too_long(void *ctx) { return bpf_strcspn(long_str, "b"); }
SEC("syscall") int test_strcspn_reject_too_long(void *ctx) { return bpf_strcspn("b", long_str); }
SEC("syscall") int test_strstr_too_long(void *ctx) { return bpf_strstr(long_str, "hello"); }
+SEC("syscall") int test_strcasestr_too_long(void *ctx) { return bpf_strcasestr(long_str, "hello"); }
SEC("syscall") int test_strnstr_too_long(void *ctx) { return bpf_strnstr(long_str, "hello", sizeof(long_str)); }
+SEC("syscall") int test_strncasestr_too_long(void *ctx) { return bpf_strncasestr(long_str, "hello", sizeof(long_str)); }
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
index 2e3498e37b9c..a8513964516b 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
@@ -33,8 +33,11 @@ __test(11) int test_strnlen(void *ctx) { return bpf_strnlen(str, 12); }
__test(5) int test_strspn(void *ctx) { return bpf_strspn(str, "ehlo"); }
__test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); }
__test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); }
+__test(6) int test_strcasestr_found(void *ctx) { return bpf_strcasestr(str, "woRLD"); }
__test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); }
+__test(-ENOENT) int test_strcasestr_notfound(void *ctx) { return bpf_strcasestr(str, "hi"); }
__test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); }
+__test(0) int test_strcasestr_empty(void *ctx) { return bpf_strcasestr(str, ""); }
__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); }
__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); }
__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); }
@@ -42,5 +45,12 @@ __test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str,
__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); }
__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); }
__test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); }
+__test(0) int test_strncasestr_found1(void *ctx) { return bpf_strncasestr("", "", 0); }
+__test(0) int test_strncasestr_found2(void *ctx) { return bpf_strncasestr(str, "heLLO", 5); }
+__test(0) int test_strncasestr_found3(void *ctx) { return bpf_strncasestr(str, "heLLO", 6); }
+__test(-ENOENT) int test_strncasestr_notfound1(void *ctx) { return bpf_strncasestr(str, "hi", 10); }
+__test(-ENOENT) int test_strncasestr_notfound2(void *ctx) { return bpf_strncasestr(str, "hello", 4); }
+__test(-ENOENT) int test_strncasestr_notfound3(void *ctx) { return bpf_strncasestr("", "a", 0); }
+__test(0) int test_strncasestr_empty(void *ctx) { return bpf_strncasestr(str, "", 1); }
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index a5c74d31a244..6e1918deaf26 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -330,9 +330,9 @@ static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
}
bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
/* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
- return tls_ptr && tls_ptr != (void *)-1
- ? tls_ptr + tls_index.offset
- : NULL;
+ if (!tls_ptr || tls_ptr == (void *)-1)
+ return NULL;
+ return tls_ptr + tls_index.offset;
}
#ifdef SUBPROGS
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c
index a58b5194fc89..022291f21dfb 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c
@@ -8,8 +8,6 @@ char _license[] SEC("license") = "GPL";
#define USEC_PER_SEC 1000000UL
-#define min(a, b) ((a) < (b) ? (a) : (b))
-
static unsigned int tcp_left_out(const struct tcp_sock *tp)
{
return tp->sacked_out + tp->lost_out;
diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c
index 2ec1de11a3ae..7b6b2b342c1d 100644
--- a/tools/testing/selftests/bpf/progs/test_check_mtu.c
+++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c
@@ -7,6 +7,7 @@
#include <stddef.h>
#include <stdint.h>
+#include <errno.h>
char _license[] SEC("license") = "GPL";
@@ -288,3 +289,14 @@ int tc_input_len_exceed(struct __sk_buff *ctx)
global_bpf_mtu_xdp = mtu_len;
return retval;
}
+
+SEC("tc")
+int tc_chk_segs_flag(struct __sk_buff *ctx)
+{
+ __u32 mtu_len = 0;
+ int err;
+
+ err = bpf_check_mtu(ctx, GLOBAL_USER_IFINDEX, &mtu_len, 0, BPF_MTU_CHK_SEGS);
+
+ return err == -EINVAL ? BPF_OK : BPF_DROP;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c
index a1ccc831c882..05ac9410cd68 100644
--- a/tools/testing/selftests/bpf/progs/test_perf_branches.c
+++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c
@@ -8,6 +8,7 @@
#include <bpf/bpf_tracing.h>
int valid = 0;
+int run_cnt = 0;
int required_size_out = 0;
int written_stack_out = 0;
int written_global_out = 0;
@@ -24,6 +25,8 @@ int perf_branches(void *ctx)
__u64 entries[4 * 3] = {0};
int required_size, written_stack, written_global;
+ ++run_cnt;
+
/* write to stack */
written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0);
/* ignore spurious events */
diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c b/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c
new file mode 100644
index 000000000000..ff4aa67ddacc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(map_flags, BPF_F_RB_OVERWRITE);
+} ringbuf SEC(".maps");
+
+int pid;
+
+const volatile unsigned long LEN1;
+const volatile unsigned long LEN2;
+const volatile unsigned long LEN3;
+const volatile unsigned long LEN4;
+const volatile unsigned long LEN5;
+
+long reserve1_fail = 0;
+long reserve2_fail = 0;
+long reserve3_fail = 0;
+long reserve4_fail = 0;
+long reserve5_fail = 0;
+
+unsigned long avail_data = 0;
+unsigned long ring_size = 0;
+unsigned long cons_pos = 0;
+unsigned long prod_pos = 0;
+unsigned long over_pos = 0;
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int test_overwrite_ringbuf(void *ctx)
+{
+ char *rec1, *rec2, *rec3, *rec4, *rec5;
+ int cur_pid = bpf_get_current_pid_tgid() >> 32;
+
+ if (cur_pid != pid)
+ return 0;
+
+ rec1 = bpf_ringbuf_reserve(&ringbuf, LEN1, 0);
+ if (!rec1) {
+ reserve1_fail = 1;
+ return 0;
+ }
+
+ rec2 = bpf_ringbuf_reserve(&ringbuf, LEN2, 0);
+ if (!rec2) {
+ bpf_ringbuf_discard(rec1, 0);
+ reserve2_fail = 1;
+ return 0;
+ }
+
+ rec3 = bpf_ringbuf_reserve(&ringbuf, LEN3, 0);
+ /* expect failure */
+ if (!rec3) {
+ reserve3_fail = 1;
+ } else {
+ bpf_ringbuf_discard(rec1, 0);
+ bpf_ringbuf_discard(rec2, 0);
+ bpf_ringbuf_discard(rec3, 0);
+ return 0;
+ }
+
+ rec4 = bpf_ringbuf_reserve(&ringbuf, LEN4, 0);
+ if (!rec4) {
+ reserve4_fail = 1;
+ bpf_ringbuf_discard(rec1, 0);
+ bpf_ringbuf_discard(rec2, 0);
+ return 0;
+ }
+
+ bpf_ringbuf_submit(rec1, 0);
+ bpf_ringbuf_submit(rec2, 0);
+ bpf_ringbuf_submit(rec4, 0);
+
+ rec5 = bpf_ringbuf_reserve(&ringbuf, LEN5, 0);
+ if (!rec5) {
+ reserve5_fail = 1;
+ return 0;
+ }
+
+ for (int i = 0; i < LEN3; i++)
+ rec5[i] = 0xdd;
+
+ bpf_ringbuf_submit(rec5, 0);
+
+ ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE);
+ avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA);
+ cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS);
+ prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS);
+ over_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_OVERWRITE_POS);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_tc_edt.c b/tools/testing/selftests/bpf/progs/test_tc_edt.c
index 950a70b61e74..4f6f03122d61 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_edt.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_edt.c
@@ -14,7 +14,6 @@
#define TIME_HORIZON_NS (2000 * 1000 * 1000)
#define NS_PER_SEC 1000000000
#define ECN_HORIZON_NS 5000000
-#define THROTTLE_RATE_BPS (5 * 1000 * 1000)
/* flow_key => last_tstamp timestamp used */
struct {
@@ -24,12 +23,13 @@ struct {
__uint(max_entries, 1);
} flow_map SEC(".maps");
+__uint64_t target_rate;
+
static inline int throttle_flow(struct __sk_buff *skb)
{
int key = 0;
uint64_t *last_tstamp = bpf_map_lookup_elem(&flow_map, &key);
- uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC /
- THROTTLE_RATE_BPS;
+ uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC / target_rate;
uint64_t now = bpf_ktime_get_ns();
uint64_t tstamp, next_tstamp = 0;
@@ -70,7 +70,7 @@ static inline int handle_tcp(struct __sk_buff *skb, struct tcphdr *tcp)
if ((void *)(tcp + 1) > data_end)
return TC_ACT_SHOT;
- if (tcp->dest == bpf_htons(9000))
+ if (tcp->source == bpf_htons(9000))
return throttle_flow(skb);
return TC_ACT_OK;
@@ -99,7 +99,8 @@ static inline int handle_ipv4(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("cls_test") int tc_prog(struct __sk_buff *skb)
+SEC("tc")
+int tc_prog(struct __sk_buff *skb)
{
if (skb->protocol == bpf_htons(ETH_P_IP))
return handle_ipv4(skb);
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index 404124a93892..7330c61b5730 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -2,23 +2,11 @@
/* In-place tunneling */
-#include <stdbool.h>
-#include <string.h>
-
-#include <linux/stddef.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/mpls.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/pkt_cls.h>
-#include <linux/types.h>
+#include <vmlinux.h>
-#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "bpf_tracing_net.h"
#include "bpf_compiler.h"
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
@@ -27,6 +15,14 @@ static const int cfg_port = 8000;
static const int cfg_udp_src = 20000;
+#define ETH_P_MPLS_UC 0x8847
+#define ETH_P_TEB 0x6558
+
+#define MPLS_LS_S_MASK 0x00000100
+#define BPF_F_ADJ_ROOM_ENCAP_L2(len) \
+ (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \
+ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
+
#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN)
#define UDP_PORT 5555
@@ -36,10 +32,9 @@ static const int cfg_udp_src = 20000;
#define EXTPROTO_VXLAN 0x1
-#define VXLAN_N_VID (1u << 24)
-#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8)
-#define VXLAN_FLAGS 0x8
-#define VXLAN_VNI 1
+#define VXLAN_FLAGS bpf_htonl(1<<27)
+#define VNI_ID 1
+#define VXLAN_VNI bpf_htonl(VNI_ID << 8)
#ifndef NEXTHDR_DEST
#define NEXTHDR_DEST 60
@@ -48,12 +43,6 @@ static const int cfg_udp_src = 20000;
/* MPLS label 1000 with S bit (last label) set and ttl of 255. */
static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
MPLS_LS_S_MASK | 0xff);
-
-struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
-} __attribute__((packed));
-
struct gre_hdr {
__be16 flags;
__be16 protocol;
@@ -94,8 +83,8 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
__u16 l2_proto, __u16 ext_proto)
{
+ struct iphdr iph_inner = {0};
__u16 udp_dst = UDP_PORT;
- struct iphdr iph_inner;
struct v4hdr h_outer;
struct tcphdr tcph;
int olen, l2_len;
@@ -122,7 +111,6 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
return TC_ACT_OK;
/* Derive the IPv4 header fields from the IPv6 header */
- memset(&iph_inner, 0, sizeof(iph_inner));
iph_inner.version = 4;
iph_inner.ihl = 5;
iph_inner.tot_len = bpf_htons(sizeof(iph6_inner) +
@@ -210,7 +198,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
vxlan_hdr->vx_flags = VXLAN_FLAGS;
- vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+ vxlan_hdr->vx_vni = VXLAN_VNI;
l2_hdr += sizeof(struct vxlanhdr);
}
@@ -340,7 +328,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
vxlan_hdr->vx_flags = VXLAN_FLAGS;
- vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+ vxlan_hdr->vx_vni = VXLAN_VNI;
l2_hdr += sizeof(struct vxlanhdr);
}
@@ -372,8 +360,8 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
static int encap_ipv6_ipip6(struct __sk_buff *skb)
{
+ struct v6hdr h_outer = {0};
struct iphdr iph_inner;
- struct v6hdr h_outer;
struct tcphdr tcph;
struct ethhdr eth;
__u64 flags;
@@ -400,13 +388,12 @@ static int encap_ipv6_ipip6(struct __sk_buff *skb)
return TC_ACT_SHOT;
/* prepare new outer network header */
- memset(&h_outer.ip, 0, sizeof(h_outer.ip));
h_outer.ip.version = 6;
h_outer.ip.hop_limit = iph_inner.ttl;
- h_outer.ip.saddr.s6_addr[1] = 0xfd;
- h_outer.ip.saddr.s6_addr[15] = 1;
- h_outer.ip.daddr.s6_addr[1] = 0xfd;
- h_outer.ip.daddr.s6_addr[15] = 2;
+ h_outer.ip.saddr.in6_u.u6_addr8[1] = 0xfd;
+ h_outer.ip.saddr.in6_u.u6_addr8[15] = 1;
+ h_outer.ip.daddr.in6_u.u6_addr8[1] = 0xfd;
+ h_outer.ip.daddr.in6_u.u6_addr8[15] = 2;
h_outer.ip.payload_len = iph_inner.tot_len;
h_outer.ip.nexthdr = IPPROTO_IPIP;
@@ -431,7 +418,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
return __encap_ipv6(skb, encap_proto, l2_proto, 0);
}
-SEC("encap_ipip_none")
+SEC("tc")
int __encap_ipip_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -440,7 +427,7 @@ int __encap_ipip_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_gre_none")
+SEC("tc")
int __encap_gre_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -449,7 +436,7 @@ int __encap_gre_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_gre_mpls")
+SEC("tc")
int __encap_gre_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -458,7 +445,7 @@ int __encap_gre_mpls(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_gre_eth")
+SEC("tc")
int __encap_gre_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -467,7 +454,7 @@ int __encap_gre_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_udp_none")
+SEC("tc")
int __encap_udp_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -476,7 +463,7 @@ int __encap_udp_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_udp_mpls")
+SEC("tc")
int __encap_udp_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -485,7 +472,7 @@ int __encap_udp_mpls(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_udp_eth")
+SEC("tc")
int __encap_udp_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -494,7 +481,7 @@ int __encap_udp_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_vxlan_eth")
+SEC("tc")
int __encap_vxlan_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -505,7 +492,7 @@ int __encap_vxlan_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_sit_none")
+SEC("tc")
int __encap_sit_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -514,7 +501,7 @@ int __encap_sit_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6tnl_none")
+SEC("tc")
int __encap_ip6tnl_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -523,7 +510,7 @@ int __encap_ip6tnl_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ipip6_none")
+SEC("tc")
int __encap_ipip6_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
@@ -532,7 +519,7 @@ int __encap_ipip6_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6gre_none")
+SEC("tc")
int __encap_ip6gre_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -541,7 +528,7 @@ int __encap_ip6gre_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6gre_mpls")
+SEC("tc")
int __encap_ip6gre_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -550,7 +537,7 @@ int __encap_ip6gre_mpls(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6gre_eth")
+SEC("tc")
int __encap_ip6gre_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -559,7 +546,7 @@ int __encap_ip6gre_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6udp_none")
+SEC("tc")
int __encap_ip6udp_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -568,7 +555,7 @@ int __encap_ip6udp_none(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6udp_mpls")
+SEC("tc")
int __encap_ip6udp_mpls(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -577,7 +564,7 @@ int __encap_ip6udp_mpls(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6udp_eth")
+SEC("tc")
int __encap_ip6udp_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -586,7 +573,7 @@ int __encap_ip6udp_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}
-SEC("encap_ip6vxlan_eth")
+SEC("tc")
int __encap_ip6vxlan_eth(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
@@ -693,7 +680,7 @@ static int decap_ipv6(struct __sk_buff *skb)
iph_outer.nexthdr);
}
-SEC("decap")
+SEC("tc")
int decap_f(struct __sk_buff *skb)
{
switch (skb->protocol) {
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 3d5f30c29ae3..2898b3749d07 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -42,12 +42,14 @@ int bench_trigger_uprobe_multi(void *ctx)
const volatile int batch_iters = 0;
SEC("?raw_tp")
-int trigger_count(void *ctx)
+int trigger_kernel_count(void *ctx)
{
int i;
- for (i = 0; i < batch_iters; i++)
+ for (i = 0; i < batch_iters; i++) {
inc_counter();
+ bpf_get_numa_node_id();
+ }
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
new file mode 100644
index 000000000000..7efa9521105e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Timer tests */
+
+struct timer_elem {
+ struct bpf_timer t;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct timer_elem);
+} timer_map SEC(".maps");
+
+static int timer_cb(void *map, int *key, struct bpf_timer *timer)
+{
+ u32 data;
+ /* Timer callbacks are never sleepable, even from non-sleepable programs */
+ bpf_copy_from_user(&data, sizeof(data), NULL);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+__failure __msg("helper call might sleep in a non-sleepable prog")
+int timer_non_sleepable_prog(void *ctx)
+{
+ struct timer_elem *val;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&timer_map, &key);
+ if (!val)
+ return 0;
+
+ bpf_timer_init(&val->t, &timer_map, 0);
+ bpf_timer_set_callback(&val->t, timer_cb);
+ return 0;
+}
+
+SEC("lsm.s/file_open")
+__failure __msg("helper call might sleep in a non-sleepable prog")
+int timer_sleepable_prog(void *ctx)
+{
+ struct timer_elem *val;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&timer_map, &key);
+ if (!val)
+ return 0;
+
+ bpf_timer_init(&val->t, &timer_map, 0);
+ bpf_timer_set_callback(&val->t, timer_cb);
+ return 0;
+}
+
+/* Workqueue tests */
+
+struct wq_elem {
+ struct bpf_wq w;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct wq_elem);
+} wq_map SEC(".maps");
+
+static int wq_cb(void *map, int *key, void *value)
+{
+ u32 data;
+ /* Workqueue callbacks are always sleepable, even from non-sleepable programs */
+ bpf_copy_from_user(&data, sizeof(data), NULL);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+__success
+int wq_non_sleepable_prog(void *ctx)
+{
+ struct wq_elem *val;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&wq_map, &key);
+ if (!val)
+ return 0;
+
+ if (bpf_wq_init(&val->w, &wq_map, 0) != 0)
+ return 0;
+ if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0)
+ return 0;
+ return 0;
+}
+
+SEC("lsm.s/file_open")
+__success
+int wq_sleepable_prog(void *ctx)
+{
+ struct wq_elem *val;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&wq_map, &key);
+ if (!val)
+ return 0;
+
+ if (bpf_wq_init(&val->w, &wq_map, 0) != 0)
+ return 0;
+ if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0)
+ return 0;
+ return 0;
+}
+
+/* Task work tests */
+
+struct task_work_elem {
+ struct bpf_task_work tw;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct task_work_elem);
+} task_work_map SEC(".maps");
+
+static int task_work_cb(struct bpf_map *map, void *key, void *value)
+{
+ u32 data;
+ /* Task work callbacks are always sleepable, even from non-sleepable programs */
+ bpf_copy_from_user(&data, sizeof(data), NULL);
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+__success
+int task_work_non_sleepable_prog(void *ctx)
+{
+ struct task_work_elem *val;
+ struct task_struct *task;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&task_work_map, &key);
+ if (!val)
+ return 0;
+
+ task = bpf_get_current_task_btf();
+ if (!task)
+ return 0;
+
+ bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL);
+ return 0;
+}
+
+SEC("lsm.s/file_open")
+__success
+int task_work_sleepable_prog(void *ctx)
+{
+ struct task_work_elem *val;
+ struct task_struct *task;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&task_work_map, &key);
+ if (!val)
+ return 0;
+
+ task = bpf_get_current_task_btf();
+ if (!task)
+ return 0;
+
+ bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 0a72e0228ea9..411a18437d7e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -1709,4 +1709,158 @@ __naked void jeq_disagreeing_tnums(void *ctx)
: __clobber_all);
}
+SEC("socket")
+__description("conditional jump on same register, branch taken")
+__not_msg("20: (b7) r0 = 1 {{.*}} R0=1")
+__success __log_level(2)
+__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void condition_jump_on_same_register(void *ctx)
+{
+ asm volatile(" \
+ call %[bpf_get_prandom_u32]; \
+ w8 = 0x80000000; \
+ r0 &= r8; \
+ if r0 == r0 goto +1; \
+ goto l1_%=; \
+ if r0 >= r0 goto +1; \
+ goto l1_%=; \
+ if r0 s>= r0 goto +1; \
+ goto l1_%=; \
+ if r0 <= r0 goto +1; \
+ goto l1_%=; \
+ if r0 s<= r0 goto +1; \
+ goto l1_%=; \
+ if r0 != r0 goto l1_%=; \
+ if r0 > r0 goto l1_%=; \
+ if r0 s> r0 goto l1_%=; \
+ if r0 < r0 goto l1_%=; \
+ if r0 s< r0 goto l1_%=; \
+l0_%=: r0 = 0; \
+ exit; \
+l1_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
+SEC("socket")
+__description("jset on same register, constant value branch taken")
+__not_msg("7: (b7) r0 = 1 {{.*}} R0=1")
+__success __log_level(2)
+__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void jset_on_same_register_1(void *ctx)
+{
+ asm volatile(" \
+ r0 = 0; \
+ if r0 & r0 goto l1_%=; \
+ r0 = 1; \
+ if r0 & r0 goto +1; \
+ goto l1_%=; \
+l0_%=: r0 = 0; \
+ exit; \
+l1_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
+SEC("socket")
+__description("jset on same register, scalar value branch taken")
+__not_msg("12: (b7) r0 = 1 {{.*}} R0=1")
+__success __log_level(2)
+__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void jset_on_same_register_2(void *ctx)
+{
+ asm volatile(" \
+ /* range [1;2] */ \
+ call %[bpf_get_prandom_u32]; \
+ r0 &= 0x1; \
+ r0 += 1; \
+ if r0 & r0 goto +1; \
+ goto l1_%=; \
+ /* range [-2;-1] */ \
+ call %[bpf_get_prandom_u32]; \
+ r0 &= 0x1; \
+ r0 -= 2; \
+ if r0 & r0 goto +1; \
+ goto l1_%=; \
+l0_%=: r0 = 0; \
+ exit; \
+l1_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
+SEC("socket")
+__description("jset on same register, scalar value unknown branch 1")
+__msg("3: (b7) r0 = 0 {{.*}} R0=0")
+__msg("5: (b7) r0 = 1 {{.*}} R0=1")
+__success __log_level(2)
+__flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void jset_on_same_register_3(void *ctx)
+{
+ asm volatile(" \
+ /* range [0;1] */ \
+ call %[bpf_get_prandom_u32]; \
+ r0 &= 0x1; \
+ if r0 & r0 goto l1_%=; \
+l0_%=: r0 = 0; \
+ exit; \
+l1_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
+SEC("socket")
+__description("jset on same register, scalar value unknown branch 2")
+__msg("4: (b7) r0 = 0 {{.*}} R0=0")
+__msg("6: (b7) r0 = 1 {{.*}} R0=1")
+__success __log_level(2)
+__flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void jset_on_same_register_4(void *ctx)
+{
+ asm volatile(" \
+ /* range [-1;0] */ \
+ call %[bpf_get_prandom_u32]; \
+ r0 &= 0x1; \
+ r0 -= 1; \
+ if r0 & r0 goto l1_%=; \
+l0_%=: r0 = 0; \
+ exit; \
+l1_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
+SEC("socket")
+__description("jset on same register, scalar value unknown branch 3")
+__msg("4: (b7) r0 = 0 {{.*}} R0=0")
+__msg("6: (b7) r0 = 1 {{.*}} R0=1")
+__success __log_level(2)
+__flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void jset_on_same_register_5(void *ctx)
+{
+ asm volatile(" \
+ /* range [-1;1] */ \
+ call %[bpf_get_prandom_u32]; \
+ r0 &= 0x2; \
+ r0 -= 1; \
+ if r0 & r0 goto l1_%=; \
+l0_%=: r0 = 0; \
+ exit; \
+l1_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c
index 28b602ac9cbe..911caa8fd1b7 100644
--- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Converted from tools/testing/selftests/bpf/verifier/direct_packet_access.c */
+#include <linux/if_ether.h>
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
@@ -800,4 +801,62 @@ l0_%=: /* exit(0) */ \
: __clobber_all);
}
+#define access_test_non_linear(name, type, desc, retval, linear_sz, off) \
+ SEC(type) \
+ __description("direct packet access: " #name " (non-linear, " type ", " desc ")") \
+ __success __retval(retval) \
+ __linear_size(linear_sz) \
+ __naked void access_non_linear_##name(void) \
+ { \
+ asm volatile (" \
+ r2 = *(u32*)(r1 + %[skb_data]); \
+ r3 = *(u32*)(r1 + %[skb_data_end]); \
+ r0 = r2; \
+ r0 += %[offset]; \
+ if r0 > r3 goto l0_%=; \
+ r0 = *(u8*)(r0 - 1); \
+ r0 = 0; \
+ exit; \
+ l0_%=: r0 = 1; \
+ exit; \
+ " : \
+ : __imm_const(skb_data, offsetof(struct __sk_buff, data)), \
+ __imm_const(skb_data_end, offsetof(struct __sk_buff, data_end)), \
+ __imm_const(offset, off) \
+ : __clobber_all); \
+ }
+
+access_test_non_linear(test31, "tc", "too short eth", 1, ETH_HLEN, 22);
+access_test_non_linear(test32, "tc", "too short 1", 1, 1, 22);
+access_test_non_linear(test33, "tc", "long enough", 0, 22, 22);
+access_test_non_linear(test34, "cgroup_skb/ingress", "too short eth", 1, ETH_HLEN, 8);
+access_test_non_linear(test35, "cgroup_skb/ingress", "too short 1", 1, 1, 8);
+access_test_non_linear(test36, "cgroup_skb/ingress", "long enough", 0, 22, 8);
+
+SEC("tc")
+__description("direct packet access: test37 (non-linear, linearized)")
+__success __retval(0)
+__linear_size(ETH_HLEN)
+__naked void access_non_linear_linearized(void)
+{
+ asm volatile (" \
+ r6 = r1; \
+ r2 = 22; \
+ call %[bpf_skb_pull_data]; \
+ r2 = *(u32*)(r6 + %[skb_data]); \
+ r3 = *(u32*)(r6 + %[skb_data_end]); \
+ r0 = r2; \
+ r0 += 22; \
+ if r0 > r3 goto l0_%=; \
+ r0 = *(u8*)(r0 - 1); \
+ exit; \
+l0_%=: r0 = 1; \
+ exit; \
+" :
+ : __imm(bpf_skb_pull_data),
+ __imm_const(skb_data, offsetof(struct __sk_buff, data)),
+ __imm_const(skb_data_end, offsetof(struct __sk_buff, data_end))
+ : __clobber_all);
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_gotox.c b/tools/testing/selftests/bpf/progs/verifier_gotox.c
new file mode 100644
index 000000000000..607dad058ca1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_gotox.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Isovalent */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "../../../include/linux/filter.h"
+
+#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)
+
+#define DEFINE_SIMPLE_JUMP_TABLE_PROG(NAME, SRC_REG, OFF, IMM, OUTCOME) \
+ \
+ SEC("socket") \
+ OUTCOME \
+ __naked void jump_table_ ## NAME(void) \
+ { \
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+ jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+ " : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, (SRC_REG), (OFF) , (IMM))) \
+ : __clobber_all); \
+ }
+
+/*
+ * The first program which doesn't use reserved fields
+ * loads and works properly. The rest fail to load.
+ */
+DEFINE_SIMPLE_JUMP_TABLE_PROG(ok, BPF_REG_0, 0, 0, __success __retval(1))
+DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_src_reg, BPF_REG_1, 0, 0, __failure __msg("BPF_JA|BPF_X uses reserved fields"))
+DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_non_zero_off, BPF_REG_0, 1, 0, __failure __msg("BPF_JA|BPF_X uses reserved fields"))
+DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_non_zero_imm, BPF_REG_0, 0, 1, __failure __msg("BPF_JA|BPF_X uses reserved fields"))
+
+/*
+ * Gotox is forbidden when there is no jump table loaded
+ * which points to the sub-function where the gotox is used
+ */
+SEC("socket")
+__failure __msg("no jump tables found for subprog starting at 0")
+__naked void jump_table_no_jump_table(void)
+{
+ asm volatile (" \
+ .8byte %[gotox_r0]; \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+/*
+ * Incorrect type of the target register, only PTR_TO_INSN allowed
+ */
+SEC("socket")
+__failure __msg("R1 has type scalar, expected PTR_TO_INSN")
+__naked void jump_table_incorrect_dst_reg_type(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r0 = *(u64 *)(r0 + 0); \
+ r1 = 42; \
+ .8byte %[gotox_r1]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r1, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0 , 0))
+ : __clobber_all);
+}
+
+#define DEFINE_INVALID_SIZE_PROG(READ_SIZE, OUTCOME) \
+ \
+ SEC("socket") \
+ OUTCOME \
+ __naked void jump_table_invalid_read_size_ ## READ_SIZE(void) \
+ { \
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+ jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r0 = *(" #READ_SIZE " *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+ " : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) \
+ : __clobber_all); \
+ }
+
+DEFINE_INVALID_SIZE_PROG(u32, __failure __msg("Invalid read of 4 bytes from insn_array"))
+DEFINE_INVALID_SIZE_PROG(u16, __failure __msg("Invalid read of 2 bytes from insn_array"))
+DEFINE_INVALID_SIZE_PROG(u8, __failure __msg("Invalid read of 1 bytes from insn_array"))
+
+SEC("socket")
+__failure __msg("misaligned value access off 0+1+0 size 8")
+__naked void jump_table_misaligned_access(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 1; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+SEC("socket")
+__failure __msg("invalid access to map value, value_size=16 off=24 size=8")
+__naked void jump_table_invalid_mem_acceess_pos(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 24; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+SEC("socket")
+__failure __msg("invalid access to map value, value_size=16 off=-24 size=8")
+__naked void jump_table_invalid_mem_acceess_neg(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 -= 24; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+SEC("socket")
+__success __retval(1)
+__naked void jump_table_add_sub_ok(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 -= 24; \
+ r0 += 32; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+SEC("socket")
+__failure __msg("write into map forbidden, value_size=16 off=8 size=8")
+__naked void jump_table_no_writes(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r1 = 0xbeef; \
+ *(u64 *)(r0 + 0) = r1; \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+#define DEFINE_JUMP_TABLE_USE_REG(REG) \
+ SEC("socket") \
+ __success __retval(1) \
+ __naked void jump_table_use_reg_r ## REG(void) \
+ { \
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+ jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 16; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r" #REG " = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_rX]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+ " : \
+ : __imm_insn(gotox_rX, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_ ## REG, 0, 0 , 0)) \
+ : __clobber_all); \
+ }
+
+DEFINE_JUMP_TABLE_USE_REG(0)
+DEFINE_JUMP_TABLE_USE_REG(1)
+DEFINE_JUMP_TABLE_USE_REG(2)
+DEFINE_JUMP_TABLE_USE_REG(3)
+DEFINE_JUMP_TABLE_USE_REG(4)
+DEFINE_JUMP_TABLE_USE_REG(5)
+DEFINE_JUMP_TABLE_USE_REG(6)
+DEFINE_JUMP_TABLE_USE_REG(7)
+DEFINE_JUMP_TABLE_USE_REG(8)
+DEFINE_JUMP_TABLE_USE_REG(9)
+
+__used static int test_subprog(void)
+{
+ return 0;
+}
+
+SEC("socket")
+__failure __msg("jump table for insn 4 points outside of the subprog [0,10]")
+__naked void jump_table_outside_subprog(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .quad ret_out_%= - socket; \
+ .size jt0_%=, 24; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ call test_subprog; \
+ exit; \
+ ret_out_%=: \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+SEC("socket")
+__success __retval(1)
+__naked void jump_table_contains_non_unique_values(void)
+{
+ asm volatile (" \
+ .pushsection .jumptables,\"\",@progbits; \
+jt0_%=: \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .quad ret0_%= - socket; \
+ .quad ret1_%= - socket; \
+ .size jt0_%=, 80; \
+ .global jt0_%=; \
+ .popsection; \
+ \
+ r0 = jt0_%= ll; \
+ r0 += 8; \
+ r0 = *(u64 *)(r0 + 0); \
+ .8byte %[gotox_r0]; \
+ ret0_%=: \
+ r0 = 0; \
+ exit; \
+ ret1_%=: \
+ r0 = 1; \
+ exit; \
+" : \
+ : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0))
+ : __clobber_all);
+}
+
+#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c
index c0e808509268..2de105057bbc 100644
--- a/tools/testing/selftests/bpf/progs/verifier_live_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c
@@ -292,3 +292,53 @@ __naked void syzbot_postorder_bug1(void)
"exit;"
::: __clobber_all);
}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} map_array SEC(".maps");
+
+SEC("socket")
+__failure __msg("invalid read from stack R2 off=-1024 size=8")
+__flag(BPF_F_TEST_STATE_FREQ)
+__naked unsigned long caller_stack_write_tail_call(void)
+{
+ asm volatile (
+ "r6 = r1;"
+ "*(u64 *)(r10 - 8) = -8;"
+ "call %[bpf_get_prandom_u32];"
+ "if r0 != 42 goto 1f;"
+ "goto 2f;"
+ "1:"
+ "*(u64 *)(r10 - 8) = -1024;"
+ "2:"
+ "r1 = r6;"
+ "r2 = r10;"
+ "r2 += -8;"
+ "call write_tail_call;"
+ "r1 = *(u64 *)(r10 - 8);"
+ "r2 = r10;"
+ "r2 += r1;"
+ "r0 = *(u64 *)(r2 + 0);"
+ "exit;"
+ :: __imm(bpf_get_prandom_u32)
+ : __clobber_all);
+}
+
+static __used __naked unsigned long write_tail_call(void)
+{
+ asm volatile (
+ "r6 = r2;"
+ "r2 = %[map_array] ll;"
+ "r3 = 0;"
+ "call %[bpf_tail_call];"
+ "*(u64 *)(r6 + 0) = -16;"
+ "r0 = 0;"
+ "exit;"
+ :
+ : __imm(bpf_tail_call),
+ __imm_addr(map_array)
+ : __clobber_all);
+}
diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c
index 32e5e779cb96..6af9100a37ff 100644
--- a/tools/testing/selftests/bpf/progs/verifier_lsm.c
+++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c
@@ -4,7 +4,7 @@
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
-SEC("lsm/file_alloc_security")
+SEC("lsm/file_permission")
__description("lsm bpf prog with -4095~0 retval. test 1")
__success
__naked int errno_zero_retval_test1(void *ctx)
@@ -15,7 +15,7 @@ __naked int errno_zero_retval_test1(void *ctx)
::: __clobber_all);
}
-SEC("lsm/file_alloc_security")
+SEC("lsm/file_permission")
__description("lsm bpf prog with -4095~0 retval. test 2")
__success
__naked int errno_zero_retval_test2(void *ctx)
diff --git a/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c b/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c
index ab9f9f2620ed..e2cbc5bda65e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c
+++ b/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c
@@ -79,11 +79,6 @@ int with_invalid_ctx_access_test5(struct bpf_nf_ctx *ctx)
return NF_ACCEPT;
}
-extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags,
- struct bpf_dynptr *ptr__uninit) __ksym;
-extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, uint32_t offset,
- void *buffer, uint32_t buffer__sz) __ksym;
-
SEC("netfilter")
__description("netfilter test prog with skb and state read access")
__success __failure_unpriv
diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c
index 2b4610b53382..a2132c72d3b8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sock.c
@@ -1117,10 +1117,17 @@ int tail_call(struct __sk_buff *sk)
return 0;
}
-/* Tail calls invalidate packet pointers. */
+static __noinline
+int static_tail_call(struct __sk_buff *sk)
+{
+ bpf_tail_call_static(sk, &jmp_table, 0);
+ return 0;
+}
+
+/* Tail calls in sub-programs invalidate packet pointers. */
SEC("tc")
__failure __msg("invalid mem access")
-int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk)
+int invalidate_pkt_pointers_by_global_tail_call(struct __sk_buff *sk)
{
int *p = (void *)(long)sk->data;
@@ -1131,4 +1138,32 @@ int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk)
return TCX_PASS;
}
+/* Tail calls in static sub-programs invalidate packet pointers. */
+SEC("tc")
+__failure __msg("invalid mem access")
+int invalidate_pkt_pointers_by_static_tail_call(struct __sk_buff *sk)
+{
+ int *p = (void *)(long)sk->data;
+
+ if ((void *)(p + 1) > (void *)(long)sk->data_end)
+ return TCX_DROP;
+ static_tail_call(sk);
+ *p = 42; /* this is unsafe */
+ return TCX_PASS;
+}
+
+/* Direct tail calls do not invalidate packet pointers. */
+SEC("tc")
+__success
+int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk)
+{
+ int *p = (void *)(long)sk->data;
+
+ if ((void *)(p + 1) > (void *)(long)sk->data_end)
+ return TCX_DROP;
+ bpf_tail_call_static(sk, &jmp_table, 0);
+ *p = 42; /* this is NOT unsafe: tail calls don't return */
+ return TCX_PASS;
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index ac3e418c2a96..61886ed554de 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -793,4 +793,57 @@ __naked int stack_slot_aliases_precision(void)
);
}
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} map_array SEC(".maps");
+
+__naked __noinline __used
+static unsigned long identity_tail_call(void)
+{
+ /* the simplest identity function involving a tail call */
+ asm volatile (
+ "r6 = r2;"
+ "r2 = %[map_array] ll;"
+ "r3 = 0;"
+ "call %[bpf_tail_call];"
+ "r0 = r6;"
+ "exit;"
+ :
+ : __imm(bpf_tail_call),
+ __imm_addr(map_array)
+ : __clobber_all);
+}
+
+SEC("?raw_tp")
+__failure __log_level(2)
+__msg("13: (85) call bpf_tail_call#12")
+__msg("mark_precise: frame1: last_idx 13 first_idx 0 subseq_idx -1 ")
+__msg("returning from callee:")
+__msg("frame1: R0=scalar() R6=3 R10=fp0")
+__msg("to caller at 4:")
+__msg("R0=scalar() R6=map_value(map=.data.vals,ks=4,vs=16) R10=fp0")
+__msg("6: (0f) r1 += r0")
+__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4")
+__msg("mark_precise: frame0: parent state regs=r0 stack=: R0=Pscalar() R6=map_value(map=.data.vals,ks=4,vs=16) R10=fp0")
+__msg("math between map_value pointer and register with unbounded min value is not allowed")
+__naked int subprog_result_tail_call(void)
+{
+ asm volatile (
+ "r2 = 3;"
+ "call identity_tail_call;"
+ "r0 *= 4;"
+ "r1 = %[vals];"
+ "r1 += r0;"
+ "r0 = *(u32 *)(r1 + 0);"
+ "exit;"
+ :
+ : __imm_ptr(vals)
+ : __clobber_common
+ );
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c
index 2f1ba08c293e..25be2cd9d42c 100644
--- a/tools/testing/selftests/bpf/progs/wq.c
+++ b/tools/testing/selftests/bpf/progs/wq.c
@@ -187,3 +187,20 @@ long test_call_lru_sleepable(void *ctx)
return test_elem_callback(&lru, &key, wq_callback);
}
+
+SEC("tc")
+long test_map_no_btf(void *ctx)
+{
+ struct elem *val;
+ struct bpf_wq *wq;
+ int key = 42;
+
+ val = bpf_map_lookup_elem(&array, &key);
+ if (!val)
+ return -2;
+
+ wq = &val->w;
+ if (bpf_wq_init(wq, &array, 0) != 0)
+ return -3;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c
index 4240211a1900..d06f6d40594a 100644
--- a/tools/testing/selftests/bpf/progs/wq_failures.c
+++ b/tools/testing/selftests/bpf/progs/wq_failures.c
@@ -142,3 +142,26 @@ long test_wrong_wq_pointer_offset(void *ctx)
return -22;
}
+
+SEC("tc")
+__log_level(2)
+__failure
+__msg(": (85) call bpf_wq_init#")
+__msg("R1 doesn't have constant offset. bpf_wq has to be at the constant offset")
+long test_bad_wq_off(void *ctx)
+{
+ struct elem *val;
+ struct bpf_wq *wq;
+ int key = 42;
+ u64 unknown;
+
+ val = bpf_map_lookup_elem(&array, &key);
+ if (!val)
+ return -2;
+
+ unknown = bpf_get_prandom_u32();
+ wq = &val->w + unknown;
+ if (bpf_wq_init(wq, &array, 0) != 0)
+ return -3;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh
index 1453a53ed547..b03a87571592 100755
--- a/tools/testing/selftests/bpf/test_bpftool_build.sh
+++ b/tools/testing/selftests/bpf/test_bpftool_build.sh
@@ -90,10 +90,6 @@ echo -e "... through kbuild\n"
if [ -f ".config" ] ; then
make_and_clean tools/bpf
- ## "make tools/bpf" sets $(OUTPUT) to ...tools/bpf/runqslower for
- ## runqslower, but the default (used for the "clean" target) is .output.
- ## Let's make sure we clean runqslower's directory properly.
- make -C tools/bpf/runqslower OUTPUT=${KDIR_ROOT_DIR}/tools/bpf/runqslower/ clean
## $OUTPUT is overwritten in kbuild Makefile, and thus cannot be passed
## down from toplevel Makefile to bpftool's Makefile.
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
index 769206fc70e4..7b4ae5e81d32 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
@@ -5,6 +5,7 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/prandom.h>
+#include <linux/ktime.h>
#include <asm/rqspinlock.h>
#include <linux/perf_event.h>
#include <linux/kthread.h>
@@ -22,48 +23,146 @@ static struct perf_event_attr hw_attr = {
static rqspinlock_t lock_a;
static rqspinlock_t lock_b;
+static rqspinlock_t lock_c;
+
+#define RQSL_SLOW_THRESHOLD_MS 10
+static const unsigned int rqsl_hist_ms[] = {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 12, 14, 16, 18, 20, 25, 30, 40, 50, 75,
+ 100, 150, 200, 250, 1000,
+};
+#define RQSL_NR_HIST_BUCKETS ARRAY_SIZE(rqsl_hist_ms)
+
+enum rqsl_context {
+ RQSL_CTX_NORMAL = 0,
+ RQSL_CTX_NMI,
+ RQSL_CTX_MAX,
+};
+
+struct rqsl_cpu_hist {
+ atomic64_t hist[RQSL_CTX_MAX][RQSL_NR_HIST_BUCKETS];
+ atomic64_t success[RQSL_CTX_MAX];
+ atomic64_t failure[RQSL_CTX_MAX];
+};
+
+static DEFINE_PER_CPU(struct rqsl_cpu_hist, rqsl_cpu_hists);
+
+enum rqsl_mode {
+ RQSL_MODE_AA = 0,
+ RQSL_MODE_ABBA,
+ RQSL_MODE_ABBCCA,
+};
+
+static int test_mode = RQSL_MODE_AA;
+module_param(test_mode, int, 0644);
+MODULE_PARM_DESC(test_mode,
+ "rqspinlock test mode: 0 = AA, 1 = ABBA, 2 = ABBCCA");
+
+static int normal_delay = 20;
+module_param(normal_delay, int, 0644);
+MODULE_PARM_DESC(normal_delay,
+ "rqspinlock critical section length for normal context (20ms default)");
+
+static int nmi_delay = 10;
+module_param(nmi_delay, int, 0644);
+MODULE_PARM_DESC(nmi_delay,
+ "rqspinlock critical section length for NMI context (10ms default)");
static struct perf_event **rqsl_evts;
static int rqsl_nevts;
-static bool test_ab = false;
-module_param(test_ab, bool, 0644);
-MODULE_PARM_DESC(test_ab, "Test ABBA situations instead of AA situations");
-
static struct task_struct **rqsl_threads;
static int rqsl_nthreads;
static atomic_t rqsl_ready_cpus = ATOMIC_INIT(0);
static int pause = 0;
-static bool nmi_locks_a(int cpu)
+static const char *rqsl_mode_names[] = {
+ [RQSL_MODE_AA] = "AA",
+ [RQSL_MODE_ABBA] = "ABBA",
+ [RQSL_MODE_ABBCCA] = "ABBCCA",
+};
+
+struct rqsl_lock_pair {
+ rqspinlock_t *worker_lock;
+ rqspinlock_t *nmi_lock;
+};
+
+static struct rqsl_lock_pair rqsl_get_lock_pair(int cpu)
{
- return (cpu & 1) && test_ab;
+ int mode = READ_ONCE(test_mode);
+
+ switch (mode) {
+ default:
+ case RQSL_MODE_AA:
+ return (struct rqsl_lock_pair){ &lock_a, &lock_a };
+ case RQSL_MODE_ABBA:
+ if (cpu & 1)
+ return (struct rqsl_lock_pair){ &lock_b, &lock_a };
+ return (struct rqsl_lock_pair){ &lock_a, &lock_b };
+ case RQSL_MODE_ABBCCA:
+ switch (cpu % 3) {
+ case 0:
+ return (struct rqsl_lock_pair){ &lock_a, &lock_b };
+ case 1:
+ return (struct rqsl_lock_pair){ &lock_b, &lock_c };
+ default:
+ return (struct rqsl_lock_pair){ &lock_c, &lock_a };
+ }
+ }
+}
+
+static u32 rqsl_hist_bucket_idx(u32 delta_ms)
+{
+ int i;
+
+ for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
+ if (delta_ms <= rqsl_hist_ms[i])
+ return i;
+ }
+
+ return RQSL_NR_HIST_BUCKETS - 1;
+}
+
+static void rqsl_record_lock_result(u64 delta_ns, enum rqsl_context ctx, int ret)
+{
+ struct rqsl_cpu_hist *hist = this_cpu_ptr(&rqsl_cpu_hists);
+ u32 delta_ms = DIV_ROUND_UP_ULL(delta_ns, NSEC_PER_MSEC);
+ u32 bucket = rqsl_hist_bucket_idx(delta_ms);
+ atomic64_t *buckets = hist->hist[ctx];
+
+ atomic64_inc(&buckets[bucket]);
+ if (!ret)
+ atomic64_inc(&hist->success[ctx]);
+ else
+ atomic64_inc(&hist->failure[ctx]);
}
static int rqspinlock_worker_fn(void *arg)
{
int cpu = smp_processor_id();
unsigned long flags;
+ u64 start_ns;
int ret;
if (cpu) {
atomic_inc(&rqsl_ready_cpus);
while (!kthread_should_stop()) {
+ struct rqsl_lock_pair locks = rqsl_get_lock_pair(cpu);
+ rqspinlock_t *worker_lock = locks.worker_lock;
+
if (READ_ONCE(pause)) {
msleep(1000);
continue;
}
- if (nmi_locks_a(cpu))
- ret = raw_res_spin_lock_irqsave(&lock_b, flags);
- else
- ret = raw_res_spin_lock_irqsave(&lock_a, flags);
- mdelay(20);
- if (nmi_locks_a(cpu) && !ret)
- raw_res_spin_unlock_irqrestore(&lock_b, flags);
- else if (!ret)
- raw_res_spin_unlock_irqrestore(&lock_a, flags);
+ start_ns = ktime_get_mono_fast_ns();
+ ret = raw_res_spin_lock_irqsave(worker_lock, flags);
+ rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns,
+ RQSL_CTX_NORMAL, ret);
+ mdelay(normal_delay);
+ if (!ret)
+ raw_res_spin_unlock_irqrestore(worker_lock, flags);
cpu_relax();
}
return 0;
@@ -91,24 +190,25 @@ static int rqspinlock_worker_fn(void *arg)
static void nmi_cb(struct perf_event *event, struct perf_sample_data *data,
struct pt_regs *regs)
{
+ struct rqsl_lock_pair locks;
int cpu = smp_processor_id();
unsigned long flags;
+ u64 start_ns;
int ret;
if (!cpu || READ_ONCE(pause))
return;
- if (nmi_locks_a(cpu))
- ret = raw_res_spin_lock_irqsave(&lock_a, flags);
- else
- ret = raw_res_spin_lock_irqsave(test_ab ? &lock_b : &lock_a, flags);
+ locks = rqsl_get_lock_pair(cpu);
+ start_ns = ktime_get_mono_fast_ns();
+ ret = raw_res_spin_lock_irqsave(locks.nmi_lock, flags);
+ rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns,
+ RQSL_CTX_NMI, ret);
- mdelay(10);
+ mdelay(nmi_delay);
- if (nmi_locks_a(cpu) && !ret)
- raw_res_spin_unlock_irqrestore(&lock_a, flags);
- else if (!ret)
- raw_res_spin_unlock_irqrestore(test_ab ? &lock_b : &lock_a, flags);
+ if (!ret)
+ raw_res_spin_unlock_irqrestore(locks.nmi_lock, flags);
}
static void free_rqsl_threads(void)
@@ -142,13 +242,19 @@ static int bpf_test_rqspinlock_init(void)
int i, ret;
int ncpus = num_online_cpus();
- pr_err("Mode = %s\n", test_ab ? "ABBA" : "AA");
+ if (test_mode < RQSL_MODE_AA || test_mode > RQSL_MODE_ABBCCA) {
+ pr_err("Invalid mode %d\n", test_mode);
+ return -EINVAL;
+ }
+
+ pr_err("Mode = %s\n", rqsl_mode_names[test_mode]);
- if (ncpus < 3)
+ if (ncpus < test_mode + 2)
return -ENOTSUPP;
raw_res_spin_lock_init(&lock_a);
raw_res_spin_lock_init(&lock_b);
+ raw_res_spin_lock_init(&lock_c);
rqsl_evts = kcalloc(ncpus - 1, sizeof(*rqsl_evts), GFP_KERNEL);
if (!rqsl_evts)
@@ -196,10 +302,88 @@ err_perf_events:
module_init(bpf_test_rqspinlock_init);
+static void rqsl_print_histograms(void)
+{
+ int cpu, i;
+
+ pr_err("rqspinlock acquisition latency histogram (ms):\n");
+
+ for_each_online_cpu(cpu) {
+ struct rqsl_cpu_hist *hist = per_cpu_ptr(&rqsl_cpu_hists, cpu);
+ u64 norm_counts[RQSL_NR_HIST_BUCKETS];
+ u64 nmi_counts[RQSL_NR_HIST_BUCKETS];
+ u64 total_counts[RQSL_NR_HIST_BUCKETS];
+ u64 norm_success, nmi_success, success_total;
+ u64 norm_failure, nmi_failure, failure_total;
+ u64 norm_total = 0, nmi_total = 0, total = 0;
+ bool has_slow = false;
+
+ for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
+ norm_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NORMAL][i]);
+ nmi_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NMI][i]);
+ total_counts[i] = norm_counts[i] + nmi_counts[i];
+ norm_total += norm_counts[i];
+ nmi_total += nmi_counts[i];
+ total += total_counts[i];
+ if (rqsl_hist_ms[i] > RQSL_SLOW_THRESHOLD_MS &&
+ total_counts[i])
+ has_slow = true;
+ }
+
+ norm_success = atomic64_read(&hist->success[RQSL_CTX_NORMAL]);
+ nmi_success = atomic64_read(&hist->success[RQSL_CTX_NMI]);
+ norm_failure = atomic64_read(&hist->failure[RQSL_CTX_NORMAL]);
+ nmi_failure = atomic64_read(&hist->failure[RQSL_CTX_NMI]);
+ success_total = norm_success + nmi_success;
+ failure_total = norm_failure + nmi_failure;
+
+ if (!total)
+ continue;
+
+ if (!has_slow) {
+ pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | "
+ "success %llu (normal %llu, nmi %llu) | "
+ "failure %llu (normal %llu, nmi %llu), all within 0-%ums\n",
+ cpu, total, norm_total, nmi_total,
+ success_total, norm_success, nmi_success,
+ failure_total, norm_failure, nmi_failure,
+ RQSL_SLOW_THRESHOLD_MS);
+ continue;
+ }
+
+ pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | "
+ "success %llu (normal %llu, nmi %llu) | "
+ "failure %llu (normal %llu, nmi %llu)\n",
+ cpu, total, norm_total, nmi_total,
+ success_total, norm_success, nmi_success,
+ failure_total, norm_failure, nmi_failure);
+ for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
+ unsigned int start_ms;
+
+ if (!total_counts[i])
+ continue;
+
+ start_ms = i == 0 ? 0 : rqsl_hist_ms[i - 1] + 1;
+ if (i == RQSL_NR_HIST_BUCKETS - 1) {
+ pr_err(" >= %ums: total %llu (normal %llu, nmi %llu)\n",
+ start_ms, total_counts[i],
+ norm_counts[i], nmi_counts[i]);
+ } else {
+ pr_err(" %u-%ums: total %llu (normal %llu, nmi %llu)\n",
+ start_ms, rqsl_hist_ms[i],
+ total_counts[i],
+ norm_counts[i], nmi_counts[i]);
+ }
+ }
+ }
+}
+
static void bpf_test_rqspinlock_exit(void)
{
+ WRITE_ONCE(pause, 1);
free_rqsl_threads();
free_rqsl_evts();
+ rqsl_print_histograms();
}
module_exit(bpf_test_rqspinlock_exit);
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 74ecc281bb8c..338c035c3688 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -43,6 +43,7 @@
#define TEST_TAG_EXPECT_STDERR_PFX_UNPRIV "comment:test_expect_stderr_unpriv="
#define TEST_TAG_EXPECT_STDOUT_PFX "comment:test_expect_stdout="
#define TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV "comment:test_expect_stdout_unpriv="
+#define TEST_TAG_LINEAR_SIZE "comment:test_linear_size="
/* Warning: duplicated in bpf_misc.h */
#define POINTER_VALUE 0xbadcafe
@@ -89,6 +90,7 @@ struct test_spec {
int mode_mask;
int arch_mask;
int load_mask;
+ int linear_sz;
bool auxiliary;
bool valid;
};
@@ -633,6 +635,21 @@ static int parse_test_spec(struct test_loader *tester,
&spec->unpriv.stdout);
if (err)
goto cleanup;
+ } else if (str_has_pfx(s, TEST_TAG_LINEAR_SIZE)) {
+ switch (bpf_program__type(prog)) {
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ val = s + sizeof(TEST_TAG_LINEAR_SIZE) - 1;
+ err = parse_int(val, &spec->linear_sz, "test linear size");
+ if (err)
+ goto cleanup;
+ break;
+ default:
+ PRINT_FAIL("__linear_size for unsupported program type");
+ err = -EINVAL;
+ goto cleanup;
+ }
}
}
@@ -1007,10 +1024,11 @@ static bool is_unpriv_capable_map(struct bpf_map *map)
}
}
-static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts)
+static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts, int linear_sz)
{
__u8 tmp_out[TEST_DATA_LEN << 2] = {};
__u8 tmp_in[TEST_DATA_LEN] = {};
+ struct __sk_buff ctx = {};
int err, saved_errno;
LIBBPF_OPTS(bpf_test_run_opts, topts,
.data_in = tmp_in,
@@ -1020,6 +1038,12 @@ static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts)
.repeat = 1,
);
+ if (linear_sz) {
+ ctx.data_end = linear_sz;
+ topts.ctx_in = &ctx;
+ topts.ctx_size_in = sizeof(ctx);
+ }
+
if (empty_opts) {
memset(&topts, 0, sizeof(struct bpf_test_run_opts));
topts.sz = sizeof(struct bpf_test_run_opts);
@@ -1269,7 +1293,8 @@ void run_subtest(struct test_loader *tester,
}
err = do_prog_test_run(bpf_program__fd(tprog), &retval,
- bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false);
+ bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false,
+ spec->linear_sz);
if (!err && retval != subspec->retval && subspec->retval != POINTER_VALUE) {
PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval);
goto tobj_cleanup;
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 3fae9ce46ca9..ccc5acd55ff9 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -1399,7 +1399,8 @@ static void test_map_stress(void)
static bool can_retry(int err)
{
return (err == EAGAIN || err == EBUSY ||
- (err == ENOMEM && map_opts.map_flags == BPF_F_NO_PREALLOC));
+ ((err == ENOMEM || err == E2BIG) &&
+ map_opts.map_flags == BPF_F_NO_PREALLOC));
}
int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts,
diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c
index 5546b05a0486..f1300047c1e0 100644
--- a/tools/testing/selftests/bpf/test_tag.c
+++ b/tools/testing/selftests/bpf/test_tag.c
@@ -116,7 +116,7 @@ static void tag_from_alg(int insns, uint8_t *tag, uint32_t len)
static const struct sockaddr_alg alg = {
.salg_family = AF_ALG,
.salg_type = "hash",
- .salg_name = "sha1",
+ .salg_name = "sha256",
};
int fd_base, fd_alg, ret;
ssize_t size;
diff --git a/tools/testing/selftests/bpf/test_tc_edt.sh b/tools/testing/selftests/bpf/test_tc_edt.sh
deleted file mode 100755
index 76f0bd17061f..000000000000
--- a/tools/testing/selftests/bpf/test_tc_edt.sh
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# This test installs a TC bpf program that throttles a TCP flow
-# with dst port = 9000 down to 5MBps. Then it measures actual
-# throughput of the flow.
-
-BPF_FILE="test_tc_edt.bpf.o"
-if [[ $EUID -ne 0 ]]; then
- echo "This script must be run as root"
- echo "FAIL"
- exit 1
-fi
-
-# check that nc, dd, and timeout are present
-command -v nc >/dev/null 2>&1 || \
- { echo >&2 "nc is not available"; exit 1; }
-command -v dd >/dev/null 2>&1 || \
- { echo >&2 "nc is not available"; exit 1; }
-command -v timeout >/dev/null 2>&1 || \
- { echo >&2 "timeout is not available"; exit 1; }
-
-readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
-readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
-
-readonly IP_SRC="172.16.1.100"
-readonly IP_DST="172.16.2.100"
-
-cleanup()
-{
- ip netns del ${NS_SRC}
- ip netns del ${NS_DST}
-}
-
-trap cleanup EXIT
-
-set -e # exit on error
-
-ip netns add "${NS_SRC}"
-ip netns add "${NS_DST}"
-ip link add veth_src type veth peer name veth_dst
-ip link set veth_src netns ${NS_SRC}
-ip link set veth_dst netns ${NS_DST}
-
-ip -netns ${NS_SRC} addr add ${IP_SRC}/24 dev veth_src
-ip -netns ${NS_DST} addr add ${IP_DST}/24 dev veth_dst
-
-ip -netns ${NS_SRC} link set dev veth_src up
-ip -netns ${NS_DST} link set dev veth_dst up
-
-ip -netns ${NS_SRC} route add ${IP_DST}/32 dev veth_src
-ip -netns ${NS_DST} route add ${IP_SRC}/32 dev veth_dst
-
-# set up TC on TX
-ip netns exec ${NS_SRC} tc qdisc add dev veth_src root fq
-ip netns exec ${NS_SRC} tc qdisc add dev veth_src clsact
-ip netns exec ${NS_SRC} tc filter add dev veth_src egress \
- bpf da obj ${BPF_FILE} sec cls_test
-
-
-# start the listener
-ip netns exec ${NS_DST} bash -c \
- "nc -4 -l -p 9000 >/dev/null &"
-declare -i NC_PID=$!
-sleep 1
-
-declare -ir TIMEOUT=20
-declare -ir EXPECTED_BPS=5000000
-
-# run the load, capture RX bytes on DST
-declare -ir RX_BYTES_START=$( ip netns exec ${NS_DST} \
- cat /sys/class/net/veth_dst/statistics/rx_bytes )
-
-set +e
-ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero \
- bs=1000 count=1000000 > /dev/tcp/${IP_DST}/9000 2>/dev/null"
-set -e
-
-declare -ir RX_BYTES_END=$( ip netns exec ${NS_DST} \
- cat /sys/class/net/veth_dst/statistics/rx_bytes )
-
-declare -ir ACTUAL_BPS=$(( ($RX_BYTES_END - $RX_BYTES_START) / $TIMEOUT ))
-
-echo $TIMEOUT $ACTUAL_BPS $EXPECTED_BPS | \
- awk '{printf "elapsed: %d sec; bps difference: %.2f%%\n",
- $1, ($2-$3)*100.0/$3}'
-
-# Pass the test if the actual bps is within 1% of the expected bps.
-# The difference is usually about 0.1% on a 20-sec test, and ==> zero
-# the longer the test runs.
-declare -ir RES=$( echo $ACTUAL_BPS $EXPECTED_BPS | \
- awk 'function abs(x){return ((x < 0.0) ? -x : x)}
- {if (abs(($1-$2)*100.0/$2) > 1.0) { print "1" }
- else { print "0"} }' )
-if [ "${RES}" == "0" ] ; then
- echo "PASS"
-else
- echo "FAIL"
- exit 1
-fi
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
deleted file mode 100755
index cb55a908bb0d..000000000000
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ /dev/null
@@ -1,320 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# In-place tunneling
-
-BPF_FILE="test_tc_tunnel.bpf.o"
-# must match the port that the bpf program filters on
-readonly port=8000
-
-readonly ns_prefix="ns-$$-"
-readonly ns1="${ns_prefix}1"
-readonly ns2="${ns_prefix}2"
-
-readonly ns1_v4=192.168.1.1
-readonly ns2_v4=192.168.1.2
-readonly ns1_v6=fd::1
-readonly ns2_v6=fd::2
-
-# Must match port used by bpf program
-readonly udpport=5555
-# MPLSoverUDP
-readonly mplsudpport=6635
-readonly mplsproto=137
-
-readonly infile="$(mktemp)"
-readonly outfile="$(mktemp)"
-
-setup() {
- ip netns add "${ns1}"
- ip netns add "${ns2}"
-
- ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \
- peer name veth2 mtu 1500 netns "${ns2}"
-
- ip netns exec "${ns1}" ethtool -K veth1 tso off
-
- ip -netns "${ns1}" link set veth1 up
- ip -netns "${ns2}" link set veth2 up
-
- ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1
- ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2
- ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad
- ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad
-
- # clamp route to reserve room for tunnel headers
- ip -netns "${ns1}" -4 route flush table main
- ip -netns "${ns1}" -6 route flush table main
- ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1
- ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1
-
- sleep 1
-
- dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
-}
-
-cleanup() {
- ip netns del "${ns2}"
- ip netns del "${ns1}"
-
- if [[ -f "${outfile}" ]]; then
- rm "${outfile}"
- fi
- if [[ -f "${infile}" ]]; then
- rm "${infile}"
- fi
-
- if [[ -n $server_pid ]]; then
- kill $server_pid 2> /dev/null
- fi
-}
-
-server_listen() {
- ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" &
- server_pid=$!
-}
-
-client_connect() {
- ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}"
- echo $?
-}
-
-verify_data() {
- wait "${server_pid}"
- server_pid=
- # sha1sum returns two fields [sha1] [filepath]
- # convert to bash array and access first elem
- insum=($(sha1sum ${infile}))
- outsum=($(sha1sum ${outfile}))
- if [[ "${insum[0]}" != "${outsum[0]}" ]]; then
- echo "data mismatch"
- exit 1
- fi
-}
-
-wait_for_port() {
- for i in $(seq 20); do
- if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then
- return 0
- fi
- sleep 0.1
- done
- return 1
-}
-
-set -e
-
-# no arguments: automated test, run all
-if [[ "$#" -eq "0" ]]; then
- echo "ipip"
- $0 ipv4 ipip none 100
-
- echo "ipip6"
- $0 ipv4 ipip6 none 100
-
- echo "ip6ip6"
- $0 ipv6 ip6tnl none 100
-
- echo "sit"
- $0 ipv6 sit none 100
-
- echo "ip4 vxlan"
- $0 ipv4 vxlan eth 2000
-
- echo "ip6 vxlan"
- $0 ipv6 ip6vxlan eth 2000
-
- for mac in none mpls eth ; do
- echo "ip gre $mac"
- $0 ipv4 gre $mac 100
-
- echo "ip6 gre $mac"
- $0 ipv6 ip6gre $mac 100
-
- echo "ip gre $mac gso"
- $0 ipv4 gre $mac 2000
-
- echo "ip6 gre $mac gso"
- $0 ipv6 ip6gre $mac 2000
-
- echo "ip udp $mac"
- $0 ipv4 udp $mac 100
-
- echo "ip6 udp $mac"
- $0 ipv6 ip6udp $mac 100
-
- echo "ip udp $mac gso"
- $0 ipv4 udp $mac 2000
-
- echo "ip6 udp $mac gso"
- $0 ipv6 ip6udp $mac 2000
- done
-
- echo "OK. All tests passed"
- exit 0
-fi
-
-if [[ "$#" -ne "4" ]]; then
- echo "Usage: $0"
- echo " or: $0 <ipv4|ipv6> <tuntype> <none|mpls|eth> <data_len>"
- exit 1
-fi
-
-case "$1" in
-"ipv4")
- readonly addr1="${ns1_v4}"
- readonly addr2="${ns2_v4}"
- readonly ipproto=4
- readonly netcat_opt=-${ipproto}
- readonly foumod=fou
- readonly foutype=ipip
- readonly fouproto=4
- readonly fouproto_mpls=${mplsproto}
- readonly gretaptype=gretap
- ;;
-"ipv6")
- readonly addr1="${ns1_v6}"
- readonly addr2="${ns2_v6}"
- readonly ipproto=6
- readonly netcat_opt=-${ipproto}
- readonly foumod=fou6
- readonly foutype=ip6tnl
- readonly fouproto="41 -6"
- readonly fouproto_mpls="${mplsproto} -6"
- readonly gretaptype=ip6gretap
- ;;
-*)
- echo "unknown arg: $1"
- exit 1
- ;;
-esac
-
-readonly tuntype=$2
-readonly mac=$3
-readonly datalen=$4
-
-echo "encap ${addr1} to ${addr2}, type ${tuntype}, mac ${mac} len ${datalen}"
-
-trap cleanup EXIT
-
-setup
-
-# basic communication works
-echo "test basic connectivity"
-server_listen
-wait_for_port ${port} ${netcat_opt}
-client_connect
-verify_data
-
-# clientside, insert bpf program to encap all TCP to port ${port}
-# client can no longer connect
-ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
-ip netns exec "${ns1}" tc filter add dev veth1 egress \
- bpf direct-action object-file ${BPF_FILE} \
- section "encap_${tuntype}_${mac}"
-echo "test bpf encap without decap (expect failure)"
-server_listen
-wait_for_port ${port} ${netcat_opt}
-! client_connect
-
-if [[ "$tuntype" =~ "udp" ]]; then
- # Set up fou tunnel.
- ttype="${foutype}"
- targs="encap fou encap-sport auto encap-dport $udpport"
- # fou may be a module; allow this to fail.
- modprobe "${foumod}" ||true
- if [[ "$mac" == "mpls" ]]; then
- dport=${mplsudpport}
- dproto=${fouproto_mpls}
- tmode="mode any ttl 255"
- else
- dport=${udpport}
- dproto=${fouproto}
- fi
- ip netns exec "${ns2}" ip fou add port $dport ipproto ${dproto}
- targs="encap fou encap-sport auto encap-dport $dport"
-elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
- ttype=$gretaptype
-elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
- ttype="vxlan"
- targs="id 1 dstport 8472 udp6zerocsumrx"
-elif [[ "$tuntype" == "ipip6" ]]; then
- ttype="ip6tnl"
- targs=""
-else
- ttype=$tuntype
- targs=""
-fi
-
-# tunnel address family differs from inner for SIT
-if [[ "${tuntype}" == "sit" ]]; then
- link_addr1="${ns1_v4}"
- link_addr2="${ns2_v4}"
-elif [[ "${tuntype}" == "ipip6" ]]; then
- link_addr1="${ns1_v6}"
- link_addr2="${ns2_v6}"
-else
- link_addr1="${addr1}"
- link_addr2="${addr2}"
-fi
-
-# serverside, insert decap module
-# server is still running
-# client can connect again
-ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \
- ${tmode} remote "${link_addr1}" local "${link_addr2}" $targs
-
-expect_tun_fail=0
-
-if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
- # No support for MPLS IPv6 fou tunnel; expect failure.
- expect_tun_fail=1
-elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
- # No support for TEB fou tunnel; expect failure.
- expect_tun_fail=1
-elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then
- # Share ethernet address between tunnel/veth2 so L2 decap works.
- ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
- awk '/ether/ { print $2 }')
- ip netns exec "${ns2}" ip link set testtun0 address $ethaddr
-elif [[ "$mac" == "mpls" ]]; then
- modprobe mpls_iptunnel ||true
- modprobe mpls_gso ||true
- ip netns exec "${ns2}" sysctl -qw net.mpls.platform_labels=65536
- ip netns exec "${ns2}" ip -f mpls route add 1000 dev lo
- ip netns exec "${ns2}" ip link set lo up
- ip netns exec "${ns2}" sysctl -qw net.mpls.conf.testtun0.input=1
- ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.lo.rp_filter=0
-fi
-
-# Because packets are decapped by the tunnel they arrive on testtun0 from
-# the IP stack perspective. Ensure reverse path filtering is disabled
-# otherwise we drop the TCP SYN as arriving on testtun0 instead of the
-# expected veth2 (veth2 is where 192.168.1.2 is configured).
-ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
-# rp needs to be disabled for both all and testtun0 as the rp value is
-# selected as the max of the "all" and device-specific values.
-ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0
-ip netns exec "${ns2}" ip link set dev testtun0 up
-if [[ "$expect_tun_fail" == 1 ]]; then
- # This tunnel mode is not supported, so we expect failure.
- echo "test bpf encap with tunnel device decap (expect failure)"
- ! client_connect
-else
- echo "test bpf encap with tunnel device decap"
- client_connect
- verify_data
- server_listen
- wait_for_port ${port} ${netcat_opt}
-fi
-
-# serverside, use BPF for decap
-ip netns exec "${ns2}" ip link del dev testtun0
-ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
-ip netns exec "${ns2}" tc filter add dev veth2 ingress \
- bpf direct-action object-file ${BPF_FILE} section decap
-echo "test bpf encap with bpf decap"
-client_connect
-verify_data
-
-echo OK
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 352adc8df2d1..9234a58b0a97 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -74,31 +74,23 @@
#define _GNU_SOURCE
#include <assert.h>
#include <fcntl.h>
-#include <errno.h>
#include <getopt.h>
#include <linux/if_link.h>
#include <linux/if_ether.h>
#include <linux/mman.h>
#include <linux/netdev.h>
-#include <linux/bitmap.h>
#include <linux/ethtool.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <locale.h>
-#include <poll.h>
-#include <pthread.h>
-#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <libgen.h>
-#include <string.h>
#include <stddef.h>
#include <sys/mman.h>
-#include <sys/socket.h>
-#include <sys/time.h>
#include <sys/types.h>
-#include <unistd.h>
+#include "prog_tests/test_xsk.h"
#include "xsk_xdp_progs.skel.h"
#include "xsk.h"
#include "xskxceiver.h"
@@ -109,9 +101,6 @@
#include <network_helpers.h>
-#define MAX_TX_BUDGET_DEFAULT 32
-
-static bool opt_verbose;
static bool opt_print_tests;
static enum test_mode opt_mode = TEST_MODE_ALL;
static u32 opt_run_test = RUN_ALL_TESTS;
@@ -120,169 +109,12 @@ void test__fail(void) { /* for network_helpers.c */ }
static void __exit_with_error(int error, const char *file, const char *func, int line)
{
- ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error,
- strerror(error));
+ ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line,
+ error, strerror(error));
ksft_exit_xfail();
}
#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
-#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : ""
-static char *mode_string(struct test_spec *test)
-{
- switch (test->mode) {
- case TEST_MODE_SKB:
- return "SKB";
- case TEST_MODE_DRV:
- return "DRV";
- case TEST_MODE_ZC:
- return "ZC";
- default:
- return "BOGUS";
- }
-}
-
-static void report_failure(struct test_spec *test)
-{
- if (test->fail)
- return;
-
- ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test),
- test->name);
- test->fail = true;
-}
-
-/* The payload is a word consisting of a packet sequence number in the upper
- * 16-bits and a intra packet data sequence number in the lower 16 bits. So the 3rd packet's
- * 5th word of data will contain the number (2<<16) | 4 as they are numbered from 0.
- */
-static void write_payload(void *dest, u32 pkt_nb, u32 start, u32 size)
-{
- u32 *ptr = (u32 *)dest, i;
-
- start /= sizeof(*ptr);
- size /= sizeof(*ptr);
- for (i = 0; i < size; i++)
- ptr[i] = htonl(pkt_nb << 16 | (i + start));
-}
-
-static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr)
-{
- memcpy(eth_hdr->h_dest, xsk->dst_mac, ETH_ALEN);
- memcpy(eth_hdr->h_source, xsk->src_mac, ETH_ALEN);
- eth_hdr->h_proto = htons(ETH_P_LOOPBACK);
-}
-
-static bool is_umem_valid(struct ifobject *ifobj)
-{
- return !!ifobj->umem->umem;
-}
-
-static u32 mode_to_xdp_flags(enum test_mode mode)
-{
- return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE;
-}
-
-static u64 umem_size(struct xsk_umem_info *umem)
-{
- return umem->num_frames * umem->frame_size;
-}
-
-static int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer,
- u64 size)
-{
- struct xsk_umem_config cfg = {
- .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
- .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
- .frame_size = umem->frame_size,
- .frame_headroom = umem->frame_headroom,
- .flags = XSK_UMEM__DEFAULT_FLAGS
- };
- int ret;
-
- if (umem->fill_size)
- cfg.fill_size = umem->fill_size;
-
- if (umem->comp_size)
- cfg.comp_size = umem->comp_size;
-
- if (umem->unaligned_mode)
- cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
-
- ret = xsk_umem__create(&umem->umem, buffer, size,
- &umem->fq, &umem->cq, &cfg);
- if (ret)
- return ret;
-
- umem->buffer = buffer;
- if (ifobj->shared_umem && ifobj->rx_on) {
- umem->base_addr = umem_size(umem);
- umem->next_buffer = umem_size(umem);
- }
-
- return 0;
-}
-
-static u64 umem_alloc_buffer(struct xsk_umem_info *umem)
-{
- u64 addr;
-
- addr = umem->next_buffer;
- umem->next_buffer += umem->frame_size;
- if (umem->next_buffer >= umem->base_addr + umem_size(umem))
- umem->next_buffer = umem->base_addr;
-
- return addr;
-}
-
-static void umem_reset_alloc(struct xsk_umem_info *umem)
-{
- umem->next_buffer = 0;
-}
-
-static void enable_busy_poll(struct xsk_socket_info *xsk)
-{
- int sock_opt;
-
- sock_opt = 1;
- if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL,
- (void *)&sock_opt, sizeof(sock_opt)) < 0)
- exit_with_error(errno);
-
- sock_opt = 20;
- if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL,
- (void *)&sock_opt, sizeof(sock_opt)) < 0)
- exit_with_error(errno);
-
- sock_opt = xsk->batch_size;
- if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET,
- (void *)&sock_opt, sizeof(sock_opt)) < 0)
- exit_with_error(errno);
-}
-
-static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem,
- struct ifobject *ifobject, bool shared)
-{
- struct xsk_socket_config cfg = {};
- struct xsk_ring_cons *rxr;
- struct xsk_ring_prod *txr;
-
- xsk->umem = umem;
- cfg.rx_size = xsk->rxqsize;
- cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
- cfg.bind_flags = ifobject->bind_flags;
- if (shared)
- cfg.bind_flags |= XDP_SHARED_UMEM;
- if (ifobject->mtu > MAX_ETH_PKT_SIZE)
- cfg.bind_flags |= XDP_USE_SG;
- if (umem->comp_size)
- cfg.tx_size = umem->comp_size;
- if (umem->fill_size)
- cfg.rx_size = umem->fill_size;
-
- txr = ifobject->tx_on ? &xsk->tx : NULL;
- rxr = ifobject->rx_on ? &xsk->rx : NULL;
- return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg);
-}
static bool ifobj_zc_avail(struct ifobject *ifobject)
{
@@ -314,7 +146,7 @@ static bool ifobj_zc_avail(struct ifobject *ifobject)
ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
ifobject->rx_on = true;
xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
- ret = __xsk_configure_socket(xsk, umem, ifobject, false);
+ ret = xsk_configure_socket(xsk, umem, ifobject, false);
if (!ret)
zc_avail = true;
@@ -327,25 +159,6 @@ out:
return zc_avail;
}
-#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags"
-static unsigned int get_max_skb_frags(void)
-{
- unsigned int max_skb_frags = 0;
- FILE *file;
-
- file = fopen(MAX_SKB_FRAGS_PATH, "r");
- if (!file) {
- ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH);
- return 0;
- }
-
- if (fscanf(file, "%u", &max_skb_frags) != 1)
- ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH);
-
- fclose(file);
- return max_skb_frags;
-}
-
static struct option long_options[] = {
{"interface", required_argument, 0, 'i'},
{"busy-poll", no_argument, 0, 'b'},
@@ -446,2256 +259,36 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj
}
}
-static int set_ring_size(struct ifobject *ifobj)
-{
- int ret;
- u32 ctr = 0;
-
- while (ctr++ < SOCK_RECONF_CTR) {
- ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring);
- if (!ret)
- break;
-
- /* Retry if it fails */
- if (ctr >= SOCK_RECONF_CTR || errno != EBUSY)
- return -errno;
-
- usleep(USLEEP_MAX);
- }
-
- return ret;
-}
-
-static int hw_ring_size_reset(struct ifobject *ifobj)
-{
- ifobj->ring.tx_pending = ifobj->set_ring.default_tx;
- ifobj->ring.rx_pending = ifobj->set_ring.default_rx;
- return set_ring_size(ifobj);
-}
-
-static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
- struct ifobject *ifobj_rx)
-{
- u32 i, j;
-
- for (i = 0; i < MAX_INTERFACES; i++) {
- struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
-
- ifobj->xsk = &ifobj->xsk_arr[0];
- ifobj->use_poll = false;
- ifobj->use_fill_ring = true;
- ifobj->release_rx = true;
- ifobj->validation_func = NULL;
- ifobj->use_metadata = false;
-
- if (i == 0) {
- ifobj->rx_on = false;
- ifobj->tx_on = true;
- } else {
- ifobj->rx_on = true;
- ifobj->tx_on = false;
- }
-
- memset(ifobj->umem, 0, sizeof(*ifobj->umem));
- ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS;
- ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
-
- for (j = 0; j < MAX_SOCKETS; j++) {
- memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j]));
- ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
- ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE;
- if (i == 0)
- ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default;
- else
- ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default;
-
- memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN);
- memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN);
- ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0);
- ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1);
- }
- }
-
- if (ifobj_tx->hw_ring_size_supp)
- hw_ring_size_reset(ifobj_tx);
-
- test->ifobj_tx = ifobj_tx;
- test->ifobj_rx = ifobj_rx;
- test->current_step = 0;
- test->total_steps = 1;
- test->nb_sockets = 1;
- test->fail = false;
- test->set_ring = false;
- test->adjust_tail = false;
- test->adjust_tail_support = false;
- test->mtu = MAX_ETH_PKT_SIZE;
- test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog;
- test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk;
- test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog;
- test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk;
-}
-
-static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
- struct ifobject *ifobj_rx, enum test_mode mode,
- const struct test_spec *test_to_run)
-{
- struct pkt_stream *tx_pkt_stream;
- struct pkt_stream *rx_pkt_stream;
- u32 i;
-
- tx_pkt_stream = test->tx_pkt_stream_default;
- rx_pkt_stream = test->rx_pkt_stream_default;
- memset(test, 0, sizeof(*test));
- test->tx_pkt_stream_default = tx_pkt_stream;
- test->rx_pkt_stream_default = rx_pkt_stream;
-
- for (i = 0; i < MAX_INTERFACES; i++) {
- struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
-
- ifobj->bind_flags = XDP_USE_NEED_WAKEUP;
- if (mode == TEST_MODE_ZC)
- ifobj->bind_flags |= XDP_ZEROCOPY;
- else
- ifobj->bind_flags |= XDP_COPY;
- }
-
- strncpy(test->name, test_to_run->name, MAX_TEST_NAME_SIZE);
- test->test_func = test_to_run->test_func;
- test->mode = mode;
- __test_spec_init(test, ifobj_tx, ifobj_rx);
-}
-
-static void test_spec_reset(struct test_spec *test)
-{
- __test_spec_init(test, test->ifobj_tx, test->ifobj_rx);
-}
-
-static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx,
- struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx,
- struct bpf_map *xskmap_tx)
-{
- test->xdp_prog_rx = xdp_prog_rx;
- test->xdp_prog_tx = xdp_prog_tx;
- test->xskmap_rx = xskmap_rx;
- test->xskmap_tx = xskmap_tx;
-}
-
-static int test_spec_set_mtu(struct test_spec *test, int mtu)
-{
- int err;
-
- if (test->ifobj_rx->mtu != mtu) {
- err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu);
- if (err)
- return err;
- test->ifobj_rx->mtu = mtu;
- }
- if (test->ifobj_tx->mtu != mtu) {
- err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu);
- if (err)
- return err;
- test->ifobj_tx->mtu = mtu;
- }
-
- return 0;
-}
-
-static void pkt_stream_reset(struct pkt_stream *pkt_stream)
-{
- if (pkt_stream) {
- pkt_stream->current_pkt_nb = 0;
- pkt_stream->nb_rx_pkts = 0;
- }
-}
-
-static struct pkt *pkt_stream_get_next_tx_pkt(struct pkt_stream *pkt_stream)
-{
- if (pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts)
- return NULL;
-
- return &pkt_stream->pkts[pkt_stream->current_pkt_nb++];
-}
-
-static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent)
-{
- while (pkt_stream->current_pkt_nb < pkt_stream->nb_pkts) {
- (*pkts_sent)++;
- if (pkt_stream->pkts[pkt_stream->current_pkt_nb].valid)
- return &pkt_stream->pkts[pkt_stream->current_pkt_nb++];
- pkt_stream->current_pkt_nb++;
- }
- return NULL;
-}
-
-static void pkt_stream_delete(struct pkt_stream *pkt_stream)
-{
- free(pkt_stream->pkts);
- free(pkt_stream);
-}
-
-static void pkt_stream_restore_default(struct test_spec *test)
-{
- struct pkt_stream *tx_pkt_stream = test->ifobj_tx->xsk->pkt_stream;
- struct pkt_stream *rx_pkt_stream = test->ifobj_rx->xsk->pkt_stream;
-
- if (tx_pkt_stream != test->tx_pkt_stream_default) {
- pkt_stream_delete(test->ifobj_tx->xsk->pkt_stream);
- test->ifobj_tx->xsk->pkt_stream = test->tx_pkt_stream_default;
- }
-
- if (rx_pkt_stream != test->rx_pkt_stream_default) {
- pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream);
- test->ifobj_rx->xsk->pkt_stream = test->rx_pkt_stream_default;
- }
-}
-
-static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts)
-{
- struct pkt_stream *pkt_stream;
-
- pkt_stream = calloc(1, sizeof(*pkt_stream));
- if (!pkt_stream)
- return NULL;
-
- pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts));
- if (!pkt_stream->pkts) {
- free(pkt_stream);
- return NULL;
- }
-
- pkt_stream->nb_pkts = nb_pkts;
- return pkt_stream;
-}
-
-static bool pkt_continues(u32 options)
-{
- return options & XDP_PKT_CONTD;
-}
-
-static u32 ceil_u32(u32 a, u32 b)
-{
- return (a + b - 1) / b;
-}
-
-static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt)
-{
- u32 nb_frags = 1, next_frag;
-
- if (!pkt)
- return 1;
-
- if (!pkt_stream->verbatim) {
- if (!pkt->valid || !pkt->len)
- return 1;
- return ceil_u32(pkt->len, frame_size);
- }
-
- /* Search for the end of the packet in verbatim mode */
- if (!pkt_continues(pkt->options))
- return nb_frags;
-
- next_frag = pkt_stream->current_pkt_nb;
- pkt++;
- while (next_frag++ < pkt_stream->nb_pkts) {
- nb_frags++;
- if (!pkt_continues(pkt->options) || !pkt->valid)
- break;
- pkt++;
- }
- return nb_frags;
-}
-
-static bool set_pkt_valid(int offset, u32 len)
-{
- return len <= MAX_ETH_JUMBO_SIZE;
-}
-
-static void pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len)
-{
- pkt->offset = offset;
- pkt->len = len;
- pkt->valid = set_pkt_valid(offset, len);
-}
-
-static void pkt_stream_pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len)
-{
- bool prev_pkt_valid = pkt->valid;
-
- pkt_set(pkt_stream, pkt, offset, len);
- pkt_stream->nb_valid_entries += pkt->valid - prev_pkt_valid;
-}
-
-static u32 pkt_get_buffer_len(struct xsk_umem_info *umem, u32 len)
-{
- return ceil_u32(len, umem->frame_size) * umem->frame_size;
-}
-
-static struct pkt_stream *__pkt_stream_generate(u32 nb_pkts, u32 pkt_len, u32 nb_start, u32 nb_off)
-{
- struct pkt_stream *pkt_stream;
- u32 i;
-
- pkt_stream = __pkt_stream_alloc(nb_pkts);
- if (!pkt_stream)
- exit_with_error(ENOMEM);
-
- pkt_stream->nb_pkts = nb_pkts;
- pkt_stream->max_pkt_len = pkt_len;
- for (i = 0; i < nb_pkts; i++) {
- struct pkt *pkt = &pkt_stream->pkts[i];
-
- pkt_stream_pkt_set(pkt_stream, pkt, 0, pkt_len);
- pkt->pkt_nb = nb_start + i * nb_off;
- }
-
- return pkt_stream;
-}
-
-static struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len)
-{
- return __pkt_stream_generate(nb_pkts, pkt_len, 0, 1);
-}
-
-static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream)
-{
- return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len);
-}
-
-static void pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len)
-{
- ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len);
-}
-
-static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len)
-{
- pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len);
- pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len);
-}
-
-static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len,
- int offset)
-{
- struct pkt_stream *pkt_stream;
- u32 i;
-
- pkt_stream = pkt_stream_clone(ifobj->xsk->pkt_stream);
- for (i = 1; i < ifobj->xsk->pkt_stream->nb_pkts; i += 2)
- pkt_stream_pkt_set(pkt_stream, &pkt_stream->pkts[i], offset, pkt_len);
-
- ifobj->xsk->pkt_stream = pkt_stream;
-}
-
-static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset)
-{
- __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset);
- __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset);
-}
-
-static void pkt_stream_receive_half(struct test_spec *test)
-{
- struct pkt_stream *pkt_stream = test->ifobj_tx->xsk->pkt_stream;
- u32 i;
-
- test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(pkt_stream->nb_pkts,
- pkt_stream->pkts[0].len);
- pkt_stream = test->ifobj_rx->xsk->pkt_stream;
- for (i = 1; i < pkt_stream->nb_pkts; i += 2)
- pkt_stream->pkts[i].valid = false;
-
- pkt_stream->nb_valid_entries /= 2;
-}
-
-static void pkt_stream_even_odd_sequence(struct test_spec *test)
-{
- struct pkt_stream *pkt_stream;
- u32 i;
-
- for (i = 0; i < test->nb_sockets; i++) {
- pkt_stream = test->ifobj_tx->xsk_arr[i].pkt_stream;
- pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2,
- pkt_stream->pkts[0].len, i, 2);
- test->ifobj_tx->xsk_arr[i].pkt_stream = pkt_stream;
-
- pkt_stream = test->ifobj_rx->xsk_arr[i].pkt_stream;
- pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2,
- pkt_stream->pkts[0].len, i, 2);
- test->ifobj_rx->xsk_arr[i].pkt_stream = pkt_stream;
- }
-}
-
-static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem)
-{
- if (!pkt->valid)
- return pkt->offset;
- return pkt->offset + umem_alloc_buffer(umem);
-}
-
-static void pkt_stream_cancel(struct pkt_stream *pkt_stream)
-{
- pkt_stream->current_pkt_nb--;
-}
-
-static void pkt_generate(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, u64 addr, u32 len,
- u32 pkt_nb, u32 bytes_written)
-{
- void *data = xsk_umem__get_data(umem->buffer, addr);
-
- if (len < MIN_PKT_SIZE)
- return;
-
- if (!bytes_written) {
- gen_eth_hdr(xsk, data);
-
- len -= PKT_HDR_SIZE;
- data += PKT_HDR_SIZE;
- } else {
- bytes_written -= PKT_HDR_SIZE;
- }
-
- write_payload(data, pkt_nb, bytes_written, len);
-}
-
-static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames,
- u32 nb_frames, bool verbatim)
-{
- u32 i, len = 0, pkt_nb = 0, payload = 0;
- struct pkt_stream *pkt_stream;
-
- pkt_stream = __pkt_stream_alloc(nb_frames);
- if (!pkt_stream)
- exit_with_error(ENOMEM);
-
- for (i = 0; i < nb_frames; i++) {
- struct pkt *pkt = &pkt_stream->pkts[pkt_nb];
- struct pkt *frame = &frames[i];
-
- pkt->offset = frame->offset;
- if (verbatim) {
- *pkt = *frame;
- pkt->pkt_nb = payload;
- if (!frame->valid || !pkt_continues(frame->options))
- payload++;
- } else {
- if (frame->valid)
- len += frame->len;
- if (frame->valid && pkt_continues(frame->options))
- continue;
-
- pkt->pkt_nb = pkt_nb;
- pkt->len = len;
- pkt->valid = frame->valid;
- pkt->options = 0;
-
- len = 0;
- }
-
- print_verbose("offset: %d len: %u valid: %u options: %u pkt_nb: %u\n",
- pkt->offset, pkt->len, pkt->valid, pkt->options, pkt->pkt_nb);
-
- if (pkt->valid && pkt->len > pkt_stream->max_pkt_len)
- pkt_stream->max_pkt_len = pkt->len;
-
- if (pkt->valid)
- pkt_stream->nb_valid_entries++;
-
- pkt_nb++;
- }
-
- pkt_stream->nb_pkts = pkt_nb;
- pkt_stream->verbatim = verbatim;
- return pkt_stream;
-}
-
-static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts)
-{
- struct pkt_stream *pkt_stream;
-
- pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true);
- test->ifobj_tx->xsk->pkt_stream = pkt_stream;
-
- pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false);
- test->ifobj_rx->xsk->pkt_stream = pkt_stream;
-}
-
-static void pkt_print_data(u32 *data, u32 cnt)
-{
- u32 i;
-
- for (i = 0; i < cnt; i++) {
- u32 seqnum, pkt_nb;
-
- seqnum = ntohl(*data) & 0xffff;
- pkt_nb = ntohl(*data) >> 16;
- ksft_print_msg("%u:%u ", pkt_nb, seqnum);
- data++;
- }
-}
-
-static void pkt_dump(void *pkt, u32 len, bool eth_header)
-{
- struct ethhdr *ethhdr = pkt;
- u32 i, *data;
-
- if (eth_header) {
- /*extract L2 frame */
- ksft_print_msg("DEBUG>> L2: dst mac: ");
- for (i = 0; i < ETH_ALEN; i++)
- ksft_print_msg("%02X", ethhdr->h_dest[i]);
-
- ksft_print_msg("\nDEBUG>> L2: src mac: ");
- for (i = 0; i < ETH_ALEN; i++)
- ksft_print_msg("%02X", ethhdr->h_source[i]);
-
- data = pkt + PKT_HDR_SIZE;
- } else {
- data = pkt;
- }
-
- /*extract L5 frame */
- ksft_print_msg("\nDEBUG>> L5: seqnum: ");
- pkt_print_data(data, PKT_DUMP_NB_TO_PRINT);
- ksft_print_msg("....");
- if (len > PKT_DUMP_NB_TO_PRINT * sizeof(u32)) {
- ksft_print_msg("\n.... ");
- pkt_print_data(data + len / sizeof(u32) - PKT_DUMP_NB_TO_PRINT,
- PKT_DUMP_NB_TO_PRINT);
- }
- ksft_print_msg("\n---------------------------------------\n");
-}
-
-static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr)
-{
- u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom;
- u32 offset = addr % umem->frame_size, expected_offset;
- int pkt_offset = pkt->valid ? pkt->offset : 0;
-
- if (!umem->unaligned_mode)
- pkt_offset = 0;
-
- expected_offset = (pkt_offset + headroom + XDP_PACKET_HEADROOM) % umem->frame_size;
-
- if (offset == expected_offset)
- return true;
-
- ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset);
- return false;
-}
-
-static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr)
-{
- void *data = xsk_umem__get_data(buffer, addr);
- struct xdp_info *meta = data - sizeof(struct xdp_info);
-
- if (meta->count != pkt->pkt_nb) {
- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n",
- __func__, pkt->pkt_nb,
- (unsigned long long)meta->count);
- return false;
- }
-
- return true;
-}
-
-static bool is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx)
-{
- struct bpf_map *data_map;
- int adjust_value = 0;
- int key = 0;
- int ret;
-
- data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss");
- if (!data_map || !bpf_map__is_internal(data_map)) {
- ksft_print_msg("Error: could not find bss section of XDP program\n");
- exit_with_error(errno);
- }
-
- ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value);
- if (ret) {
- ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret);
- exit_with_error(errno);
- }
-
- /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail
- * helper is not supported. Skip the adjust_tail test case in this scenario.
- */
- return adjust_value != -EOPNOTSUPP;
-}
-
-static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb,
- u32 bytes_processed)
-{
- u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum;
- void *data = xsk_umem__get_data(umem->buffer, addr);
-
- addr -= umem->base_addr;
-
- if (addr >= umem->num_frames * umem->frame_size ||
- addr + len > umem->num_frames * umem->frame_size) {
- ksft_print_msg("Frag invalid addr: %llx len: %u\n",
- (unsigned long long)addr, len);
- return false;
- }
- if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) {
- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n",
- (unsigned long long)addr, len);
- return false;
- }
-
- pkt_data = data;
- if (!bytes_processed) {
- pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data);
- len -= PKT_HDR_SIZE;
- } else {
- bytes_processed -= PKT_HDR_SIZE;
- }
-
- expected_seqnum = bytes_processed / sizeof(*pkt_data);
- seqnum = ntohl(*pkt_data) & 0xffff;
- pkt_nb = ntohl(*pkt_data) >> 16;
-
- if (expected_pkt_nb != pkt_nb) {
- ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n",
- __func__, expected_pkt_nb, pkt_nb);
- goto error;
- }
- if (expected_seqnum != seqnum) {
- ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n",
- __func__, expected_seqnum, seqnum);
- goto error;
- }
-
- words_to_end = len / sizeof(*pkt_data) - 1;
- pkt_data += words_to_end;
- seqnum = ntohl(*pkt_data) & 0xffff;
- expected_seqnum += words_to_end;
- if (expected_seqnum != seqnum) {
- ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n",
- __func__, expected_seqnum, seqnum);
- goto error;
- }
-
- return true;
-
-error:
- pkt_dump(data, len, !bytes_processed);
- return false;
-}
-
-static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len)
-{
- if (pkt->len != len) {
- ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n",
- __func__, pkt->len, len);
- pkt_dump(xsk_umem__get_data(buffer, addr), len, true);
- return false;
- }
-
- return true;
-}
-
-static u32 load_value(u32 *counter)
-{
- return __atomic_load_n(counter, __ATOMIC_ACQUIRE);
-}
-
-static bool kick_tx_with_check(struct xsk_socket_info *xsk, int *ret)
-{
- u32 max_budget = MAX_TX_BUDGET_DEFAULT;
- u32 cons, ready_to_send;
- int delta;
-
- cons = load_value(xsk->tx.consumer);
- ready_to_send = load_value(xsk->tx.producer) - cons;
- *ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
-
- delta = load_value(xsk->tx.consumer) - cons;
- /* By default, xsk should consume exact @max_budget descs at one
- * send in this case where hitting the max budget limit in while
- * loop is triggered in __xsk_generic_xmit(). Please make sure that
- * the number of descs to be sent is larger than @max_budget, or
- * else the tx.consumer will be updated in xskq_cons_peek_desc()
- * in time which hides the issue we try to verify.
- */
- if (ready_to_send > max_budget && delta != max_budget)
- return false;
-
- return true;
-}
-
-static int kick_tx(struct xsk_socket_info *xsk)
-{
- int ret;
-
- if (xsk->check_consumer) {
- if (!kick_tx_with_check(xsk, &ret))
- return TEST_FAILURE;
- } else {
- ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
- }
- if (ret >= 0)
- return TEST_PASS;
- if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) {
- usleep(100);
- return TEST_PASS;
- }
- return TEST_FAILURE;
-}
-
-static int kick_rx(struct xsk_socket_info *xsk)
-{
- int ret;
-
- ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL);
- if (ret < 0)
- return TEST_FAILURE;
-
- return TEST_PASS;
-}
-
-static int complete_pkts(struct xsk_socket_info *xsk, int batch_size)
-{
- unsigned int rcvd;
- u32 idx;
- int ret;
-
- if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
- ret = kick_tx(xsk);
- if (ret)
- return TEST_FAILURE;
- }
-
- rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
- if (rcvd) {
- if (rcvd > xsk->outstanding_tx) {
- u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1);
-
- ksft_print_msg("[%s] Too many packets completed\n", __func__);
- ksft_print_msg("Last completion address: %llx\n",
- (unsigned long long)addr);
- return TEST_FAILURE;
- }
-
- xsk_ring_cons__release(&xsk->umem->cq, rcvd);
- xsk->outstanding_tx -= rcvd;
- }
-
- return TEST_PASS;
-}
-
-static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk)
-{
- u32 frags_processed = 0, nb_frags = 0, pkt_len = 0;
- u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0;
- struct pkt_stream *pkt_stream = xsk->pkt_stream;
- struct ifobject *ifobj = test->ifobj_rx;
- struct xsk_umem_info *umem = xsk->umem;
- struct pollfd fds = { };
- struct pkt *pkt;
- u64 first_addr = 0;
- int ret;
-
- fds.fd = xsk_socket__fd(xsk->xsk);
- fds.events = POLLIN;
-
- ret = kick_rx(xsk);
- if (ret)
- return TEST_FAILURE;
-
- if (ifobj->use_poll) {
- ret = poll(&fds, 1, POLL_TMOUT);
- if (ret < 0)
- return TEST_FAILURE;
-
- if (!ret) {
- if (!is_umem_valid(test->ifobj_tx))
- return TEST_PASS;
-
- ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__);
- return TEST_CONTINUE;
- }
-
- if (!(fds.revents & POLLIN))
- return TEST_CONTINUE;
- }
-
- rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx);
- if (!rcvd)
- return TEST_CONTINUE;
-
- if (ifobj->use_fill_ring) {
- ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
- while (ret != rcvd) {
- if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
- ret = poll(&fds, 1, POLL_TMOUT);
- if (ret < 0)
- return TEST_FAILURE;
- }
- ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
- }
- }
-
- while (frags_processed < rcvd) {
- const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++);
- u64 addr = desc->addr, orig;
-
- orig = xsk_umem__extract_addr(addr);
- addr = xsk_umem__add_offset_to_addr(addr);
-
- if (!nb_frags) {
- pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent);
- if (!pkt) {
- ksft_print_msg("[%s] received too many packets addr: %lx len %u\n",
- __func__, addr, desc->len);
- return TEST_FAILURE;
- }
- }
-
- print_verbose("Rx: addr: %lx len: %u options: %u pkt_nb: %u valid: %u\n",
- addr, desc->len, desc->options, pkt->pkt_nb, pkt->valid);
-
- if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) ||
- !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata &&
- !is_metadata_correct(pkt, umem->buffer, addr)))
- return TEST_FAILURE;
-
- if (!nb_frags++)
- first_addr = addr;
- frags_processed++;
- pkt_len += desc->len;
- if (ifobj->use_fill_ring)
- *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig;
-
- if (pkt_continues(desc->options))
- continue;
-
- /* The complete packet has been received */
- if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) ||
- !is_offset_correct(umem, pkt, addr))
- return TEST_FAILURE;
-
- pkt_stream->nb_rx_pkts++;
- nb_frags = 0;
- pkt_len = 0;
- }
-
- if (nb_frags) {
- /* In the middle of a packet. Start over from beginning of packet. */
- idx_rx -= nb_frags;
- xsk_ring_cons__cancel(&xsk->rx, nb_frags);
- if (ifobj->use_fill_ring) {
- idx_fq -= nb_frags;
- xsk_ring_prod__cancel(&umem->fq, nb_frags);
- }
- frags_processed -= nb_frags;
- }
-
- if (ifobj->use_fill_ring)
- xsk_ring_prod__submit(&umem->fq, frags_processed);
- if (ifobj->release_rx)
- xsk_ring_cons__release(&xsk->rx, frags_processed);
-
- pthread_mutex_lock(&pacing_mutex);
- pkts_in_flight -= pkts_sent;
- pthread_mutex_unlock(&pacing_mutex);
- pkts_sent = 0;
-
-return TEST_CONTINUE;
-}
-
-bool all_packets_received(struct test_spec *test, struct xsk_socket_info *xsk, u32 sock_num,
- unsigned long *bitmap)
-{
- struct pkt_stream *pkt_stream = xsk->pkt_stream;
-
- if (!pkt_stream) {
- __set_bit(sock_num, bitmap);
- return false;
- }
-
- if (pkt_stream->nb_rx_pkts == pkt_stream->nb_valid_entries) {
- __set_bit(sock_num, bitmap);
- if (bitmap_full(bitmap, test->nb_sockets))
- return true;
- }
-
- return false;
-}
-
-static int receive_pkts(struct test_spec *test)
-{
- struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0};
- DECLARE_BITMAP(bitmap, test->nb_sockets);
- struct xsk_socket_info *xsk;
- u32 sock_num = 0;
- int res, ret;
-
- ret = gettimeofday(&tv_now, NULL);
- if (ret)
- exit_with_error(errno);
-
- timeradd(&tv_now, &tv_timeout, &tv_end);
-
- while (1) {
- xsk = &test->ifobj_rx->xsk_arr[sock_num];
-
- if ((all_packets_received(test, xsk, sock_num, bitmap)))
- break;
-
- res = __receive_pkts(test, xsk);
- if (!(res == TEST_PASS || res == TEST_CONTINUE))
- return res;
-
- ret = gettimeofday(&tv_now, NULL);
- if (ret)
- exit_with_error(errno);
-
- if (timercmp(&tv_now, &tv_end, >)) {
- ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__);
- return TEST_FAILURE;
- }
- sock_num = (sock_num + 1) % test->nb_sockets;
- }
-
- return TEST_PASS;
-}
-
-static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout)
-{
- u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len;
- struct pkt_stream *pkt_stream = xsk->pkt_stream;
- struct xsk_umem_info *umem = ifobject->umem;
- bool use_poll = ifobject->use_poll;
- struct pollfd fds = { };
- int ret;
-
- buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len);
- /* pkts_in_flight might be negative if many invalid packets are sent */
- if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) /
- buffer_len)) {
- ret = kick_tx(xsk);
- if (ret)
- return TEST_FAILURE;
- return TEST_CONTINUE;
- }
-
- fds.fd = xsk_socket__fd(xsk->xsk);
- fds.events = POLLOUT;
-
- while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) {
- if (use_poll) {
- ret = poll(&fds, 1, POLL_TMOUT);
- if (timeout) {
- if (ret < 0) {
- ksft_print_msg("ERROR: [%s] Poll error %d\n",
- __func__, errno);
- return TEST_FAILURE;
- }
- if (ret == 0)
- return TEST_PASS;
- break;
- }
- if (ret <= 0) {
- ksft_print_msg("ERROR: [%s] Poll error %d\n",
- __func__, errno);
- return TEST_FAILURE;
- }
- }
-
- complete_pkts(xsk, xsk->batch_size);
- }
-
- for (i = 0; i < xsk->batch_size; i++) {
- struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
- u32 nb_frags_left, nb_frags, bytes_written = 0;
-
- if (!pkt)
- break;
-
- nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt);
- if (nb_frags > xsk->batch_size - i) {
- pkt_stream_cancel(pkt_stream);
- xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i);
- break;
- }
- nb_frags_left = nb_frags;
-
- while (nb_frags_left--) {
- struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i);
-
- tx_desc->addr = pkt_get_addr(pkt, ifobject->umem);
- if (pkt_stream->verbatim) {
- tx_desc->len = pkt->len;
- tx_desc->options = pkt->options;
- } else if (nb_frags_left) {
- tx_desc->len = umem->frame_size;
- tx_desc->options = XDP_PKT_CONTD;
- } else {
- tx_desc->len = pkt->len - bytes_written;
- tx_desc->options = 0;
- }
- if (pkt->valid)
- pkt_generate(xsk, umem, tx_desc->addr, tx_desc->len, pkt->pkt_nb,
- bytes_written);
- bytes_written += tx_desc->len;
-
- print_verbose("Tx addr: %llx len: %u options: %u pkt_nb: %u\n",
- tx_desc->addr, tx_desc->len, tx_desc->options, pkt->pkt_nb);
-
- if (nb_frags_left) {
- i++;
- if (pkt_stream->verbatim)
- pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
- }
- }
-
- if (pkt && pkt->valid) {
- valid_pkts++;
- valid_frags += nb_frags;
- }
- }
-
- pthread_mutex_lock(&pacing_mutex);
- pkts_in_flight += valid_pkts;
- pthread_mutex_unlock(&pacing_mutex);
-
- xsk_ring_prod__submit(&xsk->tx, i);
- xsk->outstanding_tx += valid_frags;
-
- if (use_poll) {
- ret = poll(&fds, 1, POLL_TMOUT);
- if (ret <= 0) {
- if (ret == 0 && timeout)
- return TEST_PASS;
-
- ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret);
- return TEST_FAILURE;
- }
- }
-
- if (!timeout) {
- if (complete_pkts(xsk, i))
- return TEST_FAILURE;
-
- usleep(10);
- return TEST_PASS;
- }
-
- return TEST_CONTINUE;
-}
-
-static int wait_for_tx_completion(struct xsk_socket_info *xsk)
-{
- struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0};
- int ret;
-
- ret = gettimeofday(&tv_now, NULL);
- if (ret)
- exit_with_error(errno);
- timeradd(&tv_now, &tv_timeout, &tv_end);
-
- while (xsk->outstanding_tx) {
- ret = gettimeofday(&tv_now, NULL);
- if (ret)
- exit_with_error(errno);
- if (timercmp(&tv_now, &tv_end, >)) {
- ksft_print_msg("ERROR: [%s] Transmission loop timed out\n", __func__);
- return TEST_FAILURE;
- }
-
- complete_pkts(xsk, xsk->batch_size);
- }
-
- return TEST_PASS;
-}
-
-bool all_packets_sent(struct test_spec *test, unsigned long *bitmap)
-{
- return bitmap_full(bitmap, test->nb_sockets);
-}
-
-static int send_pkts(struct test_spec *test, struct ifobject *ifobject)
-{
- bool timeout = !is_umem_valid(test->ifobj_rx);
- DECLARE_BITMAP(bitmap, test->nb_sockets);
- u32 i, ret;
-
- while (!(all_packets_sent(test, bitmap))) {
- for (i = 0; i < test->nb_sockets; i++) {
- struct pkt_stream *pkt_stream;
-
- pkt_stream = ifobject->xsk_arr[i].pkt_stream;
- if (!pkt_stream || pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) {
- __set_bit(i, bitmap);
- continue;
- }
- ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout);
- if (ret == TEST_CONTINUE && !test->fail)
- continue;
-
- if ((ret || test->fail) && !timeout)
- return TEST_FAILURE;
-
- if (ret == TEST_PASS && timeout)
- return ret;
-
- ret = wait_for_tx_completion(&ifobject->xsk_arr[i]);
- if (ret)
- return TEST_FAILURE;
- }
- }
-
- return TEST_PASS;
-}
-
-static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats)
-{
- int fd = xsk_socket__fd(xsk), err;
- socklen_t optlen, expected_len;
-
- optlen = sizeof(*stats);
- err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen);
- if (err) {
- ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n",
- __func__, -err, strerror(-err));
- return TEST_FAILURE;
- }
-
- expected_len = sizeof(struct xdp_statistics);
- if (optlen != expected_len) {
- ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n",
- __func__, expected_len, optlen);
- return TEST_FAILURE;
- }
-
- return TEST_PASS;
-}
-
-static int validate_rx_dropped(struct ifobject *ifobject)
-{
- struct xsk_socket *xsk = ifobject->xsk->xsk;
- struct xdp_statistics stats;
- int err;
-
- err = kick_rx(ifobject->xsk);
- if (err)
- return TEST_FAILURE;
-
- err = get_xsk_stats(xsk, &stats);
- if (err)
- return TEST_FAILURE;
-
- /* The receiver calls getsockopt after receiving the last (valid)
- * packet which is not the final packet sent in this test (valid and
- * invalid packets are sent in alternating fashion with the final
- * packet being invalid). Since the last packet may or may not have
- * been dropped already, both outcomes must be allowed.
- */
- if (stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 ||
- stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 - 1)
- return TEST_PASS;
-
- return TEST_FAILURE;
-}
-
-static int validate_rx_full(struct ifobject *ifobject)
-{
- struct xsk_socket *xsk = ifobject->xsk->xsk;
- struct xdp_statistics stats;
- int err;
-
- usleep(1000);
- err = kick_rx(ifobject->xsk);
- if (err)
- return TEST_FAILURE;
-
- err = get_xsk_stats(xsk, &stats);
- if (err)
- return TEST_FAILURE;
-
- if (stats.rx_ring_full)
- return TEST_PASS;
-
- return TEST_FAILURE;
-}
-
-static int validate_fill_empty(struct ifobject *ifobject)
-{
- struct xsk_socket *xsk = ifobject->xsk->xsk;
- struct xdp_statistics stats;
- int err;
-
- usleep(1000);
- err = kick_rx(ifobject->xsk);
- if (err)
- return TEST_FAILURE;
-
- err = get_xsk_stats(xsk, &stats);
- if (err)
- return TEST_FAILURE;
-
- if (stats.rx_fill_ring_empty_descs)
- return TEST_PASS;
-
- return TEST_FAILURE;
-}
-
-static int validate_tx_invalid_descs(struct ifobject *ifobject)
-{
- struct xsk_socket *xsk = ifobject->xsk->xsk;
- int fd = xsk_socket__fd(xsk);
- struct xdp_statistics stats;
- socklen_t optlen;
- int err;
-
- optlen = sizeof(stats);
- err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
- if (err) {
- ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n",
- __func__, -err, strerror(-err));
- return TEST_FAILURE;
- }
-
- if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) {
- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n",
- __func__,
- (unsigned long long)stats.tx_invalid_descs,
- ifobject->xsk->pkt_stream->nb_pkts);
- return TEST_FAILURE;
- }
-
- return TEST_PASS;
-}
-
-static void xsk_configure_socket(struct test_spec *test, struct ifobject *ifobject,
- struct xsk_umem_info *umem, bool tx)
-{
- int i, ret;
-
- for (i = 0; i < test->nb_sockets; i++) {
- bool shared = (ifobject->shared_umem && tx) ? true : !!i;
- u32 ctr = 0;
-
- while (ctr++ < SOCK_RECONF_CTR) {
- ret = __xsk_configure_socket(&ifobject->xsk_arr[i], umem,
- ifobject, shared);
- if (!ret)
- break;
-
- /* Retry if it fails as xsk_socket__create() is asynchronous */
- if (ctr >= SOCK_RECONF_CTR)
- exit_with_error(-ret);
- usleep(USLEEP_MAX);
- }
- if (ifobject->busy_poll)
- enable_busy_poll(&ifobject->xsk_arr[i]);
- }
-}
-
-static void thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject)
-{
- xsk_configure_socket(test, ifobject, test->ifobj_rx->umem, true);
- ifobject->xsk = &ifobject->xsk_arr[0];
- ifobject->xskmap = test->ifobj_rx->xskmap;
- memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info));
- ifobject->umem->base_addr = 0;
-}
-
-static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream,
- bool fill_up)
-{
- u32 rx_frame_size = umem->frame_size - XDP_PACKET_HEADROOM;
- u32 idx = 0, filled = 0, buffers_to_fill, nb_pkts;
- int ret;
-
- if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS)
- buffers_to_fill = umem->num_frames;
- else
- buffers_to_fill = umem->fill_size;
-
- ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx);
- if (ret != buffers_to_fill)
- exit_with_error(ENOSPC);
-
- while (filled < buffers_to_fill) {
- struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &nb_pkts);
- u64 addr;
- u32 i;
-
- for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) {
- if (!pkt) {
- if (!fill_up)
- break;
- addr = filled * umem->frame_size + umem->base_addr;
- } else if (pkt->offset >= 0) {
- addr = pkt->offset % umem->frame_size + umem_alloc_buffer(umem);
- } else {
- addr = pkt->offset + umem_alloc_buffer(umem);
- }
-
- *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
- if (++filled >= buffers_to_fill)
- break;
- }
- }
- xsk_ring_prod__submit(&umem->fq, filled);
- xsk_ring_prod__cancel(&umem->fq, buffers_to_fill - filled);
-
- pkt_stream_reset(pkt_stream);
- umem_reset_alloc(umem);
-}
-
-static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
-{
- u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
- int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
- LIBBPF_OPTS(bpf_xdp_query_opts, opts);
- void *bufs;
- int ret;
- u32 i;
-
- if (ifobject->umem->unaligned_mode)
- mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB;
-
- if (ifobject->shared_umem)
- umem_sz *= 2;
-
- bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
- if (bufs == MAP_FAILED)
- exit_with_error(errno);
-
- ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz);
- if (ret)
- exit_with_error(-ret);
-
- xsk_configure_socket(test, ifobject, ifobject->umem, false);
-
- ifobject->xsk = &ifobject->xsk_arr[0];
-
- if (!ifobject->rx_on)
- return;
-
- xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream, ifobject->use_fill_ring);
-
- for (i = 0; i < test->nb_sockets; i++) {
- ifobject->xsk = &ifobject->xsk_arr[i];
- ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i);
- if (ret)
- exit_with_error(errno);
- }
-}
-
-static void *worker_testapp_validate_tx(void *arg)
-{
- struct test_spec *test = (struct test_spec *)arg;
- struct ifobject *ifobject = test->ifobj_tx;
- int err;
-
- if (test->current_step == 1) {
- if (!ifobject->shared_umem)
- thread_common_ops(test, ifobject);
- else
- thread_common_ops_tx(test, ifobject);
- }
-
- err = send_pkts(test, ifobject);
-
- if (!err && ifobject->validation_func)
- err = ifobject->validation_func(ifobject);
- if (err)
- report_failure(test);
-
- pthread_exit(NULL);
-}
-
-static void *worker_testapp_validate_rx(void *arg)
-{
- struct test_spec *test = (struct test_spec *)arg;
- struct ifobject *ifobject = test->ifobj_rx;
- int err;
-
- if (test->current_step == 1) {
- thread_common_ops(test, ifobject);
- } else {
- xsk_clear_xskmap(ifobject->xskmap);
- err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, 0);
- if (err) {
- ksft_print_msg("Error: Failed to update xskmap, error %s\n",
- strerror(-err));
- exit_with_error(-err);
- }
- }
-
- pthread_barrier_wait(&barr);
-
- err = receive_pkts(test);
-
- if (!err && ifobject->validation_func)
- err = ifobject->validation_func(ifobject);
-
- if (err) {
- if (test->adjust_tail && !is_adjust_tail_supported(ifobject->xdp_progs))
- test->adjust_tail_support = false;
- else
- report_failure(test);
- }
-
- pthread_exit(NULL);
-}
-
-static u64 ceil_u64(u64 a, u64 b)
-{
- return (a + b - 1) / b;
-}
-
-static void testapp_clean_xsk_umem(struct ifobject *ifobj)
-{
- u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size;
-
- if (ifobj->shared_umem)
- umem_sz *= 2;
-
- umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE;
- xsk_umem__delete(ifobj->umem->umem);
- munmap(ifobj->umem->buffer, umem_sz);
-}
-
-static void handler(int signum)
-{
- pthread_exit(NULL);
-}
-
-static bool xdp_prog_changed_rx(struct test_spec *test)
-{
- struct ifobject *ifobj = test->ifobj_rx;
-
- return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode;
-}
-
-static bool xdp_prog_changed_tx(struct test_spec *test)
-{
- struct ifobject *ifobj = test->ifobj_tx;
-
- return ifobj->xdp_prog != test->xdp_prog_tx || ifobj->mode != test->mode;
-}
-
-static void xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog,
- struct bpf_map *xskmap, enum test_mode mode)
-{
- int err;
-
- xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode));
- err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode));
- if (err) {
- ksft_print_msg("Error attaching XDP program\n");
- exit_with_error(-err);
- }
-
- if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC))
- if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) {
- ksft_print_msg("ERROR: XDP prog not in DRV mode\n");
- exit_with_error(EINVAL);
- }
-
- ifobj->xdp_prog = xdp_prog;
- ifobj->xskmap = xskmap;
- ifobj->mode = mode;
-}
-
-static void xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx,
- struct ifobject *ifobj_tx)
-{
- if (xdp_prog_changed_rx(test))
- xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode);
-
- if (!ifobj_tx || ifobj_tx->shared_umem)
- return;
-
- if (xdp_prog_changed_tx(test))
- xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode);
-}
-
-static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1,
- struct ifobject *ifobj2)
-{
- pthread_t t0, t1;
- int err;
-
- if (test->mtu > MAX_ETH_PKT_SIZE) {
- if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp ||
- (ifobj2 && !ifobj2->multi_buff_zc_supp))) {
- ksft_test_result_skip("Multi buffer for zero-copy not supported.\n");
- return TEST_SKIP;
- }
- if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp ||
- (ifobj2 && !ifobj2->multi_buff_supp))) {
- ksft_test_result_skip("Multi buffer not supported.\n");
- return TEST_SKIP;
- }
- }
- err = test_spec_set_mtu(test, test->mtu);
- if (err) {
- ksft_print_msg("Error, could not set mtu.\n");
- exit_with_error(err);
- }
-
- if (ifobj2) {
- if (pthread_barrier_init(&barr, NULL, 2))
- exit_with_error(errno);
- pkt_stream_reset(ifobj2->xsk->pkt_stream);
- }
-
- test->current_step++;
- pkt_stream_reset(ifobj1->xsk->pkt_stream);
- pkts_in_flight = 0;
-
- signal(SIGUSR1, handler);
- /*Spawn RX thread */
- pthread_create(&t0, NULL, ifobj1->func_ptr, test);
-
- if (ifobj2) {
- pthread_barrier_wait(&barr);
- if (pthread_barrier_destroy(&barr))
- exit_with_error(errno);
-
- /*Spawn TX thread */
- pthread_create(&t1, NULL, ifobj2->func_ptr, test);
-
- pthread_join(t1, NULL);
- }
-
- if (!ifobj2)
- pthread_kill(t0, SIGUSR1);
- else
- pthread_join(t0, NULL);
-
- if (test->total_steps == test->current_step || test->fail) {
- u32 i;
-
- if (ifobj2)
- for (i = 0; i < test->nb_sockets; i++)
- xsk_socket__delete(ifobj2->xsk_arr[i].xsk);
-
- for (i = 0; i < test->nb_sockets; i++)
- xsk_socket__delete(ifobj1->xsk_arr[i].xsk);
-
- testapp_clean_xsk_umem(ifobj1);
- if (ifobj2 && !ifobj2->shared_umem)
- testapp_clean_xsk_umem(ifobj2);
- }
-
- return !!test->fail;
-}
-
-static int testapp_validate_traffic(struct test_spec *test)
-{
- struct ifobject *ifobj_rx = test->ifobj_rx;
- struct ifobject *ifobj_tx = test->ifobj_tx;
-
- if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) ||
- (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) {
- ksft_test_result_skip("No huge pages present.\n");
- return TEST_SKIP;
- }
-
- if (test->set_ring) {
- if (ifobj_tx->hw_ring_size_supp) {
- if (set_ring_size(ifobj_tx)) {
- ksft_test_result_skip("Failed to change HW ring size.\n");
- return TEST_FAILURE;
- }
- } else {
- ksft_test_result_skip("Changing HW ring size not supported.\n");
- return TEST_SKIP;
- }
- }
-
- xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx);
- return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx);
-}
-
-static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj)
-{
- return __testapp_validate_traffic(test, ifobj, NULL);
-}
-
-static int testapp_teardown(struct test_spec *test)
-{
- int i;
-
- for (i = 0; i < MAX_TEARDOWN_ITER; i++) {
- if (testapp_validate_traffic(test))
- return TEST_FAILURE;
- test_spec_reset(test);
- }
-
- return TEST_PASS;
-}
-
-static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2)
-{
- thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr;
- struct ifobject *tmp_ifobj = (*ifobj1);
-
- (*ifobj1)->func_ptr = (*ifobj2)->func_ptr;
- (*ifobj2)->func_ptr = tmp_func_ptr;
-
- *ifobj1 = *ifobj2;
- *ifobj2 = tmp_ifobj;
-}
-
-static int testapp_bidirectional(struct test_spec *test)
-{
- int res;
-
- test->ifobj_tx->rx_on = true;
- test->ifobj_rx->tx_on = true;
- test->total_steps = 2;
- if (testapp_validate_traffic(test))
- return TEST_FAILURE;
-
- print_verbose("Switching Tx/Rx direction\n");
- swap_directions(&test->ifobj_rx, &test->ifobj_tx);
- res = __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx);
-
- swap_directions(&test->ifobj_rx, &test->ifobj_tx);
- return res;
-}
-
-static int swap_xsk_resources(struct test_spec *test)
-{
- int ret;
-
- test->ifobj_tx->xsk_arr[0].pkt_stream = NULL;
- test->ifobj_rx->xsk_arr[0].pkt_stream = NULL;
- test->ifobj_tx->xsk_arr[1].pkt_stream = test->tx_pkt_stream_default;
- test->ifobj_rx->xsk_arr[1].pkt_stream = test->rx_pkt_stream_default;
- test->ifobj_tx->xsk = &test->ifobj_tx->xsk_arr[1];
- test->ifobj_rx->xsk = &test->ifobj_rx->xsk_arr[1];
-
- ret = xsk_update_xskmap(test->ifobj_rx->xskmap, test->ifobj_rx->xsk->xsk, 0);
- if (ret)
- return TEST_FAILURE;
-
- return TEST_PASS;
-}
-
-static int testapp_xdp_prog_cleanup(struct test_spec *test)
-{
- test->total_steps = 2;
- test->nb_sockets = 2;
- if (testapp_validate_traffic(test))
- return TEST_FAILURE;
-
- if (swap_xsk_resources(test))
- return TEST_FAILURE;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_headroom(struct test_spec *test)
-{
- test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_stats_rx_dropped(struct test_spec *test)
-{
- if (test->mode == TEST_MODE_ZC) {
- ksft_test_result_skip("Can not run RX_DROPPED test for ZC mode\n");
- return TEST_SKIP;
- }
-
- pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0);
- test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size -
- XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3;
- pkt_stream_receive_half(test);
- test->ifobj_rx->validation_func = validate_rx_dropped;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_stats_tx_invalid_descs(struct test_spec *test)
-{
- pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0);
- test->ifobj_tx->validation_func = validate_tx_invalid_descs;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_stats_rx_full(struct test_spec *test)
-{
- pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE);
- test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE);
-
- test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS;
- test->ifobj_rx->release_rx = false;
- test->ifobj_rx->validation_func = validate_rx_full;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_stats_fill_empty(struct test_spec *test)
-{
- pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE);
- test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE);
-
- test->ifobj_rx->use_fill_ring = false;
- test->ifobj_rx->validation_func = validate_fill_empty;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_send_receive_unaligned(struct test_spec *test)
-{
- test->ifobj_tx->umem->unaligned_mode = true;
- test->ifobj_rx->umem->unaligned_mode = true;
- /* Let half of the packets straddle a 4K buffer boundary */
- pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2);
-
- return testapp_validate_traffic(test);
-}
-
-static int testapp_send_receive_unaligned_mb(struct test_spec *test)
-{
- test->mtu = MAX_ETH_JUMBO_SIZE;
- test->ifobj_tx->umem->unaligned_mode = true;
- test->ifobj_rx->umem->unaligned_mode = true;
- pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE);
- return testapp_validate_traffic(test);
-}
-
-static int testapp_single_pkt(struct test_spec *test)
-{
- struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}};
-
- pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
- return testapp_validate_traffic(test);
-}
-
-static int testapp_send_receive_mb(struct test_spec *test)
-{
- test->mtu = MAX_ETH_JUMBO_SIZE;
- pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE);
-
- return testapp_validate_traffic(test);
-}
-
-static int testapp_invalid_desc_mb(struct test_spec *test)
-{
- struct xsk_umem_info *umem = test->ifobj_tx->umem;
- u64 umem_size = umem->num_frames * umem->frame_size;
- struct pkt pkts[] = {
- /* Valid packet for synch to start with */
- {0, MIN_PKT_SIZE, 0, true, 0},
- /* Zero frame len is not legal */
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- {0, 0, 0, false, 0},
- /* Invalid address in the second frame */
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- /* Invalid len in the middle */
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- /* Invalid options in the middle */
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION},
- /* Transmit 2 frags, receive 3 */
- {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD},
- {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0},
- /* Middle frame crosses chunk boundary with small length */
- {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
- {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0},
- /* Valid packet for synch so that something is received */
- {0, MIN_PKT_SIZE, 0, true, 0}};
-
- if (umem->unaligned_mode) {
- /* Crossing a chunk boundary allowed */
- pkts[12].valid = true;
- pkts[13].valid = true;
- }
-
- test->mtu = MAX_ETH_JUMBO_SIZE;
- pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
- return testapp_validate_traffic(test);
-}
-
-static int testapp_invalid_desc(struct test_spec *test)
-{
- struct xsk_umem_info *umem = test->ifobj_tx->umem;
- u64 umem_size = umem->num_frames * umem->frame_size;
- struct pkt pkts[] = {
- /* Zero packet address allowed */
- {0, MIN_PKT_SIZE, 0, true},
- /* Allowed packet */
- {0, MIN_PKT_SIZE, 0, true},
- /* Straddling the start of umem */
- {-2, MIN_PKT_SIZE, 0, false},
- /* Packet too large */
- {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false},
- /* Up to end of umem allowed */
- {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true},
- /* After umem ends */
- {umem_size, MIN_PKT_SIZE, 0, false},
- /* Straddle the end of umem */
- {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false},
- /* Straddle a 4K boundary */
- {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false},
- /* Straddle a 2K boundary */
- {0x800 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, true},
- /* Valid packet for synch so that something is received */
- {0, MIN_PKT_SIZE, 0, true}};
-
- if (umem->unaligned_mode) {
- /* Crossing a page boundary allowed */
- pkts[7].valid = true;
- }
- if (umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) {
- /* Crossing a 2K frame size boundary not allowed */
- pkts[8].valid = false;
- }
-
- if (test->ifobj_tx->shared_umem) {
- pkts[4].offset += umem_size;
- pkts[5].offset += umem_size;
- pkts[6].offset += umem_size;
- }
-
- pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
- return testapp_validate_traffic(test);
-}
-
-static int testapp_xdp_drop(struct test_spec *test)
-{
- struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
- struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
-
- test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop,
- skel_rx->maps.xsk, skel_tx->maps.xsk);
-
- pkt_stream_receive_half(test);
- return testapp_validate_traffic(test);
-}
-
-static int testapp_xdp_metadata_copy(struct test_spec *test)
-{
- struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
- struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
-
- test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata,
- skel_tx->progs.xsk_xdp_populate_metadata,
- skel_rx->maps.xsk, skel_tx->maps.xsk);
- test->ifobj_rx->use_metadata = true;
-
- skel_rx->bss->count = 0;
-
- return testapp_validate_traffic(test);
-}
-
-static int testapp_xdp_shared_umem(struct test_spec *test)
-{
- struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
- struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
-
- test->total_steps = 1;
- test->nb_sockets = 2;
-
- test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_shared_umem,
- skel_tx->progs.xsk_xdp_shared_umem,
- skel_rx->maps.xsk, skel_tx->maps.xsk);
-
- pkt_stream_even_odd_sequence(test);
-
- return testapp_validate_traffic(test);
-}
-
-static int testapp_poll_txq_tmout(struct test_spec *test)
-{
- test->ifobj_tx->use_poll = true;
- /* create invalid frame by set umem frame_size and pkt length equal to 2048 */
- test->ifobj_tx->umem->frame_size = 2048;
- pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048);
- return testapp_validate_traffic_single_thread(test, test->ifobj_tx);
-}
-
-static int testapp_poll_rxq_tmout(struct test_spec *test)
-{
- test->ifobj_rx->use_poll = true;
- return testapp_validate_traffic_single_thread(test, test->ifobj_rx);
-}
-
-static int testapp_too_many_frags(struct test_spec *test)
-{
- struct pkt *pkts;
- u32 max_frags, i;
- int ret;
-
- if (test->mode == TEST_MODE_ZC) {
- max_frags = test->ifobj_tx->xdp_zc_max_segs;
- } else {
- max_frags = get_max_skb_frags();
- if (!max_frags) {
- ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n");
- max_frags = 17;
- }
- max_frags += 1;
- }
-
- pkts = calloc(2 * max_frags + 2, sizeof(struct pkt));
- if (!pkts)
- return TEST_FAILURE;
-
- test->mtu = MAX_ETH_JUMBO_SIZE;
-
- /* Valid packet for synch */
- pkts[0].len = MIN_PKT_SIZE;
- pkts[0].valid = true;
-
- /* One valid packet with the max amount of frags */
- for (i = 1; i < max_frags + 1; i++) {
- pkts[i].len = MIN_PKT_SIZE;
- pkts[i].options = XDP_PKT_CONTD;
- pkts[i].valid = true;
- }
- pkts[max_frags].options = 0;
-
- /* An invalid packet with the max amount of frags but signals packet
- * continues on the last frag
- */
- for (i = max_frags + 1; i < 2 * max_frags + 1; i++) {
- pkts[i].len = MIN_PKT_SIZE;
- pkts[i].options = XDP_PKT_CONTD;
- pkts[i].valid = false;
- }
-
- /* Valid packet for synch */
- pkts[2 * max_frags + 1].len = MIN_PKT_SIZE;
- pkts[2 * max_frags + 1].valid = true;
-
- pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2);
- ret = testapp_validate_traffic(test);
-
- free(pkts);
- return ret;
-}
-
-static int xsk_load_xdp_programs(struct ifobject *ifobj)
-{
- ifobj->xdp_progs = xsk_xdp_progs__open_and_load();
- if (libbpf_get_error(ifobj->xdp_progs))
- return libbpf_get_error(ifobj->xdp_progs);
-
- return 0;
-}
-
static void xsk_unload_xdp_programs(struct ifobject *ifobj)
{
xsk_xdp_progs__destroy(ifobj->xdp_progs);
}
-/* Simple test */
-static bool hugepages_present(void)
-{
- size_t mmap_sz = 2 * DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE;
- void *bufs;
-
- bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, MAP_HUGE_2MB);
- if (bufs == MAP_FAILED)
- return false;
-
- mmap_sz = ceil_u64(mmap_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE;
- munmap(bufs, mmap_sz);
- return true;
-}
-
-static void init_iface(struct ifobject *ifobj, thread_func_t func_ptr)
-{
- LIBBPF_OPTS(bpf_xdp_query_opts, query_opts);
- int err;
-
- ifobj->func_ptr = func_ptr;
-
- err = xsk_load_xdp_programs(ifobj);
- if (err) {
- ksft_print_msg("Error loading XDP program\n");
- exit_with_error(err);
- }
-
- if (hugepages_present())
- ifobj->unaligned_supp = true;
-
- err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts);
- if (err) {
- ksft_print_msg("Error querying XDP capabilities\n");
- exit_with_error(-err);
- }
- if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG)
- ifobj->multi_buff_supp = true;
- if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
- if (query_opts.xdp_zc_max_segs > 1) {
- ifobj->multi_buff_zc_supp = true;
- ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs;
- } else {
- ifobj->xdp_zc_max_segs = 0;
- }
- }
-}
-
-static int testapp_send_receive(struct test_spec *test)
-{
- return testapp_validate_traffic(test);
-}
-
-static int testapp_send_receive_2k_frame(struct test_spec *test)
-{
- test->ifobj_tx->umem->frame_size = 2048;
- test->ifobj_rx->umem->frame_size = 2048;
- pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE);
- return testapp_validate_traffic(test);
-}
-
-static int testapp_poll_rx(struct test_spec *test)
-{
- test->ifobj_rx->use_poll = true;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_poll_tx(struct test_spec *test)
-{
- test->ifobj_tx->use_poll = true;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_aligned_inv_desc(struct test_spec *test)
-{
- return testapp_invalid_desc(test);
-}
-
-static int testapp_aligned_inv_desc_2k_frame(struct test_spec *test)
-{
- test->ifobj_tx->umem->frame_size = 2048;
- test->ifobj_rx->umem->frame_size = 2048;
- return testapp_invalid_desc(test);
-}
-
-static int testapp_unaligned_inv_desc(struct test_spec *test)
-{
- test->ifobj_tx->umem->unaligned_mode = true;
- test->ifobj_rx->umem->unaligned_mode = true;
- return testapp_invalid_desc(test);
-}
-
-static int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test)
-{
- u64 page_size, umem_size;
-
- /* Odd frame size so the UMEM doesn't end near a page boundary. */
- test->ifobj_tx->umem->frame_size = 4001;
- test->ifobj_rx->umem->frame_size = 4001;
- test->ifobj_tx->umem->unaligned_mode = true;
- test->ifobj_rx->umem->unaligned_mode = true;
- /* This test exists to test descriptors that staddle the end of
- * the UMEM but not a page.
- */
- page_size = sysconf(_SC_PAGESIZE);
- umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size;
- assert(umem_size % page_size > MIN_PKT_SIZE);
- assert(umem_size % page_size < page_size - MIN_PKT_SIZE);
-
- return testapp_invalid_desc(test);
-}
-
-static int testapp_aligned_inv_desc_mb(struct test_spec *test)
-{
- return testapp_invalid_desc_mb(test);
-}
-
-static int testapp_unaligned_inv_desc_mb(struct test_spec *test)
-{
- test->ifobj_tx->umem->unaligned_mode = true;
- test->ifobj_rx->umem->unaligned_mode = true;
- return testapp_invalid_desc_mb(test);
-}
-
-static int testapp_xdp_metadata(struct test_spec *test)
-{
- return testapp_xdp_metadata_copy(test);
-}
-
-static int testapp_xdp_metadata_mb(struct test_spec *test)
-{
- test->mtu = MAX_ETH_JUMBO_SIZE;
- return testapp_xdp_metadata_copy(test);
-}
-
-static int testapp_hw_sw_min_ring_size(struct test_spec *test)
-{
- int ret;
-
- test->set_ring = true;
- test->total_steps = 2;
- test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE;
- test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2;
- test->ifobj_tx->xsk->batch_size = 1;
- test->ifobj_rx->xsk->batch_size = 1;
- ret = testapp_validate_traffic(test);
- if (ret)
- return ret;
-
- /* Set batch size to hw_ring_size - 1 */
- test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1;
- test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1;
- return testapp_validate_traffic(test);
-}
-
-static int testapp_hw_sw_max_ring_size(struct test_spec *test)
-{
- u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4;
- int ret;
-
- test->set_ring = true;
- test->total_steps = 2;
- test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending;
- test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending;
- test->ifobj_rx->umem->num_frames = max_descs;
- test->ifobj_rx->umem->fill_size = max_descs;
- test->ifobj_rx->umem->comp_size = max_descs;
- test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
- test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
-
- ret = testapp_validate_traffic(test);
- if (ret)
- return ret;
-
- /* Set batch_size to 8152 for testing, as the ice HW ignores the 3 lowest bits when
- * updating the Rx HW tail register.
- */
- test->ifobj_tx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8;
- test->ifobj_rx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8;
- pkt_stream_replace(test, max_descs, MIN_PKT_SIZE);
- return testapp_validate_traffic(test);
-}
-
-static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value)
-{
- struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
- struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
-
- test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail,
- skel_tx->progs.xsk_xdp_adjust_tail,
- skel_rx->maps.xsk, skel_tx->maps.xsk);
-
- skel_rx->bss->adjust_value = adjust_value;
-
- return testapp_validate_traffic(test);
-}
-
-static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len)
-{
- int ret;
-
- test->adjust_tail_support = true;
- test->adjust_tail = true;
- test->total_steps = 1;
-
- pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len);
- pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value);
-
- ret = testapp_xdp_adjust_tail(test, value);
- if (ret)
- return ret;
-
- if (!test->adjust_tail_support) {
- ksft_test_result_skip("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n",
- mode_string(test), busy_poll_string(test));
- return TEST_SKIP;
- }
-
- return 0;
-}
-
-static int testapp_adjust_tail_shrink(struct test_spec *test)
-{
- /* Shrink by 4 bytes for testing purpose */
- return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2);
-}
-
-static int testapp_adjust_tail_shrink_mb(struct test_spec *test)
-{
- test->mtu = MAX_ETH_JUMBO_SIZE;
- /* Shrink by the frag size */
- return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2);
-}
-
-static int testapp_adjust_tail_grow(struct test_spec *test)
-{
- /* Grow by 4 bytes for testing purpose */
- return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2);
-}
-
-static int testapp_adjust_tail_grow_mb(struct test_spec *test)
-{
- test->mtu = MAX_ETH_JUMBO_SIZE;
- /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */
- return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1,
- XSK_UMEM__LARGE_FRAME_SIZE * 2);
-}
-
-static int testapp_tx_queue_consumer(struct test_spec *test)
-{
- int nr_packets;
-
- if (test->mode == TEST_MODE_ZC) {
- ksft_test_result_skip("Can not run TX_QUEUE_CONSUMER test for ZC mode\n");
- return TEST_SKIP;
- }
-
- nr_packets = MAX_TX_BUDGET_DEFAULT + 1;
- pkt_stream_replace(test, nr_packets, MIN_PKT_SIZE);
- test->ifobj_tx->xsk->batch_size = nr_packets;
- test->ifobj_tx->xsk->check_consumer = true;
-
- return testapp_validate_traffic(test);
-}
-
static void run_pkt_test(struct test_spec *test)
{
int ret;
ret = test->test_func(test);
- if (ret == TEST_PASS)
+ switch (ret) {
+ case TEST_PASS:
ksft_test_result_pass("PASS: %s %s%s\n", mode_string(test), busy_poll_string(test),
test->name);
- pkt_stream_restore_default(test);
-}
-
-static struct ifobject *ifobject_create(void)
-{
- struct ifobject *ifobj;
-
- ifobj = calloc(1, sizeof(struct ifobject));
- if (!ifobj)
- return NULL;
-
- ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr));
- if (!ifobj->xsk_arr)
- goto out_xsk_arr;
-
- ifobj->umem = calloc(1, sizeof(*ifobj->umem));
- if (!ifobj->umem)
- goto out_umem;
-
- return ifobj;
-
-out_umem:
- free(ifobj->xsk_arr);
-out_xsk_arr:
- free(ifobj);
- return NULL;
-}
+ break;
+ case TEST_SKIP:
+ ksft_test_result_skip("SKIP: %s %s%s\n", mode_string(test), busy_poll_string(test),
+ test->name);
+ break;
+ case TEST_FAILURE:
+ ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test),
+ test->name);
+ break;
+ default:
+ ksft_test_result_fail("FAIL: %s %s%s -- Unexpected returned value (%d)\n",
+ mode_string(test), busy_poll_string(test), test->name, ret);
+ }
-static void ifobject_delete(struct ifobject *ifobj)
-{
- free(ifobj->umem);
- free(ifobj->xsk_arr);
- free(ifobj);
+ pkt_stream_restore_default(test);
}
static bool is_xdp_supported(int ifindex)
@@ -2726,47 +319,6 @@ static bool is_xdp_supported(int ifindex)
return true;
}
-static const struct test_spec tests[] = {
- {.name = "SEND_RECEIVE", .test_func = testapp_send_receive},
- {.name = "SEND_RECEIVE_2K_FRAME", .test_func = testapp_send_receive_2k_frame},
- {.name = "SEND_RECEIVE_SINGLE_PKT", .test_func = testapp_single_pkt},
- {.name = "POLL_RX", .test_func = testapp_poll_rx},
- {.name = "POLL_TX", .test_func = testapp_poll_tx},
- {.name = "POLL_RXQ_FULL", .test_func = testapp_poll_rxq_tmout},
- {.name = "POLL_TXQ_FULL", .test_func = testapp_poll_txq_tmout},
- {.name = "SEND_RECEIVE_UNALIGNED", .test_func = testapp_send_receive_unaligned},
- {.name = "ALIGNED_INV_DESC", .test_func = testapp_aligned_inv_desc},
- {.name = "ALIGNED_INV_DESC_2K_FRAME_SIZE", .test_func = testapp_aligned_inv_desc_2k_frame},
- {.name = "UNALIGNED_INV_DESC", .test_func = testapp_unaligned_inv_desc},
- {.name = "UNALIGNED_INV_DESC_4001_FRAME_SIZE",
- .test_func = testapp_unaligned_inv_desc_4001_frame},
- {.name = "UMEM_HEADROOM", .test_func = testapp_headroom},
- {.name = "TEARDOWN", .test_func = testapp_teardown},
- {.name = "BIDIRECTIONAL", .test_func = testapp_bidirectional},
- {.name = "STAT_RX_DROPPED", .test_func = testapp_stats_rx_dropped},
- {.name = "STAT_TX_INVALID", .test_func = testapp_stats_tx_invalid_descs},
- {.name = "STAT_RX_FULL", .test_func = testapp_stats_rx_full},
- {.name = "STAT_FILL_EMPTY", .test_func = testapp_stats_fill_empty},
- {.name = "XDP_PROG_CLEANUP", .test_func = testapp_xdp_prog_cleanup},
- {.name = "XDP_DROP_HALF", .test_func = testapp_xdp_drop},
- {.name = "XDP_SHARED_UMEM", .test_func = testapp_xdp_shared_umem},
- {.name = "XDP_METADATA_COPY", .test_func = testapp_xdp_metadata},
- {.name = "XDP_METADATA_COPY_MULTI_BUFF", .test_func = testapp_xdp_metadata_mb},
- {.name = "SEND_RECEIVE_9K_PACKETS", .test_func = testapp_send_receive_mb},
- {.name = "SEND_RECEIVE_UNALIGNED_9K_PACKETS",
- .test_func = testapp_send_receive_unaligned_mb},
- {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb},
- {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb},
- {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags},
- {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size},
- {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size},
- {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink},
- {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb},
- {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow},
- {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb},
- {.name = "TX_QUEUE_CONSUMER", .test_func = testapp_tx_queue_consumer},
- };
-
static void print_tests(void)
{
u32 i;
@@ -2774,10 +326,13 @@ static void print_tests(void)
printf("Tests:\n");
for (i = 0; i < ARRAY_SIZE(tests); i++)
printf("%u: %s\n", i, tests[i].name);
+ for (i = ARRAY_SIZE(tests); i < ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests); i++)
+ printf("%u: %s\n", i, ci_skip_tests[i - ARRAY_SIZE(tests)].name);
}
int main(int argc, char **argv)
{
+ const size_t total_tests = ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests);
struct pkt_stream *rx_pkt_stream_default;
struct pkt_stream *tx_pkt_stream_default;
struct ifobject *ifobj_tx, *ifobj_rx;
@@ -2805,7 +360,7 @@ int main(int argc, char **argv)
print_tests();
ksft_exit_xpass();
}
- if (opt_run_test != RUN_ALL_TESTS && opt_run_test >= ARRAY_SIZE(tests)) {
+ if (opt_run_test != RUN_ALL_TESTS && opt_run_test >= total_tests) {
ksft_print_msg("Error: test %u does not exist.\n", opt_run_test);
ksft_exit_xfail();
}
@@ -2830,10 +385,13 @@ int main(int argc, char **argv)
ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending;
}
- init_iface(ifobj_rx, worker_testapp_validate_rx);
- init_iface(ifobj_tx, worker_testapp_validate_tx);
+ if (init_iface(ifobj_rx, worker_testapp_validate_rx) ||
+ init_iface(ifobj_tx, worker_testapp_validate_tx)) {
+ ksft_print_msg("Error : can't initialize interfaces\n");
+ ksft_exit_xfail();
+ }
- test_spec_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]);
+ test_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]);
tx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE);
rx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE);
if (!tx_pkt_stream_default || !rx_pkt_stream_default)
@@ -2842,7 +400,7 @@ int main(int argc, char **argv)
test.rx_pkt_stream_default = rx_pkt_stream_default;
if (opt_run_test == RUN_ALL_TESTS)
- nb_tests = ARRAY_SIZE(tests);
+ nb_tests = total_tests;
else
nb_tests = 1;
if (opt_mode == TEST_MODE_ALL) {
@@ -2864,11 +422,15 @@ int main(int argc, char **argv)
if (opt_mode != TEST_MODE_ALL && i != opt_mode)
continue;
- for (j = 0; j < ARRAY_SIZE(tests); j++) {
+ for (j = 0; j < total_tests; j++) {
if (opt_run_test != RUN_ALL_TESTS && j != opt_run_test)
continue;
- test_spec_init(&test, ifobj_tx, ifobj_rx, i, &tests[j]);
+ if (j < ARRAY_SIZE(tests))
+ test_init(&test, ifobj_tx, ifobj_rx, i, &tests[j]);
+ else
+ test_init(&test, ifobj_tx, ifobj_rx, i,
+ &ci_skip_tests[j - ARRAY_SIZE(tests)]);
run_pkt_test(&test);
usleep(USLEEP_MAX);
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index 4df3a5d329ac..3ca518df23ad 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -22,169 +22,13 @@
#define PF_XDP AF_XDP
#endif
-#ifndef SO_BUSY_POLL_BUDGET
-#define SO_BUSY_POLL_BUDGET 70
-#endif
-
-#ifndef SO_PREFER_BUSY_POLL
-#define SO_PREFER_BUSY_POLL 69
-#endif
-
-#define TEST_PASS 0
-#define TEST_FAILURE -1
-#define TEST_CONTINUE 1
-#define TEST_SKIP 2
-#define MAX_INTERFACES 2
-#define MAX_INTERFACE_NAME_CHARS 16
-#define MAX_TEST_NAME_SIZE 48
#define MAX_TEARDOWN_ITER 10
-#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */
-#define MIN_PKT_SIZE 64
-#define MAX_ETH_PKT_SIZE 1518
#define MAX_ETH_JUMBO_SIZE 9000
-#define USLEEP_MAX 10000
#define SOCK_RECONF_CTR 10
-#define DEFAULT_BATCH_SIZE 64
-#define POLL_TMOUT 1000
-#define THREAD_TMOUT 3
-#define DEFAULT_PKT_CNT (4 * 1024)
-#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4)
#define RX_FULL_RXQSIZE 32
#define UMEM_HEADROOM_TEST_SIZE 128
#define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1)
-#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024)
-#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024)
-#define XSK_DESC__INVALID_OPTION (0xffff)
-#define HUGEPAGE_SIZE (2 * 1024 * 1024)
-#define PKT_DUMP_NB_TO_PRINT 16
#define RUN_ALL_TESTS UINT_MAX
#define NUM_MAC_ADDRESSES 4
-#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0)
-
-enum test_mode {
- TEST_MODE_SKB,
- TEST_MODE_DRV,
- TEST_MODE_ZC,
- TEST_MODE_ALL
-};
-
-struct xsk_umem_info {
- struct xsk_ring_prod fq;
- struct xsk_ring_cons cq;
- struct xsk_umem *umem;
- u64 next_buffer;
- u32 num_frames;
- u32 frame_headroom;
- void *buffer;
- u32 frame_size;
- u32 base_addr;
- u32 fill_size;
- u32 comp_size;
- bool unaligned_mode;
-};
-
-struct xsk_socket_info {
- struct xsk_ring_cons rx;
- struct xsk_ring_prod tx;
- struct xsk_umem_info *umem;
- struct xsk_socket *xsk;
- struct pkt_stream *pkt_stream;
- u32 outstanding_tx;
- u32 rxqsize;
- u32 batch_size;
- u8 dst_mac[ETH_ALEN];
- u8 src_mac[ETH_ALEN];
- bool check_consumer;
-};
-
-struct pkt {
- int offset;
- u32 len;
- u32 pkt_nb;
- bool valid;
- u16 options;
-};
-
-struct pkt_stream {
- u32 nb_pkts;
- u32 current_pkt_nb;
- struct pkt *pkts;
- u32 max_pkt_len;
- u32 nb_rx_pkts;
- u32 nb_valid_entries;
- bool verbatim;
-};
-
-struct set_hw_ring {
- u32 default_tx;
- u32 default_rx;
-};
-
-struct ifobject;
-struct test_spec;
-typedef int (*validation_func_t)(struct ifobject *ifobj);
-typedef void *(*thread_func_t)(void *arg);
-typedef int (*test_func_t)(struct test_spec *test);
-
-struct ifobject {
- char ifname[MAX_INTERFACE_NAME_CHARS];
- struct xsk_socket_info *xsk;
- struct xsk_socket_info *xsk_arr;
- struct xsk_umem_info *umem;
- thread_func_t func_ptr;
- validation_func_t validation_func;
- struct xsk_xdp_progs *xdp_progs;
- struct bpf_map *xskmap;
- struct bpf_program *xdp_prog;
- struct ethtool_ringparam ring;
- struct set_hw_ring set_ring;
- enum test_mode mode;
- int ifindex;
- int mtu;
- u32 bind_flags;
- u32 xdp_zc_max_segs;
- bool tx_on;
- bool rx_on;
- bool use_poll;
- bool busy_poll;
- bool use_fill_ring;
- bool release_rx;
- bool shared_umem;
- bool use_metadata;
- bool unaligned_supp;
- bool multi_buff_supp;
- bool multi_buff_zc_supp;
- bool hw_ring_size_supp;
-};
-
-struct test_spec {
- struct ifobject *ifobj_tx;
- struct ifobject *ifobj_rx;
- struct pkt_stream *tx_pkt_stream_default;
- struct pkt_stream *rx_pkt_stream_default;
- struct bpf_program *xdp_prog_rx;
- struct bpf_program *xdp_prog_tx;
- struct bpf_map *xskmap_rx;
- struct bpf_map *xskmap_tx;
- test_func_t test_func;
- int mtu;
- u16 total_steps;
- u16 current_step;
- u16 nb_sockets;
- bool fail;
- bool set_ring;
- bool adjust_tail;
- bool adjust_tail_support;
- enum test_mode mode;
- char name[MAX_TEST_NAME_SIZE];
-};
-
-pthread_barrier_t barr;
-pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-int pkts_in_flight;
-
-static const u8 g_mac[ETH_ALEN] = {0x55, 0x44, 0x33, 0x22, 0x11, 0x00};
-
#endif /* XSKXCEIVER_H_ */
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index a360e2eb2eef..1d778c8b7764 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -923,8 +923,10 @@ struct corecg_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) {
if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME))
ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n");
@@ -946,12 +948,11 @@ post_v2_setup:
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
cleanup_named_v1_root(root);
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index d54e2317efff..b1b30e82dd7c 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -796,8 +796,10 @@ struct cpucg_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -814,11 +816,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c
index 4034d14ba69a..8086d2ea394f 100644
--- a/tools/testing/selftests/cgroup/test_cpuset.c
+++ b/tools/testing/selftests/cgroup/test_cpuset.c
@@ -247,8 +247,10 @@ struct cpuset_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -265,11 +267,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
index dfb763819581..465cdad2bfca 100644
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -1488,8 +1488,10 @@ struct cgfreezer_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
@@ -1501,11 +1503,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c
index 0e5bb6c7307a..ed590b150a17 100644
--- a/tools/testing/selftests/cgroup/test_kill.c
+++ b/tools/testing/selftests/cgroup/test_kill.c
@@ -274,8 +274,10 @@ struct cgkill_test {
int main(int argc, char *argv[])
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
@@ -287,11 +289,10 @@ int main(int argc, char *argv[])
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index 63b3c9aad399..d4c4a514ee43 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -421,8 +421,10 @@ struct kmem_test {
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -446,11 +448,10 @@ int main(int argc, char **argv)
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index a680f773f2d5..b117325c0439 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -1650,8 +1650,10 @@ struct memcg_test {
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, proc_status, ret = EXIT_SUCCESS;
+ int i, proc_status;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -1685,11 +1687,10 @@ int main(int argc, char **argv)
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index e1f578ca2841..86a8930b47e3 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -597,8 +597,10 @@ static bool zswap_configured(void)
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i;
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
@@ -625,11 +627,10 @@ int main(int argc, char **argv)
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
- ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
- return ret;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore
new file mode 100644
index 000000000000..097f52db0be9
--- /dev/null
+++ b/tools/testing/selftests/coredump/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+stackdump_test
+coredump_socket_test
+coredump_socket_protocol_test
diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile
index 77b3665c73c7..dece1a31d561 100644
--- a/tools/testing/selftests/coredump/Makefile
+++ b/tools/testing/selftests/coredump/Makefile
@@ -1,7 +1,13 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
-TEST_GEN_PROGS := stackdump_test
+TEST_GEN_PROGS := stackdump_test \
+ coredump_socket_test \
+ coredump_socket_protocol_test
TEST_FILES := stackdump
include ../lib.mk
+
+$(OUTPUT)/stackdump_test: coredump_test_helpers.c
+$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c
+$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c
diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c
new file mode 100644
index 000000000000..d19b6717c53e
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c
@@ -0,0 +1,1568 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "coredump_test.h"
+
+#define NUM_CRASHING_COREDUMPS 5
+
+FIXTURE_SETUP(coredump)
+{
+ FILE *file;
+ int ret;
+
+ self->pid_coredump_server = -ESRCH;
+ self->fd_tmpfs_detached = -1;
+ file = fopen("/proc/sys/kernel/core_pattern", "r");
+ ASSERT_NE(NULL, file);
+
+ ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+ ASSERT_TRUE(ret || feof(file));
+ ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+ self->original_core_pattern[ret] = '\0';
+ self->fd_tmpfs_detached = create_detached_tmpfs();
+ ASSERT_GE(self->fd_tmpfs_detached, 0);
+
+ ret = fclose(file);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+ const char *reason;
+ FILE *file;
+ int ret, status;
+
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ if (!file) {
+ reason = "Unable to open core_pattern";
+ goto fail;
+ }
+
+ ret = fprintf(file, "%s", self->original_core_pattern);
+ if (ret < 0) {
+ reason = "Unable to write to core_pattern";
+ goto fail;
+ }
+
+ ret = fclose(file);
+ if (ret) {
+ reason = "Unable to close core_pattern";
+ goto fail;
+ }
+
+ if (self->fd_tmpfs_detached >= 0) {
+ ret = close(self->fd_tmpfs_detached);
+ if (ret < 0) {
+ reason = "Unable to close detached tmpfs";
+ goto fail;
+ }
+ self->fd_tmpfs_detached = -1;
+ }
+
+ return;
+fail:
+ /* This should never happen */
+ fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason);
+}
+
+TEST_F(coredump, socket_request_kernel)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_kernel: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+ system("file /tmp/coredump.file");
+}
+
+TEST_F(coredump, socket_request_userspace)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read > 0) {
+ fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n");
+ goto out;
+ }
+
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_userspace: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_reject)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_reject: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_reject: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_reject: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read > 0) {
+ fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n");
+ goto out;
+ }
+
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_reject: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_flag_combination)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_unknown_flag)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) {
+ fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) {
+ fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_unknown_flag: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_size_small)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ COREDUMP_ACK_SIZE_VER0 / 2)) {
+ fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_size_large)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV
+ *
+ * Verify that when using socket-based coredump protocol,
+ * the coredump_signal field is correctly exposed as SIGSEGV.
+ */
+TEST_F(coredump, socket_coredump_signal_sigsegv)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGSEGV) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n",
+ info.coredump_signal, SIGSEGV);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGSEGV);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGSEGV);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT
+ *
+ * Verify that when using socket-based coredump protocol,
+ * the coredump_signal field is correctly exposed as SIGABRT.
+ */
+TEST_F(coredump, socket_coredump_signal_sigabrt)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGABRT) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n",
+ info.coredump_signal, SIGABRT);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ abort();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGABRT);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGABRT);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500)
+{
+ int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
+ pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+ struct coredump_req req = {};
+
+ close(ipc_sockets[0]);
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "Failed to create and listen on unix socket\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "Failed to notify parent via ipc socket\n");
+ goto out;
+ }
+ close(ipc_sockets[1]);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump);
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump);
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file);
+ goto out;
+ }
+ }
+
+ close(fd_core_file);
+ close(fd_peer_pidfd);
+ close(fd_coredump);
+ fd_peer_pidfd = -1;
+ fd_coredump = -1;
+ }
+
+ exit_code = EXIT_SUCCESS;
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ pid[i] = fork();
+ ASSERT_GE(pid[i], 0);
+ if (pid[i] == 0)
+ crashing_child();
+ pidfd[i] = sys_pidfd_open(pid[i], 0);
+ ASSERT_GE(pidfd[i], 0);
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ waitpid(pid[i], &status[i], 0);
+ ASSERT_TRUE(WIFSIGNALED(status[i]));
+ ASSERT_TRUE(WCOREDUMP(status[i]));
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+ }
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500)
+{
+ int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
+ pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS];
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0;
+ fd_server = -1;
+ exit_code = EXIT_FAILURE;
+ n_conns = 0;
+ close(ipc_sockets[0]);
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+ close(ipc_sockets[1]);
+
+ while (n_conns < NUM_CRASHING_COREDUMPS) {
+ int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ struct coredump_req req = {};
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ continue;
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n");
+ goto out;
+ }
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n");
+ goto out;
+ }
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n");
+ goto out;
+ }
+ if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n");
+ goto out;
+ }
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n");
+ goto out;
+ }
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n");
+ goto out;
+ }
+ if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n");
+ goto out;
+ }
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n");
+ goto out;
+ }
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+ pid_t worker = fork();
+ if (worker == 0) {
+ close(fd_server);
+ process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file);
+ }
+ worker_pids[n_conns] = worker;
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ n_conns++;
+ }
+ exit_code = EXIT_SUCCESS;
+out:
+ if (fd_server >= 0)
+ close(fd_server);
+
+ // Reap all worker processes
+ for (int i = 0; i < n_conns; i++) {
+ int wstatus;
+ if (waitpid(worker_pids[i], &wstatus, 0) < 0) {
+ fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]);
+ } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
+ fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus));
+ exit_code = EXIT_FAILURE;
+ }
+ }
+
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ pid[i] = fork();
+ ASSERT_GE(pid[i], 0);
+ if (pid[i] == 0)
+ crashing_child();
+ pidfd[i] = sys_pidfd_open(pid[i], 0);
+ ASSERT_GE(pidfd[i], 0);
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ ASSERT_GE(waitpid(pid[i], &status[i], 0), 0);
+ ASSERT_TRUE(WIFSIGNALED(status[i]));
+ ASSERT_TRUE(WCOREDUMP(status[i]));
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+ }
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c
new file mode 100644
index 000000000000..7e26d4a6a15d
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_socket_test.c
@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "coredump_test.h"
+
+FIXTURE_SETUP(coredump)
+{
+ FILE *file;
+ int ret;
+
+ self->pid_coredump_server = -ESRCH;
+ self->fd_tmpfs_detached = -1;
+ file = fopen("/proc/sys/kernel/core_pattern", "r");
+ ASSERT_NE(NULL, file);
+
+ ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+ ASSERT_TRUE(ret || feof(file));
+ ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+ self->original_core_pattern[ret] = '\0';
+ self->fd_tmpfs_detached = create_detached_tmpfs();
+ ASSERT_GE(self->fd_tmpfs_detached, 0);
+
+ ret = fclose(file);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+ const char *reason;
+ FILE *file;
+ int ret, status;
+
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ if (!file) {
+ reason = "Unable to open core_pattern";
+ goto fail;
+ }
+
+ ret = fprintf(file, "%s", self->original_core_pattern);
+ if (ret < 0) {
+ reason = "Unable to write to core_pattern";
+ goto fail;
+ }
+
+ ret = fclose(file);
+ if (ret) {
+ reason = "Unable to close core_pattern";
+ goto fail;
+ }
+
+ if (self->fd_tmpfs_detached >= 0) {
+ ret = close(self->fd_tmpfs_detached);
+ if (ret < 0) {
+ reason = "Unable to close detached tmpfs";
+ goto fail;
+ }
+ self->fd_tmpfs_detached = -1;
+ }
+
+ return;
+fail:
+ /* This should never happen */
+ fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason);
+}
+
+TEST_F(coredump, socket)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket test: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket test: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket test: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket test: creat coredump file failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket test: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket test: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+}
+
+TEST_F(coredump, socket_detect_userspace_client)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_COREDUMP,
+ };
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_mask & PIDFD_COREDUMPED) {
+ fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_detect_userspace_client: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ int fd_socket;
+ ssize_t ret;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len =
+ offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd_socket < 0) {
+ fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ close(fd_socket);
+ pause();
+ fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n");
+ _exit(EXIT_SUCCESS);
+ }
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
+ ASSERT_EQ(close(pidfd), 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_EQ(errno, ENOENT);
+}
+
+TEST_F(coredump, socket_enoent)
+{
+ int pidfd, status;
+ pid_t pid;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+}
+
+TEST_F(coredump, socket_no_listener)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ int ipc_sockets[2];
+ char c;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_no_listener: socket failed: %m\n");
+ goto out;
+ }
+
+ ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "socket_no_listener: bind failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_no_listener: completed successfully\n");
+out:
+ if (fd_server >= 0)
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump
+ *
+ * Verify that when using simple socket-based coredump (@ pattern),
+ * the coredump_signal field is correctly exposed as SIGSEGV.
+ */
+TEST_F(coredump, socket_coredump_signal_sigsegv)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGSEGV) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n",
+ info.coredump_signal, SIGSEGV);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGSEGV);
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGSEGV);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT
+ *
+ * Verify that when using simple socket-based coredump (@ pattern),
+ * the coredump_signal field is correctly exposed as SIGABRT.
+ */
+TEST_F(coredump, socket_coredump_signal_sigabrt)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGABRT) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n",
+ info.coredump_signal, SIGABRT);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ abort();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGABRT);
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGABRT);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_invalid_paths)
+{
+ ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/.."));
+ ASSERT_FALSE(set_core_pattern("@.."));
+
+ ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/.."));
+ ASSERT_FALSE(set_core_pattern("@@.."));
+
+ ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket"));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h
new file mode 100644
index 000000000000..ed47f01fa53c
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_test.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __COREDUMP_TEST_H
+#define __COREDUMP_TEST_H
+
+#include <stdbool.h>
+#include <sys/types.h>
+#include <linux/coredump.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define NUM_THREAD_SPAWN 128
+
+/* Coredump fixture */
+FIXTURE(coredump)
+{
+ char original_core_pattern[256];
+ pid_t pid_coredump_server;
+ int fd_tmpfs_detached;
+};
+
+/* Shared helper function declarations */
+void *do_nothing(void *arg);
+void crashing_child(void);
+int create_detached_tmpfs(void);
+int create_and_listen_unix_socket(const char *path);
+bool set_core_pattern(const char *pattern);
+int get_peer_pidfd(int fd);
+bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info);
+
+/* Inline helper that uses harness types */
+static inline void wait_and_check_coredump_server(pid_t pid_coredump_server,
+ struct __test_metadata *const _metadata,
+ FIXTURE_DATA(coredump) *self)
+{
+ int status;
+ waitpid(pid_coredump_server, &status, 0);
+ self->pid_coredump_server = -ESRCH;
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+/* Protocol helper function declarations */
+ssize_t recv_marker(int fd);
+bool read_marker(int fd, enum coredump_mark mark);
+bool read_coredump_req(int fd, struct coredump_req *req);
+bool send_coredump_ack(int fd, const struct coredump_req *req,
+ __u64 mask, size_t size_ack);
+bool check_coredump_req(const struct coredump_req *req, size_t min_size,
+ __u64 required_mask);
+int open_coredump_tmpfile(int fd_tmpfs_detached);
+void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file);
+
+#endif /* __COREDUMP_TEST_H */
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
new file mode 100644
index 000000000000..a6f6d5f2ae07
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/coredump.h>
+#include <linux/fs.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../filesystems/wrappers.h"
+#include "../pidfd/pidfd.h"
+
+/* Forward declarations to avoid including harness header */
+struct __test_metadata;
+
+/* Match the fixture definition from coredump_test.h */
+struct _fixture_coredump_data {
+ char original_core_pattern[256];
+ pid_t pid_coredump_server;
+ int fd_tmpfs_detached;
+};
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define NUM_THREAD_SPAWN 128
+
+void *do_nothing(void *arg)
+{
+ (void)arg;
+ while (1)
+ pause();
+
+ return NULL;
+}
+
+void crashing_child(void)
+{
+ pthread_t thread;
+ int i;
+
+ for (i = 0; i < NUM_THREAD_SPAWN; ++i)
+ pthread_create(&thread, NULL, do_nothing, NULL);
+
+ /* crash on purpose */
+ i = *(int *)NULL;
+}
+
+int create_detached_tmpfs(void)
+{
+ int fd_context, fd_tmpfs;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ if (fd_context < 0)
+ return -1;
+
+ if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
+ return -1;
+
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ close(fd_context);
+ return fd_tmpfs;
+}
+
+int create_and_listen_unix_socket(const char *path)
+{
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ assert(strlen(path) < sizeof(addr.sun_path) - 1);
+ strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
+ size_t addr_len =
+ offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
+ int fd, ret;
+
+ fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd < 0)
+ goto out;
+
+ ret = bind(fd, (const struct sockaddr *)&addr, addr_len);
+ if (ret < 0)
+ goto out;
+
+ ret = listen(fd, 128);
+ if (ret < 0)
+ goto out;
+
+ return fd;
+
+out:
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+bool set_core_pattern(const char *pattern)
+{
+ int fd;
+ ssize_t ret;
+
+ fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC);
+ if (fd < 0)
+ return false;
+
+ ret = write(fd, pattern, strlen(pattern));
+ close(fd);
+ if (ret < 0)
+ return false;
+
+ fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern));
+ return ret == strlen(pattern);
+}
+
+int get_peer_pidfd(int fd)
+{
+ int fd_peer_pidfd;
+ socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
+ int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd,
+ &fd_peer_pidfd_len);
+ if (ret < 0) {
+ fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n");
+ return -1;
+ }
+ fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd);
+ return fd_peer_pidfd;
+}
+
+bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info)
+{
+ int ret;
+ memset(info, 0, sizeof(*info));
+ info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
+ ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info);
+ if (ret < 0) {
+ fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n");
+ return false;
+ }
+ fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n",
+ (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal);
+ return true;
+}
+
+/* Protocol helper functions */
+
+ssize_t recv_marker(int fd)
+{
+ enum coredump_mark mark = COREDUMP_MARK_REQACK;
+ ssize_t ret;
+
+ ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL);
+ if (ret != sizeof(mark))
+ return -1;
+
+ switch (mark) {
+ case COREDUMP_MARK_REQACK:
+ fprintf(stderr, "Received marker: ReqAck\n");
+ return COREDUMP_MARK_REQACK;
+ case COREDUMP_MARK_MINSIZE:
+ fprintf(stderr, "Received marker: MinSize\n");
+ return COREDUMP_MARK_MINSIZE;
+ case COREDUMP_MARK_MAXSIZE:
+ fprintf(stderr, "Received marker: MaxSize\n");
+ return COREDUMP_MARK_MAXSIZE;
+ case COREDUMP_MARK_UNSUPPORTED:
+ fprintf(stderr, "Received marker: Unsupported\n");
+ return COREDUMP_MARK_UNSUPPORTED;
+ case COREDUMP_MARK_CONFLICTING:
+ fprintf(stderr, "Received marker: Conflicting\n");
+ return COREDUMP_MARK_CONFLICTING;
+ default:
+ fprintf(stderr, "Received unknown marker: %u\n", mark);
+ break;
+ }
+ return -1;
+}
+
+bool read_marker(int fd, enum coredump_mark mark)
+{
+ ssize_t ret;
+
+ ret = recv_marker(fd);
+ if (ret < 0)
+ return false;
+ return ret == mark;
+}
+
+bool read_coredump_req(int fd, struct coredump_req *req)
+{
+ ssize_t ret;
+ size_t field_size, user_size, ack_size, kernel_size, remaining_size;
+
+ memset(req, 0, sizeof(*req));
+ field_size = sizeof(req->size);
+
+ /* Peek the size of the coredump request. */
+ ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL);
+ if (ret != field_size) {
+ fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n",
+ ret, field_size);
+ return false;
+ }
+ kernel_size = req->size;
+
+ if (kernel_size < COREDUMP_ACK_SIZE_VER0) {
+ fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n",
+ kernel_size, COREDUMP_ACK_SIZE_VER0);
+ return false;
+ }
+ if (kernel_size >= PAGE_SIZE) {
+ fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n",
+ kernel_size, PAGE_SIZE);
+ return false;
+ }
+
+ /* Use the minimum of user and kernel size to read the full request. */
+ user_size = sizeof(struct coredump_req);
+ ack_size = user_size < kernel_size ? user_size : kernel_size;
+ ret = recv(fd, req, ack_size, MSG_WAITALL);
+ if (ret != ack_size)
+ return false;
+
+ fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n",
+ req->size, (unsigned long long)req->mask);
+
+ if (user_size > kernel_size)
+ remaining_size = user_size - kernel_size;
+ else
+ remaining_size = kernel_size - user_size;
+
+ if (PAGE_SIZE <= remaining_size)
+ return false;
+
+ /*
+ * Discard any additional data if the kernel's request was larger than
+ * what we knew about or cared about.
+ */
+ if (remaining_size) {
+ char buffer[PAGE_SIZE];
+
+ ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL);
+ if (ret != remaining_size)
+ return false;
+ fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size);
+ }
+
+ return true;
+}
+
+bool send_coredump_ack(int fd, const struct coredump_req *req,
+ __u64 mask, size_t size_ack)
+{
+ ssize_t ret;
+ /*
+ * Wrap struct coredump_ack in a larger struct so we can
+ * simulate sending to much data to the kernel.
+ */
+ struct large_ack_for_size_testing {
+ struct coredump_ack ack;
+ char buffer[PAGE_SIZE];
+ } large_ack = {};
+
+ if (!size_ack)
+ size_ack = sizeof(struct coredump_ack) < req->size_ack ?
+ sizeof(struct coredump_ack) :
+ req->size_ack;
+ large_ack.ack.mask = mask;
+ large_ack.ack.size = size_ack;
+ ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL);
+ if (ret != size_ack)
+ return false;
+
+ fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n",
+ size_ack, (unsigned long long)mask);
+ return true;
+}
+
+bool check_coredump_req(const struct coredump_req *req, size_t min_size,
+ __u64 required_mask)
+{
+ if (req->size < min_size)
+ return false;
+ if ((req->mask & required_mask) != required_mask)
+ return false;
+ if (req->mask & ~required_mask)
+ return false;
+ return true;
+}
+
+int open_coredump_tmpfile(int fd_tmpfs_detached)
+{
+ return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
+}
+
+void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file)
+{
+ int epfd = -1;
+ int exit_code = EXIT_FAILURE;
+ struct epoll_event ev;
+ int flags;
+
+ /* Set socket to non-blocking mode for edge-triggered epoll */
+ flags = fcntl(fd_coredump, F_GETFL, 0);
+ if (flags < 0) {
+ fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n");
+ goto out;
+ }
+ if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) {
+ fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n");
+ goto out;
+ }
+
+ epfd = epoll_create1(0);
+ if (epfd < 0) {
+ fprintf(stderr, "Worker: epoll_create1() failed: %m\n");
+ goto out;
+ }
+
+ ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
+ ev.data.fd = fd_coredump;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) {
+ fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ struct epoll_event events[1];
+ int n = epoll_wait(epfd, events, 1, -1);
+ if (n < 0) {
+ fprintf(stderr, "Worker: epoll_wait() failed: %m\n");
+ break;
+ }
+
+ if (events[0].events & (EPOLLIN | EPOLLRDHUP)) {
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ break;
+ fprintf(stderr, "Worker: read() failed: %m\n");
+ goto out;
+ }
+ if (bytes_read == 0)
+ goto done;
+ ssize_t bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_write != bytes_read) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+ }
+ }
+
+done:
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "Worker: completed successfully\n");
+out:
+ if (epfd >= 0)
+ close(epfd);
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ _exit(exit_code);
+}
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
index a4ac80bb1003..c2e895bcc160 100644
--- a/tools/testing/selftests/coredump/stackdump_test.c
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -23,57 +23,15 @@
#include "../filesystems/wrappers.h"
#include "../pidfd/pidfd.h"
+#include "coredump_test.h"
+
#define STACKDUMP_FILE "stack_values"
#define STACKDUMP_SCRIPT "stackdump"
-#define NUM_THREAD_SPAWN 128
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
-static void *do_nothing(void *)
-{
- while (1)
- pause();
-
- return NULL;
-}
-
-static void crashing_child(void)
-{
- pthread_t thread;
- int i;
-
- for (i = 0; i < NUM_THREAD_SPAWN; ++i)
- pthread_create(&thread, NULL, do_nothing, NULL);
-
- /* crash on purpose */
- i = *(int *)NULL;
-}
-
-FIXTURE(coredump)
-{
- char original_core_pattern[256];
- pid_t pid_coredump_server;
- int fd_tmpfs_detached;
-};
-
-static int create_detached_tmpfs(void)
-{
- int fd_context, fd_tmpfs;
-
- fd_context = sys_fsopen("tmpfs", 0);
- if (fd_context < 0)
- return -1;
-
- if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
- return -1;
-
- fd_tmpfs = sys_fsmount(fd_context, 0, 0);
- close(fd_context);
- return fd_tmpfs;
-}
-
FIXTURE_SETUP(coredump)
{
FILE *file;
@@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120)
fclose(file);
}
-static int create_and_listen_unix_socket(const char *path)
-{
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- assert(strlen(path) < sizeof(addr.sun_path) - 1);
- strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
- size_t addr_len =
- offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
- int fd, ret;
-
- fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- if (fd < 0)
- goto out;
-
- ret = bind(fd, (const struct sockaddr *)&addr, addr_len);
- if (ret < 0)
- goto out;
-
- ret = listen(fd, 128);
- if (ret < 0)
- goto out;
-
- return fd;
-
-out:
- if (fd >= 0)
- close(fd);
- return -1;
-}
-
-static bool set_core_pattern(const char *pattern)
-{
- int fd;
- ssize_t ret;
-
- fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC);
- if (fd < 0)
- return false;
-
- ret = write(fd, pattern, strlen(pattern));
- close(fd);
- if (ret < 0)
- return false;
-
- fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern));
- return ret == strlen(pattern);
-}
-
-static int get_peer_pidfd(int fd)
-{
- int fd_peer_pidfd;
- socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
- int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd,
- &fd_peer_pidfd_len);
- if (ret < 0) {
- fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n");
- return -1;
- }
- return fd_peer_pidfd;
-}
-
-static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info)
-{
- memset(info, 0, sizeof(*info));
- info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0;
-}
-
-static void
-wait_and_check_coredump_server(pid_t pid_coredump_server,
- struct __test_metadata *const _metadata,
- FIXTURE_DATA(coredump)* self)
-{
- int status;
- waitpid(pid_coredump_server, &status, 0);
- self->pid_coredump_server = -ESRCH;
- ASSERT_TRUE(WIFEXITED(status));
- ASSERT_EQ(WEXITSTATUS(status), 0);
-}
-
-TEST_F(coredump, socket)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- fd_core_file = creat("/tmp/coredump.file", 0644);
- if (fd_core_file < 0)
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write)
- goto out;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
- ASSERT_GT(st.st_size, 0);
- system("file /tmp/coredump.file");
-}
-
-TEST_F(coredump, socket_detect_userspace_client)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (info.coredump_mask & PIDFD_COREDUMPED)
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0) {
- int fd_socket;
- ssize_t ret;
- const struct sockaddr_un coredump_sk = {
- .sun_family = AF_UNIX,
- .sun_path = "/tmp/coredump.socket",
- };
- size_t coredump_sk_len =
- offsetof(struct sockaddr_un, sun_path) +
- sizeof("/tmp/coredump.socket");
-
- fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
- if (fd_socket < 0)
- _exit(EXIT_FAILURE);
-
- ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
- if (ret < 0)
- _exit(EXIT_FAILURE);
-
- close(fd_socket);
- _exit(EXIT_SUCCESS);
- }
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFEXITED(status));
- ASSERT_EQ(WEXITSTATUS(status), 0);
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
- ASSERT_EQ(errno, ENOENT);
-}
-
-TEST_F(coredump, socket_enoent)
-{
- int pidfd, status;
- pid_t pid;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-}
-
-TEST_F(coredump, socket_no_listener)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- int ipc_sockets[2];
- char c;
- const struct sockaddr_un coredump_sk = {
- .sun_family = AF_UNIX,
- .sun_path = "/tmp/coredump.socket",
- };
- size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
- sizeof("/tmp/coredump.socket");
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- if (fd_server < 0)
- goto out;
-
- ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
- if (ret < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_server >= 0)
- close(fd_server);
- close(ipc_sockets[1]);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-static ssize_t recv_marker(int fd)
-{
- enum coredump_mark mark = COREDUMP_MARK_REQACK;
- ssize_t ret;
-
- ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL);
- if (ret != sizeof(mark))
- return -1;
-
- switch (mark) {
- case COREDUMP_MARK_REQACK:
- fprintf(stderr, "Received marker: ReqAck\n");
- return COREDUMP_MARK_REQACK;
- case COREDUMP_MARK_MINSIZE:
- fprintf(stderr, "Received marker: MinSize\n");
- return COREDUMP_MARK_MINSIZE;
- case COREDUMP_MARK_MAXSIZE:
- fprintf(stderr, "Received marker: MaxSize\n");
- return COREDUMP_MARK_MAXSIZE;
- case COREDUMP_MARK_UNSUPPORTED:
- fprintf(stderr, "Received marker: Unsupported\n");
- return COREDUMP_MARK_UNSUPPORTED;
- case COREDUMP_MARK_CONFLICTING:
- fprintf(stderr, "Received marker: Conflicting\n");
- return COREDUMP_MARK_CONFLICTING;
- default:
- fprintf(stderr, "Received unknown marker: %u\n", mark);
- break;
- }
- return -1;
-}
-
-static bool read_marker(int fd, enum coredump_mark mark)
-{
- ssize_t ret;
-
- ret = recv_marker(fd);
- if (ret < 0)
- return false;
- return ret == mark;
-}
-
-static bool read_coredump_req(int fd, struct coredump_req *req)
-{
- ssize_t ret;
- size_t field_size, user_size, ack_size, kernel_size, remaining_size;
-
- memset(req, 0, sizeof(*req));
- field_size = sizeof(req->size);
-
- /* Peek the size of the coredump request. */
- ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL);
- if (ret != field_size)
- return false;
- kernel_size = req->size;
-
- if (kernel_size < COREDUMP_ACK_SIZE_VER0)
- return false;
- if (kernel_size >= PAGE_SIZE)
- return false;
-
- /* Use the minimum of user and kernel size to read the full request. */
- user_size = sizeof(struct coredump_req);
- ack_size = user_size < kernel_size ? user_size : kernel_size;
- ret = recv(fd, req, ack_size, MSG_WAITALL);
- if (ret != ack_size)
- return false;
-
- fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n",
- req->size, (unsigned long long)req->mask);
-
- if (user_size > kernel_size)
- remaining_size = user_size - kernel_size;
- else
- remaining_size = kernel_size - user_size;
-
- if (PAGE_SIZE <= remaining_size)
- return false;
-
- /*
- * Discard any additional data if the kernel's request was larger than
- * what we knew about or cared about.
- */
- if (remaining_size) {
- char buffer[PAGE_SIZE];
-
- ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL);
- if (ret != remaining_size)
- return false;
- fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size);
- }
-
- return true;
-}
-
-static bool send_coredump_ack(int fd, const struct coredump_req *req,
- __u64 mask, size_t size_ack)
-{
- ssize_t ret;
- /*
- * Wrap struct coredump_ack in a larger struct so we can
- * simulate sending to much data to the kernel.
- */
- struct large_ack_for_size_testing {
- struct coredump_ack ack;
- char buffer[PAGE_SIZE];
- } large_ack = {};
-
- if (!size_ack)
- size_ack = sizeof(struct coredump_ack) < req->size_ack ?
- sizeof(struct coredump_ack) :
- req->size_ack;
- large_ack.ack.mask = mask;
- large_ack.ack.size = size_ack;
- ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL);
- if (ret != size_ack)
- return false;
-
- fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n",
- size_ack, (unsigned long long)mask);
- return true;
-}
-
-static bool check_coredump_req(const struct coredump_req *req, size_t min_size,
- __u64 required_mask)
-{
- if (req->size < min_size)
- return false;
- if ((req->mask & required_mask) != required_mask)
- return false;
- if (req->mask & ~required_mask)
- return false;
- return true;
-}
-
-TEST_F(coredump, socket_request_kernel)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- fd_core_file = creat("/tmp/coredump.file", 0644);
- if (fd_core_file < 0)
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write)
- goto out;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
- ASSERT_GT(st.st_size, 0);
- system("file /tmp/coredump.file");
-}
-
-TEST_F(coredump, socket_request_userspace)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_USERSPACE | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read > 0)
- goto out;
-
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_reject)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read > 0)
- goto out;
-
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_flag_combination)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_unknown_flag)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_size_small)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT,
- COREDUMP_ACK_SIZE_VER0 / 2))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_size_large)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT,
- COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-static int open_coredump_tmpfile(int fd_tmpfs_detached)
-{
- return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
-}
-
-#define NUM_CRASHING_COREDUMPS 5
-
-TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500)
-{
- int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
- pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- int exit_code = EXIT_FAILURE;
- struct coredump_req req = {};
-
- close(ipc_sockets[0]);
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0) {
- fprintf(stderr, "Failed to create and listen on unix socket\n");
- goto out;
- }
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
- fprintf(stderr, "Failed to notify parent via ipc socket\n");
- goto out;
- }
- close(ipc_sockets[1]);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0) {
- fprintf(stderr, "accept4 failed: %m\n");
- goto out;
- }
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0) {
- fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump);
- goto out;
- }
-
- if (!get_pidfd_info(fd_peer_pidfd, &info)) {
- fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd);
- goto out;
- }
-
- if (!(info.mask & PIDFD_INFO_COREDUMP)) {
- fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd);
- goto out;
- }
- if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
- fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd);
- goto out;
- }
-
- if (!read_coredump_req(fd_coredump, &req)) {
- fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT)) {
- fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
- fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
- fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
- if (fd_core_file < 0) {
- fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0) {
- fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump);
- goto out;
- }
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write) {
- fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file);
- goto out;
- }
- }
-
- close(fd_core_file);
- close(fd_peer_pidfd);
- close(fd_coredump);
- fd_peer_pidfd = -1;
- fd_coredump = -1;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- pid[i] = fork();
- ASSERT_GE(pid[i], 0);
- if (pid[i] == 0)
- crashing_child();
- pidfd[i] = sys_pidfd_open(pid[i], 0);
- ASSERT_GE(pidfd[i], 0);
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- waitpid(pid[i], &status[i], 0);
- ASSERT_TRUE(WIFSIGNALED(status[i]));
- ASSERT_TRUE(WCOREDUMP(status[i]));
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
- }
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-#define MAX_EVENTS 128
-
-static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file)
-{
- int epfd = -1;
- int exit_code = EXIT_FAILURE;
-
- epfd = epoll_create1(0);
- if (epfd < 0)
- goto out;
-
- struct epoll_event ev;
- ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
- ev.data.fd = fd_coredump;
- if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0)
- goto out;
-
- for (;;) {
- struct epoll_event events[1];
- int n = epoll_wait(epfd, events, 1, -1);
- if (n < 0)
- break;
-
- if (events[0].events & (EPOLLIN | EPOLLRDHUP)) {
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- break;
- goto out;
- }
- if (bytes_read == 0)
- goto done;
- ssize_t bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_write != bytes_read)
- goto out;
- }
- }
- }
-
-done:
- exit_code = EXIT_SUCCESS;
-out:
- if (epfd >= 0)
- close(epfd);
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- _exit(exit_code);
-}
-
-TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500)
-{
- int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
- pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS];
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
- ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0;
- fd_server = -1;
- exit_code = EXIT_FAILURE;
- n_conns = 0;
- close(ipc_sockets[0]);
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
- close(ipc_sockets[1]);
-
- while (n_conns < NUM_CRASHING_COREDUMPS) {
- int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- struct coredump_req req = {};
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- continue;
- goto out;
- }
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
- if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
- if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0))
- goto out;
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
- fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
- if (fd_core_file < 0)
- goto out;
- pid_t worker = fork();
- if (worker == 0) {
- close(fd_server);
- process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file);
- }
- worker_pids[n_conns] = worker;
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_core_file >= 0)
- close(fd_core_file);
- n_conns++;
- }
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_server >= 0)
- close(fd_server);
-
- // Reap all worker processes
- for (int i = 0; i < n_conns; i++) {
- int wstatus;
- if (waitpid(worker_pids[i], &wstatus, 0) < 0) {
- fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]);
- } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
- fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus));
- exit_code = EXIT_FAILURE;
- }
- }
-
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- pid[i] = fork();
- ASSERT_GE(pid[i], 0);
- if (pid[i] == 0)
- crashing_child();
- pidfd[i] = sys_pidfd_open(pid[i], 0);
- ASSERT_GE(pidfd[i], 0);
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- ASSERT_GE(waitpid(pid[i], &status[i], 0), 0);
- ASSERT_TRUE(WIFSIGNALED(status[i]));
- ASSERT_TRUE(WCOREDUMP(status[i]));
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
- }
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_invalid_paths)
-{
- ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/.."));
- ASSERT_FALSE(set_core_pattern("@.."));
-
- ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/.."));
- ASSERT_FALSE(set_core_pattern("@@.."));
-
- ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket"));
-}
-
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
index b12f1f9babf8..b925756373ce 100644
--- a/tools/testing/selftests/dma/dma_map_benchmark.c
+++ b/tools/testing/selftests/dma/dma_map_benchmark.c
@@ -118,7 +118,7 @@ int main(int argc, char **argv)
}
printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n",
- threads, seconds, node, dir[directions], granule);
+ threads, seconds, node, directions[dir], granule);
printf("average map latency(us):%.1f standard deviation:%.1f\n",
map.avg_map_100ns/10.0, map.map_stddev/10.0);
printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index c43a69dffd83..a0c64f415a7f 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -487,7 +487,7 @@ int setup_userns(void)
uid_t uid = getuid();
gid_t gid = getgid();
- ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER);
if (ret) {
ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
strerror(errno));
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
new file mode 100644
index 000000000000..7daf7292209e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
@@ -0,0 +1,107 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Basic tests on writing to trace_marker_raw
+# requires: trace_marker_raw
+# flags: instance
+
+is_little_endian() {
+ if lscpu | grep -q 'Little Endian'; then
+ echo 1;
+ else
+ echo 0;
+ fi
+}
+
+little=`is_little_endian`
+
+make_str() {
+ id=$1
+ cnt=$2
+
+ if [ $little -eq 1 ]; then
+ val=`printf "\\%03o\\%03o\\%03o\\%03o" \
+ $(($id & 0xff)) \
+ $((($id >> 8) & 0xff)) \
+ $((($id >> 16) & 0xff)) \
+ $((($id >> 24) & 0xff))`
+ else
+ val=`printf "\\%03o\\%03o\\%03o\\%03o" \
+ $((($id >> 24) & 0xff)) \
+ $((($id >> 16) & 0xff)) \
+ $((($id >> 8) & 0xff)) \
+ $(($id & 0xff))`
+ fi
+
+ data=`printf -- 'X%.0s' $(seq $cnt)`
+
+ printf "${val}${data}"
+}
+
+write_buffer() {
+ id=$1
+ size=$2
+
+ # write the string into the raw marker
+ make_str $id $size > trace_marker_raw
+}
+
+
+test_multiple_writes() {
+
+ # Write a bunch of data where the id is the count of
+ # data to write
+ for i in `seq 1 10` `seq 101 110` `seq 1001 1010`; do
+ write_buffer $i $i
+ done
+
+ # add a little buffer
+ echo stop > trace_marker
+
+ # Check to make sure the number of entries is the id (rounded up by 4)
+ awk '/.*: # [0-9a-f]* / {
+ print;
+ cnt = -1;
+ for (i = 0; i < NF; i++) {
+ # The counter is after the "#" marker
+ if ( $i == "#" ) {
+ i++;
+ cnt = strtonum("0x" $i);
+ num = NF - (i + 1);
+ # The number of items is always rounded up by 4
+ cnt2 = int((cnt + 3) / 4) * 4;
+ if (cnt2 != num) {
+ exit 1;
+ }
+ break;
+ }
+ }
+ }
+ // { if (NR > 30) { exit 0; } } ' trace_pipe;
+}
+
+
+get_buffer_data_size() {
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+test_buffer() {
+
+ # The id must be four bytes, test that 3 bytes fails a write
+ if echo -n abc > ./trace_marker_raw ; then
+ echo "Too small of write expected to fail but did not"
+ exit_fail
+ fi
+
+ size=`get_buffer_data_size`
+ echo size = $size
+
+ # Now add a little more than what it can handle
+
+ if write_buffer 0xdeadbeef $size ; then
+ echo "Too big of write expected to fail but did not"
+ exit_fail
+ fi
+}
+
+test_buffer
+test_multiple_writes
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc
index 2506f464811b..47067a5e3cb0 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc
@@ -28,25 +28,21 @@ test -d events/fprobes/myevent1
test -d events/fprobes/myevent2
echo 1 > events/fprobes/myevent1/enable
-# Make sure the event is attached and is the only one
+# Make sure the event is attached.
grep -q $PLACE enabled_functions
cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 1)) ]; then
+if [ $cnt -eq $ocnt ]; then
exit_fail
fi
echo 1 > events/fprobes/myevent2/enable
-# It should till be the only attached function
-cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 1)) ]; then
- exit_fail
-fi
+cnt2=`cat enabled_functions | wc -l`
echo 1 > events/fprobes/myevent3/enable
# If the function is different, the attached function should be increased
grep -q $PLACE2 enabled_functions
cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 2)) ]; then
+if [ $cnt -eq $cnt2 ]; then
exit_fail
fi
@@ -56,12 +52,6 @@ echo "-:myevent2" >> dynamic_events
grep -q myevent1 dynamic_events
! grep -q myevent2 dynamic_events
-# should still have 2 left
-cnt=`cat enabled_functions | wc -l`
-if [ $cnt -ne $((ocnt + 2)) ]; then
- exit_fail
-fi
-
echo 0 > events/fprobes/enable
echo > dynamic_events
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc
new file mode 100644
index 000000000000..c1f1cafa30f3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc
@@ -0,0 +1,40 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - enable/disable tracepoint probe events
+# requires: dynamic_events "t[:[<group>/][<event>]] <tracepoint> [<args>]":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+TRACEPOINT=sched_switch
+ENABLEFILE=events/tracepoints/myprobe/enable
+
+:;: "Add tracepoint event on $TRACEPOINT" ;:
+
+echo "t:myprobe ${TRACEPOINT}" >> dynamic_events
+
+:;: "Check enable/disable to ensure it works" ;:
+
+echo 1 > $ENABLEFILE
+
+grep -q $TRACEPOINT trace
+
+echo 0 > $ENABLEFILE
+
+echo > trace
+
+! grep -q $TRACEPOINT trace
+
+:;: "Repeat enable/disable to ensure it works" ;:
+
+echo 1 > $ENABLEFILE
+
+grep -q $TRACEPOINT trace
+
+echo 0 > $ENABLEFILE
+
+echo > trace
+
+! grep -q $TRACEPOINT trace
+
+exit 0
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 2c3c58e65a41..3a62039fa621 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -44,6 +44,12 @@ tap_timeout()
fi
}
+report_failure()
+{
+ echo "not ok $*"
+ echo "$*" >> "$kselftest_failures_file"
+}
+
run_one()
{
DIR="$1"
@@ -105,7 +111,7 @@ run_one()
echo "# $TEST_HDR_MSG"
if [ ! -e "$TEST" ]; then
echo "# Warning: file $TEST is missing!"
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
else
if [ -x /usr/bin/stdbuf ]; then
stdbuf="/usr/bin/stdbuf --output=L "
@@ -123,7 +129,7 @@ run_one()
interpreter=$(head -n 1 "$TEST" | cut -c 3-)
cmd="$stdbuf $interpreter ./$BASENAME_TEST"
else
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
return
fi
fi
@@ -137,9 +143,9 @@ run_one()
echo "ok $test_num $TEST_HDR_MSG # SKIP"
elif [ $rc -eq $timeout_rc ]; then \
echo "#"
- echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
+ report_failure "$test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
else
- echo "not ok $test_num $TEST_HDR_MSG # exit=$rc"
+ report_failure "$test_num $TEST_HDR_MSG # exit=$rc"
fi)
cd - >/dev/null
fi
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index 5e24f77868b5..c4815d365816 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -268,7 +268,9 @@ static void guest_code(void)
/* Return a safe value to a given ftr_bits an ftr value */
uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
{
- uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+ uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift;
+
+ TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features");
if (ftr_bits->sign == FTR_UNSIGNED) {
switch (ftr_bits->type) {
@@ -320,7 +322,9 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
/* Return an invalid value to a given ftr_bits an ftr value */
uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
{
- uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
+ uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift;
+
+ TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features");
if (ftr_bits->sign == FTR_UNSIGNED) {
switch (ftr_bits->type) {
@@ -672,7 +676,7 @@ static void test_clidr(struct kvm_vcpu *vcpu)
clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1));
/* find the first empty level in the cache hierarchy */
- for (level = 1; level < 7; level++) {
+ for (level = 1; level <= 7; level++) {
if (!CLIDR_CTYPE(clidr, level))
break;
}
diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index 46991a029f7c..8ec0cb64ad94 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -10,7 +10,11 @@ SYSFS_KERNEL_DIR="/sys/kernel"
SYSFS_KLP_DIR="$SYSFS_KERNEL_DIR/livepatch"
SYSFS_DEBUG_DIR="$SYSFS_KERNEL_DIR/debug"
SYSFS_KPROBES_DIR="$SYSFS_DEBUG_DIR/kprobes"
-SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing"
+if [[ -e /sys/kernel/tracing/trace ]]; then
+ SYSFS_TRACING_DIR="$SYSFS_KERNEL_DIR/tracing"
+else
+ SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing"
+fi
# Kselftest framework requirement - SKIP code is 4
ksft_skip=4
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
index ccfb40837a73..0989e80da457 100644
--- a/tools/testing/selftests/namespaces/.gitignore
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -1,3 +1,12 @@
nsid_test
file_handle_test
init_ino_test
+ns_active_ref_test
+listns_test
+listns_permissions_test
+listns_efault_test
+siocgskns_test
+cred_change_test
+stress_test
+listns_pagination_bug
+regression_pidfd_setns_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
index 5fe4b3dc07d3..fbb821652c17 100644
--- a/tools/testing/selftests/namespaces/Makefile
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -1,7 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
-TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test
+TEST_GEN_PROGS := nsid_test \
+ file_handle_test \
+ init_ino_test \
+ ns_active_ref_test \
+ listns_test \
+ listns_permissions_test \
+ listns_efault_test \
+ siocgskns_test \
+ cred_change_test \
+ stress_test \
+ listns_pagination_bug \
+ regression_pidfd_setns_test
include ../lib.mk
+$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
+$(OUTPUT)/listns_test: ../filesystems/utils.c
+$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
+$(OUTPUT)/siocgskns_test: ../filesystems/utils.c
+$(OUTPUT)/cred_change_test: ../filesystems/utils.c
+$(OUTPUT)/stress_test: ../filesystems/utils.c
+$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c
+$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c
+
diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c
new file mode 100644
index 000000000000..7b4f5ad3f725
--- /dev/null
+++ b/tools/testing/selftests/namespaces/cred_change_test.c
@@ -0,0 +1,814 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test credential changes and their impact on namespace active references.
+ */
+
+/*
+ * Test setuid() in a user namespace properly swaps active references.
+ * Create a user namespace with multiple UIDs mapped, then setuid() between them.
+ * Verify that the user namespace remains active throughout.
+ */
+TEST(setuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setuid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple UIDs mapped (0-9) */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform multiple setuid() calls.
+ * Each setuid() triggers commit_creds() which should properly
+ * swap active references via switch_cred_namespaces().
+ */
+ for (setuid_count = 0; setuid_count < 50; setuid_count++) {
+ uid_t target_uid = (setuid_count % 10);
+ if (setuid(target_uid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id);
+
+ /* Verify namespace is active while child is running */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive after child exits */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setgid() in a user namespace properly handles active references.
+ */
+TEST(setgid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setgid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple GIDs mapped */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setgid() calls */
+ for (setgid_count = 0; setgid_count < 50; setgid_count++) {
+ gid_t target_gid = (setgid_count % 10);
+ if (setgid(target_gid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setgid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setresuid() which changes real, effective, and saved UIDs.
+ * This should properly swap active references via commit_creds().
+ */
+TEST(setresuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setres_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setresuid() calls */
+ for (setres_count = 0; setres_count < 30; setres_count++) {
+ uid_t uid1 = (setres_count % 5);
+ uid_t uid2 = ((setres_count + 1) % 5);
+ uid_t uid3 = ((setres_count + 2) % 5);
+
+ if (setresuid(uid1, uid2, uid3) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setresuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test credential changes across multiple user namespaces.
+ * Create nested user namespaces and verify active reference tracking.
+ */
+TEST(cred_change_nested_userns)
+{
+ pid_t pid;
+ int status;
+ __u64 parent_userns_id, child_userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found_parent = false, found_child = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 parent_id, child_id;
+ uid_t orig_uid = getuid();
+
+ close(pipefd[0]);
+
+ /* Create first user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get first namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create nested user namespace */
+ userns_fd = get_userns_fd(0, 0, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get nested namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+
+ /* Perform some credential changes in nested namespace */
+ setuid(0);
+ setgid(0);
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read both namespace IDs */
+ if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get parent namespace ID");
+ }
+
+ if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get child namespace ID");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Parent userns: %llu, Child userns: %llu",
+ (unsigned long long)parent_userns_id,
+ (unsigned long long)child_userns_id);
+
+ /* Verify both namespaces are active */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_TRUE(found_parent);
+ ASSERT_TRUE(found_child);
+
+ /* Wait for child */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify both namespaces become inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_parent = false;
+ found_child = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_FALSE(found_parent);
+ ASSERT_FALSE(found_child);
+ TH_LOG("Nested user namespace credential changes preserved active refs (no leak)");
+}
+
+/*
+ * Test rapid credential changes don't cause refcount imbalances.
+ * This stress-tests the switch_cred_namespaces() logic.
+ */
+TEST(rapid_cred_changes_no_leak)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with wider range of UIDs/GIDs */
+ userns_fd = get_userns_fd(0, orig_uid, 100);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform many rapid credential changes.
+ * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid.
+ */
+ for (change_count = 0; change_count < 200; change_count++) {
+ switch (change_count % 6) {
+ case 0:
+ setuid(change_count % 50);
+ break;
+ case 1:
+ setgid(change_count % 50);
+ break;
+ case 2:
+ setreuid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 3:
+ setregid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 4:
+ setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ case 5:
+ setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive (no leaked active refs) */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("200 rapid credential changes completed with no active ref leak");
+}
+
+/*
+ * Test setfsuid/setfsgid which change filesystem UID/GID.
+ * These also trigger credential changes but may have different code paths.
+ */
+TEST(setfsuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setfsuid/setfsgid calls */
+ for (change_count = 0; change_count < 50; change_count++) {
+ setfsuid(change_count % 10);
+ setfsgid(change_count % 10);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644
index 000000000000..c7ed4023d7a8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[5];
+ int sv[5][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /*
+ * Map two pages:
+ * - First page: readable and writable
+ * - Second page: will be unmapped to trigger EFAULT
+ */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position the buffer pointer so there's room for exactly one u64
+ * before the page boundary. The second u64 would fall into the
+ * unmapped page.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+ /*
+ * Create a separate process to run listns() in a loop concurrently
+ * with namespace creation and destruction.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * The kernel should:
+ * 1. Successfully write the first namespace ID (within valid page)
+ * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+ * 3. Handle concurrent namespace destruction without deadlock
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create several child processes, each in its own mount namespace.
+ * These will be destroyed while the iterator is running listns().
+ */
+ for (i = 0; i < 5; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create a couple of tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 5; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /*
+ * Signal children to exit. This will destroy their mount namespaces
+ * while listns() is iterating the namespace tree.
+ * This tests that cleanup happens outside the RCU read lock.
+ */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all mount namespace children to exit and cleanup */
+ for (i = 0; i < 5; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 *ns_ids;
+ ssize_t ret;
+
+ /* Use a clearly invalid pointer */
+ ns_ids = (__u64 *)0xdeadbeef;
+
+ ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ ssize_t ret;
+
+ /* NULL buffer with non-zero count should fail */
+ ret = sys_listns(&req, NULL, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[10];
+ int sv[10][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Map two pages */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position buffer so we can write several u64s successfully
+ * before hitting the page boundary.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Request 10 namespace IDs while namespaces are being destroyed.
+ * This tests:
+ * 1. EFAULT handling when buffer becomes invalid
+ * 2. Namespace cleanup outside RCU read lock during iteration
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create more children with mount namespaces to increase the
+ * likelihood that namespace cleanup happens during iteration.
+ */
+ for (i = 0; i < 10; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 10; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill half the children */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Small delay to let some exit */
+ usleep(10000);
+
+ /* Kill remaining children */
+ for (i = 5; i < 10; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all children and cleanup */
+ for (i = 0; i < 10; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[8];
+ int sv[8][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Set up partial fault buffer */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* Position for 3 successful writes, then fault */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNS, /* Only mount namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Call listns() to race with namespace destruction.
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /* Create children with mount namespaces */
+ for (i = 0; i < 8; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Do some mount operations to make cleanup more interesting */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 8; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill children to trigger namespace destruction during iteration */
+ for (i = 0; i < 8; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for children and cleanup */
+ for (i = 0; i < 8; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c
new file mode 100644
index 000000000000..da7d33f96397
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Minimal test case to reproduce KASAN out-of-bounds in listns pagination.
+ *
+ * The bug occurs when:
+ * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER)
+ * 2. Using pagination (req.ns_id != 0)
+ * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of
+ * the filtered type, causing it to search the unified tree and potentially
+ * return a namespace of the wrong type.
+ */
+TEST(pagination_with_type_filter)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER, /* Filter by user namespace */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[10];
+ int num_children = 10;
+ int i;
+ int sv[2];
+ __u64 first_batch[3];
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* First batch - this should work */
+ ret = sys_listns(&req, first_batch, 3, 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("First batch returned %zd entries", ret);
+
+ if (ret == 3) {
+ __u64 second_batch[3];
+
+ /* Second batch - pagination triggers the bug */
+ req.ns_id = first_batch[2]; /* Continue from last ID */
+ ret = sys_listns(&req, second_batch, 3, 0);
+
+ TH_LOG("Second batch returned %zd entries", ret);
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Cleanup */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c
new file mode 100644
index 000000000000..82d818751a5f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_permissions_test.c
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test that unprivileged users can only see namespaces they're currently in.
+ * Create a namespace, drop privileges, verify we can only see our own namespaces.
+ */
+TEST(listns_unprivileged_current_only)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+ int unexpected_count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_netns_id;
+ bool found_ours;
+ int unexpected_count;
+
+ close(pipefd[0]);
+
+ /* Create user namespace to be unprivileged */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create a network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Now we're unprivileged - list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* We should only see our own network namespace */
+ found_ours = false;
+ unexpected_count = 0;
+
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_netns_id) {
+ found_ours = true;
+ } else {
+ /* This is either init_net (which we can see) or unexpected */
+ unexpected_count++;
+ }
+ }
+
+ /* Send results to parent */
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ write(pipefd[1], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ unexpected_count = 0;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ read(pipefd[0], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Child should have seen its own namespace */
+ ASSERT_TRUE(found_ours);
+
+ TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)",
+ unexpected_count);
+}
+
+/*
+ * Test that users with CAP_SYS_ADMIN in a user namespace can see
+ * all namespaces owned by that user namespace.
+ */
+TEST(listns_cap_sys_admin_in_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be set to our created user namespace */
+ };
+ __u64 ns_ids[100];
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 userns_id;
+ ssize_t ret;
+ int min_expected;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we'll have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get the user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create several namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+ unshare(CLONE_NEWIPC);
+
+ /* List namespaces owned by our user namespace */
+ req.user_ns_id = userns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /*
+ * We have CAP_SYS_ADMIN in this user namespace,
+ * so we should see all namespaces owned by it.
+ * That includes: net, uts, ipc, and the user namespace itself.
+ */
+ min_expected = 4;
+ success = (ret >= min_expected);
+
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace",
+ count);
+}
+
+/*
+ * Test that users cannot see namespaces from unrelated user namespaces.
+ * Create two sibling user namespaces, verify they can't see each other's
+ * owned namespaces.
+ */
+TEST(listns_cannot_see_sibling_userns_namespaces)
+{
+ int pipefd[2];
+ pid_t pid1, pid2;
+ int status;
+ __u64 netns_a_id;
+ int pipefd2[2];
+ bool found_sibling_netns;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Fork first child - creates user namespace A */
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ int fd;
+ __u64 netns_a_id;
+ char buf;
+
+ close(pipefd[0]);
+
+ /* Create user namespace A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create network namespace owned by user namespace A */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &netns_a_id, sizeof(netns_a_id));
+
+ /* Keep alive for sibling to check */
+ read(pipefd[1], &buf, 1);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent reads namespace A ID */
+ close(pipefd[1]);
+ netns_a_id = 0;
+ read(pipefd[0], &netns_a_id, sizeof(netns_a_id));
+
+ TH_LOG("User namespace A created network namespace with ID %llu",
+ (unsigned long long)netns_a_id);
+
+ /* Fork second child - creates user namespace B */
+ ASSERT_EQ(pipe(pipefd2), 0);
+
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_sibling_netns;
+
+ close(pipefd[0]);
+ close(pipefd2[0]);
+
+ /* Create user namespace B (sibling to A) */
+ if (setup_userns() < 0) {
+ close(pipefd2[1]);
+ exit(1);
+ }
+
+ /* Try to list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ found_sibling_netns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_a_id) {
+ found_sibling_netns = true;
+ break;
+ }
+ }
+ }
+
+ /* We should NOT see the sibling's network namespace */
+ write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[1]);
+ exit(0);
+ }
+
+ /* Parent reads result from second child */
+ close(pipefd2[1]);
+ found_sibling_netns = false;
+ read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[0]);
+
+ /* Signal first child to exit */
+ close(pipefd[0]);
+
+ /* Wait for both children */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Second child should NOT have seen first child's namespace */
+ ASSERT_FALSE(found_sibling_netns);
+ TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace");
+}
+
+/*
+ * Test permission checking with LISTNS_CURRENT_USER.
+ * Verify that listing with LISTNS_CURRENT_USER respects permissions.
+ */
+TEST(listns_current_user_permissions)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List with LISTNS_CURRENT_USER - should see our owned namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ success = (ret >= 3); /* At least user, net, uts */
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count);
+}
+
+/*
+ * Test that CAP_SYS_ADMIN in parent user namespace allows seeing
+ * child user namespace's owned namespaces.
+ */
+TEST(listns_parent_userns_cap_sys_admin)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_child_userns;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 parent_userns_id;
+ __u64 child_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_child_userns;
+
+ close(pipefd[0]);
+
+ /* Create parent user namespace - we have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get parent user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get child user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create namespaces owned by child user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List namespaces owned by parent user namespace */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = 0;
+ req.spare2 = 0;
+ req.user_ns_id = parent_userns_id;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* Should see child user namespace in the list */
+ found_child_userns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == child_userns_id) {
+ found_child_userns = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_child_userns, sizeof(found_child_userns));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_child_userns = false;
+ count = 0;
+ read(pipefd[0], &found_child_userns, sizeof(found_child_userns));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_child_userns);
+ TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)",
+ count);
+}
+
+/*
+ * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of.
+ * This is different from seeing namespaces owned by a user namespace.
+ */
+TEST(listns_cap_sys_admin_inside_userns)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_ours;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we have CAP_SYS_ADMIN inside it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* List all user namespaces globally */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = CLONE_NEWUSER;
+ req.spare2 = 0;
+ req.user_ns_id = 0;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* We should be able to see our own user namespace */
+ found_ours = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_userns_id) {
+ found_ours = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_ours);
+ TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of");
+}
+
+/*
+ * Test that dropping CAP_SYS_ADMIN restricts what we can see.
+ */
+TEST(listns_drop_cap_sys_admin)
+{
+ cap_t caps;
+ cap_value_t cap_list[1] = { CAP_SYS_ADMIN };
+
+ /* This test needs to start with CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (!caps) {
+ SKIP(return, "Cannot get capabilities");
+ }
+
+ cap_flag_value_t cap_val;
+ if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) {
+ cap_free(caps);
+ SKIP(return, "Cannot check CAP_SYS_ADMIN");
+ }
+
+ if (cap_val != CAP_SET) {
+ cap_free(caps);
+ SKIP(return, "Test needs CAP_SYS_ADMIN to start");
+ }
+ cap_free(caps);
+
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool correct;
+ ssize_t count_before, count_after;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids_before[100];
+ ssize_t count_before;
+ __u64 ns_ids_after[100];
+ ssize_t count_after;
+ bool correct;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Count namespaces with CAP_SYS_ADMIN */
+ count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+
+ /* Drop CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (caps) {
+ cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR);
+ cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR);
+ cap_set_proc(caps);
+ cap_free(caps);
+ }
+
+ /* Ensure we can't regain the capability */
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
+ /* Count namespaces without CAP_SYS_ADMIN */
+ count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+
+ /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */
+ correct = (count_after <= count_before);
+
+ write(pipefd[1], &correct, sizeof(correct));
+ write(pipefd[1], &count_before, sizeof(count_before));
+ write(pipefd[1], &count_after, sizeof(count_after));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ correct = false;
+ count_before = 0;
+ count_after = 0;
+ read(pipefd[0], &correct, sizeof(correct));
+ read(pipefd[0], &count_before, sizeof(count_before));
+ read(pipefd[0], &count_after, sizeof(count_after));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(correct);
+ TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces",
+ count_before, count_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c
new file mode 100644
index 000000000000..8a95789d6a87
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_test.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test basic listns() functionality with the unified namespace tree.
+ * List all active namespaces globally.
+ */
+TEST(listns_basic_unified)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+
+ /* Should find at least the initial namespaces */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active namespaces", ret);
+
+ /* Verify all returned IDs are non-zero */
+ for (ssize_t i = 0; i < ret; i++) {
+ ASSERT_NE(ns_ids[i], 0);
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+}
+
+/*
+ * Test listns() with type filtering.
+ * List only network namespaces.
+ */
+TEST(listns_filter_by_type)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET, /* Only network namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least init_net */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active network namespaces", ret);
+
+ /* Verify we can open each namespace and it's actually a network namespace */
+ for (ssize_t i = 0; i < ret && i < 5; i++) {
+ struct nsfs_file_handle nsfh = {
+ .ns_id = ns_ids[i],
+ .ns_type = CLONE_NEWNET,
+ .ns_inum = 0,
+ };
+ struct file_handle *fh;
+ int fd;
+
+ fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh));
+ ASSERT_NE(fh, NULL);
+ fh->handle_bytes = sizeof(nsfh);
+ fh->handle_type = 0;
+ memcpy(fh->f_handle, &nsfh, sizeof(nsfh));
+
+ fd = open_by_handle_at(-10003, fh, O_RDONLY);
+ free(fh);
+
+ if (fd >= 0) {
+ int ns_type;
+ /* Verify it's a network namespace via ioctl */
+ ns_type = ioctl(fd, NS_GET_NSTYPE);
+ if (ns_type >= 0) {
+ ASSERT_EQ(ns_type, CLONE_NEWNET);
+ }
+ close(fd);
+ }
+ }
+}
+
+/*
+ * Test listns() pagination.
+ * List namespaces in batches.
+ */
+TEST(listns_pagination)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 batch1[2], batch2[2];
+ ssize_t ret1, ret2;
+
+ /* Get first batch */
+ ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0);
+ if (ret1 < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret1, 0);
+
+ if (ret1 == 0)
+ SKIP(return, "No namespaces found");
+
+ TH_LOG("First batch: %zd namespaces", ret1);
+
+ /* Get second batch using last ID from first batch */
+ if (ret1 == ARRAY_SIZE(batch1)) {
+ req.ns_id = batch1[ret1 - 1];
+ ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0);
+ ASSERT_GE(ret2, 0);
+
+ TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)",
+ ret2, (unsigned long long)req.ns_id);
+
+ /* If we got more results, verify IDs are monotonically increasing */
+ if (ret2 > 0) {
+ ASSERT_GT(batch2[0], batch1[ret1 - 1]);
+ TH_LOG("Pagination working: %llu > %llu",
+ (unsigned long long)batch2[0],
+ (unsigned long long)batch1[ret1 - 1]);
+ }
+ } else {
+ TH_LOG("All namespaces fit in first batch");
+ }
+}
+
+/*
+ * Test listns() with LISTNS_CURRENT_USER.
+ * List namespaces owned by current user namespace.
+ */
+TEST(listns_current_user)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least the initial namespaces if we're in init_user_ns */
+ TH_LOG("Found %zd namespaces owned by current user namespace", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that listns() only returns active namespaces.
+ * Create a namespace, let it become inactive, verify it's not listed.
+ */
+TEST(listns_only_active)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[100], ns_ids_after[100];
+ ssize_t ret_before, ret_after;
+ int pipefd[2];
+ pid_t pid;
+ __u64 new_ns_id = 0;
+ int status;
+
+ /* Get initial list */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret_before, 0);
+
+ TH_LOG("Before: %zd active network namespaces", ret_before);
+
+ /* Create a new namespace in a child process and get its ID */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get its ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ write(pipefd[1], &ns_id, sizeof(ns_id));
+ close(pipefd[1]);
+
+ /* Keep namespace active briefly */
+ usleep(100000);
+ exit(0);
+ }
+
+ /* Parent reads the new namespace ID */
+ {
+ int bytes;
+
+ close(pipefd[1]);
+ bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id));
+ close(pipefd[0]);
+
+ if (bytes == sizeof(new_ns_id)) {
+ __u64 ns_ids_during[100];
+ int ret_during;
+
+ TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id);
+
+ /* List namespaces while child is still alive - should see new one */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+ TH_LOG("During: %d active network namespaces", ret_during);
+
+ /* Should have more namespaces than before */
+ ASSERT_GE(ret_during, ret_before);
+ }
+ }
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+
+ /* Give time for namespace to become inactive */
+ usleep(100000);
+
+ /* List namespaces after child exits - should not see new one */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+ TH_LOG("After: %zd active network namespaces", ret_after);
+
+ /* Verify the new namespace ID is not in the after list */
+ if (new_ns_id != 0) {
+ bool found = false;
+
+ for (ssize_t i = 0; i < ret_after; i++) {
+ if (ns_ids_after[i] == new_ns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_FALSE(found);
+ }
+}
+
+/*
+ * Test listns() with specific user namespace ID.
+ * Create a user namespace and list namespaces it owns.
+ */
+TEST(listns_specific_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be filled with created userns ID */
+ };
+ __u64 ns_ids[100];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_ns_id = 0;
+ int bytes;
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create new user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+ bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id));
+
+ if (bytes != sizeof(user_ns_id)) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get user namespace ID from child");
+ }
+
+ TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id);
+
+ /* List namespaces owned by this user namespace */
+ req.user_ns_id = user_ns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0) {
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS) {
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("Found %zd namespaces owned by user namespace %llu", ret,
+ (unsigned long long)user_ns_id);
+
+ /* Should find at least the network and UTS namespaces we created */
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret && i < 10; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test listns() with multiple namespace types filter.
+ */
+TEST(listns_multiple_types)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ TH_LOG("Found %zd active network/UTS namespaces", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that hierarchical active reference propagation keeps parent
+ * user namespaces visible in listns().
+ */
+TEST(listns_hierarchical_visibility)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 parent_ns_id = 0, child_ns_id = 0;
+ int sv[2];
+ pid_t pid;
+ int status;
+ int bytes;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_parent, found_child;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+ if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+
+ /* Read both namespace IDs */
+ bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id));
+ bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id));
+
+ if (bytes != (int)(2 * sizeof(__u64))) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace IDs from child");
+ }
+
+ TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id);
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id);
+
+ /* List all user namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0 && errno == ENOSYS) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+
+ ASSERT_GE(ret, 0);
+ TH_LOG("Found %zd active user namespaces", ret);
+
+ /* Both parent and child should be visible (active due to child process) */
+ found_parent = false;
+ found_child = false;
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_ns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_ns_id)
+ found_child = true;
+ }
+
+ TH_LOG("Parent namespace %s, child namespace %s",
+ found_parent ? "found" : "NOT FOUND",
+ found_child ? "found" : "NOT FOUND");
+
+ ASSERT_TRUE(found_child);
+ /* With hierarchical propagation, parent should also be active */
+ ASSERT_TRUE(found_parent);
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test error cases for listns().
+ */
+TEST(listns_error_cases)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[10];
+ int ret;
+
+ /* Test with invalid flags */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+
+ /* Test with NULL ns_ids array */
+ ret = sys_listns(&req, NULL, 10, 0);
+ ASSERT_LT(ret, 0);
+
+ /* Test with invalid spare field */
+ req.spare = 1;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+ req.spare = 0;
+
+ /* Test with huge nr_ns_ids */
+ ret = sys_listns(&req, ns_ids, 2000000, 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c
new file mode 100644
index 000000000000..093268f0efaa
--- /dev/null
+++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c
@@ -0,0 +1,2672 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test that initial namespaces can be reopened via file handle.
+ * Initial namespaces should have active ref count of 1 from boot.
+ */
+TEST(init_ns_always_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd1, fd2;
+ struct stat st1, st2;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open initial network namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+
+ /* Get file handle for initial namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(fd1);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+
+ /* Close the namespace fd */
+ close(fd1);
+
+ /* Try to reopen via file handle - should succeed since init ns is always active */
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle);
+ return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd2, 0);
+
+ /* Verify we opened the same namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+ ASSERT_EQ(fstat(fd1, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(fd1);
+ close(fd2);
+ free(handle);
+}
+
+/*
+ * Test namespace lifecycle: create a namespace in a child process,
+ * get a file handle while it's active, then try to reopen after
+ * the process exits (namespace becomes inactive).
+ */
+TEST(ns_inactive_after_exit)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ /* Create pipe for passing file handle from child */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Open our new namespace */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get file handle for the namespace */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Exit - namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read file handle from child */
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Try to reopen namespace - should fail with ENOENT since it's inactive */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should fail with ENOENT (namespace inactive) or ESTALE */
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a process is using it,
+ * even after the creating process exits.
+ */
+TEST(ns_active_with_multiple_processes)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ int syncpipe[2];
+ pid_t pid1, pid2;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ /* Create pipes for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ /* First child - creates namespace */
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Open and get handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Wait for signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent reads handle */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+ ASSERT_GT(ret, 0);
+
+ handle = (struct file_handle *)buf;
+
+ /* Create second child that will keep namespace active */
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ /* Second child - reopens the namespace */
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+
+ /* Open the namespace via handle */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ /* Join the namespace */
+ ret = setns(fd, CLONE_NEWNET);
+ close(fd);
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Sleep to keep namespace active */
+ sleep(1);
+ exit(0);
+ }
+
+ /* Let second child enter the namespace */
+ usleep(100000); /* 100ms */
+
+ /* Signal first child to exit */
+ close(syncpipe[0]);
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for first child */
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Namespace should still be active because second child is using it */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd, 0);
+ close(fd);
+
+ /* Wait for second child */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+}
+
+/*
+ * Test user namespace active ref tracking via credential lifecycle
+ */
+TEST(userns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Set up uid/gid mappings */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) {
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all tasks exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test PID namespace active ref tracking
+ */
+TEST(pidns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new PID namespace */
+ ret = unshare(CLONE_NEWPID);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Fork to actually enter the PID namespace */
+ pid_t child = fork();
+ if (child < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (child == 0) {
+ /* Grandchild - in new PID namespace */
+ fd = open("/proc/self/ns/pid", O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Send handle to grandparent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Wait for grandchild */
+ waitpid(child, NULL, 0);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all processes exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that an open file descriptor keeps a namespace active.
+ * Even after the creating process exits, the namespace should remain
+ * active as long as an fd is held open.
+ */
+TEST(ns_fd_keeps_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int nsfd;
+ int pipe_child_ready[2];
+ int pipe_parent_ready[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ char proc_path[64];
+
+ ASSERT_EQ(pipe(pipe_child_ready), 0);
+ ASSERT_EQ(pipe(pipe_parent_ready), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipe_child_ready[0]);
+ close(pipe_parent_ready[1]);
+
+ TH_LOG("Child: creating new network namespace");
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: network namespace created successfully");
+
+ /* Get file handle for the namespace */
+ nsfd = open("/proc/self/ns/net", O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened namespace fd %d", nsfd);
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(nsfd);
+
+ if (ret < 0) {
+ TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes);
+
+ /* Send file handle to parent */
+ ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes);
+ TH_LOG("Child: sent %d bytes of file handle to parent", ret);
+ close(pipe_child_ready[1]);
+
+ /* Wait for parent to open the fd */
+ TH_LOG("Child: waiting for parent to open fd");
+ ret = read(pipe_parent_ready[0], &sync_byte, 1);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret);
+ /* Exit - namespace should stay active because parent holds fd */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Parent: reading file handle from child");
+
+ /* Read file handle from child */
+ ret = read(pipe_child_ready[0], buf, sizeof(buf));
+ close(pipe_child_ready[0]);
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes);
+
+ /* Open the child's namespace while it's still alive */
+ snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
+ TH_LOG("Parent: opening child's namespace at %s", proc_path);
+ nsfd = open(proc_path, O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno));
+ close(pipe_parent_ready[1]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child's namespace");
+ }
+
+ TH_LOG("Parent: opened child's namespace, got fd %d", nsfd);
+
+ /* Signal child that we have the fd */
+ sync_byte = 'G';
+ write(pipe_parent_ready[1], &sync_byte, 1);
+ close(pipe_parent_ready[1]);
+ TH_LOG("Parent: signaled child that we have the fd");
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child exited, parent holds fd %d to namespace", nsfd);
+
+ /*
+ * Namespace should still be ACTIVE because we hold an fd.
+ * We should be able to reopen it via file handle.
+ */
+ TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)");
+ int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd2, 0);
+
+ TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(fd2);
+
+ /* Now close the fd - namespace should become inactive */
+ TH_LOG("Closing fd %d - namespace should become inactive", nsfd);
+ close(nsfd);
+
+ /* Now reopening should fail - namespace is inactive */
+ TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)");
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd2, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test hierarchical active reference propagation.
+ * When a child namespace is active, its owning user namespace should also
+ * be active automatically due to hierarchical active reference propagation.
+ * This ensures parents are always reachable when children are active.
+ */
+TEST(ns_parent_always_reachable)
+{
+ struct file_handle *parent_handle, *child_handle;
+ int ret;
+ int child_nsfd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 parent_id, child_id;
+ char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ TH_LOG("Child: creating parent user namespace and setting up mappings");
+
+ /* Create parent user namespace with mappings */
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for parent user namespace */
+ int parent_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (parent_fd < 0) {
+ TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened parent userns fd %d", parent_fd);
+
+ if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno));
+ close(parent_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(parent_fd);
+
+ TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id);
+
+ /* Create child user namespace within parent */
+ TH_LOG("Child: creating nested child user namespace");
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for child user namespace */
+ int child_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (child_fd < 0) {
+ TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened child userns fd %d", child_fd);
+
+ if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno));
+ close(child_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(child_fd);
+
+ TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id);
+
+ /* Send both namespace IDs to parent */
+ TH_LOG("Child: sending both namespace IDs to parent");
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+ close(pipefd[1]);
+
+ TH_LOG("Child: exiting - parent userns should become inactive");
+ /* Exit - parent user namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ TH_LOG("Parent: reading both namespace IDs from child");
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &parent_id, sizeof(parent_id));
+ if (ret != sizeof(parent_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &child_id, sizeof(child_id));
+ close(pipefd[0]);
+ if (ret != sizeof(child_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ TH_LOG("Parent: received parent_id=%llu, child_id=%llu",
+ (unsigned long long)parent_id, (unsigned long long)child_id);
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)parent_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ parent_fh->ns_id = parent_id;
+ parent_fh->ns_type = 0;
+ parent_fh->ns_inum = 0;
+
+ child_handle = (struct file_handle *)child_buf;
+ child_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ child_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle;
+ child_fh->ns_id = child_id;
+ child_fh->ns_type = 0;
+ child_fh->ns_inum = 0;
+
+ TH_LOG("Parent: opening child namespace BEFORE child exits");
+
+ /* Open child namespace while child is still alive to keep it active */
+ child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY);
+ if (child_nsfd < 0) {
+ TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ TH_LOG("Opened child namespace fd %d", child_nsfd);
+
+ /* Now wait for child to exit */
+ TH_LOG("Parent: waiting for child to exit");
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child process exited, parent holds fd to child namespace");
+
+ /*
+ * With hierarchical active reference propagation:
+ * Since the child namespace is active (parent process holds fd),
+ * the parent user namespace should ALSO be active automatically.
+ * This is because when we took an active reference on the child,
+ * it propagated up to the owning user namespace.
+ */
+ TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)");
+ int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd, 0);
+
+ TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd);
+
+ /* Verify we can also get parent via NS_GET_USERNS */
+ TH_LOG("Verifying NS_GET_USERNS also works");
+ int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS);
+ if (parent_fd2 < 0) {
+ close(parent_fd);
+ close(child_nsfd);
+ TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno);
+ SKIP(return, "NS_GET_USERNS not supported or failed");
+ }
+
+ TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2);
+
+ /* Verify both methods give us the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(parent_fd, &st1), 0);
+ ASSERT_EQ(fstat(parent_fd2, &st2), 0);
+ TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ /*
+ * Close child fd - parent should remain active because we still
+ * hold direct references to it (parent_fd and parent_fd2).
+ */
+ TH_LOG("Closing child fd - parent should remain active (direct refs held)");
+ close(child_nsfd);
+
+ /* Parent should still be openable */
+ TH_LOG("Verifying parent still active via file handle");
+ int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd3, 0);
+ close(parent_fd3);
+
+ TH_LOG("Closing all fds to parent namespace");
+ close(parent_fd);
+ close(parent_fd2);
+
+ /* Both should now be inactive */
+ TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)");
+ parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(parent_fd, 0);
+ TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that bind mounts keep namespaces in the tree even when inactive
+ */
+TEST(ns_bind_mount_keeps_in_tree)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char tmpfile[] = "/tmp/ns-test-XXXXXX";
+ int tmpfd;
+
+ /* Create temporary file for bind mount */
+ tmpfd = mkstemp(tmpfile);
+ if (tmpfd < 0) {
+ SKIP(return, "Cannot create temporary file");
+ }
+ close(tmpfd);
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Unshare mount namespace and make mounts private to avoid propagation */
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+ ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Bind mount the namespace */
+ ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /*
+ * Namespace should be inactive but still in tree due to bind mount.
+ * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree).
+ */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should be ENOENT (inactive) since bind mount keeps it in tree */
+ if (errno != ENOENT && errno != ESTALE) {
+ TH_LOG("Unexpected error: %d", errno);
+ }
+
+ /* Cleanup */
+ umount(tmpfile);
+ unlink(tmpfile);
+}
+
+/*
+ * Test multi-level hierarchy (3+ levels deep).
+ * Grandparent → Parent → Child
+ * When child is active, both parent AND grandparent should be active.
+ */
+TEST(ns_multilevel_hierarchy)
+{
+ struct file_handle *gp_handle, *p_handle, *c_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 gp_id, p_id, c_id;
+ char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ];
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create grandparent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int gp_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (gp_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) {
+ close(gp_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(gp_fd);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) {
+ close(c_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &gp_id, sizeof(gp_id));
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c_id, sizeof(c_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &gp_id, sizeof(gp_id));
+ if (ret != sizeof(gp_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read grandparent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &c_id, sizeof(c_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ /* Construct file handles from namespace IDs */
+ gp_handle = (struct file_handle *)gp_buf;
+ gp_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ gp_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle;
+ gp_fh->ns_id = gp_id;
+ gp_fh->ns_type = 0;
+ gp_fh->ns_inum = 0;
+
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c_handle = (struct file_handle *)c_buf;
+ c_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle;
+ c_fh->ns_id = c_id;
+ c_fh->ns_type = 0;
+ c_fh->ns_inum = 0;
+
+ /* Open child before process exits */
+ int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY);
+ if (c_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * With 3-level hierarchy and child active:
+ * - Child is active (we hold fd)
+ * - Parent should be active (propagated from child)
+ * - Grandparent should be active (propagated from parent)
+ */
+ TH_LOG("Testing parent active when child is active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+
+ TH_LOG("Testing grandparent active when child is active");
+ int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY);
+ ASSERT_GE(gp_fd, 0);
+
+ close(c_fd);
+ close(p_fd);
+ close(gp_fd);
+}
+
+/*
+ * Test multiple children sharing same parent.
+ * Parent should stay active as long as ANY child is active.
+ */
+TEST(ns_multiple_children_same_parent)
+{
+ struct file_handle *p_handle, *c1_handle, *c2_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, c1_id, c2_id;
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ];
+ char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c1_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c1_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) {
+ close(c1_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c1_fd);
+
+ /* Return to parent user namespace and create second child */
+ /* We can't actually do this easily, so let's create a sibling namespace
+ * by creating a network namespace instead */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (c2_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) {
+ close(c2_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c2_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c1_id, sizeof(c1_id));
+ write(pipefd[1], &c2_id, sizeof(c2_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &c1_id, sizeof(c1_id));
+ if (ret != sizeof(c1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first child namespace ID");
+ }
+
+ ret = read(pipefd[0], &c2_id, sizeof(c2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second child namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c1_handle = (struct file_handle *)c1_buf;
+ c1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle;
+ c1_fh->ns_id = c1_id;
+ c1_fh->ns_type = 0;
+ c1_fh->ns_inum = 0;
+
+ c2_handle = (struct file_handle *)c2_buf;
+ c2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle;
+ c2_fh->ns_id = c2_id;
+ c2_fh->ns_type = 0;
+ c2_fh->ns_inum = 0;
+
+ /* Open both children before process exits */
+ int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY);
+ int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY);
+
+ if (c1_fd < 0 || c2_fd < 0) {
+ if (c1_fd >= 0) close(c1_fd);
+ if (c2_fd >= 0) close(c2_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (both children active) */
+ TH_LOG("Both children active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first child - parent should STILL be active */
+ TH_LOG("Closing first child - parent should still be active");
+ close(c1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second child - NOW parent should become inactive */
+ TH_LOG("Closing second child - parent should become inactive");
+ close(c2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that different namespace types with same owner all contribute
+ * active references to the owning user namespace.
+ */
+TEST(ns_different_types_same_owner)
+{
+ struct file_handle *u_handle, *n_handle, *ut_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ /* Create network namespace (owned by user namespace) */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ /* Create UTS namespace (also owned by user namespace) */
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ u_handle = (struct file_handle *)u_buf;
+ u_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ u_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ n_handle = (struct file_handle *)n_buf;
+ n_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ n_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ ut_handle = (struct file_handle *)ut_buf;
+ ut_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ut_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces before process exits */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY);
+
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * Both network and UTS namespaces are active.
+ * User namespace should be active (gets 2 active refs).
+ */
+ TH_LOG("Both net and uts active - user namespace should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close network namespace - user namespace should STILL be active */
+ TH_LOG("Closing network ns - user ns should still be active (uts still active)");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close UTS namespace - user namespace should become inactive */
+ TH_LOG("Closing uts ns - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/*
+ * Test hierarchical propagation with deep namespace hierarchy.
+ * Create: init_user_ns -> user_A -> user_B -> net_ns
+ * When net_ns is active, both user_A and user_B should be active.
+ * This verifies the conditional recursion in __ns_ref_active_put() works.
+ */
+TEST(ns_deep_hierarchy_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle, *net_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id, net_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A -> user_B -> net hierarchy */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ close(net_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(net_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ write(pipefd[1], &net_id, sizeof(net_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ if (ret != sizeof(ub_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Open net_ns before child exits to keep it active */
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ if (net_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open network namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With net_ns active, both user_A and user_B should be active */
+ TH_LOG("Testing user_B active (net_ns active causes propagation)");
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd, 0);
+
+ TH_LOG("Testing user_A active (propagated through user_B)");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close net_ns - user_B should stay active (we hold direct ref) */
+ TH_LOG("Closing net_ns, user_B should remain active (direct ref held)");
+ close(net_fd);
+ int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd2, 0);
+ close(ub_fd2);
+
+ /* Close user_B - user_A should stay active (we hold direct ref) */
+ TH_LOG("Closing user_B, user_A should remain active (direct ref held)");
+ close(ub_fd);
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - everything should become inactive */
+ TH_LOG("Closing user_A, all should become inactive");
+ close(ua_fd);
+
+ /* All should now be inactive */
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test that parent stays active as long as ANY child is active.
+ * Create parent user namespace with two child net namespaces.
+ * Parent should remain active until BOTH children are inactive.
+ */
+TEST(ns_parent_multiple_children_refcount)
+{
+ struct file_handle *parent_handle, *net1_handle, *net2_handle;
+ int ret, pipefd[2], syncpipe[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, n1_id, n2_id;
+ char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ];
+ char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n1_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n1_fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep n1_fd open so first namespace stays active */
+
+ /* Create second network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n2_fd < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) {
+ close(n1_fd);
+ close(n2_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep both n1_fd and n2_fd open */
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &n1_id, sizeof(n1_id));
+ write(pipefd[1], &n2_id, sizeof(n2_id));
+ close(pipefd[1]);
+
+ /* Wait for parent to signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &n1_id, sizeof(n1_id));
+ if (ret != sizeof(n1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first network namespace ID");
+ }
+
+ ret = read(pipefd[0], &n2_id, sizeof(n2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(n2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)p_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ net1_handle = (struct file_handle *)n1_buf;
+ net1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle;
+ n1_fh->ns_id = n1_id;
+ n1_fh->ns_type = 0;
+ n1_fh->ns_inum = 0;
+
+ net2_handle = (struct file_handle *)n2_buf;
+ net2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle;
+ n2_fh->ns_id = n2_id;
+ n2_fh->ns_type = 0;
+ n2_fh->ns_inum = 0;
+
+ /* Open both net namespaces while child is still alive */
+ int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY);
+ int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY);
+ if (n1_fd < 0 || n2_fd < 0) {
+ if (n1_fd >= 0) close(n1_fd);
+ if (n2_fd >= 0) close(n2_fd);
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open net namespaces");
+ }
+
+ /* Signal child that we have opened the namespaces */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (has 2 active children) */
+ TH_LOG("Both net namespaces active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first net namespace - parent should STILL be active */
+ TH_LOG("Closing first net ns - parent should still be active");
+ close(n1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second net namespace - parent should become inactive */
+ TH_LOG("Closing second net ns - parent should become inactive");
+ close(n2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that user namespace as a child also propagates correctly.
+ * Create user_A -> user_B, verify when user_B is active that user_A
+ * is also active. This is different from non-user namespace children.
+ */
+TEST(ns_userns_child_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ /* Create user_B (child of user_A) */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ /* Send both namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ub_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ /* Open user_B before child exits */
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ if (ub_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open user_B");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With user_B active, user_A should also be active */
+ TH_LOG("Testing user_A active when child user_B is active");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close user_B */
+ TH_LOG("Closing user_B");
+ close(ub_fd);
+
+ /* user_A should remain active (we hold direct ref) */
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - should become inactive */
+ TH_LOG("Closing user_A - should become inactive");
+ close(ua_fd);
+
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test different namespace types (net, uts, ipc) all contributing
+ * active references to the same owning user namespace.
+ */
+TEST(ns_mixed_types_same_owner)
+{
+ struct file_handle *user_handle, *net_handle, *uts_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ user_handle = (struct file_handle *)u_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)n_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ uts_handle = (struct file_handle *)ut_buf;
+ uts_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ uts_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY);
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* User namespace should be active (2 active children) */
+ TH_LOG("Both net and uts active - user ns should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close net - user ns should STILL be active (uts still active) */
+ TH_LOG("Closing net - user ns should still be active");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close uts - user ns should become inactive */
+ TH_LOG("Closing uts - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/* Thread test helpers and structures */
+struct thread_ns_info {
+ __u64 ns_id;
+ int pipefd;
+ int syncfd_read;
+ int syncfd_write;
+ int exit_code;
+};
+
+static void *thread_create_namespace(void *arg)
+{
+ struct thread_ns_info *info = (struct thread_ns_info *)arg;
+ int ret;
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ info->exit_code = 1;
+ return NULL;
+ }
+
+ /* Get namespace ID */
+ int fd = open("/proc/thread-self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ info->exit_code = 2;
+ return NULL;
+ }
+
+ ret = ioctl(fd, NS_GET_ID, &info->ns_id);
+ close(fd);
+ if (ret < 0) {
+ info->exit_code = 3;
+ return NULL;
+ }
+
+ /* Send namespace ID to main thread */
+ if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) {
+ info->exit_code = 4;
+ return NULL;
+ }
+
+ /* Wait for signal to exit */
+ char sync_byte;
+ if (read(info->syncfd_read, &sync_byte, 1) != 1) {
+ info->exit_code = 5;
+ return NULL;
+ }
+
+ info->exit_code = 0;
+ return NULL;
+}
+
+/*
+ * Test that namespace becomes inactive after thread exits.
+ * This verifies active reference counting works with threads, not just processes.
+ */
+TEST(thread_ns_inactive_after_exit)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Namespace should be active while thread is alive */
+ TH_LOG("Attempting to open namespace while thread is alive (should succeed)");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+ close(nsfd);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ ASSERT_EQ(pthread_join(thread, NULL), 0);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0)
+ SKIP(return, "Thread failed to create namespace");
+
+ TH_LOG("Thread exited, namespace should be inactive");
+
+ /* Namespace should now be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a thread holds an fd to it.
+ * Even after the thread exits, the namespace should remain active as long as
+ * another thread holds a file descriptor to it.
+ */
+TEST(thread_ns_fd_keeps_active)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Open namespace while thread is alive */
+ TH_LOG("Opening namespace while thread is alive");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0) {
+ close(nsfd);
+ SKIP(return, "Thread failed to create namespace");
+ }
+
+ TH_LOG("Thread exited, but main thread holds fd - namespace should remain active");
+
+ /* Namespace should still be active because we hold an fd */
+ int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd2, 0);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(nsfd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(nsfd2);
+
+ TH_LOG("Closing fd - namespace should become inactive");
+ close(nsfd);
+
+ /* Now namespace should be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/* Structure for thread data in subprocess */
+struct thread_sleep_data {
+ int syncfd_read;
+};
+
+static void *thread_sleep_and_wait(void *arg)
+{
+ struct thread_sleep_data *data = (struct thread_sleep_data *)arg;
+ char sync_byte;
+
+ /* Wait for signal to exit - read will unblock when pipe is closed */
+ (void)read(data->syncfd_read, &sync_byte, 1);
+ return NULL;
+}
+
+/*
+ * Test that namespaces become inactive after subprocess with multiple threads exits.
+ * Create a subprocess that unshares user and network namespaces, then creates two
+ * threads that share those namespaces. Verify that after all threads and subprocess
+ * exit, the namespaces are no longer listed by listns() and cannot be opened by
+ * open_by_handle_at().
+ */
+TEST(thread_subprocess_ns_inactive_after_all_exit)
+{
+ int pipefd[2];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_id, net_id;
+ struct file_handle *user_handle, *net_handle;
+ char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ int ret;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+ close(sv[0]);
+
+ /* Create user namespace with mappings */
+ if (setup_userns() < 0) {
+ fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: setup_userns() succeeded\n");
+
+ /* Get user namespace ID */
+ int user_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (user_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno));
+ close(user_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(user_fd);
+ fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id);
+
+ /* Unshare network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n");
+
+ /* Get network namespace ID */
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno));
+ close(net_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(net_fd);
+ fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id);
+
+ /* Send namespace IDs to parent */
+ if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) {
+ fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) {
+ fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ close(pipefd[1]);
+ fprintf(stderr, "Child: sent namespace IDs to parent\n");
+
+ /* Create two threads that share the namespaces */
+ pthread_t thread1, thread2;
+ struct thread_sleep_data data;
+ data.syncfd_read = sv[1];
+
+ int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread1\n");
+
+ ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ pthread_cancel(thread1);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread2\n");
+
+ /* Wait for threads to complete - they will unblock when parent writes */
+ fprintf(stderr, "Child: waiting for threads to exit\n");
+ pthread_join(thread1, NULL);
+ fprintf(stderr, "Child: thread1 exited\n");
+ pthread_join(thread2, NULL);
+ fprintf(stderr, "Child: thread2 exited\n");
+
+ close(sv[1]);
+
+ /* Exit - namespaces should become inactive */
+ fprintf(stderr, "Child: all threads joined, exiting with success\n");
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ close(sv[1]);
+
+ TH_LOG("Parent: waiting to read namespace IDs from child");
+
+ /* Read namespace IDs from child */
+ ret = read(pipefd[0], &user_id, sizeof(user_id));
+ if (ret != sizeof(user_id)) {
+ TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno));
+ close(pipefd[0]);
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno));
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID from child");
+ }
+
+ TH_LOG("Child created user ns %llu and net ns %llu with 2 threads",
+ (unsigned long long)user_id, (unsigned long long)net_id);
+
+ /* Construct file handles */
+ user_handle = (struct file_handle *)user_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ user_fh->ns_id = user_id;
+ user_fh->ns_type = 0;
+ user_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Verify namespaces are active while subprocess and threads are alive */
+ TH_LOG("Verifying namespaces are active while subprocess with threads is running");
+ int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(user_fd, 0);
+
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_GE(net_fd, 0);
+
+ close(user_fd);
+ close(net_fd);
+
+ /* Also verify they appear in listns() */
+ TH_LOG("Verifying namespaces appear in listns() while active");
+ struct ns_id_req req = {
+ .size = sizeof(struct ns_id_req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids < 0) {
+ TH_LOG("listns() not available, skipping listns verification");
+ } else {
+ /* Check if user_id is in the list */
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_user);
+ TH_LOG("User namespace found in listns() as expected");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_net);
+ TH_LOG("Network namespace found in listns() as expected");
+ }
+ }
+
+ /* Signal threads to exit */
+ TH_LOG("Signaling threads to exit");
+ sync_byte = 'X';
+ /* Write two bytes - one for each thread */
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ close(sv[0]);
+
+ /* Wait for child process to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ if (WEXITSTATUS(status) != 0) {
+ TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status));
+ SKIP(return, "Child process failed");
+ }
+
+ TH_LOG("Subprocess and all threads have exited successfully");
+
+ /* Verify namespaces are now inactive - open_by_handle_at should fail */
+ TH_LOG("Verifying namespaces are inactive after subprocess and threads exit");
+ user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(user_fd, 0);
+ TH_LOG("User namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_LT(net_fd, 0);
+ TH_LOG("Network namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ /* Verify namespaces do NOT appear in listns() */
+ TH_LOG("Verifying namespaces do NOT appear in listns() when inactive");
+ memset(&req, 0, sizeof(req));
+ req.size = sizeof(struct ns_id_req);
+ req.ns_type = CLONE_NEWUSER;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_user);
+ TH_LOG("User namespace correctly not listed in listns()");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_net);
+ TH_LOG("Network namespace correctly not listed in listns()");
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c
index e28accd74a57..527ade0a8673 100644
--- a/tools/testing/selftests/namespaces/nsid_test.c
+++ b/tools/testing/selftests/namespaces/nsid_test.c
@@ -6,6 +6,7 @@
#include <libgen.h>
#include <limits.h>
#include <pthread.h>
+#include <signal.h>
#include <string.h>
#include <sys/mount.h>
#include <poll.h>
@@ -14,12 +15,30 @@
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
+#include <sys/wait.h>
#include <unistd.h>
#include <linux/fs.h>
#include <linux/limits.h>
#include <linux/nsfs.h>
#include "../kselftest_harness.h"
+/* Fixture for tests that create child processes */
+FIXTURE(nsid) {
+ pid_t child_pid;
+};
+
+FIXTURE_SETUP(nsid) {
+ self->child_pid = 0;
+}
+
+FIXTURE_TEARDOWN(nsid) {
+ /* Clean up any child process that may still be running */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+}
+
TEST(nsid_mntns_basic)
{
__u64 mnt_ns_id = 0;
@@ -44,7 +63,7 @@ TEST(nsid_mntns_basic)
close(fd_mntns);
}
-TEST(nsid_mntns_separate)
+TEST_F(nsid, mntns_separate)
{
__u64 parent_mnt_ns_id = 0;
__u64 child_mnt_ns_id = 0;
@@ -90,6 +109,9 @@ TEST(nsid_mntns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -99,8 +121,6 @@ TEST(nsid_mntns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_mntns);
SKIP(return, "No permission to create mount namespace");
}
@@ -123,10 +143,6 @@ TEST(nsid_mntns_separate)
close(fd_parent_mntns);
close(fd_child_mntns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_cgroupns_basic)
@@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic)
close(fd_cgroupns);
}
-TEST(nsid_cgroupns_separate)
+TEST_F(nsid, cgroupns_separate)
{
__u64 parent_cgroup_ns_id = 0;
__u64 child_cgroup_ns_id = 0;
@@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_cgroupns);
SKIP(return, "No permission to create cgroup namespace");
}
@@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate)
close(fd_parent_cgroupns);
close(fd_child_cgroupns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_ipcns_basic)
@@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic)
close(fd_ipcns);
}
-TEST(nsid_ipcns_separate)
+TEST_F(nsid, ipcns_separate)
{
__u64 parent_ipc_ns_id = 0;
__u64 child_ipc_ns_id = 0;
@@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_ipcns);
SKIP(return, "No permission to create IPC namespace");
}
@@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate)
close(fd_parent_ipcns);
close(fd_child_ipcns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_utsns_basic)
@@ -371,7 +381,7 @@ TEST(nsid_utsns_basic)
close(fd_utsns);
}
-TEST(nsid_utsns_separate)
+TEST_F(nsid, utsns_separate)
{
__u64 parent_uts_ns_id = 0;
__u64 child_uts_ns_id = 0;
@@ -417,6 +427,9 @@ TEST(nsid_utsns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -426,8 +439,6 @@ TEST(nsid_utsns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_utsns);
SKIP(return, "No permission to create UTS namespace");
}
@@ -450,10 +461,6 @@ TEST(nsid_utsns_separate)
close(fd_parent_utsns);
close(fd_child_utsns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_userns_basic)
@@ -480,7 +487,7 @@ TEST(nsid_userns_basic)
close(fd_userns);
}
-TEST(nsid_userns_separate)
+TEST_F(nsid, userns_separate)
{
__u64 parent_user_ns_id = 0;
__u64 child_user_ns_id = 0;
@@ -526,6 +533,9 @@ TEST(nsid_userns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -535,8 +545,6 @@ TEST(nsid_userns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_userns);
SKIP(return, "No permission to create user namespace");
}
@@ -559,10 +567,6 @@ TEST(nsid_userns_separate)
close(fd_parent_userns);
close(fd_child_userns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_timens_basic)
@@ -591,7 +595,7 @@ TEST(nsid_timens_basic)
close(fd_timens);
}
-TEST(nsid_timens_separate)
+TEST_F(nsid, timens_separate)
{
__u64 parent_time_ns_id = 0;
__u64 child_time_ns_id = 0;
@@ -652,6 +656,9 @@ TEST(nsid_timens_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -660,8 +667,6 @@ TEST(nsid_timens_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_timens);
close(pipefd[0]);
SKIP(return, "Cannot create time namespace");
@@ -689,10 +694,6 @@ TEST(nsid_timens_separate)
close(fd_parent_timens);
close(fd_child_timens);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_pidns_basic)
@@ -719,7 +720,7 @@ TEST(nsid_pidns_basic)
close(fd_pidns);
}
-TEST(nsid_pidns_separate)
+TEST_F(nsid, pidns_separate)
{
__u64 parent_pid_ns_id = 0;
__u64 child_pid_ns_id = 0;
@@ -776,6 +777,9 @@ TEST(nsid_pidns_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -784,8 +788,6 @@ TEST(nsid_pidns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_pidns);
close(pipefd[0]);
SKIP(return, "No permission to create PID namespace");
@@ -813,10 +815,6 @@ TEST(nsid_pidns_separate)
close(fd_parent_pidns);
close(fd_child_pidns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_netns_basic)
@@ -860,7 +858,7 @@ TEST(nsid_netns_basic)
close(fd_netns);
}
-TEST(nsid_netns_separate)
+TEST_F(nsid, netns_separate)
{
__u64 parent_net_ns_id = 0;
__u64 parent_netns_cookie = 0;
@@ -920,6 +918,9 @@ TEST(nsid_netns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -929,8 +930,6 @@ TEST(nsid_netns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_netns);
close(parent_sock);
SKIP(return, "No permission to create network namespace");
@@ -977,10 +976,6 @@ TEST(nsid_netns_separate)
close(fd_parent_netns);
close(fd_child_netns);
close(parent_sock);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
new file mode 100644
index 000000000000..753fd29dffd8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "../pidfd/pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Regression tests for the setns(pidfd) active reference counting bug.
+ *
+ * These tests are based on the reproducers that triggered the race condition
+ * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly").
+ *
+ * The bug: When using setns() with a pidfd, if the target task exits between
+ * prepare_nsset() and commit_nsset(), the namespaces would become inactive.
+ * Then ns_ref_active_get() would increment from 0 without properly resurrecting
+ * the owner chain, causing active reference count underflows.
+ */
+
+/*
+ * Simple pidfd setns test using create_child()+unshare().
+ *
+ * Without the fix, this would trigger active refcount warnings when the
+ * parent exits after doing setns(pidfd) on a child that has already exited.
+ */
+TEST(simple_pidfd_setns)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+ int sv[2];
+ char c;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create a child process without namespaces initially */
+ child_pid = create_child(&pidfd, 0);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ close(sv[0]);
+
+ if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ /* Signal parent that namespaces are ready */
+ if (write_nointr(sv[1], "1", 1) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ close(sv[1]);
+ _exit(0);
+ }
+ ASSERT_GE(pidfd, 0);
+ EXPECT_EQ(close(sv[1]), 0);
+
+ ret = read_nointr(sv[0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ EXPECT_EQ(close(sv[0]), 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ TH_LOG("setns() returned %d", ret);
+ close(pidfd);
+}
+
+/*
+ * Simple pidfd setns test using create_child().
+ *
+ * This variation uses create_child() with namespace flags directly.
+ * Namespaces are created immediately at clone time.
+ */
+TEST(simple_pidfd_setns_clone)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ /* Create a child process with new namespaces using create_child() */
+ child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ /* Child: sleep for a while so parent can setns to us */
+ sleep(2);
+ _exit(0);
+ }
+
+ /* Parent: pidfd was already created by create_child() */
+ ASSERT_GE(pidfd, 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ close(pidfd);
+ TH_LOG("setns() returned %d", ret);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c
new file mode 100644
index 000000000000..ba689a22d82f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/siocgskns_test.c
@@ -0,0 +1,1824 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/if.h>
+#include <linux/sockios.h>
+#include <linux/nsfs.h>
+#include <arpa/inet.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef SIOCGSKNS
+#define SIOCGSKNS 0x894C
+#endif
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test basic SIOCGSKNS functionality.
+ * Create a socket and verify SIOCGSKNS returns the correct network namespace.
+ */
+TEST(siocgskns_basic)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create a TCP socket */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS to get network namespace */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get current network namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ /* Verify they match */
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket file descriptors keep network namespaces active.
+ * Create a network namespace, create a socket in it, then exit the namespace.
+ * The namespace should remain active while the socket FD is held.
+ */
+TEST(siocgskns_keeps_netns_active)
+{
+ int sock_fd, netns_fd, test_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create a socket in the new network namespace */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ TH_LOG("socket() failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+
+ /*
+ * Namespace should still be active because socket FD keeps it alive.
+ * Try to access it via /proc/self/fd/<fd>.
+ */
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd);
+ test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+
+ /* Try SIOCGSKNS again - should fail since socket is closed */
+ ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0);
+}
+
+/*
+ * Test SIOCGSKNS with different socket types (TCP, UDP, RAW).
+ */
+TEST(siocgskns_socket_types)
+{
+ int sock_tcp, sock_udp, sock_raw;
+ int netns_tcp, netns_udp, netns_raw;
+ struct stat st_tcp, st_udp, st_raw;
+
+ /* TCP socket */
+ sock_tcp = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_tcp, 0);
+
+ /* UDP socket */
+ sock_udp = socket(AF_INET, SOCK_DGRAM, 0);
+ ASSERT_GE(sock_udp, 0);
+
+ /* RAW socket (may require privileges) */
+ sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+ if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) {
+ sock_raw = -1; /* Skip raw socket test */
+ }
+
+ /* Test SIOCGSKNS on TCP */
+ netns_tcp = ioctl(sock_tcp, SIOCGSKNS);
+ if (netns_tcp < 0) {
+ close(sock_tcp);
+ close(sock_udp);
+ if (sock_raw >= 0) close(sock_raw);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_tcp, 0);
+ }
+
+ /* Test SIOCGSKNS on UDP */
+ netns_udp = ioctl(sock_udp, SIOCGSKNS);
+ ASSERT_GE(netns_udp, 0);
+
+ /* Test SIOCGSKNS on RAW (if available) */
+ if (sock_raw >= 0) {
+ netns_raw = ioctl(sock_raw, SIOCGSKNS);
+ ASSERT_GE(netns_raw, 0);
+ }
+
+ /* Verify all return the same network namespace */
+ ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0);
+ ASSERT_EQ(fstat(netns_udp, &st_udp), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino);
+
+ if (sock_raw >= 0) {
+ ASSERT_EQ(fstat(netns_raw, &st_raw), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino);
+ close(netns_raw);
+ close(sock_raw);
+ }
+
+ close(netns_tcp);
+ close(netns_udp);
+ close(sock_tcp);
+ close(sock_udp);
+}
+
+/*
+ * Test SIOCGSKNS across setns.
+ * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still
+ * returns netns A.
+ */
+TEST(siocgskns_across_setns)
+{
+ int sock_fd, netns_a_fd, netns_b_fd, result_fd;
+ struct stat st_a;
+
+ /* Get current netns (A) */
+ netns_a_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_a_fd, 0);
+ ASSERT_EQ(fstat(netns_a_fd, &st_a), 0);
+
+ /* Create socket in netns A */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Create new netns (B) */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ netns_b_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_b_fd, 0);
+
+ /* Get netns from socket created in A */
+ result_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (result_fd < 0) {
+ close(sock_fd);
+ setns(netns_a_fd, CLONE_NEWNET);
+ close(netns_a_fd);
+ close(netns_b_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(result_fd, 0);
+ }
+
+ /* Verify it still points to netns A */
+ struct stat st_result_stat;
+ ASSERT_EQ(fstat(result_fd, &st_result_stat), 0);
+ ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino);
+
+ close(result_fd);
+ close(sock_fd);
+ close(netns_b_fd);
+
+ /* Restore original netns */
+ ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0);
+ close(netns_a_fd);
+}
+
+/*
+ * Test SIOCGSKNS fails on non-socket file descriptors.
+ */
+TEST(siocgskns_non_socket)
+{
+ int fd;
+ int pipefd[2];
+
+ /* Test on regular file */
+ fd = open("/dev/null", O_RDONLY);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_LT(ioctl(fd, SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+ close(fd);
+
+ /* Test on pipe */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
+/*
+ * Test multiple sockets keep the same network namespace active.
+ * Create multiple sockets, verify closing some doesn't affect others.
+ */
+TEST(siocgskns_multiple_sockets)
+{
+ int socks[5];
+ int netns_fds[5];
+ int i;
+ struct stat st;
+ ino_t netns_ino;
+
+ /* Create new network namespace */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ /* Create multiple sockets */
+ for (i = 0; i < 5; i++) {
+ socks[i] = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(socks[i], 0);
+ }
+
+ /* Get netns from all sockets */
+ for (i = 0; i < 5; i++) {
+ netns_fds[i] = ioctl(socks[i], SIOCGSKNS);
+ if (netns_fds[i] < 0) {
+ int j;
+ for (j = 0; j <= i; j++) {
+ close(socks[j]);
+ if (j < i && netns_fds[j] >= 0)
+ close(netns_fds[j]);
+ }
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fds[i], 0);
+ }
+ }
+
+ /* Verify all point to same netns */
+ ASSERT_EQ(fstat(netns_fds[0], &st), 0);
+ netns_ino = st.st_ino;
+
+ for (i = 1; i < 5; i++) {
+ ASSERT_EQ(fstat(netns_fds[i], &st), 0);
+ ASSERT_EQ(st.st_ino, netns_ino);
+ }
+
+ /* Close some sockets */
+ for (i = 0; i < 3; i++) {
+ close(socks[i]);
+ }
+
+ /* Remaining netns FDs should still be valid */
+ for (i = 3; i < 5; i++) {
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]);
+ int test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < 5; i++) {
+ if (i >= 3)
+ close(socks[i]);
+ close(netns_fds[i]);
+ }
+}
+
+/*
+ * Test socket keeps netns active after creating process exits.
+ * Verify that as long as the socket FD exists, the namespace remains active.
+ */
+TEST(siocgskns_netns_lifecycle)
+{
+ int sock_fd, netns_fd;
+ int ipc_sockets[2];
+ int syncpipe[2];
+ pid_t pid;
+ int status;
+ char sync_byte;
+ struct stat st;
+ ino_t netns_ino;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child */
+ close(ipc_sockets[0]);
+ close(syncpipe[1]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send socket to parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+
+ /* Wait for parent signal */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+
+ /* Receive socket FD */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Get netns from socket while child is alive */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ close(sock_fd);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Signal child to exit */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /*
+ * Socket FD should still keep namespace active even after
+ * the creating process exited.
+ */
+ int test_fd = ioctl(sock_fd, SIOCGSKNS);
+ ASSERT_GE(test_fd, 0);
+
+ struct stat st_test;
+ ASSERT_EQ(fstat(test_fd, &st_test), 0);
+ ASSERT_EQ(st_test.st_ino, netns_ino);
+
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+}
+
+/*
+ * Test IPv6 sockets also work with SIOCGSKNS.
+ */
+TEST(siocgskns_ipv6)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create an IPv6 TCP socket */
+ sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify it matches current namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket-kept netns appears in listns() output.
+ * Verify that a network namespace kept alive by a socket FD appears in
+ * listns() output even after the creating process exits, and that it
+ * disappears when the socket is closed.
+ */
+TEST(siocgskns_listns_visibility)
+{
+ int sock_fd, netns_fd, owner_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ __u64 netns_id, owner_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ owner_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (owner_fd < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(owner_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(owner_fd, NS_GET_ID, &owner_id);
+ if (ret < 0) {
+ close(owner_fd);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(owner_fd);
+
+ /* Namespace should appear in listns() output */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Search for our network namespace in the list */
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id);
+
+ /* Now verify with owner filtering */
+ req.user_ns_id = owner_id;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id);
+
+ /* Close socket - namespace should become inactive and disappear from listns() */
+ close(sock_fd);
+ close(netns_fd);
+
+ /* Verify it's no longer in listns() output */
+ req.user_ns_id = 0;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found_netns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+}
+
+/*
+ * Test that socket-kept netns can be reopened via file handle.
+ * Verify that a network namespace kept alive by a socket FD can be
+ * reopened using file handles even after the creating process exits.
+ */
+TEST(siocgskns_file_handle)
+{
+ int sock_fd, netns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st1, st2;
+ ino_t netns_ino;
+ __u64 netns_id;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int ret;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ netns_ino = st1.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0; /* Type field not needed for reopening */
+ nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */
+
+ TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id);
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ close(reopened_fd);
+
+ /* Close the netns FD */
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test combined listns() and file handle operations with socket-kept netns.
+ * Create a netns, keep it alive with a socket, verify it appears in listns(),
+ * then reopen it via file handle obtained from listns() entry.
+ */
+TEST(siocgskns_listns_and_file_handle)
+{
+ int sock_fd, netns_fd, userns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+ ino_t netns_ino;
+ __u64 netns_id, userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false, found_userns = false;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new userns and netns with socket */
+ close(ipc_sockets[0]);
+
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ userns_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (userns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(userns_fd, NS_GET_ID, &userns_id);
+ if (ret < 0) {
+ close(userns_fd);
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(userns_fd);
+
+ TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ struct stat reopened_st;
+ ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0);
+ ASSERT_EQ(reopened_st.st_ino, netns_ino);
+
+ TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino);
+
+ close(reopened_fd);
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ close(netns_fd);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_FALSE(found_netns);
+ ASSERT_FALSE(found_userns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test multi-level namespace resurrection across three user namespace levels.
+ *
+ * This test creates a complex namespace hierarchy with three levels of user
+ * namespaces and a network namespace at the deepest level. It verifies that
+ * the resurrection semantics work correctly when SIOCGSKNS is called on a
+ * socket from an inactive namespace tree, and that listns() and
+ * open_by_handle_at() correctly respect visibility rules.
+ *
+ * Hierarchy after child processes exit (all with 0 active refcount):
+ *
+ * net_L3A (0) <- Level 3 network namespace
+ * |
+ * +
+ * userns_L3 (0) <- Level 3 user namespace
+ * |
+ * +
+ * userns_L2 (0) <- Level 2 user namespace
+ * |
+ * +
+ * userns_L1 (0) <- Level 1 user namespace
+ * |
+ * x
+ * init_user_ns
+ *
+ * The test verifies:
+ * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain
+ * 2. After resurrection, all namespaces are visible in listns()
+ * 3. Resurrected namespaces can be reopened via file handles
+ * 4. Closing the netns FD cascades down: the entire ownership chain
+ * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again
+ * 5. Inactive namespaces disappear from listns() and cannot be reopened
+ * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again
+ * 7. After second resurrection, namespaces are visible and can be reopened
+ */
+TEST(siocgskns_multilevel_resurrection)
+{
+ int ipc_sockets[2];
+ pid_t pid_l1, pid_l2, pid_l3;
+ int status;
+
+ /* Namespace file descriptors to be received from child */
+ int sock_L3A_fd = -1;
+ int netns_L3A_fd = -1;
+ __u64 netns_L3A_id;
+ __u64 userns_L1_id, userns_L2_id, userns_L3_id;
+
+ /* For listns() and file handle testing */
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int reopened_fd;
+
+ /* Allocate file handle for testing */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ /*
+ * Fork level 1 child that creates userns_L1
+ */
+ pid_l1 = fork();
+ ASSERT_GE(pid_l1, 0);
+
+ if (pid_l1 == 0) {
+ /* Level 1 child */
+ int ipc_L2[2];
+ close(ipc_sockets[0]);
+
+ /* Create userns_L1 */
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L2 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 2 child that creates userns_L2
+ */
+ pid_l2 = fork();
+ if (pid_l2 < 0) {
+ close(ipc_sockets[1]);
+ close(ipc_L2[0]);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ if (pid_l2 == 0) {
+ /* Level 2 child */
+ int ipc_L3[2];
+ close(ipc_L2[0]);
+
+ /* Create userns_L2 (nested inside userns_L1) */
+ if (setup_userns() < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L3 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 3 child that creates userns_L3 and network namespaces
+ */
+ pid_l3 = fork();
+ if (pid_l3 < 0) {
+ close(ipc_L2[1]);
+ close(ipc_L3[0]);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ if (pid_l3 == 0) {
+ /* Level 3 child - the deepest level */
+ int sock_fd;
+ close(ipc_L3[0]);
+
+ /* Create userns_L3 (nested inside userns_L2) */
+ if (setup_userns() < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create network namespace at level 3 */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create socket in net_L3A */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to L2 parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_L3[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(0);
+ }
+
+ /* Level 2 child - receive from L3 and forward to L1 */
+ close(ipc_L3[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L3[0], &msg, 0);
+ close(ipc_L3[0]);
+
+ if (n != 1) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L3 child */
+ waitpid(pid_l3, NULL, 0);
+
+ /* Forward the socket FD to L1 parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Y';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_L2[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(0);
+ }
+
+ /* Level 1 child - receive from L2 and forward to parent */
+ close(ipc_L2[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L2[0], &msg, 0);
+ close(ipc_L2[0]);
+
+ if (n != 1) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L2 child */
+ waitpid(pid_l2, NULL, 0);
+
+ /* Forward the socket FD to parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Z';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent - receive the socket from the deepest level */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+
+ if (n != 1) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+ memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L1 child */
+ waitpid(pid_l1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * At this point, all child processes have exited. The socket itself
+ * doesn't keep the namespace active - we need to call SIOCGSKNS which
+ * will resurrect the entire namespace tree by taking active references.
+ */
+
+ /* Get network namespace from socket - this resurrects the tree */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ if (netns_L3A_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_L3A_fd, 0);
+ }
+
+ /* Get namespace ID for net_L3A */
+ ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */
+ int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS);
+ if (userns_L3_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_L3_fd, 0);
+ }
+
+ ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L2_fd, 0);
+ ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L1_fd, 0);
+ ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id);
+ ASSERT_EQ(ret, 0);
+
+ close(userns_L1_fd);
+ close(userns_L2_fd);
+ close(userns_L3_fd);
+
+ TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)",
+ netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id);
+
+ /*
+ * Test 1: Verify net_L3A is visible in listns() after resurrection.
+ * The entire ownership chain should be resurrected and visible.
+ */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ bool found_netns_L3A = false;
+ bool found_userns_L1 = false;
+ bool found_userns_L2 = false;
+ bool found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()");
+
+ /*
+ * Test 2: Verify net_L3A can be reopened via file handle.
+ */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_L3A_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened");
+
+ /*
+ * Test 3: Verify that when we close the netns FD (dropping the last
+ * active reference), the entire tree becomes inactive and disappears
+ * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops ->
+ * userns_L2 drops -> userns_L1 drops.
+ */
+ close(netns_L3A_fd);
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_FALSE(found_netns_L3A);
+ ASSERT_FALSE(found_userns_L1);
+ ASSERT_FALSE(found_userns_L2);
+ ASSERT_FALSE(found_userns_L3);
+ TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed");
+
+ /*
+ * Test 4: Verify file handle no longer works for inactive namespace.
+ */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd >= 0) {
+ close(reopened_fd);
+ free(handle);
+ ASSERT_TRUE(false); /* Should have failed */
+ }
+ TH_LOG("Inactive namespace correctly cannot be reopened via file handle");
+
+ /*
+ * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again.
+ * The socket is still valid, so we can call SIOCGSKNS on it to resurrect
+ * the namespace tree once more.
+ */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ ASSERT_GE(netns_L3A_fd, 0);
+
+ TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree");
+
+ /* Verify the namespace tree is resurrected and visible in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again");
+
+ /* Verify we can reopen via file handle again */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection");
+
+ /* Final cleanup */
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ free(handle);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c
new file mode 100644
index 000000000000..dd7df7d6cb27
--- /dev/null
+++ b/tools/testing/selftests/namespaces/stress_test.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Stress tests for namespace active reference counting.
+ *
+ * These tests validate that the active reference counting system can handle
+ * high load scenarios including rapid namespace creation/destruction, large
+ * numbers of concurrent namespaces, and various edge cases under stress.
+ */
+
+/*
+ * Test rapid creation and destruction of user namespaces.
+ * Create and destroy namespaces in quick succession to stress the
+ * active reference tracking and ensure no leaks occur.
+ */
+TEST(rapid_namespace_creation_destruction)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[256], ns_ids_after[256];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count of active user namespaces */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Rapidly create and destroy 100 user namespaces */
+ for (i = 0; i < 100; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create user namespace and immediately exit */
+ if (setup_userns() < 0)
+ exit(1);
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline (no leaked namespaces) */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test creating many concurrent namespaces.
+ * Verify that listns() correctly tracks all of them and that they all
+ * become inactive after processes exit.
+ */
+TEST(many_concurrent_namespaces)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512];
+ ssize_t ret_before, ret_during, ret_after;
+ pid_t pids[50];
+ int num_children = 50;
+ int i;
+ int sv[2];
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children, each with their own user namespace */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Child: create user namespace and wait for parent signal */
+ char c;
+
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* List namespaces while all children are running */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+
+ TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during);
+
+ /* Should have at least num_children more namespaces than baseline */
+ ASSERT_GE(ret_during, ret_before + num_children);
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ /* If we fail to write, kill remaining children */
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After all children exit: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test rapid namespace creation with different namespace types.
+ * Create multiple types of namespaces rapidly to stress the tracking system.
+ */
+TEST(rapid_mixed_namespace_creation)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces (all types)", ret_before);
+
+ /* Rapidly create and destroy namespaces with multiple types */
+ for (i = 0; i < 50; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+
+ /* Create additional namespace types */
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWIPC) < 0)
+ exit(1);
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test nested namespace creation under stress.
+ * Create deeply nested namespace hierarchies and verify proper cleanup.
+ */
+TEST(nested_namespace_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Create 20 processes, each with nested user namespaces */
+ for (i = 0; i < 20; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int userns_fd;
+ uid_t orig_uid = getuid();
+ int depth;
+
+ /* Create nested user namespaces (up to 5 levels) */
+ for (depth = 0; depth < 5; depth++) {
+ userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1);
+ if (userns_fd < 0)
+ exit(1);
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ exit(1);
+ }
+ close(userns_fd);
+ }
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test listns() pagination under stress.
+ * Create many namespaces and verify pagination works correctly.
+ */
+TEST(listns_pagination_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[30];
+ int num_children = 30;
+ int i;
+ int sv[2];
+ __u64 all_ns_ids[512];
+ int total_found = 0;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* Paginate through all namespaces using small batch sizes */
+ req.ns_id = 0;
+ while (1) {
+ __u64 batch[5]; /* Small batch size to force pagination */
+ ssize_t ret;
+
+ ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ if (ret == 0)
+ break;
+
+ /* Store results */
+ for (i = 0; i < ret && total_found < 512; i++) {
+ all_ns_ids[total_found++] = batch[i];
+ }
+
+ /* Update cursor for next batch */
+ if (ret == ARRAY_SIZE(batch))
+ req.ns_id = batch[ret - 1];
+ else
+ break;
+ }
+
+ TH_LOG("Paginated through %d user namespaces", total_found);
+
+ /* Verify no duplicates in pagination */
+ for (i = 0; i < total_found; i++) {
+ for (int j = i + 1; j < total_found; j++) {
+ if (all_ns_ids[i] == all_ns_ids[j]) {
+ TH_LOG("Found duplicate ns_id: %llu at positions %d and %d",
+ (unsigned long long)all_ns_ids[i], i, j);
+ ASSERT_TRUE(false);
+ }
+ }
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+/*
+ * Test concurrent namespace operations.
+ * Multiple processes creating, querying, and destroying namespaces concurrently.
+ */
+TEST(concurrent_namespace_operations)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ pid_t pids[20];
+ int num_workers = 20;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Create worker processes that do concurrent operations */
+ for (i = 0; i < num_workers; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Each worker: create namespaces, list them, repeat */
+ int iterations;
+
+ for (iterations = 0; iterations < 10; iterations++) {
+ int userns_fd;
+ __u64 temp_ns_ids[100];
+ ssize_t ret;
+
+ /* Create a user namespace */
+ userns_fd = get_userns_fd(0, getuid(), 1);
+ if (userns_fd < 0)
+ continue;
+
+ /* List namespaces */
+ ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0);
+ (void)ret;
+
+ close(userns_fd);
+
+ /* Small delay */
+ usleep(1000);
+ }
+
+ exit(0);
+ }
+ }
+
+ /* Wait for all workers */
+ for (i = 0; i < num_workers; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After concurrent operations: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test namespace churn - continuous creation and destruction.
+ * Simulates high-churn scenarios like container orchestration.
+ */
+TEST(namespace_churn)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int cycle;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Simulate churn: batches of namespaces created and destroyed */
+ for (cycle = 0; cycle < 10; cycle++) {
+ pid_t batch_pids[10];
+ int i;
+
+ /* Create batch */
+ for (i = 0; i < 10; i++) {
+ batch_pids[i] = fork();
+ ASSERT_GE(batch_pids[i], 0);
+
+ if (batch_pids[i] == 0) {
+ /* Create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+
+ /* Keep namespaces alive briefly */
+ usleep(10000);
+ exit(0);
+ }
+ }
+
+ /* Wait for batch to complete */
+ for (i = 0; i < 10; i++) {
+ int status;
+ waitpid(batch_pids[i], &status, 0);
+ }
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h
new file mode 100644
index 000000000000..9741a64a5b1d
--- /dev/null
+++ b/tools/testing/selftests/namespaces/wrappers.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/nsfs.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__
+#define __SELFTESTS_NAMESPACES_WRAPPERS_H__
+
+#ifndef __NR_listns
+ #if defined __alpha__
+ #define __NR_listns 580
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_listns 4470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_listns 6470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_listns 5470
+ #endif
+ #else
+ #define __NR_listns 470
+ #endif
+#endif
+
+static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids,
+ size_t nr_ns_ids, unsigned int flags)
+{
+ return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags);
+}
+
+#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */
diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc
index 330e000baeb1..f9d43cbdc894 100644
--- a/tools/testing/selftests/nolibc/Makefile.nolibc
+++ b/tools/testing/selftests/nolibc/Makefile.nolibc
@@ -87,7 +87,6 @@ IMAGE_riscv = arch/riscv/boot/Image
IMAGE_riscv32 = arch/riscv/boot/Image
IMAGE_riscv64 = arch/riscv/boot/Image
IMAGE_s390x = arch/s390/boot/bzImage
-IMAGE_s390 = arch/s390/boot/bzImage
IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi
IMAGE_sparc32 = arch/sparc/boot/image
IMAGE_sparc64 = arch/sparc/boot/image
@@ -117,7 +116,6 @@ DEFCONFIG_riscv = defconfig
DEFCONFIG_riscv32 = rv32_defconfig
DEFCONFIG_riscv64 = defconfig
DEFCONFIG_s390x = defconfig
-DEFCONFIG_s390 = defconfig compat.config
DEFCONFIG_loongarch = defconfig
DEFCONFIG_sparc32 = sparc32_defconfig
DEFCONFIG_sparc64 = sparc64_defconfig
@@ -156,7 +154,6 @@ QEMU_ARCH_riscv = riscv64
QEMU_ARCH_riscv32 = riscv32
QEMU_ARCH_riscv64 = riscv64
QEMU_ARCH_s390x = s390x
-QEMU_ARCH_s390 = s390x
QEMU_ARCH_loongarch = loongarch64
QEMU_ARCH_sparc32 = sparc
QEMU_ARCH_sparc64 = sparc64
@@ -197,7 +194,6 @@ QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T
QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_sparc32 = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_sparc64 = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
@@ -223,13 +219,13 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2)
CFLAGS_s390x = -m64
-CFLAGS_s390 = -m31
CFLAGS_mips32le = -EL -mabi=32 -fPIC
CFLAGS_mips32be = -EB -mabi=32
CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2
CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6
CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6
CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2
+CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld)
CFLAGS_sparc32 = $(call cc-option,-m32)
CFLAGS_sh4 = -ml -m4
ifeq ($(origin XARCH),command line)
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 29de21595fc9..3c5a226dad3a 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -25,6 +25,7 @@
#include <sys/sysmacros.h>
#include <sys/time.h>
#include <sys/timerfd.h>
+#include <sys/uio.h>
#include <sys/utsname.h>
#include <sys/wait.h>
#include <dirent.h>
@@ -1282,6 +1283,10 @@ int run_syscall(int min, int max)
int proc;
int test;
int tmp;
+ struct iovec iov_one = {
+ .iov_base = &tmp,
+ .iov_len = 1,
+ };
int ret = 0;
void *p1, *p2;
int has_gettid = 1;
@@ -1343,6 +1348,8 @@ int run_syscall(int min, int max)
CASE_TEST(dup3_0); tmp = dup3(0, 100, 0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break;
CASE_TEST(dup3_m1); tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break;
CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break;
+ CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break;
+ CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break;
CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break;
CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break;
CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break;
@@ -1395,6 +1402,10 @@ int run_syscall(int min, int max)
CASE_TEST(waitpid_child); EXPECT_SYSER(1, waitpid(getpid(), &tmp, WNOHANG), -1, ECHILD); break;
CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break;
CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break;
+ CASE_TEST(readv_badf); EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break;
+ CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break;
+ CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break;
+ CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break;
CASE_TEST(syscall_noargs); EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break;
CASE_TEST(syscall_args); EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break;
CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break;
@@ -1540,6 +1551,8 @@ int run_stdlib(int min, int max)
CASE_TEST(abs); EXPECT_EQ(1, abs(-10), 10); break;
CASE_TEST(abs_noop); EXPECT_EQ(1, abs(10), 10); break;
CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break;
+ CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break;
+ CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break;
case __LINE__:
return ret; /* must be last */
diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index e8af1fb505cf..3917cfb8fdc4 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -23,7 +23,7 @@ all_archs=(
mips32le mips32be mipsn32le mipsn32be mips64le mips64be
ppc ppc64 ppc64le
riscv32 riscv64
- s390x s390
+ s390x
loongarch
sparc32 sparc64
m68k
@@ -169,7 +169,7 @@ test_arch() {
cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-")
build_dir="${build_location}/${arch}"
if [ "$werror" -ne 0 ]; then
- CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror"
+ CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror -Wl,--fatal-warnings"
fi
MAKE=(make -f Makefile.nolibc -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}")
@@ -185,10 +185,6 @@ test_arch() {
exit 1
esac
printf '%-15s' "$arch:"
- if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then
- echo "Unsupported configuration"
- return
- fi
if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then
echo "Unsupported configuration"
return
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index f87993def738..d60f10a873bb 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -148,6 +148,14 @@
#define PIDFD_INFO_COREDUMP (1UL << 4)
#endif
+#ifndef PIDFD_INFO_SUPPORTED_MASK
+#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5)
+#endif
+
+#ifndef PIDFD_INFO_COREDUMP_SIGNAL
+#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6)
+#endif
+
#ifndef PIDFD_COREDUMPED
#define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */
#endif
@@ -183,8 +191,11 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
- __u32 coredump_mask;
- __u32 __spare1;
+ struct {
+ __u32 coredump_mask;
+ __u32 coredump_signal;
+ };
+ __u64 supported_mask;
};
/*
diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c
index a0eb6e81eaa2..cb5430a2fd75 100644
--- a/tools/testing/selftests/pidfd/pidfd_info_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_info_test.c
@@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread)
EXPECT_EQ(close(pidfd_thread), 0);
}
+/*
+ * Test: PIDFD_INFO_SUPPORTED_MASK field
+ *
+ * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel
+ * returns the supported_mask field indicating which flags the kernel supports.
+ */
+TEST(supported_mask_field)
+{
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_SUPPORTED_MASK,
+ };
+ int pidfd;
+ pid_t pid;
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ pause();
+
+ /* Request supported_mask field */
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+
+ /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK));
+
+ /* Verify supported_mask contains expected flags */
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL));
+
+ /* Clean up */
+ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ sys_waitid(P_PIDFD, pidfd, NULL, WEXITED);
+ close(pidfd);
+}
+
+/*
+ * Test: PIDFD_INFO_SUPPORTED_MASK always available
+ *
+ * Verify that supported_mask is returned even when other fields are requested.
+ */
+TEST(supported_mask_with_other_fields)
+{
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK,
+ };
+ int pidfd;
+ pid_t pid;
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ pause();
+
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+
+ /* Both fields should be present */
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK));
+ ASSERT_NE(info.supported_mask, 0);
+
+ /* Clean up */
+ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ sys_waitid(P_PIDFD, pidfd, NULL, WEXITED);
+ close(pidfd);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
index 88ca4e368489..b5239b52cb5d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh
@@ -31,7 +31,7 @@ fi
if ! cp "$oldrun/scenarios" $T/scenarios.oldrun
then
# Later on, can reconstitute this from console.log files.
- echo Prior run batches file does not exist: $oldrun/batches
+ echo Prior run scenarios file does not exist: $oldrun/scenarios
exit 1
fi
@@ -68,7 +68,7 @@ usage () {
echo " --datestamp string"
echo " --dryrun"
echo " --duration minutes | <seconds>s | <hours>h | <days>d"
- echo " --link hard|soft|copy"
+ echo " --link hard|soft|copy|inplace|inplace-force"
echo " --remote"
echo " --rundir /new/res/path"
echo "Command line: $scriptname $args"
@@ -121,7 +121,7 @@ do
shift
;;
--link)
- checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--'
+ checkarg --link "hard|soft|copy|inplace|inplace-force" "$#" "$2" 'hard\|soft\|copy\|inplace\|inplace-force' '^--'
case "$2" in
copy)
arg_link="cp -R"
@@ -132,6 +132,14 @@ do
soft)
arg_link="cp -Rs"
;;
+ inplace)
+ arg_link="inplace"
+ rundir="$oldrun"
+ ;;
+ inplace-force)
+ arg_link="inplace-force"
+ rundir="$oldrun"
+ ;;
esac
shift
;;
@@ -172,21 +180,37 @@ fi
echo ---- Re-run results directory: $rundir
-# Copy old run directory tree over and adjust.
-mkdir -p "`dirname "$rundir"`"
-if ! $arg_link "$oldrun" "$rundir"
-then
- echo "Cannot copy from $oldrun to $rundir."
- usage
-fi
-rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
-touch "$rundir/log"
-echo $scriptname $args | tee -a "$rundir/log"
-echo $oldrun > "$rundir/re-run"
-if ! test -d "$rundir/../../bin"
+if test "$oldrun" != "$rundir"
then
- $arg_link "$oldrun/../../bin" "$rundir/../.."
+ # Copy old run directory tree over and adjust.
+ mkdir -p "`dirname "$rundir"`"
+ if ! $arg_link "$oldrun" "$rundir"
+ then
+ echo "Cannot copy from $oldrun to $rundir."
+ usage
+ fi
+ rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log
+ touch "$rundir/log"
+ echo $scriptname $args | tee -a "$rundir/log"
+ echo $oldrun > "$rundir/re-run"
+ if ! test -d "$rundir/../../bin"
+ then
+ $arg_link "$oldrun/../../bin" "$rundir/../.."
+ fi
+else
+ # Check for a run having already happened.
+ find "$rundir" -name console.log -print > $T/oldrun-console.log
+ if test -s $T/oldrun-console.log
+ then
+ echo Run already took place in $rundir
+ if test "$arg_link" = inplace
+ then
+ usage
+ fi
+ fi
fi
+
+# Find runs to be done based on their qemu-cmd files.
for i in $rundir/*/qemu-cmd
do
cp "$i" $T
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
new file mode 100755
index 000000000000..2ff905a1853b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Usage: kvm-series.sh config-list commit-id-list [ kvm.sh parameters ]
+#
+# Tests the specified list of unadorned configs ("TREE01 SRCU-P" but not
+# "CFLIST" or "3*TRACE01") and an indication of a set of commits to test,
+# then runs each commit through the specified list of commits using kvm.sh.
+# The runs are grouped into a -series/config/commit directory tree.
+# Each run defaults to a duration of one minute.
+#
+# Run in top-level Linux source directory. Please note that this is in
+# no way a replacement for "git bisect"!!!
+#
+# This script is intended to replace kvm-check-branches.sh by providing
+# ease of use and faster execution.
+
+T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"
+trap 'rm -rf $T' 0
+
+scriptname=$0
+args="$*"
+
+config_list="${1}"
+if test -z "${config_list}"
+then
+ echo "$0: Need a quoted list of --config arguments for first argument."
+ exit 1
+fi
+if test -z "${config_list}" || echo "${config_list}" | grep -q '\*'
+then
+ echo "$0: Repetition ('*') not allowed in config list."
+ exit 1
+fi
+
+commit_list="${2}"
+if test -z "${commit_list}"
+then
+ echo "$0: Need a list of commits (e.g., HEAD^^^..) for second argument."
+ exit 2
+fi
+git log --pretty=format:"%h" "${commit_list}" > $T/commits
+ret=$?
+if test "${ret}" -ne 0
+then
+ echo "$0: Invalid commit list ('${commit_list}')."
+ exit 2
+fi
+sha1_list=`cat $T/commits`
+
+shift
+shift
+
+RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE
+PATH=${RCUTORTURE}/bin:$PATH; export PATH
+. functions.sh
+
+ret=0
+nfail=0
+nsuccess=0
+faillist=
+successlist=
+cursha1="`git rev-parse --abbrev-ref HEAD`"
+ds="`date +%Y.%m.%d-%H.%M.%S`-series"
+startdate="`date`"
+starttime="`get_starttime`"
+
+echo " --- " $scriptname $args | tee -a $T/log
+echo " --- Results directory: " $ds | tee -a $T/log
+
+for config in ${config_list}
+do
+ sha_n=0
+ for sha in ${sha1_list}
+ do
+ sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order.
+ echo Starting ${config}/${sha1} at `date` | tee -a $T/log
+ git checkout "${sha}"
+ time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@"
+ curret=$?
+ if test "${curret}" -ne 0
+ then
+ nfail=$((nfail+1))
+ faillist="$faillist ${config}/${sha1}(${curret})"
+ else
+ nsuccess=$((nsuccess+1))
+ successlist="$successlist ${config}/${sha1}"
+ # Successful run, so remove large files.
+ rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers}
+ fi
+ if test "${ret}" -eq 0
+ then
+ ret=${curret}
+ fi
+ sha_n=$((sha_n+1))
+ done
+done
+git checkout "${cursha1}"
+
+echo ${nsuccess} SUCCESSES: | tee -a $T/log
+echo ${successlist} | fmt | tee -a $T/log
+echo | tee -a $T/log
+echo ${nfail} FAILURES: | tee -a $T/log
+echo ${faillist} | fmt | tee -a $T/log
+if test -n "${faillist}"
+then
+ echo | tee -a $T/log
+ echo Failures across commits: | tee -a $T/log
+ echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' |
+ sort | uniq -c | sort -k2n | tee -a $T/log
+fi
+echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
+echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log
+cp $T/log tools/testing/selftests/rcutorture/res/${ds}
+
+exit "${ret}"
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 617cba339d28..fff15821c44c 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -199,7 +199,7 @@ do
fi
;;
--kconfig|--kconfigs)
- checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)* *$' '^error$'
+ checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|-\?[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|-\?[0-9]\+\|"[^"]*"\)\)* *$' '^error$'
TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
shift
;;
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index dc4985064b3a..67caf4276bb0 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -16,3 +16,4 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
CONFIG_RCU_EQS_DEBUG=y
CONFIG_RCU_LAZY=y
+CONFIG_RCU_DYNTICKS_TORTURE=y
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index 33baaa9f9997..e7b858cd3736 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -28,8 +28,6 @@ do { \
RSEQ_WRITE_ONCE(*(p), v); \
} while (0)
-#ifdef __s390x__
-
#define LONG_L "lg"
#define LONG_S "stg"
#define LONG_LT_R "ltgr"
@@ -63,43 +61,6 @@ do { \
".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
".popsection\n\t"
-#elif __s390__
-
-#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
- start_ip, post_commit_offset, abort_ip) \
- ".pushsection __rseq_cs, \"aw\"\n\t" \
- ".balign 32\n\t" \
- __rseq_str(label) ":\n\t" \
- ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
- ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
- ".popsection\n\t" \
- ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
- ".long 0x0, " __rseq_str(label) "b\n\t" \
- ".popsection\n\t"
-
-/*
- * Exit points of a rseq critical section consist of all instructions outside
- * of the critical section where a critical section can either branch to or
- * reach through the normal course of its execution. The abort IP and the
- * post-commit IP are already part of the __rseq_cs section and should not be
- * explicitly defined as additional exit points. Knowing all exit points is
- * useful to assist debuggers stepping over the critical section.
- */
-#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
- ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
- ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
- ".popsection\n\t"
-
-#define LONG_L "l"
-#define LONG_S "st"
-#define LONG_LT_R "ltr"
-#define LONG_CMP "c"
-#define LONG_CMP_R "cr"
-#define LONG_ADDI "ahi"
-#define LONG_ADD_R "ar"
-
-#endif
-
#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
__RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
(post_commit_ip - start_ip), abort_ip)
diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index 0443beacf362..d4be97498b32 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -33,6 +33,7 @@ Usage: $0 [OPTIONS]
-c | --collection COLLECTION Run all tests from COLLECTION
-l | --list List the available collection:test entries
-d | --dry-run Don't actually run any tests
+ -f | --no-error-on-fail Don't exit with an error just because tests failed
-n | --netns Run each test in namespace
-h | --help Show this usage info
-o | --override-timeout Number of seconds after which we timeout
@@ -44,6 +45,7 @@ COLLECTIONS=""
TESTS=""
dryrun=""
kselftest_override_timeout=""
+ERROR_ON_FAIL=true
while true; do
case "$1" in
-s | --summary)
@@ -65,6 +67,9 @@ while true; do
-d | --dry-run)
dryrun="echo"
shift ;;
+ -f | --no-error-on-fail)
+ ERROR_ON_FAIL=false
+ shift ;;
-n | --netns)
RUN_IN_NETNS=1
shift ;;
@@ -105,9 +110,18 @@ if [ -n "$TESTS" ]; then
available="$(echo "$valid" | sed -e 's/ /\n/g')"
fi
+kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)"
+export kselftest_failures_file
+
collections=$(echo "$available" | cut -d: -f1 | sort | uniq)
for collection in $collections ; do
[ -w /dev/kmsg ] && echo "kselftest: Running tests in $collection" >> /dev/kmsg
tests=$(echo "$available" | grep "^$collection:" | cut -d: -f2)
($dryrun cd "$collection" && $dryrun run_many $tests)
done
+
+failures="$(cat "$kselftest_failures_file")"
+rm "$kselftest_failures_file"
+if "$ERROR_ON_FAIL" && [ "$failures" ]; then
+ exit 1
+fi
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 9d9d6b4c38b0..5fe45f9c5f8f 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -174,6 +174,7 @@ auto-test-targets := \
minimal \
numa \
allowed_cpus \
+ peek_dsq \
prog_run \
reload_loop \
select_cpu_dfl \
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
new file mode 100644
index 000000000000..a3faf5bb49d6
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A BPF program for testing DSQ operations and peek in particular.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
+ */
+
+#include <scx/common.bpf.h>
+#include <scx/compat.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei); /* Error handling */
+
+#define MAX_SAMPLES 100
+#define MAX_CPUS 512
+#define DSQ_POOL_SIZE 8
+int max_samples = MAX_SAMPLES;
+int max_cpus = MAX_CPUS;
+int dsq_pool_size = DSQ_POOL_SIZE;
+
+/* Global variables to store test results */
+int dsq_peek_result1 = -1;
+long dsq_inserted_pid = -1;
+int insert_test_cpu = -1; /* Set to the cpu that performs the test */
+long dsq_peek_result2 = -1;
+long dsq_peek_result2_pid = -1;
+long dsq_peek_result2_expected = -1;
+int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
+int real_dsq_id = 1235; /* DSQ for normal operation */
+int enqueue_count = -1;
+int dispatch_count = -1;
+bool debug_ksym_exists;
+
+/* DSQ pool for stress testing */
+int dsq_pool_base_id = 2000;
+int phase1_complete = -1;
+long total_peek_attempts = -1;
+long successful_peeks = -1;
+
+/* BPF map for sharing peek results with userspace */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_SAMPLES);
+ __type(key, u32);
+ __type(value, long);
+} peek_results SEC(".maps");
+
+static int get_random_dsq_id(void)
+{
+ u64 time = bpf_ktime_get_ns();
+
+ return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
+}
+
+static void record_peek_result(long pid)
+{
+ u32 slot_key;
+ long *slot_pid_ptr;
+ int ix;
+
+ if (pid <= 0)
+ return;
+
+ /* Find an empty slot or one with the same PID */
+ bpf_for(ix, 0, 10) {
+ slot_key = (pid + ix) % MAX_SAMPLES;
+ slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
+ if (!slot_pid_ptr)
+ continue;
+
+ if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
+ *slot_pid_ptr = pid;
+ break;
+ }
+ }
+}
+
+/* Scan all DSQs in the pool and try to move a task to local */
+static int scan_dsq_pool(void)
+{
+ struct task_struct *task;
+ int moved = 0;
+ int i;
+
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ total_peek_attempts++;
+
+ task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
+ if (task) {
+ successful_peeks++;
+ record_peek_result(task->pid);
+
+ /* Try to move this task to local */
+ if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
+ moved = 1;
+ break;
+ }
+ }
+ }
+ return moved;
+}
+
+/* Struct_ops scheduler for testing DSQ peek operations */
+void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ struct task_struct *peek_result;
+ int last_insert_test_cpu, cpu;
+
+ enqueue_count++;
+ cpu = bpf_get_smp_processor_id();
+ last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
+
+ /* Phase 1: Simple insert-then-peek test (only on first task) */
+ if (last_insert_test_cpu == -1) {
+ bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
+
+ /* Test 1: Peek empty DSQ - should return NULL */
+ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
+ dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
+
+ /* Test 2: Insert task into test DSQ for testing in dispatch callback */
+ dsq_inserted_pid = p->pid;
+ scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
+ dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
+ } else if (!phase1_complete) {
+ /* Still in phase 1, use real DSQ */
+ scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
+ } else {
+ /* Phase 2: Random DSQ insertion for stress testing */
+ int random_dsq_id = get_random_dsq_id();
+
+ scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
+ }
+}
+
+void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
+{
+ dispatch_count++;
+
+ /* Phase 1: Complete the simple peek test if we inserted a task but
+ * haven't tested peek yet
+ */
+ if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
+ struct task_struct *peek_result;
+
+ bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
+
+ /* Test 3: Peek DSQ after insert - should return the task we inserted */
+ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
+ /* Store the PID of the peeked task for comparison */
+ dsq_peek_result2 = (long)peek_result;
+ dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
+
+ /* Now consume the task since we've peeked at it */
+ scx_bpf_dsq_move_to_local(test_dsq_id);
+
+ /* Mark phase 1 as complete */
+ phase1_complete = 1;
+ bpf_printk("Phase 1 complete, starting phase 2 stress testing");
+ } else if (!phase1_complete) {
+ /* Still in phase 1, use real DSQ */
+ scx_bpf_dsq_move_to_local(real_dsq_id);
+ } else {
+ /* Phase 2: Scan all DSQs in the pool and try to move a task */
+ if (!scan_dsq_pool()) {
+ /* No tasks found in DSQ pool, fall back to real DSQ */
+ scx_bpf_dsq_move_to_local(real_dsq_id);
+ }
+ }
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
+{
+ s32 err;
+ int i;
+
+ /* Always set debug values so we can see which version we're using */
+ debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
+
+ /* Initialize state first */
+ insert_test_cpu = -1;
+ enqueue_count = 0;
+ dispatch_count = 0;
+ phase1_complete = 0;
+ total_peek_attempts = 0;
+ successful_peeks = 0;
+
+ /* Create the test and real DSQs */
+ err = scx_bpf_create_dsq(test_dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
+ return err;
+ }
+ err = scx_bpf_create_dsq(real_dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
+ return err;
+ }
+
+ /* Create the DSQ pool for stress testing */
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ err = scx_bpf_create_dsq(dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
+ return err;
+ }
+ }
+
+ /* Initialize the peek results map */
+ bpf_for(i, 0, MAX_SAMPLES) {
+ u32 key = i;
+ long pid = -1;
+
+ bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
+ }
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
+{
+ int i;
+
+ /* Destroy the primary DSQs */
+ scx_bpf_destroy_dsq(test_dsq_id);
+ scx_bpf_destroy_dsq(real_dsq_id);
+
+ /* Destroy the DSQ pool */
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ scx_bpf_destroy_dsq(dsq_id);
+ }
+
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops peek_dsq_ops = {
+ .enqueue = (void *)peek_dsq_enqueue,
+ .dispatch = (void *)peek_dsq_dispatch,
+ .init = (void *)peek_dsq_init,
+ .exit = (void *)peek_dsq_exit,
+ .name = "peek_dsq",
+};
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c
new file mode 100644
index 000000000000..a717384a3224
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/peek_dsq.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for DSQ operations including create, destroy, and peek operations.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+#include <sched.h>
+#include "peek_dsq.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_WORKERS 4
+
+static bool workload_running = true;
+static pthread_t workload_threads[NUM_WORKERS];
+
+/**
+ * Background workload thread that sleeps and wakes rapidly to exercise
+ * the scheduler's enqueue operations and ensure DSQ operations get tested.
+ */
+static void *workload_thread_fn(void *arg)
+{
+ while (workload_running) {
+ /* Sleep for a very short time to trigger scheduler activity */
+ usleep(1000); /* 1ms sleep */
+ /* Yield to ensure we go through the scheduler */
+ sched_yield();
+ }
+ return NULL;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct peek_dsq *skel;
+
+ skel = peek_dsq__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
+{
+ long count = 0;
+
+ printf("Observed %s DSQ peek pids:\n", dsq_name);
+ for (int i = 0; i < max_samples; i++) {
+ long pid;
+ int err;
+
+ err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
+ if (err == 0) {
+ if (pid == 0) {
+ printf(" Sample %d: NULL peek\n", i);
+ } else if (pid > 0) {
+ printf(" Sample %d: pid %ld\n", i, pid);
+ count++;
+ }
+ } else {
+ printf(" Sample %d: error reading pid (err=%d)\n", i, err);
+ }
+ }
+ printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
+ return count;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct peek_dsq *skel = ctx;
+ bool failed = false;
+ int seconds = 3;
+ int err;
+
+ /* Enable the scheduler to test DSQ operations */
+ printf("Enabling scheduler to test DSQ insert operations...\n");
+
+ struct bpf_link *link =
+ bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
+
+ if (!link) {
+ SCX_ERR("Failed to attach struct_ops");
+ return SCX_TEST_FAIL;
+ }
+
+ printf("Starting %d background workload threads...\n", NUM_WORKERS);
+ workload_running = true;
+ for (int i = 0; i < NUM_WORKERS; i++) {
+ err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
+ if (err) {
+ SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
+ /* Stop already created threads */
+ workload_running = false;
+ for (int j = 0; j < i; j++)
+ pthread_join(workload_threads[j], NULL);
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ printf("Waiting for enqueue events.\n");
+ sleep(seconds);
+ while (skel->data->enqueue_count <= 0) {
+ printf(".");
+ fflush(stdout);
+ sleep(1);
+ seconds++;
+ if (seconds >= 30) {
+ printf("\n\u2717 Timeout waiting for enqueue events\n");
+ /* Stop workload threads and cleanup */
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++)
+ pthread_join(workload_threads[i], NULL);
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++) {
+ err = pthread_join(workload_threads[i], NULL);
+ if (err) {
+ SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+ printf("Background workload threads stopped.\n");
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+
+ /* Detach the scheduler */
+ bpf_link__destroy(link);
+
+ printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
+ skel->data->enqueue_count, skel->data->dispatch_count);
+ printf("Debug: ksym_exists=%d\n",
+ skel->bss->debug_ksym_exists);
+
+ /* Check DSQ insert result */
+ printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
+ if (skel->data->insert_test_cpu != -1)
+ printf("\u2713 DSQ insert succeeded !\n");
+ else {
+ printf("\u2717 DSQ insert failed or not attempted\n");
+ failed = true;
+ }
+
+ /* Check DSQ peek results */
+ printf(" DSQ peek result 1 (before insert): %d\n",
+ skel->data->dsq_peek_result1);
+ if (skel->data->dsq_peek_result1 == 0)
+ printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
+ else {
+ printf("\u2717 DSQ peek verification failed\n");
+ failed = true;
+ }
+
+ printf(" DSQ peek result 2 (after insert): %ld\n",
+ skel->data->dsq_peek_result2);
+ printf(" DSQ peek result 2, expected: %ld\n",
+ skel->data->dsq_peek_result2_expected);
+ if (skel->data->dsq_peek_result2 ==
+ skel->data->dsq_peek_result2_expected)
+ printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
+ else {
+ printf("\u2717 DSQ peek verification failed\n");
+ failed = true;
+ }
+
+ printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
+ printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
+
+ int pid_count;
+
+ pid_count = print_observed_pids(skel->maps.peek_results,
+ skel->data->max_samples, "DSQ pool");
+ printf("Total non-null peek observations: %ld out of %ld\n",
+ skel->data->successful_peeks, skel->data->total_peek_attempts);
+
+ if (skel->bss->debug_ksym_exists && pid_count == 0) {
+ printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
+ failed = true;
+ }
+ if (skel->bss->debug_ksym_exists && pid_count > 0)
+ printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
+
+ if (failed)
+ return SCX_TEST_FAIL;
+ else
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct peek_dsq *skel = ctx;
+
+ if (workload_running) {
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++)
+ pthread_join(workload_threads[i], NULL);
+ }
+
+ peek_dsq__destroy(skel);
+}
+
+struct scx_test peek_dsq = {
+ .name = "peek_dsq",
+ .description =
+ "Test DSQ create/destroy operations and future peek functionality",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&peek_dsq)
diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c
index 252c6308c569..10badae13ebe 100644
--- a/tools/testing/selftests/timers/nanosleep.c
+++ b/tools/testing/selftests/timers/nanosleep.c
@@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns)
return 0;
}
+static void dummy_event_handler(int val)
+{
+ /* No action needed */
+}
+
+static int nanosleep_test_remaining(int clockid)
+{
+ struct timespec rqtp = {}, rmtp = {};
+ struct itimerspec itimer = {};
+ struct sigaction sa = {};
+ timer_t timer;
+ int ret;
+
+ sa.sa_handler = dummy_event_handler;
+ ret = sigaction(SIGALRM, &sa, NULL);
+ if (ret)
+ return -1;
+
+ ret = timer_create(clockid, NULL, &timer);
+ if (ret)
+ return -1;
+
+ itimer.it_value.tv_nsec = NSEC_PER_SEC / 4;
+ ret = timer_settime(timer, 0, &itimer, NULL);
+ if (ret)
+ return -1;
+
+ rqtp.tv_nsec = NSEC_PER_SEC / 2;
+ ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp);
+ if (ret != EINTR)
+ return -1;
+
+ ret = timer_delete(timer);
+ if (ret)
+ return -1;
+
+ sa.sa_handler = SIG_DFL;
+ ret = sigaction(SIGALRM, &sa, NULL);
+ if (ret)
+ return -1;
+
+ if (!in_order((struct timespec) {}, rmtp))
+ return -1;
+
+ if (!in_order(rmtp, rqtp))
+ return -1;
+
+ return 0;
+}
+
int main(int argc, char **argv)
{
long long length;
@@ -150,6 +200,11 @@ int main(int argc, char **argv)
}
length *= 100;
}
+ ret = nanosleep_test_remaining(clockid);
+ if (ret < 0) {
+ ksft_test_result_fail("%-31s\n", clockstring(clockid));
+ ksft_exit_fail();
+ }
ksft_test_result_pass("%-31s\n", clockstring(clockid));
next:
ret = 0;
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index f0eceb0faf34..a563c438ac79 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -18,6 +18,7 @@
#include <time.h>
#include <include/vdso/time64.h>
#include <pthread.h>
+#include <stdbool.h>
#include "../kselftest.h"
@@ -670,8 +671,14 @@ static void check_timer_create_exact(void)
int main(int argc, char **argv)
{
+ bool run_sig_ign_tests = ksft_min_kernel_version(6, 13);
+
ksft_print_header();
- ksft_set_plan(19);
+ if (run_sig_ign_tests) {
+ ksft_set_plan(19);
+ } else {
+ ksft_set_plan(10);
+ }
ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
ksft_print_msg("based timers if other threads run on the CPU...\n");
@@ -695,15 +702,20 @@ int main(int argc, char **argv)
check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_timer_distribution();
- check_sig_ign(0);
- check_sig_ign(1);
- check_rearm();
- check_delete();
- check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
- check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
- check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
- check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
- check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
+ if (run_sig_ign_tests) {
+ check_sig_ign(0);
+ check_sig_ign(1);
+ check_rearm();
+ check_delete();
+ check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+ check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+ check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
+ check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
+ check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
+ } else {
+ ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n");
+ }
+
check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 6b8123c12a7a..f8fa102a627f 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -836,56 +836,70 @@ static int ublk_process_io(struct ublk_thread *t)
return reapped;
}
-static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
- cpu_set_t *cpuset)
-{
- if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
- ublk_err("ublk dev %u thread %u set affinity failed",
- t->dev->dev_info.dev_id, t->idx);
-}
-
struct ublk_thread_info {
struct ublk_dev *dev;
+ pthread_t thread;
unsigned idx;
sem_t *ready;
cpu_set_t *affinity;
unsigned long long extra_flags;
};
-static void *ublk_io_handler_fn(void *data)
+static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
{
- struct ublk_thread_info *info = data;
- struct ublk_thread *t = &info->dev->threads[info->idx];
+ if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
+ ublk_err("ublk dev %u thread %u set affinity failed",
+ info->dev->dev_info.dev_id, info->idx);
+}
+
+static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
+{
+ struct ublk_thread t = {
+ .dev = info->dev,
+ .idx = info->idx,
+ };
int dev_id = info->dev->dev_info.dev_id;
int ret;
- t->dev = info->dev;
- t->idx = info->idx;
-
- ret = ublk_thread_init(t, info->extra_flags);
+ ret = ublk_thread_init(&t, info->extra_flags);
if (ret) {
ublk_err("ublk dev %d thread %u init failed\n",
- dev_id, t->idx);
- return NULL;
+ dev_id, t.idx);
+ return ret;
}
- /* IO perf is sensitive with queue pthread affinity on NUMA machine*/
- if (info->affinity)
- ublk_thread_set_sched_affinity(t, info->affinity);
sem_post(info->ready);
ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
- gettid(), dev_id, t->idx);
+ gettid(), dev_id, t.idx);
/* submit all io commands to ublk driver */
- ublk_submit_fetch_commands(t);
+ ublk_submit_fetch_commands(&t);
do {
- if (ublk_process_io(t) < 0)
+ if (ublk_process_io(&t) < 0)
break;
} while (1);
ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
- gettid(), dev_id, t->idx);
- ublk_thread_deinit(t);
+ gettid(), dev_id, t.idx);
+ ublk_thread_deinit(&t);
+ return 0;
+}
+
+static void *ublk_io_handler_fn(void *data)
+{
+ struct ublk_thread_info *info = data;
+
+ /*
+ * IO perf is sensitive with queue pthread affinity on NUMA machine
+ *
+ * Set sched_affinity at beginning, so following allocated memory/pages
+ * could be CPU/NUMA aware.
+ */
+ if (info->affinity)
+ ublk_thread_set_sched_affinity(info);
+
+ __ublk_io_handler_fn(info);
+
return NULL;
}
@@ -983,14 +997,13 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
*/
if (dev->nthreads == dinfo->nr_hw_queues)
tinfo[i].affinity = &affinity_buf[i];
- pthread_create(&dev->threads[i].thread, NULL,
+ pthread_create(&tinfo[i].thread, NULL,
ublk_io_handler_fn,
&tinfo[i]);
}
for (i = 0; i < dev->nthreads; i++)
sem_wait(&ready);
- free(tinfo);
free(affinity_buf);
/* everything is fine now, start us */
@@ -1013,7 +1026,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
/* wait until we are terminated */
for (i = 0; i < dev->nthreads; i++)
- pthread_join(dev->threads[i].thread, &thread_ret);
+ pthread_join(tinfo[i].thread, &thread_ret);
+ free(tinfo);
fail:
for (i = 0; i < dinfo->nr_hw_queues; i++)
ublk_queue_deinit(&dev->q[i]);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 5e55484fb0aa..fe42705c6d42 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -175,23 +175,20 @@ struct ublk_queue {
struct ublk_thread {
struct ublk_dev *dev;
- struct io_uring ring;
- unsigned int cmd_inflight;
- unsigned int io_inflight;
-
- pthread_t thread;
unsigned idx;
#define UBLKS_T_STOPPING (1U << 0)
#define UBLKS_T_IDLE (1U << 1)
unsigned state;
+ unsigned int cmd_inflight;
+ unsigned int io_inflight;
+ struct io_uring ring;
};
struct ublk_dev {
struct ublk_tgt tgt;
struct ublksrv_ctrl_dev_info dev_info;
struct ublk_queue q[UBLK_MAX_QUEUES];
- struct ublk_thread threads[UBLK_MAX_THREADS];
unsigned nthreads;
unsigned per_io_tasks;
diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index 5fdd0f362337..50c261005111 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -25,10 +25,6 @@
#define VDSO_VERSION 1
#define VDSO_NAMES 0
#define VDSO_32BIT 1
-#elif defined (__s390__) && !defined(__s390x__)
-#define VDSO_VERSION 2
-#define VDSO_NAMES 0
-#define VDSO_32BIT 1
#elif defined (__s390x__)
#define VDSO_VERSION 2
#define VDSO_NAMES 0
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
index 05e1e6774fba..918eaec8bfbe 100644
--- a/tools/testing/selftests/x86/test_vsyscall.c
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -308,12 +308,13 @@ static void test_getcpu(int cpu)
#ifdef __x86_64__
static jmp_buf jmpbuf;
-static volatile unsigned long segv_err;
+static volatile unsigned long segv_err, segv_trapno;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t *)ctx_void;
+ segv_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
segv_err = ctx->uc_mcontext.gregs[REG_ERR];
siglongjmp(jmpbuf, 1);
}
@@ -336,7 +337,8 @@ static void test_vsys_r(void)
else if (can_read)
ksft_test_result_pass("We have read access\n");
else
- ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err);
+ ksft_test_result_pass("We do not have read access (trap=%ld, error=0x%lx)\n",
+ segv_trapno, segv_err);
}
static void test_vsys_x(void)
@@ -347,7 +349,7 @@ static void test_vsys_x(void)
return;
}
- ksft_print_msg("Make sure that vsyscalls really page fault\n");
+ ksft_print_msg("Make sure that vsyscalls really cause a fault\n");
bool can_exec;
if (sigsetjmp(jmpbuf, 1) == 0) {
@@ -358,13 +360,14 @@ static void test_vsys_x(void)
}
if (can_exec)
- ksft_test_result_fail("Executing the vsyscall did not page fault\n");
- else if (segv_err & (1 << 4)) /* INSTR */
- ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n",
- segv_err);
+ ksft_test_result_fail("Executing the vsyscall did not fault\n");
+ /* #GP or #PF (with X86_PF_INSTR) */
+ else if ((segv_trapno == 13) || ((segv_trapno == 14) && (segv_err & (1 << 4))))
+ ksft_test_result_pass("Executing the vsyscall page failed (trap=%ld, error=0x%lx)\n",
+ segv_trapno, segv_err);
else
- ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n",
- segv_err);
+ ksft_test_result_fail("Execution failed with the wrong error (trap=%ld, error=0x%lx)\n",
+ segv_trapno, segv_err);
}
/*