diff options
Diffstat (limited to 'tools/testing/selftests/namespaces')
| -rw-r--r-- | tools/testing/selftests/namespaces/.gitignore | 9 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/Makefile | 24 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/cred_change_test.c | 814 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/listns_efault_test.c | 530 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/listns_pagination_bug.c | 138 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/listns_permissions_test.c | 759 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/listns_test.c | 679 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/ns_active_ref_test.c | 2672 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/nsid_test.c | 107 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/regression_pidfd_setns_test.c | 113 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/siocgskns_test.c | 1824 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/stress_test.c | 626 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/wrappers.h | 35 |
13 files changed, 8273 insertions, 57 deletions
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index ccfb40837a73..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,3 +1,12 @@ nsid_test file_handle_test init_ino_test +ns_active_ref_test +listns_test +listns_permissions_test +listns_efault_test +siocgskns_test +cred_change_test +stress_test +listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 5fe4b3dc07d3..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,29 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap -TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test +TEST_GEN_PROGS := nsid_test \ + file_handle_test \ + init_ino_test \ + ns_active_ref_test \ + listns_test \ + listns_permissions_test \ + listns_efault_test \ + siocgskns_test \ + cred_change_test \ + stress_test \ + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk +$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c +$(OUTPUT)/listns_test: ../filesystems/utils.c +$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c +$(OUTPUT)/siocgskns_test: ../filesystems/utils.c +$(OUTPUT)/cred_change_test: ../filesystems/utils.c +$(OUTPUT)/stress_test: ../filesystems/utils.c +$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c + diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c new file mode 100644 index 000000000000..7b4f5ad3f725 --- /dev/null +++ b/tools/testing/selftests/namespaces/cred_change_test.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test credential changes and their impact on namespace active references. + */ + +/* + * Test setuid() in a user namespace properly swaps active references. + * Create a user namespace with multiple UIDs mapped, then setuid() between them. + * Verify that the user namespace remains active throughout. + */ +TEST(setuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setuid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple UIDs mapped (0-9) */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform multiple setuid() calls. + * Each setuid() triggers commit_creds() which should properly + * swap active references via switch_cred_namespaces(). + */ + for (setuid_count = 0; setuid_count < 50; setuid_count++) { + uid_t target_uid = (setuid_count % 10); + if (setuid(target_uid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id); + + /* Verify namespace is active while child is running */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + ASSERT_TRUE(found); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive after child exits */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setuid() correctly preserved active references (no leak)"); +} + +/* + * Test setgid() in a user namespace properly handles active references. + */ +TEST(setgid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setgid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple GIDs mapped */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setgid() calls */ + for (setgid_count = 0; setgid_count < 50; setgid_count++) { + gid_t target_gid = (setgid_count % 10); + if (setgid(target_gid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setgid() correctly preserved active references (no leak)"); +} + +/* + * Test setresuid() which changes real, effective, and saved UIDs. + * This should properly swap active references via commit_creds(). + */ +TEST(setresuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setres_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setresuid() calls */ + for (setres_count = 0; setres_count < 30; setres_count++) { + uid_t uid1 = (setres_count % 5); + uid_t uid2 = ((setres_count + 1) % 5); + uid_t uid3 = ((setres_count + 2) % 5); + + if (setresuid(uid1, uid2, uid3) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setresuid() correctly preserved active references (no leak)"); +} + +/* + * Test credential changes across multiple user namespaces. + * Create nested user namespaces and verify active reference tracking. + */ +TEST(cred_change_nested_userns) +{ + pid_t pid; + int status; + __u64 parent_userns_id, child_userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found_parent = false, found_child = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 parent_id, child_id; + uid_t orig_uid = getuid(); + + close(pipefd[0]); + + /* Create first user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get first namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create nested user namespace */ + userns_fd = get_userns_fd(0, 0, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get nested namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + + /* Perform some credential changes in nested namespace */ + setuid(0); + setgid(0); + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read both namespace IDs */ + if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get parent namespace ID"); + } + + if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get child namespace ID"); + } + close(pipefd[0]); + + TH_LOG("Parent userns: %llu, Child userns: %llu", + (unsigned long long)parent_userns_id, + (unsigned long long)child_userns_id); + + /* Verify both namespaces are active */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_TRUE(found_parent); + ASSERT_TRUE(found_child); + + /* Wait for child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify both namespaces become inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_parent = false; + found_child = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_FALSE(found_parent); + ASSERT_FALSE(found_child); + TH_LOG("Nested user namespace credential changes preserved active refs (no leak)"); +} + +/* + * Test rapid credential changes don't cause refcount imbalances. + * This stress-tests the switch_cred_namespaces() logic. + */ +TEST(rapid_cred_changes_no_leak) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace with wider range of UIDs/GIDs */ + userns_fd = get_userns_fd(0, orig_uid, 100); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform many rapid credential changes. + * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid. + */ + for (change_count = 0; change_count < 200; change_count++) { + switch (change_count % 6) { + case 0: + setuid(change_count % 50); + break; + case 1: + setgid(change_count % 50); + break; + case 2: + setreuid(change_count % 50, (change_count + 1) % 50); + break; + case 3: + setregid(change_count % 50, (change_count + 1) % 50); + break; + case 4: + setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + case 5: + setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive (no leaked active refs) */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("200 rapid credential changes completed with no active ref leak"); +} + +/* + * Test setfsuid/setfsgid which change filesystem UID/GID. + * These also trigger credential changes but may have different code paths. + */ +TEST(setfsuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setfsuid/setfsgid calls */ + for (change_count = 0; change_count < 50; change_count++) { + setfsuid(change_count % 10); + setfsgid(change_count % 10); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..c7ed4023d7a8 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c new file mode 100644 index 000000000000..da7d33f96397 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Minimal test case to reproduce KASAN out-of-bounds in listns pagination. + * + * The bug occurs when: + * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER) + * 2. Using pagination (req.ns_id != 0) + * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of + * the filtered type, causing it to search the unified tree and potentially + * return a namespace of the wrong type. + */ +TEST(pagination_with_type_filter) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, /* Filter by user namespace */ + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[10]; + int num_children = 10; + int i; + int sv[2]; + __u64 first_batch[3]; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* First batch - this should work */ + ret = sys_listns(&req, first_batch, 3, 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("First batch returned %zd entries", ret); + + if (ret == 3) { + __u64 second_batch[3]; + + /* Second batch - pagination triggers the bug */ + req.ns_id = first_batch[2]; /* Continue from last ID */ + ret = sys_listns(&req, second_batch, 3, 0); + + TH_LOG("Second batch returned %zd entries", ret); + ASSERT_GE(ret, 0); + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Cleanup */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c new file mode 100644 index 000000000000..82d818751a5f --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_permissions_test.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test that unprivileged users can only see namespaces they're currently in. + * Create a namespace, drop privileges, verify we can only see our own namespaces. + */ +TEST(listns_unprivileged_current_only) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + int unexpected_count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_netns_id; + bool found_ours; + int unexpected_count; + + close(pipefd[0]); + + /* Create user namespace to be unprivileged */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create a network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Now we're unprivileged - list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* We should only see our own network namespace */ + found_ours = false; + unexpected_count = 0; + + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_netns_id) { + found_ours = true; + } else { + /* This is either init_net (which we can see) or unexpected */ + unexpected_count++; + } + } + + /* Send results to parent */ + write(pipefd[1], &found_ours, sizeof(found_ours)); + write(pipefd[1], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + unexpected_count = 0; + read(pipefd[0], &found_ours, sizeof(found_ours)); + read(pipefd[0], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Child should have seen its own namespace */ + ASSERT_TRUE(found_ours); + + TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)", + unexpected_count); +} + +/* + * Test that users with CAP_SYS_ADMIN in a user namespace can see + * all namespaces owned by that user namespace. + */ +TEST(listns_cap_sys_admin_in_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Will be set to our created user namespace */ + }; + __u64 ns_ids[100]; + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 userns_id; + ssize_t ret; + int min_expected; + bool success; + + close(pipefd[0]); + + /* Create user namespace - we'll have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get the user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create several namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + unshare(CLONE_NEWIPC); + + /* List namespaces owned by our user namespace */ + req.user_ns_id = userns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* + * We have CAP_SYS_ADMIN in this user namespace, + * so we should see all namespaces owned by it. + * That includes: net, uts, ipc, and the user namespace itself. + */ + min_expected = 4; + success = (ret >= min_expected); + + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace", + count); +} + +/* + * Test that users cannot see namespaces from unrelated user namespaces. + * Create two sibling user namespaces, verify they can't see each other's + * owned namespaces. + */ +TEST(listns_cannot_see_sibling_userns_namespaces) +{ + int pipefd[2]; + pid_t pid1, pid2; + int status; + __u64 netns_a_id; + int pipefd2[2]; + bool found_sibling_netns; + + ASSERT_EQ(pipe(pipefd), 0); + + /* Fork first child - creates user namespace A */ + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + int fd; + __u64 netns_a_id; + char buf; + + close(pipefd[0]); + + /* Create user namespace A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create network namespace owned by user namespace A */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &netns_a_id, sizeof(netns_a_id)); + + /* Keep alive for sibling to check */ + read(pipefd[1], &buf, 1); + close(pipefd[1]); + exit(0); + } + + /* Parent reads namespace A ID */ + close(pipefd[1]); + netns_a_id = 0; + read(pipefd[0], &netns_a_id, sizeof(netns_a_id)); + + TH_LOG("User namespace A created network namespace with ID %llu", + (unsigned long long)netns_a_id); + + /* Fork second child - creates user namespace B */ + ASSERT_EQ(pipe(pipefd2), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool found_sibling_netns; + + close(pipefd[0]); + close(pipefd2[0]); + + /* Create user namespace B (sibling to A) */ + if (setup_userns() < 0) { + close(pipefd2[1]); + exit(1); + } + + /* Try to list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + found_sibling_netns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == netns_a_id) { + found_sibling_netns = true; + break; + } + } + } + + /* We should NOT see the sibling's network namespace */ + write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[1]); + exit(0); + } + + /* Parent reads result from second child */ + close(pipefd2[1]); + found_sibling_netns = false; + read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[0]); + + /* Signal first child to exit */ + close(pipefd[0]); + + /* Wait for both children */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Second child should NOT have seen first child's namespace */ + ASSERT_FALSE(found_sibling_netns); + TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace"); +} + +/* + * Test permission checking with LISTNS_CURRENT_USER. + * Verify that listing with LISTNS_CURRENT_USER respects permissions. + */ +TEST(listns_current_user_permissions) +{ + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool success; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List with LISTNS_CURRENT_USER - should see our owned namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + success = (ret >= 3); /* At least user, net, uts */ + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count); +} + +/* + * Test that CAP_SYS_ADMIN in parent user namespace allows seeing + * child user namespace's owned namespaces. + */ +TEST(listns_parent_userns_cap_sys_admin) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_child_userns; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 parent_userns_id; + __u64 child_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_child_userns; + + close(pipefd[0]); + + /* Create parent user namespace - we have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get parent user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get child user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create namespaces owned by child user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List namespaces owned by parent user namespace */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = 0; + req.spare2 = 0; + req.user_ns_id = parent_userns_id; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* Should see child user namespace in the list */ + found_child_userns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == child_userns_id) { + found_child_userns = true; + break; + } + } + } + + write(pipefd[1], &found_child_userns, sizeof(found_child_userns)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_child_userns = false; + count = 0; + read(pipefd[0], &found_child_userns, sizeof(found_child_userns)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_child_userns); + TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)", + count); +} + +/* + * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of. + * This is different from seeing namespaces owned by a user namespace. + */ +TEST(listns_cap_sys_admin_inside_userns) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_ours; + + close(pipefd[0]); + + /* Create user namespace - we have CAP_SYS_ADMIN inside it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* List all user namespaces globally */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = CLONE_NEWUSER; + req.spare2 = 0; + req.user_ns_id = 0; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* We should be able to see our own user namespace */ + found_ours = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_userns_id) { + found_ours = true; + break; + } + } + } + + write(pipefd[1], &found_ours, sizeof(found_ours)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + read(pipefd[0], &found_ours, sizeof(found_ours)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_ours); + TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of"); +} + +/* + * Test that dropping CAP_SYS_ADMIN restricts what we can see. + */ +TEST(listns_drop_cap_sys_admin) +{ + cap_t caps; + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + + /* This test needs to start with CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (!caps) { + SKIP(return, "Cannot get capabilities"); + } + + cap_flag_value_t cap_val; + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) { + cap_free(caps); + SKIP(return, "Cannot check CAP_SYS_ADMIN"); + } + + if (cap_val != CAP_SET) { + cap_free(caps); + SKIP(return, "Test needs CAP_SYS_ADMIN to start"); + } + cap_free(caps); + + int pipefd[2]; + pid_t pid; + int status; + bool correct; + ssize_t count_before, count_after; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids_before[100]; + ssize_t count_before; + __u64 ns_ids_after[100]; + ssize_t count_after; + bool correct; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Count namespaces with CAP_SYS_ADMIN */ + count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + + /* Drop CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (caps) { + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR); + cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR); + cap_set_proc(caps); + cap_free(caps); + } + + /* Ensure we can't regain the capability */ + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* Count namespaces without CAP_SYS_ADMIN */ + count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + + /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */ + correct = (count_after <= count_before); + + write(pipefd[1], &correct, sizeof(correct)); + write(pipefd[1], &count_before, sizeof(count_before)); + write(pipefd[1], &count_after, sizeof(count_after)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + correct = false; + count_before = 0; + count_after = 0; + read(pipefd[0], &correct, sizeof(correct)); + read(pipefd[0], &count_before, sizeof(count_before)); + read(pipefd[0], &count_after, sizeof(count_after)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(correct); + TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces", + count_before, count_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c new file mode 100644 index 000000000000..8a95789d6a87 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_test.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test basic listns() functionality with the unified namespace tree. + * List all active namespaces globally. + */ +TEST(listns_basic_unified) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + + /* Should find at least the initial namespaces */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active namespaces", ret); + + /* Verify all returned IDs are non-zero */ + for (ssize_t i = 0; i < ret; i++) { + ASSERT_NE(ns_ids[i], 0); + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } +} + +/* + * Test listns() with type filtering. + * List only network namespaces. + */ +TEST(listns_filter_by_type) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, /* Only network namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least init_net */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active network namespaces", ret); + + /* Verify we can open each namespace and it's actually a network namespace */ + for (ssize_t i = 0; i < ret && i < 5; i++) { + struct nsfs_file_handle nsfh = { + .ns_id = ns_ids[i], + .ns_type = CLONE_NEWNET, + .ns_inum = 0, + }; + struct file_handle *fh; + int fd; + + fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh)); + ASSERT_NE(fh, NULL); + fh->handle_bytes = sizeof(nsfh); + fh->handle_type = 0; + memcpy(fh->f_handle, &nsfh, sizeof(nsfh)); + + fd = open_by_handle_at(-10003, fh, O_RDONLY); + free(fh); + + if (fd >= 0) { + int ns_type; + /* Verify it's a network namespace via ioctl */ + ns_type = ioctl(fd, NS_GET_NSTYPE); + if (ns_type >= 0) { + ASSERT_EQ(ns_type, CLONE_NEWNET); + } + close(fd); + } + } +} + +/* + * Test listns() pagination. + * List namespaces in batches. + */ +TEST(listns_pagination) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 batch1[2], batch2[2]; + ssize_t ret1, ret2; + + /* Get first batch */ + ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0); + if (ret1 < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret1, 0); + + if (ret1 == 0) + SKIP(return, "No namespaces found"); + + TH_LOG("First batch: %zd namespaces", ret1); + + /* Get second batch using last ID from first batch */ + if (ret1 == ARRAY_SIZE(batch1)) { + req.ns_id = batch1[ret1 - 1]; + ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0); + ASSERT_GE(ret2, 0); + + TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)", + ret2, (unsigned long long)req.ns_id); + + /* If we got more results, verify IDs are monotonically increasing */ + if (ret2 > 0) { + ASSERT_GT(batch2[0], batch1[ret1 - 1]); + TH_LOG("Pagination working: %llu > %llu", + (unsigned long long)batch2[0], + (unsigned long long)batch1[ret1 - 1]); + } + } else { + TH_LOG("All namespaces fit in first batch"); + } +} + +/* + * Test listns() with LISTNS_CURRENT_USER. + * List namespaces owned by current user namespace. + */ +TEST(listns_current_user) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least the initial namespaces if we're in init_user_ns */ + TH_LOG("Found %zd namespaces owned by current user namespace", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that listns() only returns active namespaces. + * Create a namespace, let it become inactive, verify it's not listed. + */ +TEST(listns_only_active) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[100], ns_ids_after[100]; + ssize_t ret_before, ret_after; + int pipefd[2]; + pid_t pid; + __u64 new_ns_id = 0; + int status; + + /* Get initial list */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret_before, 0); + + TH_LOG("Before: %zd active network namespaces", ret_before); + + /* Create a new namespace in a child process and get its ID */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + + close(pipefd[0]); + + /* Create new network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get its ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + write(pipefd[1], &ns_id, sizeof(ns_id)); + close(pipefd[1]); + + /* Keep namespace active briefly */ + usleep(100000); + exit(0); + } + + /* Parent reads the new namespace ID */ + { + int bytes; + + close(pipefd[1]); + bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id)); + close(pipefd[0]); + + if (bytes == sizeof(new_ns_id)) { + __u64 ns_ids_during[100]; + int ret_during; + + TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id); + + /* List namespaces while child is still alive - should see new one */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + TH_LOG("During: %d active network namespaces", ret_during); + + /* Should have more namespaces than before */ + ASSERT_GE(ret_during, ret_before); + } + } + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + + /* Give time for namespace to become inactive */ + usleep(100000); + + /* List namespaces after child exits - should not see new one */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + TH_LOG("After: %zd active network namespaces", ret_after); + + /* Verify the new namespace ID is not in the after list */ + if (new_ns_id != 0) { + bool found = false; + + for (ssize_t i = 0; i < ret_after; i++) { + if (ns_ids_after[i] == new_ns_id) { + found = true; + break; + } + } + ASSERT_FALSE(found); + } +} + +/* + * Test listns() with specific user namespace ID. + * Create a user namespace and list namespaces it owns. + */ +TEST(listns_specific_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, /* Will be filled with created userns ID */ + }; + __u64 ns_ids[100]; + int sv[2]; + pid_t pid; + int status; + __u64 user_ns_id = 0; + int bytes; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + char buf; + + close(sv[0]); + + /* Create new user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) { + close(sv[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id)); + + if (bytes != sizeof(user_ns_id)) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get user namespace ID from child"); + } + + TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id); + + /* List namespaces owned by this user namespace */ + req.user_ns_id = user_ns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0) { + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) { + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("Found %zd namespaces owned by user namespace %llu", ret, + (unsigned long long)user_ns_id); + + /* Should find at least the network and UTS namespaces we created */ + if (ret > 0) { + for (ssize_t i = 0; i < ret && i < 10; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test listns() with multiple namespace types filter. + */ +TEST(listns_multiple_types) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + TH_LOG("Found %zd active network/UTS namespaces", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that hierarchical active reference propagation keeps parent + * user namespaces visible in listns(). + */ +TEST(listns_hierarchical_visibility) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 parent_ns_id = 0, child_ns_id = 0; + int sv[2]; + pid_t pid; + int status; + int bytes; + __u64 ns_ids[100]; + ssize_t ret; + bool found_parent, found_child; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + char buf; + + close(sv[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) { + close(sv[1]); + exit(1); + } + if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + + /* Read both namespace IDs */ + bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id)); + bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id)); + + if (bytes != (int)(2 * sizeof(__u64))) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace IDs from child"); + } + + TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id); + TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id); + + /* List all user namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0 && errno == ENOSYS) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "listns() not supported"); + } + + ASSERT_GE(ret, 0); + TH_LOG("Found %zd active user namespaces", ret); + + /* Both parent and child should be visible (active due to child process) */ + found_parent = false; + found_child = false; + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == parent_ns_id) + found_parent = true; + if (ns_ids[i] == child_ns_id) + found_child = true; + } + + TH_LOG("Parent namespace %s, child namespace %s", + found_parent ? "found" : "NOT FOUND", + found_child ? "found" : "NOT FOUND"); + + ASSERT_TRUE(found_child); + /* With hierarchical propagation, parent should also be active */ + ASSERT_TRUE(found_parent); + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test error cases for listns(). + */ +TEST(listns_error_cases) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[10]; + int ret; + + /* Test with invalid flags */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + + /* Test with NULL ns_ids array */ + ret = sys_listns(&req, NULL, 10, 0); + ASSERT_LT(ret, 0); + + /* Test with invalid spare field */ + req.spare = 1; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + req.spare = 0; + + /* Test with huge nr_ns_ids */ + ret = sys_listns(&req, ns_ids, 2000000, 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c new file mode 100644 index 000000000000..093268f0efaa --- /dev/null +++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c @@ -0,0 +1,2672 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <pthread.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test that initial namespaces can be reopened via file handle. + * Initial namespaces should have active ref count of 1 from boot. + */ +TEST(init_ns_always_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd1, fd2; + struct stat st1, st2; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open initial network namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + + /* Get file handle for initial namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(fd1); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + + /* Close the namespace fd */ + close(fd1); + + /* Try to reopen via file handle - should succeed since init ns is always active */ + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); + return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd2, 0); + + /* Verify we opened the same namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + ASSERT_EQ(fstat(fd1, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(fd1); + close(fd2); + free(handle); +} + +/* + * Test namespace lifecycle: create a namespace in a child process, + * get a file handle while it's active, then try to reopen after + * the process exits (namespace becomes inactive). + */ +TEST(ns_inactive_after_exit) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + /* Create pipe for passing file handle from child */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Open our new namespace */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get file handle for the namespace */ + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Exit - namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read file handle from child */ + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Try to reopen namespace - should fail with ENOENT since it's inactive */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should fail with ENOENT (namespace inactive) or ESTALE */ + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a process is using it, + * even after the creating process exits. + */ +TEST(ns_active_with_multiple_processes) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + int syncpipe[2]; + pid_t pid1, pid2; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + + /* Create pipes for communication */ + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + /* First child - creates namespace */ + close(pipefd[0]); + close(syncpipe[1]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Open and get handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Wait for signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent reads handle */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + ASSERT_GT(ret, 0); + + handle = (struct file_handle *)buf; + + /* Create second child that will keep namespace active */ + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + /* Second child - reopens the namespace */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Open the namespace via handle */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0) { + exit(1); + } + + /* Join the namespace */ + ret = setns(fd, CLONE_NEWNET); + close(fd); + if (ret < 0) { + exit(1); + } + + /* Sleep to keep namespace active */ + sleep(1); + exit(0); + } + + /* Let second child enter the namespace */ + usleep(100000); /* 100ms */ + + /* Signal first child to exit */ + close(syncpipe[0]); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for first child */ + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Namespace should still be active because second child is using it */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd, 0); + close(fd); + + /* Wait for second child */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); +} + +/* + * Test user namespace active ref tracking via credential lifecycle + */ +TEST(userns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Set up uid/gid mappings */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) { + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + } + + /* Get file handle */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all tasks exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test PID namespace active ref tracking + */ +TEST(pidns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Fork to actually enter the PID namespace */ + pid_t child = fork(); + if (child < 0) { + close(pipefd[1]); + exit(1); + } + + if (child == 0) { + /* Grandchild - in new PID namespace */ + fd = open("/proc/self/ns/pid", O_RDONLY); + if (fd < 0) { + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + exit(1); + } + + /* Send handle to grandparent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child, NULL, 0); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all processes exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that an open file descriptor keeps a namespace active. + * Even after the creating process exits, the namespace should remain + * active as long as an fd is held open. + */ +TEST(ns_fd_keeps_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int nsfd; + int pipe_child_ready[2]; + int pipe_parent_ready[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + char proc_path[64]; + + ASSERT_EQ(pipe(pipe_child_ready), 0); + ASSERT_EQ(pipe(pipe_parent_ready), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipe_child_ready[0]); + close(pipe_parent_ready[1]); + + TH_LOG("Child: creating new network namespace"); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: network namespace created successfully"); + + /* Get file handle for the namespace */ + nsfd = open("/proc/self/ns/net", O_RDONLY); + if (nsfd < 0) { + TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: opened namespace fd %d", nsfd); + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH); + close(nsfd); + + if (ret < 0) { + TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes); + + /* Send file handle to parent */ + ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes); + TH_LOG("Child: sent %d bytes of file handle to parent", ret); + close(pipe_child_ready[1]); + + /* Wait for parent to open the fd */ + TH_LOG("Child: waiting for parent to open fd"); + ret = read(pipe_parent_ready[0], &sync_byte, 1); + close(pipe_parent_ready[0]); + + TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret); + /* Exit - namespace should stay active because parent holds fd */ + exit(0); + } + + /* Parent process */ + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + + TH_LOG("Parent: reading file handle from child"); + + /* Read file handle from child */ + ret = read(pipe_child_ready[0], buf, sizeof(buf)); + close(pipe_child_ready[0]); + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes); + + /* Open the child's namespace while it's still alive */ + snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid); + TH_LOG("Parent: opening child's namespace at %s", proc_path); + nsfd = open(proc_path, O_RDONLY); + if (nsfd < 0) { + TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno)); + close(pipe_parent_ready[1]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child's namespace"); + } + + TH_LOG("Parent: opened child's namespace, got fd %d", nsfd); + + /* Signal child that we have the fd */ + sync_byte = 'G'; + write(pipe_parent_ready[1], &sync_byte, 1); + close(pipe_parent_ready[1]); + TH_LOG("Parent: signaled child that we have the fd"); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child exited, parent holds fd %d to namespace", nsfd); + + /* + * Namespace should still be ACTIVE because we hold an fd. + * We should be able to reopen it via file handle. + */ + TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)"); + int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd2, 0); + + TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(fd2); + + /* Now close the fd - namespace should become inactive */ + TH_LOG("Closing fd %d - namespace should become inactive", nsfd); + close(nsfd); + + /* Now reopening should fail - namespace is inactive */ + TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)"); + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd2, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test hierarchical active reference propagation. + * When a child namespace is active, its owning user namespace should also + * be active automatically due to hierarchical active reference propagation. + * This ensures parents are always reachable when children are active. + */ +TEST(ns_parent_always_reachable) +{ + struct file_handle *parent_handle, *child_handle; + int ret; + int child_nsfd; + int pipefd[2]; + pid_t pid; + int status; + __u64 parent_id, child_id; + char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + TH_LOG("Child: creating parent user namespace and setting up mappings"); + + /* Create parent user namespace with mappings */ + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for parent user namespace */ + int parent_fd = open("/proc/self/ns/user", O_RDONLY); + if (parent_fd < 0) { + TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened parent userns fd %d", parent_fd); + + if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) { + TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno)); + close(parent_fd); + close(pipefd[1]); + exit(1); + } + close(parent_fd); + + TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id); + + /* Create child user namespace within parent */ + TH_LOG("Child: creating nested child user namespace"); + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for child user namespace */ + int child_fd = open("/proc/self/ns/user", O_RDONLY); + if (child_fd < 0) { + TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened child userns fd %d", child_fd); + + if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) { + TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno)); + close(child_fd); + close(pipefd[1]); + exit(1); + } + close(child_fd); + + TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id); + + /* Send both namespace IDs to parent */ + TH_LOG("Child: sending both namespace IDs to parent"); + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + close(pipefd[1]); + + TH_LOG("Child: exiting - parent userns should become inactive"); + /* Exit - parent user namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + TH_LOG("Parent: reading both namespace IDs from child"); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &parent_id, sizeof(parent_id)); + if (ret != sizeof(parent_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &child_id, sizeof(child_id)); + close(pipefd[0]); + if (ret != sizeof(child_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + TH_LOG("Parent: received parent_id=%llu, child_id=%llu", + (unsigned long long)parent_id, (unsigned long long)child_id); + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)parent_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + parent_fh->ns_id = parent_id; + parent_fh->ns_type = 0; + parent_fh->ns_inum = 0; + + child_handle = (struct file_handle *)child_buf; + child_handle->handle_bytes = sizeof(struct nsfs_file_handle); + child_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle; + child_fh->ns_id = child_id; + child_fh->ns_type = 0; + child_fh->ns_inum = 0; + + TH_LOG("Parent: opening child namespace BEFORE child exits"); + + /* Open child namespace while child is still alive to keep it active */ + child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY); + if (child_nsfd < 0) { + TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + TH_LOG("Opened child namespace fd %d", child_nsfd); + + /* Now wait for child to exit */ + TH_LOG("Parent: waiting for child to exit"); + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child process exited, parent holds fd to child namespace"); + + /* + * With hierarchical active reference propagation: + * Since the child namespace is active (parent process holds fd), + * the parent user namespace should ALSO be active automatically. + * This is because when we took an active reference on the child, + * it propagated up to the owning user namespace. + */ + TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)"); + int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd, 0); + + TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd); + + /* Verify we can also get parent via NS_GET_USERNS */ + TH_LOG("Verifying NS_GET_USERNS also works"); + int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS); + if (parent_fd2 < 0) { + close(parent_fd); + close(child_nsfd); + TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno); + SKIP(return, "NS_GET_USERNS not supported or failed"); + } + + TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2); + + /* Verify both methods give us the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(parent_fd, &st1), 0); + ASSERT_EQ(fstat(parent_fd2, &st2), 0); + TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + /* + * Close child fd - parent should remain active because we still + * hold direct references to it (parent_fd and parent_fd2). + */ + TH_LOG("Closing child fd - parent should remain active (direct refs held)"); + close(child_nsfd); + + /* Parent should still be openable */ + TH_LOG("Verifying parent still active via file handle"); + int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd3, 0); + close(parent_fd3); + + TH_LOG("Closing all fds to parent namespace"); + close(parent_fd); + close(parent_fd2); + + /* Both should now be inactive */ + TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)"); + parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(parent_fd, 0); + TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that bind mounts keep namespaces in the tree even when inactive + */ +TEST(ns_bind_mount_keeps_in_tree) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char tmpfile[] = "/tmp/ns-test-XXXXXX"; + int tmpfd; + + /* Create temporary file for bind mount */ + tmpfd = mkstemp(tmpfile); + if (tmpfd < 0) { + SKIP(return, "Cannot create temporary file"); + } + close(tmpfd); + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Unshare mount namespace and make mounts private to avoid propagation */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Bind mount the namespace */ + ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Get file handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* + * Namespace should be inactive but still in tree due to bind mount. + * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree). + */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should be ENOENT (inactive) since bind mount keeps it in tree */ + if (errno != ENOENT && errno != ESTALE) { + TH_LOG("Unexpected error: %d", errno); + } + + /* Cleanup */ + umount(tmpfile); + unlink(tmpfile); +} + +/* + * Test multi-level hierarchy (3+ levels deep). + * Grandparent → Parent → Child + * When child is active, both parent AND grandparent should be active. + */ +TEST(ns_multilevel_hierarchy) +{ + struct file_handle *gp_handle, *p_handle, *c_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 gp_id, p_id, c_id; + char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ]; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create grandparent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int gp_fd = open("/proc/self/ns/user", O_RDONLY); + if (gp_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) { + close(gp_fd); + close(pipefd[1]); + exit(1); + } + close(gp_fd); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c_fd = open("/proc/self/ns/user", O_RDONLY); + if (c_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) { + close(c_fd); + close(pipefd[1]); + exit(1); + } + close(c_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &gp_id, sizeof(gp_id)); + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c_id, sizeof(c_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &gp_id, sizeof(gp_id)); + if (ret != sizeof(gp_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read grandparent namespace ID from child"); + } + + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &c_id, sizeof(c_id)); + close(pipefd[0]); + if (ret != sizeof(c_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + /* Construct file handles from namespace IDs */ + gp_handle = (struct file_handle *)gp_buf; + gp_handle->handle_bytes = sizeof(struct nsfs_file_handle); + gp_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle; + gp_fh->ns_id = gp_id; + gp_fh->ns_type = 0; + gp_fh->ns_inum = 0; + + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c_handle = (struct file_handle *)c_buf; + c_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle; + c_fh->ns_id = c_id; + c_fh->ns_type = 0; + c_fh->ns_inum = 0; + + /* Open child before process exits */ + int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY); + if (c_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * With 3-level hierarchy and child active: + * - Child is active (we hold fd) + * - Parent should be active (propagated from child) + * - Grandparent should be active (propagated from parent) + */ + TH_LOG("Testing parent active when child is active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + + TH_LOG("Testing grandparent active when child is active"); + int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY); + ASSERT_GE(gp_fd, 0); + + close(c_fd); + close(p_fd); + close(gp_fd); +} + +/* + * Test multiple children sharing same parent. + * Parent should stay active as long as ANY child is active. + */ +TEST(ns_multiple_children_same_parent) +{ + struct file_handle *p_handle, *c1_handle, *c2_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 p_id, c1_id, c2_id; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ]; + char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c1_fd = open("/proc/self/ns/user", O_RDONLY); + if (c1_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) { + close(c1_fd); + close(pipefd[1]); + exit(1); + } + close(c1_fd); + + /* Return to parent user namespace and create second child */ + /* We can't actually do this easily, so let's create a sibling namespace + * by creating a network namespace instead */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int c2_fd = open("/proc/self/ns/net", O_RDONLY); + if (c2_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) { + close(c2_fd); + close(pipefd[1]); + exit(1); + } + close(c2_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c1_id, sizeof(c1_id)); + write(pipefd[1], &c2_id, sizeof(c2_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &c1_id, sizeof(c1_id)); + if (ret != sizeof(c1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first child namespace ID"); + } + + ret = read(pipefd[0], &c2_id, sizeof(c2_id)); + close(pipefd[0]); + if (ret != sizeof(c2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second child namespace ID"); + } + + /* Construct file handles from namespace IDs */ + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c1_handle = (struct file_handle *)c1_buf; + c1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle; + c1_fh->ns_id = c1_id; + c1_fh->ns_type = 0; + c1_fh->ns_inum = 0; + + c2_handle = (struct file_handle *)c2_buf; + c2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle; + c2_fh->ns_id = c2_id; + c2_fh->ns_type = 0; + c2_fh->ns_inum = 0; + + /* Open both children before process exits */ + int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY); + int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY); + + if (c1_fd < 0 || c2_fd < 0) { + if (c1_fd >= 0) close(c1_fd); + if (c2_fd >= 0) close(c2_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (both children active) */ + TH_LOG("Both children active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first child - parent should STILL be active */ + TH_LOG("Closing first child - parent should still be active"); + close(c1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second child - NOW parent should become inactive */ + TH_LOG("Closing second child - parent should become inactive"); + close(c2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that different namespace types with same owner all contribute + * active references to the owning user namespace. + */ +TEST(ns_different_types_same_owner) +{ + struct file_handle *u_handle, *n_handle, *ut_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + /* Create network namespace (owned by user namespace) */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + /* Create UTS namespace (also owned by user namespace) */ + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + u_handle = (struct file_handle *)u_buf; + u_handle->handle_bytes = sizeof(struct nsfs_file_handle); + u_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + n_handle = (struct file_handle *)n_buf; + n_handle->handle_bytes = sizeof(struct nsfs_file_handle); + n_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + ut_handle = (struct file_handle *)ut_buf; + ut_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ut_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces before process exits */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY); + + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * Both network and UTS namespaces are active. + * User namespace should be active (gets 2 active refs). + */ + TH_LOG("Both net and uts active - user namespace should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close network namespace - user namespace should STILL be active */ + TH_LOG("Closing network ns - user ns should still be active (uts still active)"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close UTS namespace - user namespace should become inactive */ + TH_LOG("Closing uts ns - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* + * Test hierarchical propagation with deep namespace hierarchy. + * Create: init_user_ns -> user_A -> user_B -> net_ns + * When net_ns is active, both user_A and user_B should be active. + * This verifies the conditional recursion in __ns_ref_active_put() works. + */ +TEST(ns_deep_hierarchy_propagation) +{ + struct file_handle *ua_handle, *ub_handle, *net_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id, net_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A -> user_B -> net hierarchy */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + close(net_fd); + close(pipefd[1]); + exit(1); + } + close(net_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + write(pipefd[1], &net_id, sizeof(net_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + if (ret != sizeof(ub_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Open net_ns before child exits to keep it active */ + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + if (net_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open network namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With net_ns active, both user_A and user_B should be active */ + TH_LOG("Testing user_B active (net_ns active causes propagation)"); + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd, 0); + + TH_LOG("Testing user_A active (propagated through user_B)"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close net_ns - user_B should stay active (we hold direct ref) */ + TH_LOG("Closing net_ns, user_B should remain active (direct ref held)"); + close(net_fd); + int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd2, 0); + close(ub_fd2); + + /* Close user_B - user_A should stay active (we hold direct ref) */ + TH_LOG("Closing user_B, user_A should remain active (direct ref held)"); + close(ub_fd); + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - everything should become inactive */ + TH_LOG("Closing user_A, all should become inactive"); + close(ua_fd); + + /* All should now be inactive */ + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test that parent stays active as long as ANY child is active. + * Create parent user namespace with two child net namespaces. + * Parent should remain active until BOTH children are inactive. + */ +TEST(ns_parent_multiple_children_refcount) +{ + struct file_handle *parent_handle, *net1_handle, *net2_handle; + int ret, pipefd[2], syncpipe[2]; + pid_t pid; + int status; + __u64 p_id, n1_id, n2_id; + char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ]; + char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ]; + char sync_byte; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + close(syncpipe[1]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n1_fd = open("/proc/self/ns/net", O_RDONLY); + if (n1_fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep n1_fd open so first namespace stays active */ + + /* Create second network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n2_fd = open("/proc/self/ns/net", O_RDONLY); + if (n2_fd < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) { + close(n1_fd); + close(n2_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep both n1_fd and n2_fd open */ + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &n1_id, sizeof(n1_id)); + write(pipefd[1], &n2_id, sizeof(n2_id)); + close(pipefd[1]); + + /* Wait for parent to signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + close(pipefd[1]); + close(syncpipe[0]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &n1_id, sizeof(n1_id)); + if (ret != sizeof(n1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first network namespace ID"); + } + + ret = read(pipefd[0], &n2_id, sizeof(n2_id)); + close(pipefd[0]); + if (ret != sizeof(n2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)p_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + net1_handle = (struct file_handle *)n1_buf; + net1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle; + n1_fh->ns_id = n1_id; + n1_fh->ns_type = 0; + n1_fh->ns_inum = 0; + + net2_handle = (struct file_handle *)n2_buf; + net2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle; + n2_fh->ns_id = n2_id; + n2_fh->ns_type = 0; + n2_fh->ns_inum = 0; + + /* Open both net namespaces while child is still alive */ + int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY); + int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY); + if (n1_fd < 0 || n2_fd < 0) { + if (n1_fd >= 0) close(n1_fd); + if (n2_fd >= 0) close(n2_fd); + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open net namespaces"); + } + + /* Signal child that we have opened the namespaces */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (has 2 active children) */ + TH_LOG("Both net namespaces active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first net namespace - parent should STILL be active */ + TH_LOG("Closing first net ns - parent should still be active"); + close(n1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second net namespace - parent should become inactive */ + TH_LOG("Closing second net ns - parent should become inactive"); + close(n2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that user namespace as a child also propagates correctly. + * Create user_A -> user_B, verify when user_B is active that user_A + * is also active. This is different from non-user namespace children. + */ +TEST(ns_userns_child_propagation) +{ + struct file_handle *ua_handle, *ub_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + /* Create user_B (child of user_A) */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + /* Send both namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + close(pipefd[0]); + if (ret != sizeof(ub_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + /* Open user_B before child exits */ + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + if (ub_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open user_B"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With user_B active, user_A should also be active */ + TH_LOG("Testing user_A active when child user_B is active"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close user_B */ + TH_LOG("Closing user_B"); + close(ub_fd); + + /* user_A should remain active (we hold direct ref) */ + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - should become inactive */ + TH_LOG("Closing user_A - should become inactive"); + close(ua_fd); + + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test different namespace types (net, uts, ipc) all contributing + * active references to the same owning user namespace. + */ +TEST(ns_mixed_types_same_owner) +{ + struct file_handle *user_handle, *net_handle, *uts_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + user_handle = (struct file_handle *)u_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + net_handle = (struct file_handle *)n_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + uts_handle = (struct file_handle *)ut_buf; + uts_handle->handle_bytes = sizeof(struct nsfs_file_handle); + uts_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY); + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* User namespace should be active (2 active children) */ + TH_LOG("Both net and uts active - user ns should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close net - user ns should STILL be active (uts still active) */ + TH_LOG("Closing net - user ns should still be active"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close uts - user ns should become inactive */ + TH_LOG("Closing uts - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* Thread test helpers and structures */ +struct thread_ns_info { + __u64 ns_id; + int pipefd; + int syncfd_read; + int syncfd_write; + int exit_code; +}; + +static void *thread_create_namespace(void *arg) +{ + struct thread_ns_info *info = (struct thread_ns_info *)arg; + int ret; + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + info->exit_code = 1; + return NULL; + } + + /* Get namespace ID */ + int fd = open("/proc/thread-self/ns/net", O_RDONLY); + if (fd < 0) { + info->exit_code = 2; + return NULL; + } + + ret = ioctl(fd, NS_GET_ID, &info->ns_id); + close(fd); + if (ret < 0) { + info->exit_code = 3; + return NULL; + } + + /* Send namespace ID to main thread */ + if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) { + info->exit_code = 4; + return NULL; + } + + /* Wait for signal to exit */ + char sync_byte; + if (read(info->syncfd_read, &sync_byte, 1) != 1) { + info->exit_code = 5; + return NULL; + } + + info->exit_code = 0; + return NULL; +} + +/* + * Test that namespace becomes inactive after thread exits. + * This verifies active reference counting works with threads, not just processes. + */ +TEST(thread_ns_inactive_after_exit) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Namespace should be active while thread is alive */ + TH_LOG("Attempting to open namespace while thread is alive (should succeed)"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + close(nsfd); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + ASSERT_EQ(pthread_join(thread, NULL), 0); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) + SKIP(return, "Thread failed to create namespace"); + + TH_LOG("Thread exited, namespace should be inactive"); + + /* Namespace should now be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a thread holds an fd to it. + * Even after the thread exits, the namespace should remain active as long as + * another thread holds a file descriptor to it. + */ +TEST(thread_ns_fd_keeps_active) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Open namespace while thread is alive */ + TH_LOG("Opening namespace while thread is alive"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) { + close(nsfd); + SKIP(return, "Thread failed to create namespace"); + } + + TH_LOG("Thread exited, but main thread holds fd - namespace should remain active"); + + /* Namespace should still be active because we hold an fd */ + int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd2, 0); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(nsfd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(nsfd2); + + TH_LOG("Closing fd - namespace should become inactive"); + close(nsfd); + + /* Now namespace should be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* Structure for thread data in subprocess */ +struct thread_sleep_data { + int syncfd_read; +}; + +static void *thread_sleep_and_wait(void *arg) +{ + struct thread_sleep_data *data = (struct thread_sleep_data *)arg; + char sync_byte; + + /* Wait for signal to exit - read will unblock when pipe is closed */ + (void)read(data->syncfd_read, &sync_byte, 1); + return NULL; +} + +/* + * Test that namespaces become inactive after subprocess with multiple threads exits. + * Create a subprocess that unshares user and network namespaces, then creates two + * threads that share those namespaces. Verify that after all threads and subprocess + * exit, the namespaces are no longer listed by listns() and cannot be opened by + * open_by_handle_at(). + */ +TEST(thread_subprocess_ns_inactive_after_all_exit) +{ + int pipefd[2]; + int sv[2]; + pid_t pid; + int status; + __u64 user_id, net_id; + struct file_handle *user_handle, *net_handle; + char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char sync_byte; + int ret; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + close(sv[0]); + + /* Create user namespace with mappings */ + if (setup_userns() < 0) { + fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: setup_userns() succeeded\n"); + + /* Get user namespace ID */ + int user_fd = open("/proc/self/ns/user", O_RDONLY); + if (user_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno)); + close(user_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(user_fd); + fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id); + + /* Unshare network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n"); + + /* Get network namespace ID */ + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno)); + close(net_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(net_fd); + fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id); + + /* Send namespace IDs to parent */ + if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) { + fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno)); + exit(1); + } + if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) { + fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno)); + exit(1); + } + close(pipefd[1]); + fprintf(stderr, "Child: sent namespace IDs to parent\n"); + + /* Create two threads that share the namespaces */ + pthread_t thread1, thread2; + struct thread_sleep_data data; + data.syncfd_read = sv[1]; + + int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: created thread1\n"); + + ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + pthread_cancel(thread1); + exit(1); + } + fprintf(stderr, "Child: created thread2\n"); + + /* Wait for threads to complete - they will unblock when parent writes */ + fprintf(stderr, "Child: waiting for threads to exit\n"); + pthread_join(thread1, NULL); + fprintf(stderr, "Child: thread1 exited\n"); + pthread_join(thread2, NULL); + fprintf(stderr, "Child: thread2 exited\n"); + + close(sv[1]); + + /* Exit - namespaces should become inactive */ + fprintf(stderr, "Child: all threads joined, exiting with success\n"); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + close(sv[1]); + + TH_LOG("Parent: waiting to read namespace IDs from child"); + + /* Read namespace IDs from child */ + ret = read(pipefd[0], &user_id, sizeof(user_id)); + if (ret != sizeof(user_id)) { + TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno)); + close(pipefd[0]); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID from child"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno)); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID from child"); + } + + TH_LOG("Child created user ns %llu and net ns %llu with 2 threads", + (unsigned long long)user_id, (unsigned long long)net_id); + + /* Construct file handles */ + user_handle = (struct file_handle *)user_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle; + user_fh->ns_id = user_id; + user_fh->ns_type = 0; + user_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Verify namespaces are active while subprocess and threads are alive */ + TH_LOG("Verifying namespaces are active while subprocess with threads is running"); + int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(user_fd, 0); + + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_GE(net_fd, 0); + + close(user_fd); + close(net_fd); + + /* Also verify they appear in listns() */ + TH_LOG("Verifying namespaces appear in listns() while active"); + struct ns_id_req req = { + .size = sizeof(struct ns_id_req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids < 0) { + TH_LOG("listns() not available, skipping listns verification"); + } else { + /* Check if user_id is in the list */ + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_TRUE(found_user); + TH_LOG("User namespace found in listns() as expected"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_TRUE(found_net); + TH_LOG("Network namespace found in listns() as expected"); + } + } + + /* Signal threads to exit */ + TH_LOG("Signaling threads to exit"); + sync_byte = 'X'; + /* Write two bytes - one for each thread */ + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + close(sv[0]); + + /* Wait for child process to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + if (WEXITSTATUS(status) != 0) { + TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status)); + SKIP(return, "Child process failed"); + } + + TH_LOG("Subprocess and all threads have exited successfully"); + + /* Verify namespaces are now inactive - open_by_handle_at should fail */ + TH_LOG("Verifying namespaces are inactive after subprocess and threads exit"); + user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(user_fd, 0); + TH_LOG("User namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_LT(net_fd, 0); + TH_LOG("Network namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + /* Verify namespaces do NOT appear in listns() */ + TH_LOG("Verifying namespaces do NOT appear in listns() when inactive"); + memset(&req, 0, sizeof(req)); + req.size = sizeof(struct ns_id_req); + req.ns_type = CLONE_NEWUSER; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_FALSE(found_user); + TH_LOG("User namespace correctly not listed in listns()"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_FALSE(found_net); + TH_LOG("Network namespace correctly not listed in listns()"); + } + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index e28accd74a57..527ade0a8673 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -6,6 +6,7 @@ #include <libgen.h> #include <limits.h> #include <pthread.h> +#include <signal.h> #include <string.h> #include <sys/mount.h> #include <poll.h> @@ -14,12 +15,30 @@ #include <sys/stat.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/wait.h> #include <unistd.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/nsfs.h> #include "../kselftest_harness.h" +/* Fixture for tests that create child processes */ +FIXTURE(nsid) { + pid_t child_pid; +}; + +FIXTURE_SETUP(nsid) { + self->child_pid = 0; +} + +FIXTURE_TEARDOWN(nsid) { + /* Clean up any child process that may still be running */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } +} + TEST(nsid_mntns_basic) { __u64 mnt_ns_id = 0; @@ -44,7 +63,7 @@ TEST(nsid_mntns_basic) close(fd_mntns); } -TEST(nsid_mntns_separate) +TEST_F(nsid, mntns_separate) { __u64 parent_mnt_ns_id = 0; __u64 child_mnt_ns_id = 0; @@ -90,6 +109,9 @@ TEST(nsid_mntns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -99,8 +121,6 @@ TEST(nsid_mntns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_mntns); SKIP(return, "No permission to create mount namespace"); } @@ -123,10 +143,6 @@ TEST(nsid_mntns_separate) close(fd_parent_mntns); close(fd_child_mntns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_cgroupns_basic) @@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic) close(fd_cgroupns); } -TEST(nsid_cgroupns_separate) +TEST_F(nsid, cgroupns_separate) { __u64 parent_cgroup_ns_id = 0; __u64 child_cgroup_ns_id = 0; @@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_cgroupns); SKIP(return, "No permission to create cgroup namespace"); } @@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate) close(fd_parent_cgroupns); close(fd_child_cgroupns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_ipcns_basic) @@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic) close(fd_ipcns); } -TEST(nsid_ipcns_separate) +TEST_F(nsid, ipcns_separate) { __u64 parent_ipc_ns_id = 0; __u64 child_ipc_ns_id = 0; @@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_ipcns); SKIP(return, "No permission to create IPC namespace"); } @@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate) close(fd_parent_ipcns); close(fd_child_ipcns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_utsns_basic) @@ -371,7 +381,7 @@ TEST(nsid_utsns_basic) close(fd_utsns); } -TEST(nsid_utsns_separate) +TEST_F(nsid, utsns_separate) { __u64 parent_uts_ns_id = 0; __u64 child_uts_ns_id = 0; @@ -417,6 +427,9 @@ TEST(nsid_utsns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -426,8 +439,6 @@ TEST(nsid_utsns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_utsns); SKIP(return, "No permission to create UTS namespace"); } @@ -450,10 +461,6 @@ TEST(nsid_utsns_separate) close(fd_parent_utsns); close(fd_child_utsns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_userns_basic) @@ -480,7 +487,7 @@ TEST(nsid_userns_basic) close(fd_userns); } -TEST(nsid_userns_separate) +TEST_F(nsid, userns_separate) { __u64 parent_user_ns_id = 0; __u64 child_user_ns_id = 0; @@ -526,6 +533,9 @@ TEST(nsid_userns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -535,8 +545,6 @@ TEST(nsid_userns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_userns); SKIP(return, "No permission to create user namespace"); } @@ -559,10 +567,6 @@ TEST(nsid_userns_separate) close(fd_parent_userns); close(fd_child_userns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_timens_basic) @@ -591,7 +595,7 @@ TEST(nsid_timens_basic) close(fd_timens); } -TEST(nsid_timens_separate) +TEST_F(nsid, timens_separate) { __u64 parent_time_ns_id = 0; __u64 child_time_ns_id = 0; @@ -652,6 +656,9 @@ TEST(nsid_timens_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -660,8 +667,6 @@ TEST(nsid_timens_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_timens); close(pipefd[0]); SKIP(return, "Cannot create time namespace"); @@ -689,10 +694,6 @@ TEST(nsid_timens_separate) close(fd_parent_timens); close(fd_child_timens); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_pidns_basic) @@ -719,7 +720,7 @@ TEST(nsid_pidns_basic) close(fd_pidns); } -TEST(nsid_pidns_separate) +TEST_F(nsid, pidns_separate) { __u64 parent_pid_ns_id = 0; __u64 child_pid_ns_id = 0; @@ -776,6 +777,9 @@ TEST(nsid_pidns_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -784,8 +788,6 @@ TEST(nsid_pidns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_pidns); close(pipefd[0]); SKIP(return, "No permission to create PID namespace"); @@ -813,10 +815,6 @@ TEST(nsid_pidns_separate) close(fd_parent_pidns); close(fd_child_pidns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_netns_basic) @@ -860,7 +858,7 @@ TEST(nsid_netns_basic) close(fd_netns); } -TEST(nsid_netns_separate) +TEST_F(nsid, netns_separate) { __u64 parent_net_ns_id = 0; __u64 parent_netns_cookie = 0; @@ -920,6 +918,9 @@ TEST(nsid_netns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -929,8 +930,6 @@ TEST(nsid_netns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_netns); close(parent_sock); SKIP(return, "No permission to create network namespace"); @@ -977,10 +976,6 @@ TEST(nsid_netns_separate) close(fd_parent_netns); close(fd_child_netns); close(parent_sock); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c new file mode 100644 index 000000000000..ba689a22d82f --- /dev/null +++ b/tools/testing/selftests/namespaces/siocgskns_test.c @@ -0,0 +1,1824 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/if.h> +#include <linux/sockios.h> +#include <linux/nsfs.h> +#include <arpa/inet.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C +#endif + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test basic SIOCGSKNS functionality. + * Create a socket and verify SIOCGSKNS returns the correct network namespace. + */ +TEST(siocgskns_basic) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create a TCP socket */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS to get network namespace */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get current network namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + /* Verify they match */ + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket file descriptors keep network namespaces active. + * Create a network namespace, create a socket in it, then exit the namespace. + * The namespace should remain active while the socket FD is held. + */ +TEST(siocgskns_keeps_netns_active) +{ + int sock_fd, netns_fd, test_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Create a socket in the new network namespace */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + TH_LOG("socket() failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS); + + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + + /* + * Namespace should still be active because socket FD keeps it alive. + * Try to access it via /proc/self/fd/<fd>. + */ + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd); + test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + + /* Try SIOCGSKNS again - should fail since socket is closed */ + ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0); +} + +/* + * Test SIOCGSKNS with different socket types (TCP, UDP, RAW). + */ +TEST(siocgskns_socket_types) +{ + int sock_tcp, sock_udp, sock_raw; + int netns_tcp, netns_udp, netns_raw; + struct stat st_tcp, st_udp, st_raw; + + /* TCP socket */ + sock_tcp = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_tcp, 0); + + /* UDP socket */ + sock_udp = socket(AF_INET, SOCK_DGRAM, 0); + ASSERT_GE(sock_udp, 0); + + /* RAW socket (may require privileges) */ + sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); + if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) { + sock_raw = -1; /* Skip raw socket test */ + } + + /* Test SIOCGSKNS on TCP */ + netns_tcp = ioctl(sock_tcp, SIOCGSKNS); + if (netns_tcp < 0) { + close(sock_tcp); + close(sock_udp); + if (sock_raw >= 0) close(sock_raw); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_tcp, 0); + } + + /* Test SIOCGSKNS on UDP */ + netns_udp = ioctl(sock_udp, SIOCGSKNS); + ASSERT_GE(netns_udp, 0); + + /* Test SIOCGSKNS on RAW (if available) */ + if (sock_raw >= 0) { + netns_raw = ioctl(sock_raw, SIOCGSKNS); + ASSERT_GE(netns_raw, 0); + } + + /* Verify all return the same network namespace */ + ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0); + ASSERT_EQ(fstat(netns_udp, &st_udp), 0); + ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino); + + if (sock_raw >= 0) { + ASSERT_EQ(fstat(netns_raw, &st_raw), 0); + ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino); + close(netns_raw); + close(sock_raw); + } + + close(netns_tcp); + close(netns_udp); + close(sock_tcp); + close(sock_udp); +} + +/* + * Test SIOCGSKNS across setns. + * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still + * returns netns A. + */ +TEST(siocgskns_across_setns) +{ + int sock_fd, netns_a_fd, netns_b_fd, result_fd; + struct stat st_a; + + /* Get current netns (A) */ + netns_a_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_a_fd, 0); + ASSERT_EQ(fstat(netns_a_fd, &st_a), 0); + + /* Create socket in netns A */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Create new netns (B) */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + netns_b_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_b_fd, 0); + + /* Get netns from socket created in A */ + result_fd = ioctl(sock_fd, SIOCGSKNS); + if (result_fd < 0) { + close(sock_fd); + setns(netns_a_fd, CLONE_NEWNET); + close(netns_a_fd); + close(netns_b_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(result_fd, 0); + } + + /* Verify it still points to netns A */ + struct stat st_result_stat; + ASSERT_EQ(fstat(result_fd, &st_result_stat), 0); + ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino); + + close(result_fd); + close(sock_fd); + close(netns_b_fd); + + /* Restore original netns */ + ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0); + close(netns_a_fd); +} + +/* + * Test SIOCGSKNS fails on non-socket file descriptors. + */ +TEST(siocgskns_non_socket) +{ + int fd; + int pipefd[2]; + + /* Test on regular file */ + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0); + + ASSERT_LT(ioctl(fd, SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + close(fd); + + /* Test on pipe */ + ASSERT_EQ(pipe(pipefd), 0); + + ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + + close(pipefd[0]); + close(pipefd[1]); +} + +/* + * Test multiple sockets keep the same network namespace active. + * Create multiple sockets, verify closing some doesn't affect others. + */ +TEST(siocgskns_multiple_sockets) +{ + int socks[5]; + int netns_fds[5]; + int i; + struct stat st; + ino_t netns_ino; + + /* Create new network namespace */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + /* Create multiple sockets */ + for (i = 0; i < 5; i++) { + socks[i] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(socks[i], 0); + } + + /* Get netns from all sockets */ + for (i = 0; i < 5; i++) { + netns_fds[i] = ioctl(socks[i], SIOCGSKNS); + if (netns_fds[i] < 0) { + int j; + for (j = 0; j <= i; j++) { + close(socks[j]); + if (j < i && netns_fds[j] >= 0) + close(netns_fds[j]); + } + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fds[i], 0); + } + } + + /* Verify all point to same netns */ + ASSERT_EQ(fstat(netns_fds[0], &st), 0); + netns_ino = st.st_ino; + + for (i = 1; i < 5; i++) { + ASSERT_EQ(fstat(netns_fds[i], &st), 0); + ASSERT_EQ(st.st_ino, netns_ino); + } + + /* Close some sockets */ + for (i = 0; i < 3; i++) { + close(socks[i]); + } + + /* Remaining netns FDs should still be valid */ + for (i = 3; i < 5; i++) { + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]); + int test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + } + + /* Cleanup */ + for (i = 0; i < 5; i++) { + if (i >= 3) + close(socks[i]); + close(netns_fds[i]); + } +} + +/* + * Test socket keeps netns active after creating process exits. + * Verify that as long as the socket FD exists, the namespace remains active. + */ +TEST(siocgskns_netns_lifecycle) +{ + int sock_fd, netns_fd; + int ipc_sockets[2]; + int syncpipe[2]; + pid_t pid; + int status; + char sync_byte; + struct stat st; + ino_t netns_ino; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + ASSERT_EQ(pipe(syncpipe), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child */ + close(ipc_sockets[0]); + close(syncpipe[1]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send socket to parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + + /* Wait for parent signal */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent */ + close(ipc_sockets[1]); + close(syncpipe[0]); + + /* Receive socket FD */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Get netns from socket while child is alive */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + close(sock_fd); + waitpid(pid, NULL, 0); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Signal child to exit */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* + * Socket FD should still keep namespace active even after + * the creating process exited. + */ + int test_fd = ioctl(sock_fd, SIOCGSKNS); + ASSERT_GE(test_fd, 0); + + struct stat st_test; + ASSERT_EQ(fstat(test_fd, &st_test), 0); + ASSERT_EQ(st_test.st_ino, netns_ino); + + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); +} + +/* + * Test IPv6 sockets also work with SIOCGSKNS. + */ +TEST(siocgskns_ipv6) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create an IPv6 TCP socket */ + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify it matches current namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket-kept netns appears in listns() output. + * Verify that a network namespace kept alive by a socket FD appears in + * listns() output even after the creating process exits, and that it + * disappears when the socket is closed. + */ +TEST(siocgskns_listns_visibility) +{ + int sock_fd, netns_fd, owner_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + __u64 netns_id, owner_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + owner_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_fd < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(owner_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(owner_fd, NS_GET_ID, &owner_id); + if (ret < 0) { + close(owner_fd); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(owner_fd); + + /* Namespace should appear in listns() output */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + /* Search for our network namespace in the list */ + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id); + + /* Now verify with owner filtering */ + req.user_ns_id = owner_id; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id); + + /* Close socket - namespace should become inactive and disappear from listns() */ + close(sock_fd); + close(netns_fd); + + /* Verify it's no longer in listns() output */ + req.user_ns_id = 0; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_FALSE(found_netns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); +} + +/* + * Test that socket-kept netns can be reopened via file handle. + * Verify that a network namespace kept alive by a socket FD can be + * reopened using file handles even after the creating process exits. + */ +TEST(siocgskns_file_handle) +{ + int sock_fd, netns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st1, st2; + ino_t netns_ino; + __u64 netns_id; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int ret; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + netns_ino = st1.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; /* Type field not needed for reopening */ + nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */ + + TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id); + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + close(reopened_fd); + + /* Close the netns FD */ + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + free(handle); +} + +/* + * Test combined listns() and file handle operations with socket-kept netns. + * Create a netns, keep it alive with a socket, verify it appears in listns(), + * then reopen it via file handle obtained from listns() entry. + */ +TEST(siocgskns_listns_and_file_handle) +{ + int sock_fd, netns_fd, userns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + ino_t netns_ino; + __u64 netns_id, userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false, found_userns = false; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new userns and netns with socket */ + close(ipc_sockets[0]); + + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (userns_fd < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(userns_fd, NS_GET_ID, &userns_id); + if (ret < 0) { + close(userns_fd); + free(handle); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(userns_fd); + + TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + struct stat reopened_st; + ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0); + ASSERT_EQ(reopened_st.st_ino, netns_ino); + + TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino); + + close(reopened_fd); + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + close(netns_fd); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_FALSE(found_netns); + ASSERT_FALSE(found_userns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); + + close(sock_fd); + free(handle); +} + +/* + * Test multi-level namespace resurrection across three user namespace levels. + * + * This test creates a complex namespace hierarchy with three levels of user + * namespaces and a network namespace at the deepest level. It verifies that + * the resurrection semantics work correctly when SIOCGSKNS is called on a + * socket from an inactive namespace tree, and that listns() and + * open_by_handle_at() correctly respect visibility rules. + * + * Hierarchy after child processes exit (all with 0 active refcount): + * + * net_L3A (0) <- Level 3 network namespace + * | + * + + * userns_L3 (0) <- Level 3 user namespace + * | + * + + * userns_L2 (0) <- Level 2 user namespace + * | + * + + * userns_L1 (0) <- Level 1 user namespace + * | + * x + * init_user_ns + * + * The test verifies: + * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain + * 2. After resurrection, all namespaces are visible in listns() + * 3. Resurrected namespaces can be reopened via file handles + * 4. Closing the netns FD cascades down: the entire ownership chain + * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again + * 5. Inactive namespaces disappear from listns() and cannot be reopened + * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again + * 7. After second resurrection, namespaces are visible and can be reopened + */ +TEST(siocgskns_multilevel_resurrection) +{ + int ipc_sockets[2]; + pid_t pid_l1, pid_l2, pid_l3; + int status; + + /* Namespace file descriptors to be received from child */ + int sock_L3A_fd = -1; + int netns_L3A_fd = -1; + __u64 netns_L3A_id; + __u64 userns_L1_id, userns_L2_id, userns_L3_id; + + /* For listns() and file handle testing */ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int reopened_fd; + + /* Allocate file handle for testing */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + /* + * Fork level 1 child that creates userns_L1 + */ + pid_l1 = fork(); + ASSERT_GE(pid_l1, 0); + + if (pid_l1 == 0) { + /* Level 1 child */ + int ipc_L2[2]; + close(ipc_sockets[0]); + + /* Create userns_L1 */ + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Create socketpair for communicating with L2 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* + * Fork level 2 child that creates userns_L2 + */ + pid_l2 = fork(); + if (pid_l2 < 0) { + close(ipc_sockets[1]); + close(ipc_L2[0]); + close(ipc_L2[1]); + exit(1); + } + + if (pid_l2 == 0) { + /* Level 2 child */ + int ipc_L3[2]; + close(ipc_L2[0]); + + /* Create userns_L2 (nested inside userns_L1) */ + if (setup_userns() < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* Create socketpair for communicating with L3 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* + * Fork level 3 child that creates userns_L3 and network namespaces + */ + pid_l3 = fork(); + if (pid_l3 < 0) { + close(ipc_L2[1]); + close(ipc_L3[0]); + close(ipc_L3[1]); + exit(1); + } + + if (pid_l3 == 0) { + /* Level 3 child - the deepest level */ + int sock_fd; + close(ipc_L3[0]); + + /* Create userns_L3 (nested inside userns_L2) */ + if (setup_userns() < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create network namespace at level 3 */ + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create socket in net_L3A */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Send socket FD to L2 parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_L3[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_L3[1]); + exit(1); + } + + close(sock_fd); + close(ipc_L3[1]); + exit(0); + } + + /* Level 2 child - receive from L3 and forward to L1 */ + close(ipc_L3[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L3[0], &msg, 0); + close(ipc_L3[0]); + + if (n != 1) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L3 child */ + waitpid(pid_l3, NULL, 0); + + /* Forward the socket FD to L1 parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Y'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_L2[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_L2[1]); + exit(1); + } + + close(received_fd); + close(ipc_L2[1]); + exit(0); + } + + /* Level 1 child - receive from L2 and forward to parent */ + close(ipc_L2[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L2[0], &msg, 0); + close(ipc_L2[0]); + + if (n != 1) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L2 child */ + waitpid(pid_l2, NULL, 0); + + /* Forward the socket FD to parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Z'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(received_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent - receive the socket from the deepest level */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + + if (n != 1) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L1 child */ + waitpid(pid_l1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * At this point, all child processes have exited. The socket itself + * doesn't keep the namespace active - we need to call SIOCGSKNS which + * will resurrect the entire namespace tree by taking active references. + */ + + /* Get network namespace from socket - this resurrects the tree */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + if (netns_L3A_fd < 0) { + free(handle); + close(sock_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_L3A_fd, 0); + } + + /* Get namespace ID for net_L3A */ + ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */ + int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS); + if (userns_L3_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_L3_fd, 0); + } + + ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id); + ASSERT_EQ(ret, 0); + + int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS); + ASSERT_GE(userns_L2_fd, 0); + ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id); + ASSERT_EQ(ret, 0); + + int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS); + ASSERT_GE(userns_L1_fd, 0); + ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id); + ASSERT_EQ(ret, 0); + + close(userns_L1_fd); + close(userns_L2_fd); + close(userns_L3_fd); + + TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)", + netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id); + + /* + * Test 1: Verify net_L3A is visible in listns() after resurrection. + * The entire ownership chain should be resurrected and visible. + */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + bool found_netns_L3A = false; + bool found_userns_L1 = false; + bool found_userns_L2 = false; + bool found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()"); + + /* + * Test 2: Verify net_L3A can be reopened via file handle. + */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_L3A_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened"); + + /* + * Test 3: Verify that when we close the netns FD (dropping the last + * active reference), the entire tree becomes inactive and disappears + * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops -> + * userns_L2 drops -> userns_L1 drops. + */ + close(netns_L3A_fd); + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_FALSE(found_netns_L3A); + ASSERT_FALSE(found_userns_L1); + ASSERT_FALSE(found_userns_L2); + ASSERT_FALSE(found_userns_L3); + TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed"); + + /* + * Test 4: Verify file handle no longer works for inactive namespace. + */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd >= 0) { + close(reopened_fd); + free(handle); + ASSERT_TRUE(false); /* Should have failed */ + } + TH_LOG("Inactive namespace correctly cannot be reopened via file handle"); + + /* + * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again. + * The socket is still valid, so we can call SIOCGSKNS on it to resurrect + * the namespace tree once more. + */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + ASSERT_GE(netns_L3A_fd, 0); + + TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree"); + + /* Verify the namespace tree is resurrected and visible in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again"); + + /* Verify we can reopen via file handle again */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection"); + + /* Final cleanup */ + close(sock_L3A_fd); + close(netns_L3A_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c new file mode 100644 index 000000000000..dd7df7d6cb27 --- /dev/null +++ b/tools/testing/selftests/namespaces/stress_test.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Stress tests for namespace active reference counting. + * + * These tests validate that the active reference counting system can handle + * high load scenarios including rapid namespace creation/destruction, large + * numbers of concurrent namespaces, and various edge cases under stress. + */ + +/* + * Test rapid creation and destruction of user namespaces. + * Create and destroy namespaces in quick succession to stress the + * active reference tracking and ensure no leaks occur. + */ +TEST(rapid_namespace_creation_destruction) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[256], ns_ids_after[256]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count of active user namespaces */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Rapidly create and destroy 100 user namespaces */ + for (i = 0; i < 100; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create user namespace and immediately exit */ + if (setup_userns() < 0) + exit(1); + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline (no leaked namespaces) */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test creating many concurrent namespaces. + * Verify that listns() correctly tracks all of them and that they all + * become inactive after processes exit. + */ +TEST(many_concurrent_namespaces) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512]; + ssize_t ret_before, ret_during, ret_after; + pid_t pids[50]; + int num_children = 50; + int i; + int sv[2]; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children, each with their own user namespace */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Child: create user namespace and wait for parent signal */ + char c; + + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* List namespaces while all children are running */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + + TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during); + + /* Should have at least num_children more namespaces than baseline */ + ASSERT_GE(ret_during, ret_before + num_children); + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + /* If we fail to write, kill remaining children */ + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After all children exit: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test rapid namespace creation with different namespace types. + * Create multiple types of namespaces rapidly to stress the tracking system. + */ +TEST(rapid_mixed_namespace_creation) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces (all types)", ret_before); + + /* Rapidly create and destroy namespaces with multiple types */ + for (i = 0; i < 50; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + + /* Create additional namespace types */ + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + if (unshare(CLONE_NEWIPC) < 0) + exit(1); + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test nested namespace creation under stress. + * Create deeply nested namespace hierarchies and verify proper cleanup. + */ +TEST(nested_namespace_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Create 20 processes, each with nested user namespaces */ + for (i = 0; i < 20; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int userns_fd; + uid_t orig_uid = getuid(); + int depth; + + /* Create nested user namespaces (up to 5 levels) */ + for (depth = 0; depth < 5; depth++) { + userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1); + if (userns_fd < 0) + exit(1); + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + exit(1); + } + close(userns_fd); + } + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test listns() pagination under stress. + * Create many namespaces and verify pagination works correctly. + */ +TEST(listns_pagination_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[30]; + int num_children = 30; + int i; + int sv[2]; + __u64 all_ns_ids[512]; + int total_found = 0; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* Paginate through all namespaces using small batch sizes */ + req.ns_id = 0; + while (1) { + __u64 batch[5]; /* Small batch size to force pagination */ + ssize_t ret; + + ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + if (ret == 0) + break; + + /* Store results */ + for (i = 0; i < ret && total_found < 512; i++) { + all_ns_ids[total_found++] = batch[i]; + } + + /* Update cursor for next batch */ + if (ret == ARRAY_SIZE(batch)) + req.ns_id = batch[ret - 1]; + else + break; + } + + TH_LOG("Paginated through %d user namespaces", total_found); + + /* Verify no duplicates in pagination */ + for (i = 0; i < total_found; i++) { + for (int j = i + 1; j < total_found; j++) { + if (all_ns_ids[i] == all_ns_ids[j]) { + TH_LOG("Found duplicate ns_id: %llu at positions %d and %d", + (unsigned long long)all_ns_ids[i], i, j); + ASSERT_TRUE(false); + } + } + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +/* + * Test concurrent namespace operations. + * Multiple processes creating, querying, and destroying namespaces concurrently. + */ +TEST(concurrent_namespace_operations) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + pid_t pids[20]; + int num_workers = 20; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Create worker processes that do concurrent operations */ + for (i = 0; i < num_workers; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Each worker: create namespaces, list them, repeat */ + int iterations; + + for (iterations = 0; iterations < 10; iterations++) { + int userns_fd; + __u64 temp_ns_ids[100]; + ssize_t ret; + + /* Create a user namespace */ + userns_fd = get_userns_fd(0, getuid(), 1); + if (userns_fd < 0) + continue; + + /* List namespaces */ + ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0); + (void)ret; + + close(userns_fd); + + /* Small delay */ + usleep(1000); + } + + exit(0); + } + } + + /* Wait for all workers */ + for (i = 0; i < num_workers; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After concurrent operations: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test namespace churn - continuous creation and destruction. + * Simulates high-churn scenarios like container orchestration. + */ +TEST(namespace_churn) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int cycle; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Simulate churn: batches of namespaces created and destroyed */ + for (cycle = 0; cycle < 10; cycle++) { + pid_t batch_pids[10]; + int i; + + /* Create batch */ + for (i = 0; i < 10; i++) { + batch_pids[i] = fork(); + ASSERT_GE(batch_pids[i], 0); + + if (batch_pids[i] == 0) { + /* Create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + + /* Keep namespaces alive briefly */ + usleep(10000); + exit(0); + } + } + + /* Wait for batch to complete */ + for (i = 0; i < 10; i++) { + int status; + waitpid(batch_pids[i], &status, 0); + } + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h new file mode 100644 index 000000000000..9741a64a5b1d --- /dev/null +++ b/tools/testing/selftests/namespaces/wrappers.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/nsfs.h> +#include <linux/types.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__ +#define __SELFTESTS_NAMESPACES_WRAPPERS_H__ + +#ifndef __NR_listns + #if defined __alpha__ + #define __NR_listns 580 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_listns 4470 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_listns 6470 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_listns 5470 + #endif + #else + #define __NR_listns 470 + #endif +#endif + +static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids, + size_t nr_ns_ids, unsigned int flags) +{ + return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags); +} + +#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */ |
