11 files changed, 819 insertions, 69 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6ae5f48cf64e..b758a9dd7526 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.ns.__ns_ref	= REFCOUNT_INIT(2),
+	.ns		= NS_COMMON_INIT(init_cgroup_ns, 2),
 	.user_ns	= &init_user_ns,
-	.ns.ops		= &cgroupns_operations,
-	.ns.inum	= ns_init_inum(&init_cgroup_ns),
 	.root_cset	= &init_css_set,
-	.ns.ns_type	= ns_common_type(&init_cgroup_ns),
 };
 
 static struct file_system_type cgroup2_fs_type;
@@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
 	} else {
 		/*
 		 * NOTE: This function may be called from bpf_cgroup_from_id()
-		 * on a task which has already passed exit_task_namespaces() and
-		 * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
-		 * cgroups visible for lookups.
+		 * on a task which has already passed exit_nsproxy_namespaces()
+		 * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+		 * make all cgroups visible for lookups.
 		 */
 		return &cgrp_dfl_root.cgrp;
 	}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index fdbe57578e68..db9617556dd7 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
 	ret = ns_common_init(new_ns);
 	if (ret)
 		return ERR_PTR(ret);
-	ns_tree_add(new_ns);
 	return no_free_ptr(new_ns);
 }
 
@@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags,
 	new_ns->ucounts = ucounts;
 	new_ns->root_cset = cset;
 
+	ns_tree_add(new_ns);
 	return new_ns;
 }
 
diff --git a/kernel/cred.c b/kernel/cred.c
index dbf6b687dc5c..a6e7f580df14 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -306,6 +306,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
 		kdebug("share_creds(%p{%ld})",
 		       p->cred, atomic_long_read(&p->cred->usage));
 		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+		get_cred_namespaces(p);
 		return 0;
 	}
 
@@ -343,6 +344,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
 
 	p->cred = p->real_cred = get_cred(new);
 	inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+	get_cred_namespaces(p);
+
 	return 0;
 
 error_put:
@@ -435,10 +438,13 @@ int commit_creds(struct cred *new)
 	 */
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+	if (new->user_ns != old->user_ns)
+		switch_cred_namespaces(old, new);
 
 	/* send notifications */
 	if (!uid_eq(new->uid,   old->uid)  ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..988e16efd66b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -291,6 +291,7 @@ repeat:
 	write_unlock_irq(&tasklist_lock);
 	/* @thread_pid can't go away until free_pids() below */
 	proc_flush_pid(thread_pid);
+	exit_cred_namespaces(p);
 	add_device_randomness(&p->se.sum_exec_runtime,
 			      sizeof(p->se.sum_exec_runtime));
 	free_pids(post.pids);
@@ -962,7 +963,7 @@ void __noreturn do_exit(long code)
 	exit_fs(tsk);
 	if (group_dead)
 		disassociate_ctty(1);
-	exit_task_namespaces(tsk);
+	exit_nsproxy_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..f1857672426e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2453,7 +2453,7 @@ bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
 bad_fork_cleanup_namespaces:
-	exit_task_namespaces(p);
+	exit_nsproxy_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm) {
 		mm_clear_owner(p->mm, p);
@@ -2487,6 +2487,7 @@ bad_fork_cleanup_delayacct:
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
 	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+	exit_cred_namespaces(p);
 	exit_creds(p);
 bad_fork_free:
 	WRITE_ONCE(p->__state, TASK_DEAD);
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index c1fb2bad6d72..6fe1c747fa46 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
 
 #include <linux/ns_common.h>
 #include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
 #include <linux/vfsdebug.h>
 
 #ifdef CONFIG_DEBUG_VFS
@@ -52,13 +54,21 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
 
 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
 {
+	int ret;
+
 	refcount_set(&ns->__ns_ref, 1);
 	ns->stashed = NULL;
 	ns->ops = ops;
 	ns->ns_id = 0;
 	ns->ns_type = ns_type;
 	RB_CLEAR_NODE(&ns->ns_tree_node);
+	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
+	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
 	INIT_LIST_HEAD(&ns->ns_list_node);
+	INIT_LIST_HEAD(&ns->ns_unified_list_node);
+	ns->ns_owner_tree = RB_ROOT;
+	INIT_LIST_HEAD(&ns->ns_owner);
+	INIT_LIST_HEAD(&ns->ns_owner_entry);
 
 #ifdef CONFIG_DEBUG_VFS
 	ns_debug(ns, ops);
@@ -68,10 +78,219 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 		ns->inum = inum;
 		return 0;
 	}
-	return proc_alloc_inum(&ns->inum);
+	ret = proc_alloc_inum(&ns->inum);
+	if (ret)
+		return ret;
+	/*
+	 * Tree ref starts at 0. It's incremented when namespace enters
+	 * active use (installed in nsproxy) and decremented when all
+	 * active uses are gone. Initial namespaces are always active.
+	 */
+	if (is_initial_namespace(ns))
+		atomic_set(&ns->__ns_ref_active, 1);
+	else
+		atomic_set(&ns->__ns_ref_active, 0);
+	return 0;
 }
 
 void __ns_common_free(struct ns_common *ns)
 {
 	proc_free_inum(ns->inum);
 }
+
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
+{
+	struct user_namespace *owner;
+
+	if (unlikely(!ns->ops))
+		return NULL;
+	VFS_WARN_ON_ONCE(!ns->ops->owner);
+	owner = ns->ops->owner(ns);
+	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+	if (!owner)
+		return NULL;
+	/* Skip init_user_ns as it's always active */
+	if (owner == &init_user_ns)
+		return NULL;
+	return to_ns_common(owner);
+}
+
+void __ns_ref_active_get_owner(struct ns_common *ns)
+{
+	ns = ns_owner(ns);
+	if (ns)
+		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        +      +
+ *                        user_ns1 (2)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   +   +
+ *                        user_ns2 (3)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        -      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   -   +
+ *                        user_ns2 (2)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put_owner(struct ns_common *ns)
+{
+	for (;;) {
+		ns = ns_owner(ns);
+		if (!ns)
+			return;
+		if (!atomic_dec_and_test(&ns->__ns_ref_active))
+			return;
+	}
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        +      +
+ *                        user_ns1 (2)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   +   +
+ *                        user_ns2 (3)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        -      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        +   -   +
+ *                        user_ns2 (2)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        -      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        -   -   -
+ *                        user_ns2 (0)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ *                 net_ns          pid_ns
+ *                       \        /
+ *                        +      -
+ *                        user_ns1 (0)
+ *                            |
+ *                 ipc_ns     |     uts_ns
+ *                       \    |    /
+ *                        -   +   -
+ *                        user_ns2 (0)
+ *                            |
+ *            cgroup_ns       |       mnt_ns
+ *                     \      |      /
+ *                      x     x     x
+ *                      init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_resurrect(struct ns_common *ns)
+{
+	/* If we didn't resurrect the namespace we're done. */
+	if (atomic_fetch_add(1, &ns->__ns_ref_active))
+		return;
+
+	/*
+	 * We did resurrect it. Walk the ownership hierarchy upwards
+	 * until we found an owning user namespace that is active.
+	 */
+	for (;;) {
+		ns = ns_owner(ns);
+		if (!ns)
+			return;
+
+		if (atomic_fetch_add(1, &ns->__ns_ref_active))
+			return;
+	}
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 19aa64ab08c8..94c2cfe0afa1 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
+#include <linux/nstree.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -179,12 +180,15 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
 	if ((flags & CLONE_VM) == 0)
 		timens_on_fork(new_ns, tsk);
 
+	nsproxy_ns_active_get(new_ns);
 	tsk->nsproxy = new_ns;
 	return 0;
 }
 
 void free_nsproxy(struct nsproxy *ns)
 {
+	nsproxy_ns_active_put(ns);
+
 	put_mnt_ns(ns->mnt_ns);
 	put_uts_ns(ns->uts_ns);
 	put_ipc_ns(ns->ipc_ns);
@@ -232,6 +236,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 
 	might_sleep();
 
+	if (new)
+		nsproxy_ns_active_get(new);
+
 	task_lock(p);
 	ns = p->nsproxy;
 	p->nsproxy = new;
@@ -241,11 +248,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 		put_nsproxy(ns);
 }
 
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
 {
 	switch_task_namespaces(p, NULL);
 }
 
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+	ns_ref_active_get(new->user_ns);
+	ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+	ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+	ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
 int exec_task_namespaces(void)
 {
 	struct task_struct *tsk = current;
diff --git a/kernel/nstree.c b/kernel/nstree.c
index b24a320a11a6..4a8838683b6b 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -1,34 +1,38 @@
 // SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
 
 #include <linux/nstree.h>
 #include <linux/proc_ns.h>
+#include <linux/rculist.h>
 #include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
+
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+static struct rb_root ns_unified_tree = RB_ROOT; /* protected by ns_tree_lock */
+static LIST_HEAD(ns_unified_list); /* protected by ns_tree_lock */
 
 /**
  * struct ns_tree - Namespace tree
  * @ns_tree: Rbtree of namespaces of a particular type
  * @ns_list: Sequentially walkable list of all namespaces of this type
- * @ns_tree_lock: Seqlock to protect the tree and list
  * @type: type of namespaces in this tree
  */
 struct ns_tree {
-       struct rb_root ns_tree;
-       struct list_head ns_list;
-       seqlock_t ns_tree_lock;
-       int type;
+	struct rb_root ns_tree;
+	struct list_head ns_list;
+	int type;
 };
 
 struct ns_tree mnt_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWNS,
 };
 
 struct ns_tree net_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWNET,
 };
 EXPORT_SYMBOL_GPL(net_ns_tree);
@@ -36,47 +40,39 @@ EXPORT_SYMBOL_GPL(net_ns_tree);
 struct ns_tree uts_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWUTS,
 };
 
 struct ns_tree user_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWUSER,
 };
 
 struct ns_tree ipc_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWIPC,
 };
 
 struct ns_tree pid_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWPID,
 };
 
 struct ns_tree cgroup_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWCGROUP,
 };
 
 struct ns_tree time_ns_tree = {
 	.ns_tree = RB_ROOT,
 	.ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
-	.ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
 	.type = CLONE_NEWTIME,
 };
 
-DEFINE_COOKIE(namespace_cookie);
-
 static inline struct ns_common *node_to_ns(const struct rb_node *node)
 {
 	if (!node)
@@ -84,30 +80,54 @@ static inline struct ns_common *node_to_ns(const struct rb_node *node)
 	return rb_entry(node, struct ns_common, ns_tree_node);
 }
 
-static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
 {
-	struct ns_common *ns_a = node_to_ns(a);
-	struct ns_common *ns_b = node_to_ns(b);
-	u64 ns_id_a = ns_a->ns_id;
-	u64 ns_id_b = ns_b->ns_id;
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct ns_common, ns_unified_tree_node);
+}
 
-	if (ns_id_a < ns_id_b)
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct ns_common, ns_owner_tree_node);
+}
+
+static int ns_id_cmp(u64 id_a, u64 id_b)
+{
+	if (id_a < id_b)
 		return -1;
-	if (ns_id_a > ns_id_b)
+	if (id_a > id_b)
 		return 1;
 	return 0;
 }
 
+static int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+	return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id);
+}
+
+static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+	return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id);
+}
+
+static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
+{
+	return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
+}
+
 void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 {
 	struct rb_node *node, *prev;
+	const struct proc_ns_operations *ops = ns->ops;
 
 	VFS_WARN_ON_ONCE(!ns->ns_id);
-
-	write_seqlock(&ns_tree->ns_tree_lock);
-
 	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
 
+	write_seqlock(&ns_tree_lock);
+
 	node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
 	/*
 	 * If there's no previous entry simply add it after the
@@ -119,22 +139,83 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	else
 		list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
 
-	write_sequnlock(&ns_tree->ns_tree_lock);
+	/* Add to unified tree and list */
+	rb_find_add_rcu(&ns->ns_unified_tree_node, &ns_unified_tree, ns_cmp_unified);
+	prev = rb_prev(&ns->ns_unified_tree_node);
+	if (!prev)
+		list_add_rcu(&ns->ns_unified_list_node, &ns_unified_list);
+	else
+		list_add_rcu(&ns->ns_unified_list_node, &node_to_ns_unified(prev)->ns_unified_list_node);
+
+	if (ops) {
+		struct user_namespace *user_ns;
+
+		VFS_WARN_ON_ONCE(!ops->owner);
+		user_ns = ops->owner(ns);
+		if (user_ns) {
+			struct ns_common *owner = &user_ns->ns;
+			VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+			/* Insert into owner's rbtree */
+			rb_find_add_rcu(&ns->ns_owner_tree_node, &owner->ns_owner_tree, ns_cmp_owner);
+
+			/* Insert into owner's list in sorted order */
+			prev = rb_prev(&ns->ns_owner_tree_node);
+			if (!prev)
+				list_add_rcu(&ns->ns_owner_entry, &owner->ns_owner);
+			else
+				list_add_rcu(&ns->ns_owner_entry, &node_to_ns_owner(prev)->ns_owner_entry);
+		} else {
+			/* Only the initial user namespace doesn't have an owner. */
+			VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+		}
+	}
+	write_sequnlock(&ns_tree_lock);
 
 	VFS_WARN_ON_ONCE(node);
+
+	/*
+	 * Take an active reference on the owner namespace. This ensures
+	 * that the owner remains visible while any of its child namespaces
+	 * are active. For init namespaces this is a no-op as ns_owner()
+	 * returns NULL for namespaces owned by init_user_ns.
+	 */
+	__ns_ref_active_get_owner(ns);
 }
 
 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
 {
+	const struct proc_ns_operations *ops = ns->ops;
+	struct user_namespace *user_ns;
+
 	VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
 	VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
 	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
 
-	write_seqlock(&ns_tree->ns_tree_lock);
+	write_seqlock(&ns_tree_lock);
 	rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
-	list_bidir_del_rcu(&ns->ns_list_node);
 	RB_CLEAR_NODE(&ns->ns_tree_node);
-	write_sequnlock(&ns_tree->ns_tree_lock);
+
+	list_bidir_del_rcu(&ns->ns_list_node);
+
+	rb_erase(&ns->ns_unified_tree_node, &ns_unified_tree);
+	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
+
+	list_bidir_del_rcu(&ns->ns_unified_list_node);
+
+	/* Remove from owner's rbtree if this namespace has an owner */
+	if (ops) {
+		user_ns = ops->owner(ns);
+		if (user_ns) {
+			struct ns_common *owner = &user_ns->ns;
+			rb_erase(&ns->ns_owner_tree_node, &owner->ns_owner_tree);
+			RB_CLEAR_NODE(&ns->ns_owner_tree_node);
+		}
+
+		list_bidir_del_rcu(&ns->ns_owner_entry);
+	}
+
+	write_sequnlock(&ns_tree_lock);
 }
 EXPORT_SYMBOL_GPL(__ns_tree_remove);
 
@@ -150,6 +231,17 @@ static int ns_find(const void *key, const struct rb_node *node)
 	return 0;
 }
 
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+	const u64 ns_id = *(u64 *)key;
+	const struct ns_common *ns = node_to_ns_unified(node);
+
+	if (ns_id < ns->ns_id)
+		return -1;
+	if (ns_id > ns->ns_id)
+		return 1;
+	return 0;
+}
 
 static struct ns_tree *ns_tree_from_type(int ns_type)
 {
@@ -175,31 +267,49 @@ static struct ns_tree *ns_tree_from_type(int ns_type)
 	return NULL;
 }
 
-struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
 {
-	struct ns_tree *ns_tree;
 	struct rb_node *node;
 	unsigned int seq;
 
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+	do {
+		seq = read_seqbegin(&ns_tree_lock);
+		node = rb_find_rcu(&ns_id, &ns_unified_tree, ns_find_unified);
+		if (node)
+			break;
+	} while (read_seqretry(&ns_tree_lock, seq));
+
+	return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+	struct ns_tree *ns_tree;
+	struct rb_node *node;
+	unsigned int seq;
 
 	ns_tree = ns_tree_from_type(ns_type);
 	if (!ns_tree)
 		return NULL;
 
 	do {
-		seq = read_seqbegin(&ns_tree->ns_tree_lock);
+		seq = read_seqbegin(&ns_tree_lock);
 		node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
 		if (node)
 			break;
-	} while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+	} while (read_seqretry(&ns_tree_lock, seq));
 
-	if (!node)
-		return NULL;
+	return node_to_ns(node);
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
 
-	VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+	if (ns_type)
+		return __ns_tree_lookup_rcu(ns_id, ns_type);
 
-	return node_to_ns(node);
+	return __ns_unified_tree_lookup_rcu(ns_id);
 }
 
 /**
@@ -233,15 +343,416 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
 /**
  * ns_tree_gen_id - generate a new namespace id
  * @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
  *
  * Generates a new namespace id and assigns it to the namespace. All
  * namespaces types share the same id space and thus can be compared
  * directly. IOW, when two ids of two namespace are equal, they are
  * identical.
  */
-u64 ns_tree_gen_id(struct ns_common *ns)
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
 {
-	guard(preempt)();
-	ns->ns_id = gen_cookie_next(&namespace_cookie);
+	static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+	if (id)
+		ns->ns_id = id;
+	else
+		ns->ns_id = atomic64_inc_return(&namespace_cookie);
 	return ns->ns_id;
 }
+
+struct klistns {
+	u64 __user *uns_ids;
+	u32 nr_ns_ids;
+	u64 last_ns_id;
+	u64 user_ns_id;
+	u32 ns_type;
+	struct user_namespace *user_ns;
+	bool userns_capable;
+	struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+	if (kls->user_ns_id != LISTNS_CURRENT_USER)
+		put_user_ns(kls->user_ns);
+	if (kls->first_ns && kls->first_ns->ops)
+		kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+			  struct ns_id_req *kreq)
+{
+	int ret;
+	size_t usize;
+
+	BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+	ret = get_user(usize, &req->size);
+	if (ret)
+		return -EFAULT;
+	if (unlikely(usize > PAGE_SIZE))
+		return -E2BIG;
+	if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+		return -EINVAL;
+	memset(kreq, 0, sizeof(*kreq));
+	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+	if (ret)
+		return ret;
+	if (kreq->spare != 0)
+		return -EINVAL;
+	if (kreq->ns_type & ~NS_ALL)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+				  u64 __user *ns_ids, size_t nr_ns_ids)
+{
+	kls->last_ns_id = kreq->ns_id;
+	kls->user_ns_id = kreq->user_ns_id;
+	kls->nr_ns_ids	= nr_ns_ids;
+	kls->ns_type	= kreq->ns_type;
+	kls->uns_ids	= ns_ids;
+	return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+	struct ns_common *ret = NULL;
+	struct rb_node *node;
+
+	VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+	read_seqlock_excl(&ns_tree_lock);
+	node = owner->ns_owner_tree.rb_node;
+
+	while (node) {
+		struct ns_common *ns;
+
+		ns = node_to_ns_owner(node);
+		if (ns_id <= ns->ns_id) {
+			ret = ns;
+			if (ns_id == ns->ns_id)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+
+	if (ret)
+		ret = ns_get_unless_inactive(ret);
+	read_sequnlock_excl(&ns_tree_lock);
+	return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+	struct ns_common *ns;
+
+	guard(rcu)();
+	ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+	if (!ns)
+		return NULL;
+
+	if (!ns_get_unless_inactive(ns))
+		return NULL;
+
+	return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+					     const struct ns_common *ns)
+{
+	return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+					    struct ns_common *ns)
+{
+	if (kls->user_ns) {
+		if (kls->userns_capable)
+			return true;
+	} else {
+		struct ns_common *owner;
+		struct user_namespace *user_ns;
+
+		owner = ns_owner(ns);
+		if (owner)
+			user_ns = to_user_ns(owner);
+		else
+			user_ns = &init_user_ns;
+		if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+			return true;
+	}
+
+	if (is_current_namespace(ns))
+		return true;
+
+	if (ns->ns_type != CLONE_NEWUSER)
+		return false;
+
+	if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+		return true;
+
+	return false;
+}
+
+static void __ns_put(struct ns_common *ns)
+{
+	if (ns->ops)
+		ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+							   struct ns_common *candidate)
+{
+	struct ns_common *ns __free(ns_put) = NULL;
+
+	if (!ns_requested(kls, candidate))
+		return NULL;
+
+	ns = ns_get_unless_inactive(candidate);
+	if (!ns)
+		return NULL;
+
+	if (!may_list_ns(kls, ns))
+		return NULL;
+
+	return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+	u64 __user *ns_ids = kls->uns_ids;
+	size_t nr_ns_ids = kls->nr_ns_ids;
+	struct ns_common *ns = NULL, *first_ns = NULL;
+	const struct list_head *head;
+	ssize_t ret;
+
+	VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+	if (kls->user_ns_id == LISTNS_CURRENT_USER)
+		ns = to_ns_common(current_user_ns());
+	else if (kls->user_ns_id)
+		ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+	if (!ns)
+		return -EINVAL;
+	kls->user_ns = to_user_ns(ns);
+
+	/*
+	 * Use the rbtree to find the first namespace we care about and
+	 * then use it's list entry to iterate from there.
+	 */
+	if (kls->last_ns_id) {
+		kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+		if (!kls->first_ns)
+			return -ENOENT;
+		first_ns = kls->first_ns;
+	}
+
+	ret = 0;
+	head = &to_ns_common(kls->user_ns)->ns_owner;
+	kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+	rcu_read_lock();
+
+	if (!first_ns)
+		first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry);
+	for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids;
+	     ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) {
+		struct ns_common *valid __free(ns_put);
+
+		valid = legitimize_ns(kls, ns);
+		if (!valid)
+			continue;
+
+		rcu_read_unlock();
+
+		if (put_user(valid->ns_id, ns_ids + ret))
+			return -EINVAL;
+		nr_ns_ids--;
+		ret++;
+
+		rcu_read_lock();
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+	struct ns_common *ret = NULL;
+	struct ns_tree *ns_tree = NULL;
+	struct rb_node *node;
+
+	if (ns_type) {
+		ns_tree = ns_tree_from_type(ns_type);
+		if (!ns_tree)
+			return NULL;
+	}
+
+	read_seqlock_excl(&ns_tree_lock);
+	if (ns_tree)
+		node = ns_tree->ns_tree.rb_node;
+	else
+		node = ns_unified_tree.rb_node;
+
+	while (node) {
+		struct ns_common *ns;
+
+		if (ns_type)
+			ns = node_to_ns(node);
+		else
+			ns = node_to_ns_unified(node);
+
+		if (ns_id <= ns->ns_id) {
+			if (ns_type)
+				ret = node_to_ns(node);
+			else
+				ret = node_to_ns_unified(node);
+			if (ns_id == ns->ns_id)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
+		}
+	}
+
+	if (ret)
+		ret = ns_get_unless_inactive(ret);
+	read_sequnlock_excl(&ns_tree_lock);
+	return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+						struct ns_tree *ns_tree)
+{
+	if (ns_tree)
+		return list_entry_rcu(head->next, struct ns_common, ns_list_node);
+	return list_entry_rcu(head->next, struct ns_common, ns_unified_list_node);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+					       struct ns_tree *ns_tree)
+{
+	if (ns_tree)
+		return list_entry_rcu(ns->ns_list_node.next, struct ns_common, ns_list_node);
+	return list_entry_rcu(ns->ns_unified_list_node.next, struct ns_common, ns_unified_list_node);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+				     const struct list_head *head,
+				     struct ns_tree *ns_tree)
+{
+	if (ns_tree)
+		return &ns->ns_list_node == head;
+	return &ns->ns_unified_list_node == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+	u64 __user *ns_ids = kls->uns_ids;
+	size_t nr_ns_ids = kls->nr_ns_ids;
+	struct ns_common *ns, *first_ns = NULL;
+	struct ns_tree *ns_tree = NULL;
+	const struct list_head *head;
+	u32 ns_type;
+	ssize_t ret;
+
+	if (hweight32(kls->ns_type) == 1)
+		ns_type = kls->ns_type;
+	else
+		ns_type = 0;
+
+	if (ns_type) {
+		ns_tree = ns_tree_from_type(ns_type);
+		if (!ns_tree)
+			return -EINVAL;
+	}
+
+	if (kls->last_ns_id) {
+		kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+		if (!kls->first_ns)
+			return -ENOENT;
+		first_ns = kls->first_ns;
+	}
+
+	ret = 0;
+	if (ns_tree)
+		head = &ns_tree->ns_list;
+	else
+		head = &ns_unified_list;
+
+	rcu_read_lock();
+
+	if (!first_ns)
+		first_ns = first_ns_common(head, ns_tree);
+
+	for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+	     ns = next_ns_common(ns, ns_tree)) {
+		struct ns_common *valid __free(ns_put);
+
+		valid = legitimize_ns(kls, ns);
+		if (!valid)
+			continue;
+
+		rcu_read_unlock();
+
+		if (put_user(valid->ns_id, ns_ids + ret))
+			return -EINVAL;
+
+		nr_ns_ids--;
+		ret++;
+
+		rcu_read_lock();
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+		u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+	struct klistns klns __free(klistns_free) = {};
+	const size_t maxcount = 1000000;
+	struct ns_id_req kreq;
+	ssize_t ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (unlikely(nr_ns_ids > maxcount))
+		return -EOVERFLOW;
+
+	if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+		return -EFAULT;
+
+	ret = copy_ns_id_req(req, &kreq);
+	if (ret)
+		return ret;
+
+	ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+	if (ret)
+		return ret;
+
+	if (kreq.user_ns_id)
+		return do_listns_userns(&klns);
+
+	return do_listns(&klns);
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fffec767a63..a5a63dc0a491 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT;
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.ns.__ns_ref = REFCOUNT_INIT(2),
+	.ns = NS_COMMON_INIT(init_pid_ns, 2),
 	.idr = IDR_INIT(init_pid_ns.idr),
 	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
-	.ns.inum = ns_init_inum(&init_pid_ns),
-#ifdef CONFIG_PID_NS
-	.ns.ops = &pidns_operations,
-#endif
 	.pid_max = PID_MAX_DEFAULT,
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
 #endif
-	.ns.ns_type = ns_common_type(&init_pid_ns),
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
@@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp)
 void free_pid(struct pid *pid)
 {
 	int i;
+	struct pid_namespace *active_ns;
 
 	lockdep_assert_not_held(&tasklist_lock);
 
+	active_ns = pid->numbers[pid->level].ns;
+	ns_ref_active_put(active_ns);
+
 	spin_lock(&pidmap_lock);
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
@@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	}
 	spin_unlock(&pidmap_lock);
 	idr_preload_end();
+	ns_ref_active_get(ns);
 
 	return pid;
 
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5b6997f4dc3d..19911f88e2b8 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = {
 };
 
 struct time_namespace init_time_ns = {
-	.ns.ns_type	= ns_common_type(&init_time_ns),
-	.ns.__ns_ref	= REFCOUNT_INIT(3),
+	.ns		= NS_COMMON_INIT(init_time_ns, 3),
 	.user_ns	= &init_user_ns,
-	.ns.inum	= ns_init_inum(&init_time_ns),
-	.ns.ops		= &timens_operations,
 	.frozen_offsets	= true,
 };
 
diff --git a/kernel/user.c b/kernel/user.c
index 0163665914c9..4b3132e786d9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
  * and 1 for... ?
  */
 struct user_namespace init_user_ns = {
+	.ns = NS_COMMON_INIT(init_user_ns, 3),
 	.uid_map = {
 		{
 			.extent[0] = {
@@ -65,14 +66,8 @@ struct user_namespace init_user_ns = {
 			.nr_extents = 1,
 		},
 	},
-	.ns.ns_type = ns_common_type(&init_user_ns),
-	.ns.__ns_ref = REFCOUNT_INIT(3),
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
-	.ns.inum = ns_init_inum(&init_user_ns),
-#ifdef CONFIG_USER_NS
-	.ns.ops = &userns_operations,
-#endif
 	.flags = USERNS_INIT_FLAGS,
 #ifdef CONFIG_KEYS
 	.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),