diff options
Diffstat (limited to 'kernel/nscommon.c')
| -rw-r--r-- | kernel/nscommon.c | 246 |
1 files changed, 240 insertions, 6 deletions
diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c1fb2bad6d72..bdc3c86231d3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> +#include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS @@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret = 0; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - return proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_ns_init_inum(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +struct ns_common *__must_check ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put(struct ns_common *ns) +{ + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_get(struct ns_common *ns) +{ + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + /* If we didn't resurrect the namespace we're done. */ + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + } +} |
