summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 11:20:29 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 11:20:29 -0700
commit18b19abc3709b109676ffd1f48dcd332c2e477d4 (patch)
treed0d51d0a3d6f99e6082b42cf6d1ca710a46b6b49 /kernel
parent5484a4ea7a1f208b886b58dd55cc55f418930f8a (diff)
parent6e65f4e8fc5b02f7a60ebb5b1b83772df0b86663 (diff)
Merge tag 'namespace-6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull namespace updates from Christian Brauner: "This contains a larger set of changes around the generic namespace infrastructure of the kernel. Each specific namespace type (net, cgroup, mnt, ...) embedds a struct ns_common which carries the reference count of the namespace and so on. We open-coded and cargo-culted so many quirks for each namespace type that it just wasn't scalable anymore. So given there's a bunch of new changes coming in that area I've started cleaning all of this up. The core change is to make it possible to correctly initialize every namespace uniformly and derive the correct initialization settings from the type of the namespace such as namespace operations, namespace type and so on. This leaves the new ns_common_init() function with a single parameter which is the specific namespace type which derives the correct parameters statically. This also means the compiler will yell as soon as someone does something remotely fishy. The ns_common_init() addition also allows us to remove ns_alloc_inum() and drops any special-casing of the initial network namespace in the network namespace initialization code that Linus complained about. Another part is reworking the reference counting. The reference counting was open-coded and copy-pasted for each namespace type even though they all followed the same rules. This also removes all open accesses to the reference count and makes it private and only uses a very small set of dedicated helpers to manipulate them just like we do for e.g., files. In addition this generalizes the mount namespace iteration infrastructure introduced a few cycles ago. As reminder, the vfs makes it possible to iterate sequentially and bidirectionally through all mount namespaces on the system or all mount namespaces that the caller holds privilege over. This allow userspace to iterate over all mounts in all mount namespaces using the listmount() and statmount() system call. Each mount namespace has a unique identifier for the lifetime of the systems that is exposed to userspace. The network namespace also has a unique identifier working exactly the same way. This extends the concept to all other namespace types. The new nstree type makes it possible to lookup namespaces purely by their identifier and to walk the namespace list sequentially and bidirectionally for all namespace types, allowing userspace to iterate through all namespaces. Looking up namespaces in the namespace tree works completely locklessly. This also means we can move the mount namespace onto the generic infrastructure and remove a bunch of code and members from struct mnt_namespace itself. There's a bunch of stuff coming on top of this in the future but for now this uses the generic namespace tree to extend a concept introduced first for pidfs a few cycles ago. For a while now we have supported pidfs file handles for pidfds. This has proven to be very useful. This extends the concept to cover namespaces as well. It is possible to encode and decode namespace file handles using the common name_to_handle_at() and open_by_handle_at() apis. As with pidfs file handles, namespace file handles are exhaustive, meaning it is not required to actually hold a reference to nsfs in able to decode aka open_by_handle_at() a namespace file handle. Instead the FD_NSFS_ROOT constant can be passed which will let the kernel grab a reference to the root of nsfs internally and thus decode the file handle. Namespaces file descriptors can already be derived from pidfds which means they aren't subject to overmount protection bugs. IOW, it's irrelevant if the caller would not have access to an appropriate /proc/<pid>/ns/ directory as they could always just derive the namespace based on a pidfd already. It has the same advantage as pidfds. It's possible to reliably and for the lifetime of the system refer to a namespace without pinning any resources and to compare them trivially. Permission checking is kept simple. If the caller is located in the namespace the file handle refers to they are able to open it otherwise they must hold privilege over the owning namespace of the relevant namespace. The namespace file handle layout is exposed as uapi and has a stable and extensible format. For now it simply contains the namespace identifier, the namespace type, and the inode number. The stable format means that userspace may construct its own namespace file handles without going through name_to_handle_at() as they are already allowed for pidfs and cgroup file handles" * tag 'namespace-6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (65 commits) ns: drop assert ns: move ns type into struct ns_common nstree: make struct ns_tree private ns: add ns_debug() ns: simplify ns_common_init() further cgroup: add missing ns_common include ns: use inode initializer for initial namespaces selftests/namespaces: verify initial namespace inode numbers ns: rename to __ns_ref nsfs: port to ns_ref_*() helpers net: port to ns_ref_*() helpers uts: port to ns_ref_*() helpers ipv4: use check_net() net: use check_net() net-sysfs: use check_net() user: port to ns_ref_*() helpers time: port to ns_ref_*() helpers pid: port to ns_ref_*() helpers ipc: port to ns_ref_*() helpers cgroup: port to ns_ref_*() helpers ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup/cgroup.c7
-rw-r--r--kernel/cgroup/namespace.c27
-rw-r--r--kernel/nscommon.c77
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/nstree.c247
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/pid_namespace.c23
-rw-r--r--kernel/time/namespace.c32
-rw-r--r--kernel/user.c5
-rw-r--r--kernel/user_namespace.c24
-rw-r--r--kernel/utsname.c31
12 files changed, 393 insertions, 91 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index c60623448235..1f48f7cd2d7b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
- kthread.o sys_ni.o nsproxy.o \
+ kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 77d02f87f3f1..a0d5d62f1483 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -241,11 +242,12 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
- .ns.inum = PROC_CGROUP_INIT_INO,
+ .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
+ .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -6336,6 +6338,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index dedadb525880..fdbe57578e68 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
/* cgroup namespaces */
@@ -21,29 +21,28 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
- struct cgroup_namespace *new_ns;
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
+ ret = ns_common_init(new_ns);
+ if (ret)
return ERR_PTR(ret);
- }
- refcount_set(&new_ns->ns.count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
+ ns_tree_add(new_ns);
+ return no_free_ptr(new_ns);
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
+ ns_tree_remove(ns);
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
EXPORT_SYMBOL(free_cgroup_ns);
@@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags,
return new_ns;
}
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns)
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
- .type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
new file mode 100644
index 000000000000..c1fb2bad6d72
--- /dev/null
+++ b/kernel/nscommon.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ns_common.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+#ifdef CONFIG_DEBUG_VFS
+static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ VFS_WARN_ON_ONCE(ops != &ipcns_operations);
+ break;
+#endif
+ case CLONE_NEWNS:
+ VFS_WARN_ON_ONCE(ops != &mntns_operations);
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ VFS_WARN_ON_ONCE(ops != &netns_operations);
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ VFS_WARN_ON_ONCE(ops != &pidns_operations);
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ VFS_WARN_ON_ONCE(ops != &timens_operations);
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ VFS_WARN_ON_ONCE(ops != &userns_operations);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ VFS_WARN_ON_ONCE(ops != &utsns_operations);
+ break;
+#endif
+ }
+}
+#endif
+
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
+{
+ refcount_set(&ns->__ns_ref, 1);
+ ns->stashed = NULL;
+ ns->ops = ops;
+ ns->ns_id = 0;
+ ns->ns_type = ns_type;
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ INIT_LIST_HEAD(&ns->ns_list_node);
+
+#ifdef CONFIG_DEBUG_VFS
+ ns_debug(ns, ops);
+#endif
+
+ if (inum) {
+ ns->inum = inum;
+ return 0;
+ }
+ return proc_alloc_inum(&ns->inum);
+}
+
+void __ns_common_free(struct ns_common *ns)
+{
+ proc_free_inum(ns->inum);
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8af3b9ec3aa8..19aa64ab08c8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (proc_ns_file(fd_file(f))) {
ns = get_proc_ns(file_inode(fd_file(f)));
- if (flags && (ns->ops->type != flags))
+ if (flags && (ns->ns_type != flags))
err = -EINVAL;
- flags = ns->ops->type;
+ flags = ns->ns_type;
} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
err = check_setns_flags(flags);
} else {
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..b24a320a11a6
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+/**
+ * struct ns_tree - Namespace tree
+ * @ns_tree: Rbtree of namespaces of a particular type
+ * @ns_list: Sequentially walkable list of all namespaces of this type
+ * @ns_tree_lock: Seqlock to protect the tree and list
+ * @type: type of namespaces in this tree
+ */
+struct ns_tree {
+ struct rb_root ns_tree;
+ struct list_head ns_list;
+ seqlock_t ns_tree_lock;
+ int type;
+};
+
+struct ns_tree mnt_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNS,
+};
+
+struct ns_tree net_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNET,
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree uts_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUTS,
+};
+
+struct ns_tree user_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUSER,
+};
+
+struct ns_tree ipc_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWIPC,
+};
+
+struct ns_tree pid_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWPID,
+};
+
+struct ns_tree cgroup_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWCGROUP,
+};
+
+struct ns_tree time_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWTIME,
+};
+
+DEFINE_COOKIE(namespace_cookie);
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_tree_node);
+}
+
+static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct ns_common *ns_a = node_to_ns(a);
+ struct ns_common *ns_b = node_to_ns(b);
+ u64 ns_id_a = ns_a->ns_id;
+ u64 ns_id_b = ns_b->ns_id;
+
+ if (ns_id_a < ns_id_b)
+ return -1;
+ if (ns_id_a > ns_id_b)
+ return 1;
+ return 0;
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ struct rb_node *node, *prev;
+
+ VFS_WARN_ON_ONCE(!ns->ns_id);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+ node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
+ /*
+ * If there's no previous entry simply add it after the
+ * head and if there is add it after the previous entry.
+ */
+ prev = rb_prev(&ns->ns_tree_node);
+ if (!prev)
+ list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
+ else
+ list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+
+ write_sequnlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
+ VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+ rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
+ list_bidir_del_rcu(&ns->ns_list_node);
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ write_sequnlock(&ns_tree->ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+
+static struct ns_tree *ns_tree_from_type(int ns_type)
+{
+ switch (ns_type) {
+ case CLONE_NEWCGROUP:
+ return &cgroup_ns_tree;
+ case CLONE_NEWIPC:
+ return &ipc_ns_tree;
+ case CLONE_NEWNS:
+ return &mnt_ns_tree;
+ case CLONE_NEWNET:
+ return &net_ns_tree;
+ case CLONE_NEWPID:
+ return &pid_ns_tree;
+ case CLONE_NEWUSER:
+ return &user_ns_tree;
+ case CLONE_NEWUTS:
+ return &uts_ns_tree;
+ case CLONE_NEWTIME:
+ return &time_ns_tree;
+ }
+
+ return NULL;
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+
+ do {
+ seq = read_seqbegin(&ns_tree->ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+
+ if (!node)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+
+ return node_to_ns(node);
+}
+
+/**
+ * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree *ns_tree, bool previous)
+{
+ struct list_head *list;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+ if (previous)
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ else
+ list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
+ if (list_is_head(list, &ns_tree->ns_list))
+ return ERR_PTR(-ENOENT);
+
+ VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
+
+ return list_entry_rcu(list, struct ns_common, ns_list_node);
+}
+
+/**
+ * ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 ns_tree_gen_id(struct ns_common *ns)
+{
+ guard(preempt)();
+ ns->ns_id = gen_cookie_next(&namespace_cookie);
+ return ns->ns_id;
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index 2dbcc4dd90cc..4fffec767a63 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = PROC_PID_INIT_INO,
+ .ns.inum = ns_init_inum(&init_pid_ns),
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
@@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = {
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
+ .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d315c89e4d62..650be58d8d18 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <linux/nstree.h>
#include <uapi/linux/wait.h>
#include "pid_sysctl.h"
@@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_idr;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto out_free_idr;
- ns->ns.ops = &pidns_operations;
ns->pid_max = PID_MAX_LIMIT;
err = register_pidns_sysctls(ns);
if (err)
goto out_free_inum;
- refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
@@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
+ ns_tree_add(ns);
return ns;
out_free_inum:
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ ns_tree_remove(ns);
unregister_pidns_sysctls(ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
@@ -168,7 +169,7 @@ static void destroy_pid_namespace_work(struct work_struct *work)
parent = ns->parent;
destroy_pid_namespace(ns);
ns = parent;
- } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
+ } while (ns != &init_pid_ns && ns_ref_put(ns));
}
struct pid_namespace *copy_pid_ns(u64 flags,
@@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
+ if (ns && ns != &init_pid_ns && ns_ref_put(ns))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
- return container_of(ns, struct pid_namespace, ns);
-}
-
static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -453,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
const struct proc_ns_operations pidns_operations = {
.name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
@@ -464,7 +459,6 @@ const struct proc_ns_operations pidns_operations = {
const struct proc_ns_operations pidns_for_children_operations = {
.name = "pid_for_children",
.real_ns_name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_for_children_get,
.put = pidns_put,
.install = pidns_install,
@@ -481,6 +475,7 @@ static __init int pid_namespaces_init(void)
#endif
register_pid_ns_sysctl_table_vm();
+ ns_tree_add(&init_pid_ns);
return 0;
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 888872bcc5bb..5b6997f4dc3d 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
@@ -253,16 +252,13 @@ out:
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
__free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
@@ -466,7 +462,6 @@ out:
const struct proc_ns_operations timens_operations = {
.name = "time",
- .type = CLONE_NEWTIME,
.get = timens_get,
.put = timens_put,
.install = timens_install,
@@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
- .type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,
.install = timens_install,
@@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.count = REFCOUNT_INIT(3),
+ .ns.ns_type = ns_common_type(&init_time_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(3),
.user_ns = &init_user_ns,
- .ns.inum = PROC_TIME_INIT_INO,
+ .ns.inum = ns_init_inum(&init_time_ns),
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/user.c b/kernel/user.c
index f46b1d41163b..0163665914c9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -65,10 +65,11 @@ struct user_namespace init_user_ns = {
.nr_extents = 1,
},
},
- .ns.count = REFCOUNT_INIT(3),
+ .ns.ns_type = ns_common_type(&init_user_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .ns.inum = PROC_USER_INIT_INO,
+ .ns.inum = ns_init_inum(&init_user_ns),
#ifdef CONFIG_USER_NS
.ns.ops = &userns_operations,
#endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 682f40d5632d..03cb63883d04 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/nstree.h>
static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
@@ -124,12 +125,11 @@ int create_user_ns(struct cred *new)
goto fail_dec;
ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
- ret = ns_alloc_inum(&ns->ns);
+
+ ret = ns_common_init(ns);
if (ret)
goto fail_free;
- ns->ns.ops = &userns_operations;
- refcount_set(&ns->ns.count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
@@ -159,12 +159,13 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
set_cred_user_ns(new, ns);
+ ns_tree_add(ns);
return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
fail_free:
kmem_cache_free(user_ns_cachep, ns);
fail_dec:
@@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work)
do {
struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ ns_tree_remove(ns);
if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
kfree(ns->gid_map.forward);
kfree(ns->gid_map.reverse);
@@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work)
#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
- ns_free_inum(&ns->ns);
- kmem_cache_free(user_ns_cachep, ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
dec_user_namespaces(ucounts);
ns = parent;
- } while (refcount_dec_and_test(&parent->ns.count));
+ } while (ns_ref_put(parent));
}
void __put_user_ns(struct user_namespace *ns)
@@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns)
}
EXPORT_SYMBOL(current_in_userns);
-static inline struct user_namespace *to_user_ns(struct ns_common *ns)
-{
- return container_of(ns, struct user_namespace, ns);
-}
-
static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns)
const struct proc_ns_operations userns_operations = {
.name = "user",
- .type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
@@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
+ ns_tree_add(&init_user_ns);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 00d8d7922f86..ebbfc578a9d3 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/nstree.h>
#include <linux/sched/task.h>
static struct kmem_cache *uts_ns_cache __ro_after_init;
@@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
}
-static struct uts_namespace *create_uts_ns(void)
-{
- struct uts_namespace *uts_ns;
-
- uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
- if (uts_ns)
- refcount_set(&uts_ns->ns.count, 1);
- return uts_ns;
-}
-
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
@@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = create_uts_ns();
+ ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL);
if (!ns)
goto fail_dec;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free;
ns->ucounts = ucounts;
- ns->ns.ops = &utsns_operations;
-
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
+ ns_tree_add(ns);
return ns;
fail_free:
@@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(u64 flags,
void free_uts_ns(struct uts_namespace *ns)
{
+ ns_tree_remove(ns);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kmem_cache_free(uts_ns_cache, ns);
-}
-
-static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
-{
- return container_of(ns, struct uts_namespace, ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *utsns_get(struct task_struct *task)
@@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns)
const struct proc_ns_operations utsns_operations = {
.name = "uts",
- .type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
@@ -174,4 +160,5 @@ void __init uts_ns_init(void)
offsetof(struct uts_namespace, name),
sizeof_field(struct uts_namespace, name),
NULL);
+ ns_tree_add(&init_uts_ns);
}