summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2026-01-20 15:52:35 +0100
committerChristian Brauner <brauner@kernel.org>2026-02-10 11:39:30 +0100
commit802182490445f6bcf5de0e0518fb967c2afb6da1 (patch)
tree9aac5aa5336113b048b43dd1abb92932ee396588 /include/linux
parenta344860211f5c07dc6358758e42ff70f97b364a9 (diff)
pidfs: convert rb-tree to rhashtable
Mateusz reported performance penalties [1] during task creation because pidfs uses pidmap_lock to add elements into the rbtree. Switch to an rhashtable to have separate fine-grained locking and to decouple from pidmap_lock moving all heavy manipulations outside of it. Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount protection to an rhashtable. This removes the global pidmap_lock contention from pidfs_ino_get_pid() lookups and allows the hashtable insert to happen outside the pidmap_lock. pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and initializes pid fields and is called inside pidmap_lock. pidfs_add_pid() inserts pid into rhashtable and is called outside pidmap_lock. Insertion into the rhashtable can fail and memory allocation may happen so we need to drop the spinlock. To guard against accidently opening an already reaped task pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or it already went through pidfs_exit() aka the process as already reaped. If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether the task has exited. This slightly changes visibility semantics: pidfd creation is denied after pidfs_exit() runs, which is just before the pid number is removed from the via free_pid(). That should not be an issue though. Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1] Link: https://patch.msgid.link/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/pid.h4
-rw-r--r--include/linux/pidfs.h3
2 files changed, 4 insertions, 3 deletions
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 003a1027d219..ce9b5cb7560b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -6,6 +6,7 @@
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
+#include <linux/rhashtable-types.h>
#include <linux/sched.h>
#include <linux/wait.h>
@@ -60,7 +61,7 @@ struct pid {
spinlock_t lock;
struct {
u64 ino;
- struct rb_node pidfs_node;
+ struct rhash_head pidfs_hash;
struct dentry *stashed;
struct pidfs_attr *attr;
};
@@ -73,7 +74,6 @@ struct pid {
struct upid numbers[];
};
-extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid;
struct file;
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 3e08c33da2df..416bdff4d6ce 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -6,7 +6,8 @@ struct coredump_params;
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
-void pidfs_add_pid(struct pid *pid);
+void pidfs_prepare_pid(struct pid *pid);
+int pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
void pidfs_exit(struct task_struct *tsk);
#ifdef CONFIG_COREDUMP