From 8ec7c826d97b390879df2a03dfb035c70af86779 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:39 +0200 Subject: pidfs: persist information Persist exit and coredump information independent of whether anyone currently holds a pidfd for the struct pid. The current scheme allocated pidfs dentries on-demand repeatedly. This scheme is reaching it's limits as it makes it impossible to pin information that needs to be available after the task has exited or coredumped and that should not be lost simply because the pidfd got closed temporarily. The next opener should still see the stashed information. This is also a prerequisite for supporting extended attributes on pidfds to allow attaching meta information to them. If someone opens a pidfd for a struct pid a pidfs dentry is allocated and stashed in pid->stashed. Once the last pidfd for the struct pid is closed the pidfs dentry is released and removed from pid->stashed. So if 10 callers create a pidfs dentry for the same struct pid sequentially, i.e., each closing the pidfd before the other creates a new one then a new pidfs dentry is allocated every time. Because multiple tasks acquiring and releasing a pidfd for the same struct pid can race with each another a task may still find a valid pidfs entry from the previous task in pid->stashed and reuse it. Or it might find a dead dentry in there and fail to reuse it and so stashes a new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry but only one will succeed, the other ones will put their dentry. The current scheme aims to ensure that a pidfs dentry for a struct pid can only be created if the task is still alive or if a pidfs dentry already existed before the task was reaped and so exit information has been was stashed in the pidfs inode. That's great except that it's buggy. If a pidfs dentry is stashed in pid->stashed after pidfs_exit() but before __unhash_process() is called we will return a pidfd for a reaped task without exit information being available. The pidfds_pid_valid() check does not guard against this race as it doens't sync at all with pidfs_exit(). The pid_has_task() check might be successful simply because we're before __unhash_process() but after pidfs_exit(). Introduce a new scheme where the lifetime of information associated with a pidfs entry (coredump and exit information) isn't bound to the lifetime of the pidfs inode but the struct pid itself. The first time a pidfs dentry is allocated for a struct pid a struct pidfs_attr will be allocated which will be used to store exit and coredump information. If all pidfs for the pidfs dentry are closed the dentry and inode can be cleaned up but the struct pidfs_attr will stick until the struct pid itself is freed. This will ensure minimal memory usage while persisting relevant information. The new scheme has various advantages. First, it allows to close the race where we end up handing out a pidfd for a reaped task for which no exit information is available. Second, it minimizes memory usage. Third, it allows to remove complex lifetime tracking via dentries when registering a struct pid with pidfs. There's no need to get or put a reference. Instead, the lifetime of exit and coredump information associated with a struct pid is bound to the lifetime of struct pid itself. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/pidfs.h') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 77e7db194914..8f6ed59bb3fb 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); void pidfs_get_pid(struct pid *pid); void pidfs_put_pid(struct pid *pid); +void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From 804d6794497e6f3992d156e07d01e22b037ce09e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:42 +0200 Subject: pidfs: remove pidfs_{get,put}_pid() Now that we stash persistent information in struct pid there's no need to play volatile games with pinning struct pid via dentries in pidfs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-8-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/coredump.c | 6 ------ fs/pidfs.c | 35 +---------------------------------- include/linux/pidfs.h | 2 -- net/unix/af_unix.c | 5 ----- 4 files changed, 1 insertion(+), 47 deletions(-) (limited to 'include/linux/pidfs.h') diff --git a/fs/coredump.c b/fs/coredump.c index f217ebf2b3b6..55d6a713a0fb 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -898,12 +898,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, O_NONBLOCK | SOCK_COREDUMP); - /* - * ... Make sure to only put our reference after connect() took - * its own reference keeping the pidfs entry alive ... - */ - pidfs_put_pid(cprm.pid); - if (retval) { if (retval == -EAGAIN) coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); diff --git a/fs/pidfs.c b/fs/pidfs.c index c49c53d6ae51..bc2342cf4492 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -895,8 +895,7 @@ static void pidfs_put_data(void *data) * pidfs_register_pid - register a struct pid in pidfs * @pid: pid to pin * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * Register a struct pid in pidfs. * * Return: On success zero, on error a negative error code is returned. */ @@ -1007,38 +1006,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } -/** - * pidfs_get_pid - pin a struct pid through pidfs - * @pid: pid to pin - * - * Similar to pidfs_register_pid() but only valid if the caller knows - * there's a reference to the @pid through a dentry already that can't - * go away. - */ -void pidfs_get_pid(struct pid *pid) -{ - if (!pid) - return; - WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); -} - -/** - * pidfs_put_pid - drop a pidfs reference - * @pid: pid to drop - * - * Drop a reference to @pid via pidfs. This is only safe if the - * reference has been taken via pidfs_get_pid(). - */ -void pidfs_put_pid(struct pid *pid) -{ - might_sleep(); - - if (!pid) - return; - VFS_WARN_ON_ONCE(!pid->stashed); - dput(pid->stashed); -} - void __init pidfs_init(void) { pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 8f6ed59bb3fb..3e08c33da2df 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -14,8 +14,6 @@ void pidfs_coredump(const struct coredump_params *cprm); #endif extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); -void pidfs_get_pid(struct pid *pid); -void pidfs_put_pid(struct pid *pid); void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2e2e9997a68e..129388c309b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -646,9 +646,6 @@ static void unix_sock_destructor(struct sock *sk) return; } - if (sk->sk_peer_pid) - pidfs_put_pid(sk->sk_peer_pid); - if (u->addr) unix_release_addr(u->addr); @@ -769,7 +766,6 @@ static void drop_peercred(struct unix_peercred *peercred) swap(peercred->peer_pid, pid); swap(peercred->peer_cred, cred); - pidfs_put_pid(pid); put_pid(pid); put_cred(cred); } @@ -802,7 +798,6 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); - pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } -- cgit v1.2.3