From 3e15dcf77b23b8e9b9b7f3c0d4def8fe9c12c534 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 8 Sep 2023 16:28:59 +0300 Subject: fs: rename __mnt_{want,drop}_write*() helpers Before exporting these helpers to modules, make their names more meaningful. The names mnt_{get,put)_write_access*() were chosen, because they rhyme with the inode {get,put)_write_access() helpers, which have a very close meaning for the inode object. Suggested-by: Christian Brauner Link: https://lore.kernel.org/r/20230817-anfechtbar-ruhelosigkeit-8c6cca8443fc@brauner/ Signed-off-by: Amir Goldstein Message-Id: <20230908132900.2983519-2-amir73il@gmail.com> Signed-off-by: Christian Brauner --- include/linux/mount.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 4f40b40306d0..ac3dd2876197 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -extern int __mnt_want_write(struct vfsmount *); -extern void __mnt_drop_write(struct vfsmount *); +int mnt_get_write_access(struct vfsmount *mnt); +void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); -- cgit v1.2.3 From 5aa8fd9cea2ee0d42c5d92c5eacf0a14bbc4c293 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Sep 2023 20:25:50 -0400 Subject: fs: add a new SB_I_NOUMASK flag SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4 also sets this flag to prevent the VFS from applying the umask on newly-created files. NFSv4 doesn't support POSIX ACLs however, which causes confusion when other subsystems try to test for them. Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask stripping without advertising support for POSIX ACLs. Set the new flag on NFSv4 instead of SB_POSIXACL. Also, move mode_strip_umask to namei.h and convert init_mknod and init_mkdir to use it. Signed-off-by: Jeff Layton Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org> Signed-off-by: Christian Brauner --- fs/init.c | 6 ++---- fs/namei.c | 19 ------------------- fs/nfs/super.c | 2 +- include/linux/fs.h | 3 ++- include/linux/namei.h | 24 ++++++++++++++++++++++++ 5 files changed, 29 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/init.c b/fs/init.c index 9684406a8416..e9387b6c4f30 100644 --- a/fs/init.c +++ b/fs/init.c @@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, @@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); if (!error) error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, diff --git a/fs/namei.c b/fs/namei.c index 567ee547492b..94b27370f468 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3103,25 +3103,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); -/** - * mode_strip_umask - handle vfs umask stripping - * @dir: parent directory of the new inode - * @mode: mode of the new inode to be created in @dir - * - * Umask stripping depends on whether or not the filesystem supports POSIX - * ACLs. If the filesystem doesn't support it umask stripping is done directly - * in here. If the filesystem does support POSIX ACLs umask stripping is - * deferred until the filesystem calls posix_acl_create(). - * - * Returns: mode - */ -static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) -{ - if (!IS_POSIXACL(dir)) - mode &= ~current_umask(); - return mode; -} - /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d6473cb00cb..9b1cfca8112a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_export_op = &nfs_export_ops; break; case 4: - sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_NOUMASK; sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4aeb3fa11927..58dea591a341 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1119,7 +1119,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) -#define SB_POSIXACL BIT(16) /* VFS does not apply the umask */ +#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ @@ -1166,6 +1166,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/namei.h b/include/linux/namei.h index 1463cbda4888..e3619920f9f0 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -92,6 +92,30 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); +/** + * mode_strip_umask - handle vfs umask stripping + * @dir: parent directory of the new inode + * @mode: mode of the new inode to be created in @dir + * + * In most filesystems, umask stripping depends on whether or not the + * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask + * stripping is done directly in here. If the filesystem does support POSIX + * ACLs umask stripping is deferred until the filesystem calls + * posix_acl_create(). + * + * Some filesystems (like NFSv4) also want to avoid umask stripping by the + * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK + * to get this effect without declaring that they support POSIX ACLs. + * + * Returns: mode + */ +static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode) +{ + if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK)) + mode &= ~current_umask(); + return mode; +} + extern int __must_check nd_jump_link(const struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) -- cgit v1.2.3 From 61105aab4edb59bf8177f005eb2923fe5c4deb3c Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 21 Sep 2023 09:57:52 +0200 Subject: pipe: reduce padding in struct pipe_inode_info This has no effect on 64 bit because there are 10 32-bit integers surrounding the two bools, but on 32 bit architectures, this reduces the struct size by 4 bytes by merging the two bools into one word. Signed-off-by: Max Kellermann Message-Id: <20230921075755.1378787-1-max.kellermann@ionos.com> Signed-off-by: Christian Brauner --- include/linux/pipe_fs_i.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 608a9eb86bff..598a411d7da2 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -62,9 +62,6 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; -#ifdef CONFIG_WATCH_QUEUE - bool note_loss; -#endif unsigned int nr_accounted; unsigned int readers; unsigned int writers; @@ -72,6 +69,9 @@ struct pipe_inode_info { unsigned int r_counter; unsigned int w_counter; bool poll_usage; +#ifdef CONFIG_WATCH_QUEUE + bool note_loss; +#endif struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; -- cgit v1.2.3 From b4bd6b4bac8edd61eb8f7b836969d12c0c6af165 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 21 Sep 2023 09:57:53 +0200 Subject: fs/pipe: move check to pipe_has_watch_queue() This declutters the code by reducing the number of #ifdefs and makes the watch_queue checks simpler. This has no runtime effect; the machine code is identical. Signed-off-by: Max Kellermann Message-Id: <20230921075755.1378787-2-max.kellermann@ionos.com> Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/pipe.c | 12 +++--------- include/linux/pipe_fs_i.h | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 6c1a9b1db907..6ecaccb48738 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -437,12 +437,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe_has_watch_queue(pipe)) { ret = -EXDEV; goto out; } -#endif /* * If it wasn't empty we try to merge new data into @@ -1325,10 +1323,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) unsigned int nr_slots, size; long ret = 0; -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) + if (pipe_has_watch_queue(pipe)) return -EBUSY; -#endif size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; @@ -1380,10 +1376,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) if (file->f_op != &pipefifo_fops || !pipe) return NULL; -#ifdef CONFIG_WATCH_QUEUE - if (for_splice && pipe->watch_queue) + if (for_splice && pipe_has_watch_queue(pipe)) return NULL; -#endif return pipe; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 598a411d7da2..8ff23bf5a819 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -124,6 +124,22 @@ struct pipe_buf_operations { bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; +/** + * pipe_has_watch_queue - Check whether the pipe is a watch_queue, + * i.e. it was created with O_NOTIFICATION_PIPE + * @pipe: The pipe to check + * + * Return: true if pipe is a watch queue, false otherwise. + */ +static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) +{ +#ifdef CONFIG_WATCH_QUEUE + return pipe->watch_queue != NULL; +#else + return false; +#endif +} + /** * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer -- cgit v1.2.3 From 85fadf89e5708d74c4923e9ceeca4b9df0e000bb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 22 Sep 2023 10:54:08 -0700 Subject: watch_queue: Annotate struct watch_filter with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct watch_filter. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Cc: David Howells Cc: Randy Dunlap Cc: Al Viro Cc: Christian Brauner Cc: Jonathan Corbet Cc: Siddh Raman Pant Cc: Mauro Carvalho Chehab Cc: Qian Cai Signed-off-by: Kees Cook Tested-by: Siddh Raman Pant Reviewed-by: "Gustavo A. R. Silva" Message-Id: <20230922175407.work.754-kees@kernel.org> Signed-off-by: Christian Brauner --- include/linux/watch_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index 45cd42f55d49..429c7b6afead 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -32,7 +32,7 @@ struct watch_filter { DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ - struct watch_type_filter filters[]; + struct watch_type_filter filters[] __counted_by(nr_filters); }; struct watch_queue { -- cgit v1.2.3 From 0ede61d8589cc2d93aa78230d74ac58b5b8d0244 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Sep 2023 08:45:59 +0200 Subject: file: convert to SLAB_TYPESAFE_BY_RCU In recent discussions around some performance improvements in the file handling area we discussed switching the file cache to rely on SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based freeing for files completely. This is a pretty sensitive change overall but it might actually be worth doing. The main downside is the subtlety. The other one is that we should really wait for Jann's patch to land that enables KASAN to handle SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this exists. With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times which requires a few changes. So it isn't sufficient anymore to just acquire a reference to the file in question under rcu using atomic_long_inc_not_zero() since the file might have already been recycled and someone else might have bumped the reference. In other words, callers might see reference count bumps from newer users. For this reason it is necessary to verify that the pointer is the same before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file without first aqcuiring a reference on it. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a reference under rcu or they must hold the files_lock of the fdtable. Failing to do either one of this is a bug. Thanks to Jann for pointing out that we need to ensure memory ordering between reallocations and pointer check by ensuring that all subsequent loads have a dependency on the second load in get_file_rcu() and providing a fixup that was folded into this patch. Cc: Jann Horn Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- Documentation/filesystems/files.rst | 53 +++++------- arch/powerpc/platforms/cell/spufs/coredump.c | 11 ++- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +- fs/file.c | 125 +++++++++++++++++++++++---- fs/file_table.c | 40 +++++---- fs/gfs2/glock.c | 11 ++- fs/notify/dnotify/dnotify.c | 6 +- fs/proc/fd.c | 11 ++- include/linux/fdtable.h | 17 +--- include/linux/fs.h | 4 +- kernel/bpf/task_iter.c | 4 +- kernel/fork.c | 4 +- kernel/kcmp.c | 4 +- 13 files changed, 191 insertions(+), 103 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index bcf84459917f..9e38e4c221ca 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -62,7 +62,7 @@ the fdtable structure - be held. 4. To look up the file structure given an fd, a reader - must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These + must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These take care of barrier requirements due to lock-free lookup. An example:: @@ -70,43 +70,22 @@ the fdtable structure - struct file *file; rcu_read_lock(); - file = lookup_fd_rcu(fd); - if (file) { - ... - } - .... + file = lookup_fdget_rcu(fd); rcu_read_unlock(); - -5. Handling of the file structures is special. Since the look-up - of the fd (fget()/fget_light()) are lock-free, it is possible - that look-up may race with the last put() operation on the - file structure. This is avoided using atomic_long_inc_not_zero() - on ->f_count:: - - rcu_read_lock(); - file = files_lookup_fd_rcu(files, fd); if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; + ... + fput(file); } - rcu_read_unlock(); .... - return file; - - atomic_long_inc_not_zero() detects if refcounts is already zero or - goes to zero during increment. If it does, we fail - fget()/fget_light(). -6. Since both fdtable and file structures can be looked up +5. Since both fdtable and file structures can be looked up lock-free, they must be installed using rcu_assign_pointer() API. If they are looked up lock-free, rcu_dereference() must be used. However it is advisable to use files_fdtable() - and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues. + and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these + issues. -7. While updating, the fdtable pointer must be looked up while +6. While updating, the fdtable pointer must be looked up while holding files->file_lock. If ->file_lock is dropped, then another thread expand the files thereby creating a new fdtable and making the earlier fdtable pointer stale. @@ -126,3 +105,19 @@ the fdtable structure - Since locate_fd() can drop ->file_lock (and reacquire ->file_lock), the fdtable pointer (fdt) must be loaded after locate_fd(). +On newer kernels rcu based file lookup has been switched to rely on +SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore +to just acquire a reference to the file in question under rcu using +atomic_long_inc_not_zero() since the file might have already been +recycled and someone else might have bumped the reference. In other +words, callers might see reference count bumps from newer users. For +this is reason it is necessary to verify that the pointer is the same +before and after the reference count increment. This pattern can be seen +in get_file_rcu() and __files_get_rcu(). + +In addition, it isn't possible to access or check fields in struct file +without first aqcuiring a reference on it under rcu lookup. Not doing +that was always very dodgy and it was only usable for non-pointer data +in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers +either first acquire a reference or they must hold the files_lock of the +fdtable. diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 1a587618015c..18daafbe2e65 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd) */ static struct spu_context *coredump_next_context(int *fd) { - struct spu_context *ctx; + struct spu_context *ctx = NULL; struct file *file; int n = iterate_fd(current->files, *fd, match_context, NULL); if (!n) @@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd) *fd = n - 1; rcu_read_lock(); - file = lookup_fd_rcu(*fd); - ctx = SPUFS_I(file_inode(file))->i_ctx; - get_spu_context(ctx); + file = lookup_fdget_rcu(*fd); rcu_read_unlock(); + if (file) { + ctx = SPUFS_I(file_inode(file))->i_ctx; + get_spu_context(ctx); + fput(file); + } return ctx; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index aa4d842d4c5a..97bf10861cad 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -916,9 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915) struct file *file; rcu_read_lock(); - file = READ_ONCE(i915->gem.mmap_singleton); - if (file && !get_file_rcu(file)) - file = NULL; + file = get_file_rcu(&i915->gem.mmap_singleton); rcu_read_unlock(); if (file) return file; diff --git a/fs/file.c b/fs/file.c index 3e4a4dfa38fc..4eb026631673 100644 --- a/fs/file.c +++ b/fs/file.c @@ -853,8 +853,79 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static struct file *__get_file_rcu(struct file __rcu **f) +{ + struct file __rcu *file; + struct file __rcu *file_reloaded; + struct file __rcu *file_reloaded_cmp; + + file = rcu_dereference_raw(*f); + if (!file) + return NULL; + + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + return ERR_PTR(-EAGAIN); + + file_reloaded = rcu_dereference_raw(*f); + + /* + * Ensure that all accesses have a dependency on the load from + * rcu_dereference_raw() above so we get correct ordering + * between reuse/allocation and the pointer check below. + */ + file_reloaded_cmp = file_reloaded; + OPTIMIZER_HIDE_VAR(file_reloaded_cmp); + + /* + * atomic_long_inc_not_zero() above provided a full memory + * barrier when we acquired a reference. + * + * This is paired with the write barrier from assigning to the + * __rcu protected file pointer so that if that pointer still + * matches the current file, we know we have successfully + * acquired a reference to the right file. + * + * If the pointers don't match the file has been reallocated by + * SLAB_TYPESAFE_BY_RCU. + */ + if (file == file_reloaded_cmp) + return file_reloaded; + + fput(file); + return ERR_PTR(-EAGAIN); +} + +/** + * get_file_rcu - try go get a reference to a file under rcu + * @f: the file to get a reference on + * + * This function tries to get a reference on @f carefully verifying that + * @f hasn't been reused. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_rcu(struct file __rcu **f) +{ + for (;;) { + struct file __rcu *file; + + file = __get_file_rcu(f); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) + continue; + + return file; + } +} +EXPORT_SYMBOL_GPL(get_file_rcu); + static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -865,12 +936,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); - file = rcu_dereference_raw(*fdentry); - if (unlikely(!file)) - return NULL; - - if (unlikely(file->f_mode & mask)) - return NULL; /* * Ok, we have a file pointer. However, because we do @@ -879,10 +944,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * Such a race can take two forms: * - * (a) the file ref already went down to zero, - * and get_file_rcu() fails. Just try again: + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. */ - if (unlikely(!get_file_rcu(file))) + file = __get_file_rcu(fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) continue; /* @@ -893,12 +963,20 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || - unlikely(rcu_dereference_raw(*fdentry) != file)) { + if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } + /* + * This isn't the file we're looking for or we're not + * allowed to get a reference to it. + */ + if (unlikely(file->f_mode & mask)) { + fput(file); + return NULL; + } + /* * Ok, we have a ref to the file, and checked that it * still exists. @@ -948,7 +1026,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) +struct file *lookup_fdget_rcu(unsigned int fd) +{ + return __fget_files_rcu(current->files, fd, 0); + +} +EXPORT_SYMBOL_GPL(lookup_fdget_rcu); + +struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -957,13 +1042,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) task_lock(task); files = task->files; if (files) - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); task_unlock(task); return file; } -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -974,7 +1059,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); if (file) break; } @@ -983,7 +1068,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fd_rcu); +EXPORT_SYMBOL(task_lookup_next_fdget_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1272,12 +1357,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; + struct file *f; int retval = oldfd; rcu_read_lock(); - if (!files_lookup_fd_rcu(files, oldfd)) + f = __fget_files_rcu(files, oldfd, 0); + if (!f) retval = -EBADF; rcu_read_unlock(); + if (f) + fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); diff --git a/fs/file_table.c b/fs/file_table.c index e68e97d4f00a..a79a80031343 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -65,33 +65,33 @@ static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_rcuhead); - put_cred(f->f_cred); - if (unlikely(f->f_mode & FMODE_BACKING)) - kfree(backing_file(f)); - else - kmem_cache_free(filp_cachep, f); + kfree(backing_file(f)); } static inline void file_free(struct file *f) { security_file_free(f); - if (unlikely(f->f_mode & FMODE_BACKING)) - path_put(backing_file_real_path(f)); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); - call_rcu(&f->f_rcuhead, file_free_rcu); + put_cred(f->f_cred); + if (unlikely(f->f_mode & FMODE_BACKING)) { + path_put(backing_file_real_path(f)); + call_rcu(&f->f_rcuhead, file_free_rcu); + } else { + kmem_cache_free(filp_cachep, f); + } } void release_empty_file(struct file *f) { WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); - /* Uhm, we better find out who grabs references to an unopened file. */ - WARN_ON_ONCE(atomic_long_cmpxchg(&f->f_count, 1, 0) != 1); - security_file_free(f); - put_cred(f->f_cred); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - kmem_cache_free(filp_cachep, f); + if (atomic_long_dec_and_test(&f->f_count)) { + security_file_free(f); + put_cred(f->f_cred); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + percpu_counter_dec(&nr_files); + kmem_cache_free(filp_cachep, f); + } } /* @@ -176,7 +176,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); @@ -184,6 +183,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ + /* + * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While + * fget-rcu pattern users need to be able to handle spurious + * refcount bumps we should reinitialize the reused file first. + */ + atomic_long_set(&f->f_count, 1); return 0; } @@ -483,7 +488,8 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | + SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9cbf8d98489a..b4bc873aab7d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2717,16 +2717,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) for(;; i->fd++) { struct inode *inode; - i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } + inode = file_inode(i->file); - if (inode->i_sb != i->sb) - continue; - if (get_file_rcu(i->file)) + if (inode->i_sb == i->sb) break; + + rcu_read_unlock(); + fput(i->file); + rcu_read_lock(); } rcu_read_unlock(); return i->file; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f7..869b016014d2 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; - struct file *f; + struct file *f = NULL; int destroy = 0, error = 0; __u32 mask; @@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) } rcu_read_lock(); - f = lookup_fd_rcu(fd); + f = lookup_fdget_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread @@ -392,6 +392,8 @@ out_err: fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); + if (f) + fput(f); return error; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6276b3938842..6e72e5ad42bc 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, fd); - if (file) - *mode = file->f_mode; + file = task_lookup_fdget_rcu(task, fd); rcu_read_unlock(); + if (file) { + *mode = file->f_mode; + fput(file); + } return !!file; } @@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, char name[10 + 1]; unsigned int len; - f = task_lookup_next_fd_rcu(p, &fd); + f = task_lookup_next_fdget_rcu(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; rcu_read_unlock(); + fput(f); data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..bc4c3287a65e 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un return files_lookup_fd_raw(files, fd); } -static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd) -{ - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "suspicious rcu_dereference_check() usage"); - return files_lookup_fd_raw(files, fd); -} - -static inline struct file *lookup_fd_rcu(unsigned int fd) -{ - return files_lookup_fd_rcu(current->files, fd); -} - -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd); -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd); +struct file *lookup_fdget_rcu(unsigned int fd); +struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd); +struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd); struct task_struct; diff --git a/include/linux/fs.h b/include/linux/fs.h index 58dea591a341..ceafc40cc25f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1042,7 +1042,9 @@ static inline struct file *get_file(struct file *f) atomic_long_inc(&f->f_count); return f; } -#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) + +struct file *get_file_rcu(struct file __rcu **f); + #define file_count(x) atomic_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..82ad23b1d257 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -308,11 +308,9 @@ again: rcu_read_lock(); for (;; curr_fd++) { struct file *f; - f = task_lookup_next_fd_rcu(curr_task, &curr_fd); + f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); if (!f) break; - if (!get_file_rcu(f)) - continue; /* set info->fd */ info->fd = curr_fd; diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..640123767726 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm) struct file *exe_file; rcu_read_lock(); - exe_file = rcu_dereference(mm->exe_file); - if (exe_file && !get_file_rcu(exe_file)) - exe_file = NULL; + exe_file = get_file_rcu(&mm->exe_file); rcu_read_unlock(); return exe_file; } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 5353edfad8e1..b0639f21041f 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, idx); + file = task_lookup_fdget_rcu(task, idx); rcu_read_unlock(); + if (file) + fput(file); return file; } -- cgit v1.2.3 From 95e93d17cb113a3b097774874572b69c058acab5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 4 Oct 2023 13:19:15 +0200 Subject: vfs: predict the error in retry_estale as unlikely Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20231004111916.728135-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index e3619920f9f0..3100371b5e32 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -136,7 +136,7 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) static inline bool retry_estale(const long error, const unsigned int flags) { - return error == -ESTALE && !(flags & LOOKUP_REVAL); + return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL)); } #endif /* _LINUX_NAMEI_H */ -- cgit v1.2.3 From 08582d678fcf11fc86188f0b92239d3d49667d8e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 9 Oct 2023 18:37:11 +0300 Subject: fs: create helper file_user_path() for user displayed mapped file path Overlayfs uses backing files with "fake" overlayfs f_path and "real" underlying f_inode, in order to use underlying inode aops for mapped files and to display the overlayfs path in /proc//maps. In preparation for storing the overlayfs "fake" path instead of the underlying "real" path in struct backing_file, define a noop helper file_user_path() that returns f_path for now. Use the new helper in procfs and kernel logs whenever a path of a mapped file is displayed to users. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231009153712.1566422-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- arch/arc/kernel/troubleshoot.c | 6 ++++-- fs/proc/base.c | 2 +- fs/proc/nommu.c | 2 +- fs/proc/task_mmu.c | 4 ++-- fs/proc/task_nommu.c | 2 +- include/linux/fs.h | 14 ++++++++++++++ kernel/trace/trace_output.c | 2 +- 7 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index d5b3ed2c58f5..c380d8c30704 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address) */ if (vma) { char buf[ARC_PATH_MAX]; - char *nm = "?"; + char *nm = "anon"; if (vma->vm_file) { - nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1); + /* XXX: can we use %pD below and get rid of buf? */ + nm = d_path(file_user_path(vma->vm_file), buf, + ARC_PATH_MAX-1); if (IS_ERR(nm)) nm = "?"; } diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..20695c928ee6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 4d3493579458..c6e7ebc63756 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..1593940ca01e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (anon_name) seq_printf(m, "[anon_shmem:%s]", anon_name->name); else - seq_file_path(m, file, "\n"); + seq_path(m, file_user_path(file), "\n"); goto done; } @@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); + seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a8ac0dd8041e..2d5862c13399 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); diff --git a/include/linux/fs.h b/include/linux/fs.h index ceafc40cc25f..057a3bbd4d27 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2513,6 +2513,20 @@ static inline const struct path *file_real_path(struct file *f) return &f->f_path; } +/* + * file_user_path - get the path to display for memory mapped file + * + * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file + * stored in ->vm_file is a backing file whose f_inode is on the underlying + * filesystem. When the mapped file path is displayed to user (e.g. via + * /proc//maps), this helper should be used to get the path to display + * to the user, which is the path of the fd that user has requested to map. + */ +static inline const struct path *file_user_path(struct file *f) +{ + return &f->f_path; +} + static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index db575094c498..d8b302d01083 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, vmstart = vma->vm_start; } if (file) { - ret = trace_seq_path(s, &file->f_path); + ret = trace_seq_path(s, file_user_path(file)); if (ret) trace_seq_printf(s, "[+0x%lx]", ip - vmstart); -- cgit v1.2.3 From def3ae83da02f87005210fa3d448c5dd37ba4105 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 9 Oct 2023 18:37:12 +0300 Subject: fs: store real path instead of fake path in backing file f_path A backing file struct stores two path's, one "real" path that is referring to f_inode and one "fake" path, which should be displayed to users in /proc//maps. There is a lot more potential code that needs to know the "real" path, then code that needs to know the "fake" path. Instead of code having to request the "real" path with file_real_path(), store the "real" path in f_path and require code that needs to know the "fake" path request it with file_user_path(). Replace the file_real_path() helper with a simple const accessor f_path(). After this change, file_dentry() is not expected to observe any files with overlayfs f_path and real f_inode, so the call to ->d_real() should not be needed. Leave the ->d_real() call for now and add an assertion in ovl_d_real() to catch if we made wrong assumptions. Suggested-by: Miklos Szeredi Link: https://lore.kernel.org/r/CAJfpegtt48eXhhjDFA1ojcHPNKj3Go6joryCPtEFAKpocyBsnw@mail.gmail.com/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231009153712.1566422-4-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/file_table.c | 12 ++++++------ fs/internal.h | 2 +- fs/open.c | 23 +++++++++++------------ fs/overlayfs/super.c | 16 ++++++++++++---- include/linux/fs.h | 22 ++++------------------ include/linux/fsnotify.h | 3 +-- 6 files changed, 35 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 08fd1dd6d863..fa92743ba6a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* Container for backing file with optional real path */ +/* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path real_path; + struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) @@ -55,11 +55,11 @@ static inline struct backing_file *backing_file(struct file *f) return container_of(f, struct backing_file, file); } -struct path *backing_file_real_path(struct file *f) +struct path *backing_file_user_path(struct file *f) { - return &backing_file(f)->real_path; + return &backing_file(f)->user_path; } -EXPORT_SYMBOL_GPL(backing_file_real_path); +EXPORT_SYMBOL_GPL(backing_file_user_path); static inline void file_free(struct file *f) { @@ -68,7 +68,7 @@ static inline void file_free(struct file *f) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { - path_put(backing_file_real_path(f)); + path_put(backing_file_user_path(f)); kfree(backing_file(f)); } else { kmem_cache_free(filp_cachep, f); diff --git a/fs/internal.h b/fs/internal.h index 4e93a685bdaa..58e43341aebf 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -101,7 +101,7 @@ static inline void file_put_write_access(struct file *file) put_write_access(file->f_inode); mnt_put_write_access(file->f_path.mnt); if (unlikely(file->f_mode & FMODE_BACKING)) - mnt_put_write_access(backing_file_real_path(file)->mnt); + mnt_put_write_access(backing_file_user_path(file)->mnt); } static inline void put_file_access(struct file *file) diff --git a/fs/open.c b/fs/open.c index fe63e236da22..02dc608d40d8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -881,7 +881,7 @@ static inline int file_get_write_access(struct file *f) if (unlikely(error)) goto cleanup_inode; if (unlikely(f->f_mode & FMODE_BACKING)) { - error = mnt_get_write_access(backing_file_real_path(f)->mnt); + error = mnt_get_write_access(backing_file_user_path(f)->mnt); if (unlikely(error)) goto cleanup_mnt; } @@ -1182,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open); /** * backing_file_open - open a backing file for kernel internal use - * @path: path of the file to open + * @user_path: path that the user reuqested to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @path may be on the stackable filesystem and backing inode on the - * underlying filesystem. In this case, we want to be able to return - * the @real_path of the backing inode. This is done by embedding the - * returned file into a container structure that also stores the path of - * the backing inode on the underlying filesystem, which can be - * retrieved using backing_file_real_path(). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). */ -struct file *backing_file_open(const struct path *path, int flags, +struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred) { @@ -1206,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - path_get(real_path); - *backing_file_real_path(f) = *real_path; + path_get(user_path); + *backing_file_user_path(f) = *user_path; + f->f_path = *real_path; error = do_dentry_open(f, d_inode(real_path->dentry), NULL); if (error) { fput(f); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index def266b5e2a3..9f43f0d303ad 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct dentry *real = NULL, *lower; int err; - /* It's an overlay file */ + /* + * vfs is only expected to call d_real() with NULL from d_real_inode() + * and with overlay inode from file_dentry() on an overlay file. + * + * TODO: remove @inode argument from d_real() API, remove code in this + * function that deals with non-NULL @inode and remove d_real() call + * from file_dentry(). + */ if (inode && d_inode(dentry) == inode) return dentry; + else if (inode) + goto bug; if (!d_is_reg(dentry)) { - if (!inode || inode == d_inode(dentry)) - return dentry; - goto bug; + /* d_real_inode() is only relevant for regular files */ + return dentry; } real = ovl_dentry_upper(dentry); diff --git a/include/linux/fs.h b/include/linux/fs.h index 057a3bbd4d27..b37dccfb4534 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2492,26 +2492,10 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct file *backing_file_open(const struct path *path, int flags, +struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred); -struct path *backing_file_real_path(struct file *f); - -/* - * file_real_path - get the path corresponding to f_inode - * - * When opening a backing file for a stackable filesystem (e.g., - * overlayfs) f_path may be on the stackable filesystem and f_inode on - * the underlying filesystem. When the path associated with f_inode is - * needed, this helper should be used instead of accessing f_path - * directly. -*/ -static inline const struct path *file_real_path(struct file *f) -{ - if (unlikely(f->f_mode & FMODE_BACKING)) - return backing_file_real_path(f); - return &f->f_path; -} +struct path *backing_file_user_path(struct file *f); /* * file_user_path - get the path to display for memory mapped file @@ -2524,6 +2508,8 @@ static inline const struct path *file_real_path(struct file *f) */ static inline const struct path *file_user_path(struct file *f) { + if (unlikely(f->f_mode & FMODE_BACKING)) + return backing_file_user_path(f); return &f->f_path; } diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index ed48e4f1e755..bcb6609b54b3 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) if (file->f_mode & FMODE_NONOTIFY) return 0; - /* Overlayfs internal files have fake f_path */ - path = file_real_path(file); + path = &file->f_path; return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } -- cgit v1.2.3 From e4e8b47a34a432c3f65534d12d5c132b6639da71 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 15 Jan 2018 18:30:46 +0100 Subject: fs: fix umask on NFS with CONFIG_FS_POSIX_ACL=n Make IS_POSIXACL() return false if POSIX ACL support is disabled. Never skip applying the umask in namei.c and never bother to do any ACL specific checks if the filesystem falsely indicates it has ACLs enabled when the feature is completely disabled in the kernel. This fixes a problem where the umask is always ignored in the NFS client when compiled without CONFIG_FS_POSIX_ACL. This is a 4 year old regression caused by commit 013cdf1088d723 which itself was not completely wrong, but failed to consider all the side effects by misdesigned VFS code. Prior to that commit, there were two places where the umask could be applied, for example when creating a directory: 1. in the VFS layer in SYSCALL_DEFINE3(mkdirat), but only if !IS_POSIXACL() 2. again (unconditionally) in nfs3_proc_mkdir() The first one does not apply, because even without CONFIG_FS_POSIX_ACL, the NFS client sets SB_POSIXACL in nfs_fill_super(). After that commit, (2.) was replaced by: 2b. in posix_acl_create(), called by nfs3_proc_mkdir() There's one branch in posix_acl_create() which applies the umask; however, without CONFIG_FS_POSIX_ACL, posix_acl_create() is an empty dummy function which does not apply the umask. The approach chosen by this patch is to make IS_POSIXACL() always return false when POSIX ACL support is disabled, so the umask always gets applied by the VFS layer. This is consistent with the (regular) behavior of posix_acl_create(): that function returns early if IS_POSIXACL() is false, before applying the umask. Therefore, posix_acl_create() is responsible for applying the umask if there is ACL support enabled in the file system (SB_POSIXACL), and the VFS layer is responsible for all other cases (no SB_POSIXACL or no CONFIG_FS_POSIX_ACL). Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b37dccfb4534..cb8bfa1f8ecb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2113,7 +2113,12 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) + +#ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) +#else +#define IS_POSIXACL(inode) 0 +#endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) -- cgit v1.2.3 From 61d4fb0b349ec1b33119913c3b0bd109de30142c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2023 12:14:37 +0200 Subject: file, i915: fix file reference for mmap_singleton() Today we got a report at [1] for rcu stalls on the i915 testsuite in [2] due to the conversion of files to SLAB_TYPSSAFE_BY_RCU. Afaict, get_file_rcu() goes into an infinite loop trying to carefully verify that i915->gem.mmap_singleton hasn't changed - see the splat below. So I stared at this code to figure out what it actually does. It seems that the i915->gem.mmap_singleton pointer itself never had rcu semantics. The i915->gem.mmap_singleton is replaced in file->f_op->release::singleton_release(): static int singleton_release(struct inode *inode, struct file *file) { struct drm_i915_private *i915 = file->private_data; cmpxchg(&i915->gem.mmap_singleton, file, NULL); drm_dev_put(&i915->drm); return 0; } The cmpxchg() is ordered against a concurrent update of i915->gem.mmap_singleton from mmap_singleton(). IOW, when mmap_singleton() fails to get a reference on i915->gem.mmap_singleton: While mmap_singleton() does rcu_read_lock(); file = get_file_rcu(&i915->gem.mmap_singleton); rcu_read_unlock(); it allocates a new file via anon_inode_getfile() and does smp_store_mb(i915->gem.mmap_singleton, file); So, then what happens in the case of this bug is that at some point fput() is called and drops the file->f_count to zero leaving the pointer in i915->gem.mmap_singleton in tact. Now, there might be delays until file->f_op->release::singleton_release() is called and i915->gem.mmap_singleton is set to NULL. Say concurrently another task hits mmap_singleton() and does: rcu_read_lock(); file = get_file_rcu(&i915->gem.mmap_singleton); rcu_read_unlock(); When get_file_rcu() fails to get a reference via atomic_inc_not_zero() it will try the reload from i915->gem.mmap_singleton expecting it to be NULL, assuming it has comparable semantics as we expect in __fget_files_rcu(). But it hasn't so it reloads the same pointer again, trying the same atomic_inc_not_zero() again and doing so until file->f_op->release::singleton_release() of the old file has been called. So, in contrast to __fget_files_rcu() here we want to not retry when atomic_inc_not_zero() has failed. We only want to retry in case we managed to get a reference but the pointer did change on reload. <3> [511.395679] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: <3> [511.395716] rcu: Tasks blocked on level-1 rcu_node (CPUs 0-9): P6238 <3> [511.395934] rcu: (detected by 16, t=65002 jiffies, g=123977, q=439 ncpus=20) <6> [511.395944] task:i915_selftest state:R running task stack:10568 pid:6238 tgid:6238 ppid:1001 flags:0x00004002 <6> [511.395962] Call Trace: <6> [511.395966] <6> [511.395974] ? __schedule+0x3a8/0xd70 <6> [511.395995] ? asm_sysvec_apic_timer_interrupt+0x1a/0x20 <6> [511.396003] ? lockdep_hardirqs_on+0xc3/0x140 <6> [511.396013] ? asm_sysvec_apic_timer_interrupt+0x1a/0x20 <6> [511.396029] ? get_file_rcu+0x10/0x30 <6> [511.396039] ? get_file_rcu+0x10/0x30 <6> [511.396046] ? i915_gem_object_mmap+0xbc/0x450 [i915] <6> [511.396509] ? i915_gem_mmap+0x272/0x480 [i915] <6> [511.396903] ? mmap_region+0x253/0xb60 <6> [511.396925] ? do_mmap+0x334/0x5c0 <6> [511.396939] ? vm_mmap_pgoff+0x9f/0x1c0 <6> [511.396949] ? rcu_is_watching+0x11/0x50 <6> [511.396962] ? igt_mmap_offset+0xfc/0x110 [i915] <6> [511.397376] ? __igt_mmap+0xb3/0x570 [i915] <6> [511.397762] ? igt_mmap+0x11e/0x150 [i915] <6> [511.398139] ? __trace_bprintk+0x76/0x90 <6> [511.398156] ? __i915_subtests+0xbf/0x240 [i915] <6> [511.398586] ? __pfx___i915_live_setup+0x10/0x10 [i915] <6> [511.399001] ? __pfx___i915_live_teardown+0x10/0x10 [i915] <6> [511.399433] ? __run_selftests+0xbc/0x1a0 [i915] <6> [511.399875] ? i915_live_selftests+0x4b/0x90 [i915] <6> [511.400308] ? i915_pci_probe+0x106/0x200 [i915] <6> [511.400692] ? pci_device_probe+0x95/0x120 <6> [511.400704] ? really_probe+0x164/0x3c0 <6> [511.400715] ? __pfx___driver_attach+0x10/0x10 <6> [511.400722] ? __driver_probe_device+0x73/0x160 <6> [511.400731] ? driver_probe_device+0x19/0xa0 <6> [511.400741] ? __driver_attach+0xb6/0x180 <6> [511.400749] ? __pfx___driver_attach+0x10/0x10 <6> [511.400756] ? bus_for_each_dev+0x77/0xd0 <6> [511.400770] ? bus_add_driver+0x114/0x210 <6> [511.400781] ? driver_register+0x5b/0x110 <6> [511.400791] ? i915_init+0x23/0xc0 [i915] <6> [511.401153] ? __pfx_i915_init+0x10/0x10 [i915] <6> [511.401503] ? do_one_initcall+0x57/0x270 <6> [511.401515] ? rcu_is_watching+0x11/0x50 <6> [511.401521] ? kmalloc_trace+0xa3/0xb0 <6> [511.401532] ? do_init_module+0x5f/0x210 <6> [511.401544] ? load_module+0x1d00/0x1f60 <6> [511.401581] ? init_module_from_file+0x86/0xd0 <6> [511.401590] ? init_module_from_file+0x86/0xd0 <6> [511.401613] ? idempotent_init_module+0x17c/0x230 <6> [511.401639] ? __x64_sys_finit_module+0x56/0xb0 <6> [511.401650] ? do_syscall_64+0x3c/0x90 <6> [511.401659] ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8 <6> [511.401684] Link: [1]: https://lore.kernel.org/intel-gfx/SJ1PR11MB6129CB39EED831784C331BAFB9DEA@SJ1PR11MB6129.namprd11.prod.outlook.com Link: [2]: https://intel-gfx-ci.01.org/tree/linux-next/next-20231013/bat-dg2-11/igt@i915_selftest@live@mman.html#dmesg-warnings10963 Cc: Jann Horn , Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231025-formfrage-watscheln-84526cd3bd7d@brauner Signed-off-by: Christian Brauner --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +--- fs/file.c | 25 +++++++++++++++++++++++++ include/linux/fs.h | 1 + 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 97bf10861cad..72b353737334 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -915,9 +915,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915) { struct file *file; - rcu_read_lock(); - file = get_file_rcu(&i915->gem.mmap_singleton); - rcu_read_unlock(); + file = get_file_active(&i915->gem.mmap_singleton); if (file) return file; diff --git a/fs/file.c b/fs/file.c index 1a475d7d636e..5fb0b146e79e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -927,6 +927,31 @@ struct file *get_file_rcu(struct file __rcu **f) } EXPORT_SYMBOL_GPL(get_file_rcu); +/** + * get_file_active - try go get a reference to a file + * @f: the file to get a reference on + * + * In contast to get_file_rcu() the pointer itself isn't part of the + * reference counting. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_active(struct file **f) +{ + struct file __rcu *file; + + rcu_read_lock(); + file = __get_file_rcu(f); + rcu_read_unlock(); + if (IS_ERR(file)) + file = NULL; + return file; +} +EXPORT_SYMBOL_GPL(get_file_active); + static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { diff --git a/include/linux/fs.h b/include/linux/fs.h index cb8bfa1f8ecb..b35815277cc6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1044,6 +1044,7 @@ static inline struct file *get_file(struct file *f) } struct file *get_file_rcu(struct file __rcu **f); +struct file *get_file_active(struct file **f); #define file_count(x) atomic_long_read(&(x)->f_count) -- cgit v1.2.3