From 1f6df5847454dee8608f78ee0df7352472cb2447 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 18:45:02 -0400 Subject: drop_collected_paths(): constify arguments ... and use that to constify the pointers in callers Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 5f9c053b0897..c09032463b36 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -105,7 +105,7 @@ extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(struct path *, struct path *); +extern void drop_collected_paths(const struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); -- cgit v1.2.3 From b42ffcd5069d5cfb777b8982a1c55c7e2f1d3998 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 19:34:37 -0400 Subject: collect_paths(): constify the return value callers have no business modifying the paths they get Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 346e577073bb..cf8dbf0741f0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2300,7 +2300,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2334,7 +2334,7 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(const struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { for (const struct path *p = paths; p->mnt; p++) path_put(p); diff --git a/include/linux/mount.h b/include/linux/mount.h index c09032463b36..18e4b97f8a98 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -104,8 +104,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(const struct path *, struct path *); +extern const struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); -- cgit v1.2.3 From 09a1b33c080f6ac700fadc67c8471e67bf75fda4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 12:33:11 -0400 Subject: preparations to taking MNT_WRITE_HOLD out of ->mnt_flags We have an unpleasant wart in accessibility rules for struct mount. There are per-superblock lists of mounts, used by sb_prepare_remount_readonly() to check if any of those is currently claimed for write access and to block further attempts to get write access on those until we are done. As soon as it is attached to a filesystem, mount becomes reachable via that list. Only sb_prepare_remount_readonly() traverses it and it only accesses a few members of struct mount. Unfortunately, ->mnt_flags is one of those and it is modified - MNT_WRITE_HOLD set and then cleared. It is done under mount_lock, so from the locking rules POV everything's fine. However, it has easily overlooked implications - once mount has been attached to a filesystem, it has to be treated as globally visible. In particular, initializing ->mnt_flags *must* be done either prior to that point or under mount_lock. All other members are still private at that point. Life gets simpler if we move that bit (and that's *all* that can get touched by access via this list) out of ->mnt_flags. It's not even hard to do - currently the list is implemented as list_head one, anchored in super_block->s_mounts and linked via mount->mnt_instance. As the first step, switch it to hlist-like open-coded structure - address of the first mount in the set is stored in ->s_mounts and ->mnt_instance replaced with ->mnt_next_for_sb and ->mnt_pprev_for_sb - the former either NULL or pointing to the next mount in set, the latter - address of either ->s_mounts or ->mnt_next_for_sb in the previous element of the set. In the next commit we'll steal the LSB of ->mnt_pprev_for_sb as replacement for MNT_WRITE_HOLD. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 4 +++- fs/namespace.c | 38 +++++++++++++++++++++++++++++--------- fs/super.c | 3 +-- include/linux/fs.h | 4 +++- 4 files changed, 36 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/mount.h b/fs/mount.h index 04d0eadc4c10..b208f69f69d7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -64,7 +64,9 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev will be stolen */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ diff --git a/fs/namespace.c b/fs/namespace.c index 4e9ed4edd854..342dfd882b13 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -730,6 +730,27 @@ static inline void mnt_unhold_writers(struct mount *mnt) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; +} + static int mnt_make_readonly(struct mount *mnt) { int ret; @@ -743,7 +764,6 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ @@ -751,9 +771,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) return -EBUSY; lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -763,9 +783,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) + m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); @@ -1207,7 +1227,7 @@ static void setup_mnt(struct mount *m, struct dentry *root) m->mnt_parent = m; lock_mount_hash(); - list_add_tail(&m->mnt_instance, &s->s_mounts); + mnt_add_instance(m, s); unlock_mount_hash(); } @@ -1425,7 +1445,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..3b0f49e1b817 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0e9c7f1460dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1324,6 +1324,8 @@ struct sb_writers { struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; +struct mount; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1358,7 +1360,7 @@ struct super_block { __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct list_head s_mounts; /* list of mounts; _not_ for fs use */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; -- cgit v1.2.3 From 3371fa2f27134fc4ec7d40b2ae7b9e92c3b2527e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 13:37:12 -0400 Subject: struct mount: relocate MNT_WRITE_HOLD bit ... from ->mnt_flags to LSB of ->mnt_pprev_for_sb. This is safe - we always set and clear it within the same mount_lock scope, so we won't interfere with list operations - traversals are always forward, so they don't even look at ->mnt_prev_for_sb and both insertions and removals are in mount_lock scopes of their own, so that bit will be clear in *all* mount instances during those. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 25 ++++++++++++++++++++++++- fs/namespace.c | 34 +++++++++++++++++----------------- include/linux/mount.h | 3 +-- 3 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/mount.h b/fs/mount.h index b208f69f69d7..40cf16544317 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -66,7 +66,8 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ struct mount * __aligned(1) *mnt_pprev_for_sb; - /* except that LSB of pprev will be stolen */ + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -244,4 +245,26 @@ static inline struct mount *topmost_overmount(struct mount *m) return m; } +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 342dfd882b13..714e159ed9cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -509,20 +509,20 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); @@ -533,7 +533,7 @@ int mnt_get_write_access(struct vfsmount *m) } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -672,15 +672,15 @@ EXPORT_SYMBOL(mnt_drop_write_file); * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -696,9 +696,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -720,14 +720,14 @@ static inline int mnt_hold_writers(struct mount *mnt) */ static inline void mnt_unhold_writers(struct mount *mnt) { - if (!(mnt->mnt_flags & MNT_WRITE_HOLD)) + if (!test_write_hold(mnt)) return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); } static inline void mnt_del_instance(struct mount *m) @@ -766,7 +766,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) { int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; @@ -784,8 +784,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + if (test_write_hold(m)) + clear_write_hold(m); } unlock_mount_hash(); diff --git a/include/linux/mount.h b/include/linux/mount.h index 18e4b97f8a98..85e97b9340ff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -33,7 +33,6 @@ enum mount_flags { MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, - MNT_WRITE_HOLD = 0x200, MNT_INTERNAL = 0x4000, @@ -52,7 +51,7 @@ enum mount_flags { | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, - MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | + MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; -- cgit v1.2.3 From a79765248649de77771c24f7be08ff4c96f16f7a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Aug 2025 02:48:13 -0400 Subject: constify {__,}mnt_is_readonly() Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 54066c9b8da0..4b74fabced43 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -428,7 +428,7 @@ out_free_cache: * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -468,7 +468,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; diff --git a/include/linux/mount.h b/include/linux/mount.h index 85e97b9340ff..acfe7ef86a1b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); -extern bool __mnt_is_readonly(struct vfsmount *mnt); +extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -- cgit v1.2.3