From 880b9577855edddda1e732748e849c63199d489b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2023 09:00:09 -0700 Subject: fs: distinguish between user initiated freeze and kernel initiated freeze Userspace can freeze a filesystem using the FIFREEZE ioctl or by suspending the block device; this state persists until userspace thaws the filesystem with the FITHAW ioctl or resuming the block device. Since commit 18e9e5104fcd ("Introduce freeze_super and thaw_super for the fsfreeze ioctl") we only allow the first freeze command to succeed. The kernel may decide that it is necessary to freeze a filesystem for its own internal purposes, such as suspends in progress, filesystem fsck activities, or quiescing a device prior to removal. Userspace thaw commands must never break a kernel freeze, and kernel thaw commands shouldn't undo userspace's freeze command. Introduce a couple of freeze holder flags and wire it into the sb_writers state. One kernel and one userspace freeze are allowed to coexist at the same time; the filesystem will not thaw until both are lifted. I wonder if the f2fs/gfs2 code should be using a kernel freeze here, but for now we'll use FREEZE_HOLDER_USERSPACE to preserve existing behaviors. Cc: mcgrof@kernel.org Cc: jack@suse.cz Cc: hch@infradead.org Cc: ruansy.fnst@fujitsu.com Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Jan Kara --- include/linux/fs.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..5e8d81212abc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1147,7 +1147,8 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - int frozen; /* Is sb frozen? */ + unsigned short frozen; /* Is sb frozen? */ + unsigned short freeze_holders; /* Who froze fs? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -1902,6 +1903,10 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), +}; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); @@ -1914,9 +1919,9 @@ struct super_operations { void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *); + int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *); + int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); @@ -2290,8 +2295,8 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -extern int freeze_super(struct super_block *super); -extern int thaw_super(struct super_block *super); +int freeze_super(struct super_block *super, enum freeze_holder who); +int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); -- cgit v1.2.3 From 6a3207395563f724d91231ec0aa7c4d95bf9591d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Aug 2023 12:26:25 +0100 Subject: fs, block: remove bdev->bd_super bdev->bd_super is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Message-Id: <20230807112625.652089-5-hch@lst.de> Signed-off-by: Christian Brauner --- fs/ext4/super.c | 1 - fs/super.c | 3 --- include/linux/blk_types.h | 1 - 3 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..b24f5b9f63e0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5572,7 +5572,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); - sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; diff --git a/fs/super.c b/fs/super.c index e781226e2880..7755cc2a3607 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1322,7 +1322,6 @@ int get_tree_bdev(struct fs_context *fc, } s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } BUG_ON(fc->root); @@ -1395,7 +1394,6 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } return dget(s->s_root); @@ -1413,7 +1411,6 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); blkdev_put(bdev, sb->s_type); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0bad62cca3d0..d5c5e59ddbd2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -52,7 +52,6 @@ struct block_device { atomic_t bd_openers; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct inode * bd_inode; /* will die */ - struct super_block * bd_super; void * bd_claiming; void * bd_holder; const struct blk_holder_ops *bd_holder_ops; -- cgit v1.2.3 From cf6da236c27a73ab91b657232cd3841aab27c37a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:20 +0200 Subject: fs: export setup_bdev_super We'll want to use setup_bdev_super instead of duplicating it in nilfs2. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Message-Id: <20230802154131.2221419-2-hch@lst.de> Signed-off-by: Christian Brauner --- fs/super.c | 3 ++- include/linux/fs_context.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 249558ecfd77..a366bc65886e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1243,7 +1243,7 @@ static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) s->s_dev == *(dev_t *)fc->sget_key; } -static int setup_bdev_super(struct super_block *sb, int sb_flags, +int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); @@ -1295,6 +1295,7 @@ static int setup_bdev_super(struct super_block *sb, int sb_flags, sb_set_blocksize(sb, block_size(bdev)); return 0; } +EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index ff6341e09925..58ef8433a94b 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -158,6 +158,8 @@ extern int get_tree_keyed(struct fs_context *fc, struct fs_context *fc), void *key); +int setup_bdev_super(struct super_block *sb, int sb_flags, + struct fs_context *fc); extern int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)); -- cgit v1.2.3 From 7ecd0b6f510005e120f5bc198bacb5951814cf36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Aug 2023 17:41:27 +0200 Subject: fs: export fs_holder_ops Export fs_holder_ops so that file systems that open additional block devices can use it as well. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Message-Id: <20230802154131.2221419-9-hch@lst.de> Signed-off-by: Christian Brauner --- fs/super.c | 3 ++- include/linux/blkdev.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 9cf7fc67727b..f72a1112a31b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1244,9 +1244,10 @@ static void fs_mark_dead(struct block_device *bdev) up_read(&sb->s_umount); } -static const struct blk_holder_ops fs_holder_ops = { +const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_mark_dead, }; +EXPORT_SYMBOL_GPL(fs_holder_ops); static int set_bdev_super(struct super_block *s, void *data) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..83262702eea7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1464,6 +1464,8 @@ struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev); }; +extern const struct blk_holder_ops fs_holder_ops; + /* * Return the correct open flags for blkdev_get_by_* for super block flags * as stored in sb->s_flags. -- cgit v1.2.3 From ab6860f62bfe329e11e5b5b7295b673c9c3a62d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:19 +0200 Subject: block: simplify the disk_force_media_change interface Hard code the events to DISK_EVENT_MEDIA_CHANGE as that is the only useful use case, and drop the superfluous return value. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-9-hch@lst.de> Signed-off-by: Christian Brauner --- block/disk-events.c | 15 ++++----------- drivers/block/loop.c | 6 +++--- include/linux/blkdev.h | 2 +- 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/disk-events.c b/block/disk-events.c index 0cfac464e6d1..6189b819b235 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -294,25 +294,18 @@ EXPORT_SYMBOL(disk_check_media_change); * @disk: the disk which will raise the event * @events: the events to raise * - * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, - * attempt to free all dentries and inodes and invalidates all block + * Should be called when the media changes for @disk. Generates a uevent + * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. - * - * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. */ -bool disk_force_media_change(struct gendisk *disk, unsigned int events) +void disk_force_media_change(struct gendisk *disk) { - disk_event_uevent(disk, events); - - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return false; - + disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); if (__invalidate_device(disk->part0, true)) pr_warn("VFS: busy inodes on changed media %s\n", disk->disk_name); set_bit(GD_NEED_PART_SCAN, &disk->state); - return true; } EXPORT_SYMBOL_GPL(disk_force_media_change); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 37511d2b2caf..705a0effa7d8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -603,7 +603,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_err; /* and ... switch */ - disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); + disk_force_media_change(lo->lo_disk); blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; @@ -1067,7 +1067,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, /* suppress uevents while reconfiguring the device */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); - disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); + disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; @@ -1171,7 +1171,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) if (!release) blk_mq_unfreeze_queue(lo->lo_queue); - disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); + disk_force_media_change(lo->lo_disk); if (lo->lo_flags & LO_FLAGS_PARTSCAN) { int err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83262702eea7..c8eab6effc22 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -750,7 +750,7 @@ static inline int bdev_read_only(struct block_device *bdev) } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -bool disk_force_media_change(struct gendisk *disk, unsigned int events); +void disk_force_media_change(struct gendisk *disk); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); -- cgit v1.2.3 From 560e20e4bf6484a0c12f9f3c7a1aa55056948e1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:24 +0200 Subject: block: consolidate __invalidate_device and fsync_bdev We currently have two interfaces that take a block_devices and the find a mounted file systems to flush or invaldidate data on it. Both are a bit problematic because they only work for the "main" block devices that is used as s_dev for the super_block, and because they don't call into the file system at all. Merge the two into a new bdev_mark_dead helper that does both the syncing and invalidation and which is properly documented. This is in preparation of merging the functionality into the ->mark_dead holder operation so that it will work on additional block devices used by a file systems and give us a single entry point for invalidation of dead devices or media. Note that a single standalone fsync_bdev call for an obscure ioctl remains for now, but that one will also be deal with in a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-14-hch@lst.de> Signed-off-by: Christian Brauner --- block/bdev.c | 33 ++++++++++++++++++++++++++++----- block/disk-events.c | 4 ++-- block/genhd.c | 3 +-- block/partitions/core.c | 5 +---- drivers/s390/block/dasd.c | 6 ++---- fs/super.c | 4 ++-- include/linux/blkdev.h | 2 +- 7 files changed, 37 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/block/bdev.c b/block/bdev.c index 979e28a46b98..074a9ffa9d8b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -221,7 +221,6 @@ int fsync_bdev(struct block_device *bdev) } return sync_blockdev(bdev); } -EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev - lock a filesystem and force it into a consistent state @@ -960,12 +959,27 @@ out_path_put: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev, bool kill_dirty) +/** + * bdev_mark_dead - mark a block device as dead + * @bdev: block device to operate on + * @surprise: indicate a surprise removal + * + * Tell the file system that this devices or media is dead. If @surprise is set + * to %true the device or media is already gone, if not we are preparing for an + * orderly removal. + * + * This syncs out all dirty data and writes back inodes and then invalidates any + * cached data in the inodes on the file system, the inodes themselves and the + * block device mapping. + */ +void bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb = get_super(bdev); int res = 0; if (sb) { + if (!surprise) + sync_filesystem(sb); /* * no need to lock the super, get_super holds the * read mutex so the filesystem cannot go away @@ -973,13 +987,22 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb, kill_dirty); + res = invalidate_inodes(sb, true); drop_super(sb); + } else { + if (!surprise) + sync_blockdev(bdev); } invalidate_bdev(bdev); - return res; } -EXPORT_SYMBOL(__invalidate_device); +#ifdef CONFIG_DASD_MODULE +/* + * Drivers should not use this directly, but the DASD driver has historically + * had a shutdown to offline mode that doesn't actually remove the gendisk + * that otherwise looks a lot like a safe device removal. + */ +EXPORT_SYMBOL_GPL(bdev_mark_dead); +#endif void sync_bdevs(bool wait) { diff --git a/block/disk-events.c b/block/disk-events.c index 6b858d350477..422db8292d09 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -281,7 +281,7 @@ bool disk_check_media_change(struct gendisk *disk) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; - __invalidate_device(disk->part0, true); + bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } @@ -300,7 +300,7 @@ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); - __invalidate_device(disk->part0, true); + bdev_mark_dead(disk->part0, true); set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); diff --git a/block/genhd.c b/block/genhd.c index 3d287b32d50d..afc2cb09eb94 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -647,8 +647,7 @@ void del_gendisk(struct gendisk *disk) mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) { remove_inode_hash(part->bd_inode); - fsync_bdev(part); - __invalidate_device(part, true); + bdev_mark_dead(part, false); } mutex_unlock(&disk->open_mutex); diff --git a/block/partitions/core.c b/block/partitions/core.c index 13a7341299a9..e137a87f4db0 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -281,10 +281,7 @@ static void delete_partition(struct block_device *part) * looked up any more even when openers still hold references. */ remove_inode_hash(part->bd_inode); - - fsync_bdev(part); - __invalidate_device(part, true); - + bdev_mark_dead(part, false); drop_partition(part); } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 675b38ad00dc..1f642be840c3 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3626,10 +3626,8 @@ int dasd_generic_set_offline(struct ccw_device *cdev) * so sync bdev first and then wait for our queues to become * empty */ - if (device->block) { - fsync_bdev(device->block->bdev); - __invalidate_device(device->block->bdev, true); - } + if (device->block) + bdev_mark_dead(device->block->bdev, false); dasd_schedule_device_bh(device); rc = wait_event_interruptible(shutdown_waitq, _wait_for_empty_queues(device)); diff --git a/fs/super.c b/fs/super.c index f72a1112a31b..9b2188e08bcc 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1359,7 +1359,7 @@ int get_tree_bdev(struct fs_context *fc, /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * __invalidate_device()). It is safe because we have active sb + * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ up_write(&s->s_umount); @@ -1411,7 +1411,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * __invalidate_device()). It is safe because we have active sb + * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ up_write(&s->s_umount); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8eab6effc22..6721595b9f97 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -751,6 +751,7 @@ static inline int bdev_read_only(struct block_device *bdev) bool set_capacity_and_notify(struct gendisk *disk, sector_t size); void disk_force_media_change(struct gendisk *disk); +void bdev_mark_dead(struct block_device *bdev, bool surprise); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); @@ -809,7 +810,6 @@ int __register_blkdev(unsigned int major, const char *name, void unregister_blkdev(unsigned int major, const char *name); bool disk_check_media_change(struct gendisk *disk); -int __invalidate_device(struct block_device *bdev, bool kill_dirty); void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED -- cgit v1.2.3 From d8530de5a6e82be0ce17a5fdf727a394bcf6444c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:25 +0200 Subject: block: call into the file system for bdev_mark_dead Combine the newly merged bdev_mark_dead helper with the existing mark_dead holder operation so that all operations that invalidate a device that is dead or being removed now go through the holder ops. This allows file systems to explicitly shutdown either ASAP (for a surprise removal) or after writing back data (for an orderly removal), and do so not only for the main device. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-15-hch@lst.de> Signed-off-by: Christian Brauner --- block/bdev.c | 30 +++++++++--------------------- block/genhd.c | 44 ++++++++++++++++++++++++-------------------- fs/super.c | 8 ++++++-- include/linux/blkdev.h | 2 +- 4 files changed, 40 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/block/bdev.c b/block/bdev.c index 074a9ffa9d8b..4d580083a114 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -968,31 +968,19 @@ EXPORT_SYMBOL(lookup_bdev); * to %true the device or media is already gone, if not we are preparing for an * orderly removal. * - * This syncs out all dirty data and writes back inodes and then invalidates any - * cached data in the inodes on the file system, the inodes themselves and the - * block device mapping. + * This calls into the file system, which then typicall syncs out all dirty data + * and writes back inodes and then invalidates any cached data in the inodes on + * the file system. In addition we also invalidate the block device mapping. */ void bdev_mark_dead(struct block_device *bdev, bool surprise) { - struct super_block *sb = get_super(bdev); - int res = 0; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) + bdev->bd_holder_ops->mark_dead(bdev, surprise); + else + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_holder_lock); - if (sb) { - if (!surprise) - sync_filesystem(sb); - /* - * no need to lock the super, get_super holds the - * read mutex so the filesystem cannot go away - * under us (->put_super runs with the write lock - * hold). - */ - shrink_dcache_sb(sb); - res = invalidate_inodes(sb, true); - drop_super(sb); - } else { - if (!surprise) - sync_blockdev(bdev); - } invalidate_bdev(bdev); } #ifdef CONFIG_DASD_MODULE diff --git a/block/genhd.c b/block/genhd.c index afc2cb09eb94..cc32a0c704eb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -554,7 +554,7 @@ out_exit_elevator: } EXPORT_SYMBOL(device_add_disk); -static void blk_report_disk_dead(struct gendisk *disk) +static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { struct block_device *bdev; unsigned long idx; @@ -565,10 +565,7 @@ static void blk_report_disk_dead(struct gendisk *disk) continue; rcu_read_unlock(); - mutex_lock(&bdev->bd_holder_lock); - if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) - bdev->bd_holder_ops->mark_dead(bdev); - mutex_unlock(&bdev->bd_holder_lock); + bdev_mark_dead(bdev, surprise); put_device(&bdev->bd_device); rcu_read_lock(); @@ -576,14 +573,7 @@ static void blk_report_disk_dead(struct gendisk *disk) rcu_read_unlock(); } -/** - * blk_mark_disk_dead - mark a disk as dead - * @disk: disk to mark as dead - * - * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O - * to this disk. - */ -void blk_mark_disk_dead(struct gendisk *disk) +static void __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. @@ -603,8 +593,19 @@ void blk_mark_disk_dead(struct gendisk *disk) * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(disk->queue); +} - blk_report_disk_dead(disk); +/** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) +{ + __blk_mark_disk_dead(disk); + blk_report_disk_dead(disk, true); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); @@ -641,17 +642,20 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); /* - * Prevent new openers by unlinked the bdev inode, and write out - * dirty data before marking the disk dead and stopping all I/O. + * Prevent new openers by unlinked the bdev inode. */ mutex_lock(&disk->open_mutex); - xa_for_each(&disk->part_tbl, idx, part) { + xa_for_each(&disk->part_tbl, idx, part) remove_inode_hash(part->bd_inode); - bdev_mark_dead(part, false); - } mutex_unlock(&disk->open_mutex); - blk_mark_disk_dead(disk); + /* + * Tell the file system to write back all dirty data and shut down if + * it hasn't been notified earlier. + */ + if (!test_bit(GD_DEAD, &disk->state)) + blk_report_disk_dead(disk, false); + __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. diff --git a/fs/super.c b/fs/super.c index 9b2188e08bcc..11fa21da130c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1228,7 +1228,7 @@ static bool lock_active_super(struct super_block *sb) return true; } -static void fs_mark_dead(struct block_device *bdev) +static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb = bdev->bd_holder; @@ -1238,6 +1238,10 @@ static void fs_mark_dead(struct block_device *bdev) if (!lock_active_super(sb)) return; + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + invalidate_inodes(sb, true); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); @@ -1245,7 +1249,7 @@ static void fs_mark_dead(struct block_device *bdev) } const struct blk_holder_ops fs_holder_ops = { - .mark_dead = fs_mark_dead, + .mark_dead = fs_bdev_mark_dead, }; EXPORT_SYMBOL_GPL(fs_holder_ops); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6721595b9f97..cdd03c612d39 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,7 +1461,7 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #endif struct blk_holder_ops { - void (*mark_dead)(struct block_device *bdev); + void (*mark_dead)(struct block_device *bdev, bool surprise); }; extern const struct blk_holder_ops fs_holder_ops; -- cgit v1.2.3 From 2142b88c37a3e49fbca4a36b8674626917d9bf40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:26 +0200 Subject: block: call into the file system for ioctl BLKFLSBUF BLKFLSBUF is a historic ioctl that is called on a file handle to a block device and syncs either the file system mounted on that block device if there is one, or otherwise the just the data on the block device. Replace the get_super based syncing with a holder operation to remove the last usage of get_super, and to also support syncing the file system if the block device is not the main block device stored in s_dev. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-16-hch@lst.de> Signed-off-by: Christian Brauner --- block/bdev.c | 16 ---------------- block/ioctl.c | 9 ++++++++- fs/super.c | 13 +++++++++++++ include/linux/blkdev.h | 7 +++++-- 4 files changed, 26 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/bdev.c b/block/bdev.c index 4d580083a114..a20263fa27a4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -206,22 +206,6 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) } EXPORT_SYMBOL(sync_blockdev_range); -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - if (sb) { - int res = sync_filesystem(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -} - /** * freeze_bdev - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock diff --git a/block/ioctl.c b/block/ioctl.c index 3be11941fb2d..648670ddb164 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -364,7 +364,14 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, { if (!capable(CAP_SYS_ADMIN)) return -EACCES; - fsync_bdev(bdev); + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) + bdev->bd_holder_ops->sync(bdev); + else + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_holder_lock); + invalidate_bdev(bdev); return 0; } diff --git a/fs/super.c b/fs/super.c index 11fa21da130c..1a369fa3f0a6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1248,8 +1248,21 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) up_read(&sb->s_umount); } +static void fs_bdev_sync(struct block_device *bdev) +{ + struct super_block *sb = bdev->bd_holder; + + lockdep_assert_held(&bdev->bd_holder_lock); + + if (!lock_active_super(sb)) + return; + sync_filesystem(sb); + up_read(&sb->s_umount); +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, + .sync = fs_bdev_sync, }; EXPORT_SYMBOL_GPL(fs_holder_ops); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cdd03c612d39..fa462d48ee96 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,6 +1462,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset); struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev, bool surprise); + + /* + * Sync the file system mounted on the block device. + */ + void (*sync)(struct block_device *bdev); }; extern const struct blk_holder_ops fs_holder_ops; @@ -1524,8 +1529,6 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int fsync_bdev(struct block_device *bdev); - int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); -- cgit v1.2.3 From 38bcdd38935350abceace901313e007e84d84456 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2023 12:08:27 +0200 Subject: fs: remove get_super get_super is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Josef Bacik Message-Id: <20230811100828.1897174-17-hch@lst.de> Signed-off-by: Christian Brauner --- fs/super.c | 37 ------------------------------------- include/linux/fs.h | 1 - 2 files changed, 38 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 1a369fa3f0a6..1c7c74855437 100644 --- a/fs/super.c +++ b/fs/super.c @@ -790,43 +790,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -/** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); -rescan: - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_bdev == bdev) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) - return sb; - up_read(&sb->s_umount); - /* nope, got unmounted */ - spin_lock(&sb_lock); - __put_super(sb); - goto rescan; - } - } - spin_unlock(&sb_lock); - return NULL; -} - /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..14b5777a24a0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2913,7 +2913,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_super(struct block_device *); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -- cgit v1.2.3 From 5e87491415217d6bec0bcae08a3156622be2b177 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Aug 2023 16:00:50 +0200 Subject: super: wait for nascent superblocks Recent patches experiment with making it possible to allocate a new superblock before opening the relevant block device. Naturally this has intricate side-effects that we get to learn about while developing this. Superblock allocators such as sget{_fc}() return with s_umount of the new superblock held and lock ordering currently requires that block level locks such as bdev_lock and open_mutex rank above s_umount. Before aca740cecbe5 ("fs: open block device after superblock creation") ordering was guaranteed to be correct as block devices were opened prior to superblock allocation and thus s_umount wasn't held. But now s_umount must be dropped before opening block devices to avoid locking violations. This has consequences. The main one being that iterators over @super_blocks and @fs_supers that grab a temporary reference to the superblock can now also grab s_umount before the caller has managed to open block devices and called fill_super(). So whereas before such iterators or concurrent mounts would have simply slept on s_umount until SB_BORN was set or the superblock was discard due to initalization failure they can now needlessly spin through sget{_fc}(). If the caller is sleeping on bdev_lock or open_mutex one caller waiting on SB_BORN will always spin somewhere and potentially this can go on for quite a while. It should be possible to drop s_umount while allowing iterators to wait on a nascent superblock to either be born or discarded. This patch implements a wait_var_event() mechanism allowing iterators to sleep until they are woken when the superblock is born or discarded. This also allows us to avoid relooping through @fs_supers and @super_blocks if a superblock isn't yet born or dying. Link: aca740cecbe5 ("fs: open block device after superblock creation") Reviewed-by: Jan Kara Message-Id: <20230818-vfs-super-fixes-v3-v3-3-9f0b1876e46b@kernel.org> Signed-off-by: Christian Brauner --- fs/super.c | 204 +++++++++++++++++++++++++++++++++++++++-------------- include/linux/fs.h | 1 + 2 files changed, 154 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index ba5d813c5804..e2630fe4928a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -50,7 +50,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_internal", }; -static inline void super_lock(struct super_block *sb, bool excl) +static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); @@ -66,14 +66,9 @@ static inline void super_unlock(struct super_block *sb, bool excl) up_read(&sb->s_umount); } -static inline void super_lock_excl(struct super_block *sb) +static inline void __super_lock_excl(struct super_block *sb) { - super_lock(sb, true); -} - -static inline void super_lock_shared(struct super_block *sb) -{ - super_lock(sb, false); + __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) @@ -86,6 +81,99 @@ static inline void super_unlock_shared(struct super_block *sb) super_unlock(sb, false); } +static inline bool wait_born(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with smp_store_release() in super_wake() and ensures + * that we see SB_BORN or SB_DYING after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & (SB_BORN | SB_DYING); +} + +/** + * super_lock - wait for superblock to become ready and lock it + * @sb: superblock to wait for + * @excl: whether exclusive access is required + * + * If the superblock has neither passed through vfs_get_tree() or + * generic_shutdown_super() yet wait for it to happen. Either superblock + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're + * woken and we'll see SB_DYING. + * + * The caller must have acquired a temporary reference on @sb->s_count. + * + * Return: This returns true if SB_BORN was set, false if SB_DYING was + * set. The function acquires s_umount and returns with it held. + */ +static __must_check bool super_lock(struct super_block *sb, bool excl) +{ + + lockdep_assert_not_held(&sb->s_umount); + +relock: + __super_lock(sb, excl); + + /* + * Has gone through generic_shutdown_super() in the meantime. + * @sb->s_root is NULL and @sb->s_active is 0. No one needs to + * grab a reference to this. Tell them so. + */ + if (sb->s_flags & SB_DYING) + return false; + + /* Has called ->get_tree() successfully. */ + if (sb->s_flags & SB_BORN) + return true; + + super_unlock(sb, excl); + + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, wait_born(sb)); + + /* + * Neither SB_BORN nor SB_DYING are ever unset so we never loop. + * Just reacquire @sb->s_umount for the caller. + */ + goto relock; +} + +/* wait and acquire read-side of @sb->s_umount */ +static inline bool super_lock_shared(struct super_block *sb) +{ + return super_lock(sb, false); +} + +/* wait and acquire write-side of @sb->s_umount */ +static inline bool super_lock_excl(struct super_block *sb) +{ + return super_lock(sb, true); +} + +/* wake waiters */ +#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING) +static void super_wake(struct super_block *sb, unsigned int flag) +{ + WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); + WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); + + /* + * Pairs with smp_load_acquire() in super_lock() to make sure + * all initializations in the superblock are seen by the user + * seeing SB_BORN sent. + */ + smp_store_release(&sb->s_flags, sb->s_flags | flag); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees SB_BORN set or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + wake_up_var(&sb->s_flags); +} + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. @@ -393,7 +481,7 @@ EXPORT_SYMBOL(deactivate_locked_super); void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { - super_lock_excl(s); + __super_lock_excl(s); deactivate_locked_super(s); } } @@ -415,10 +503,12 @@ EXPORT_SYMBOL(deactivate_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + bool born; + s->s_count++; spin_unlock(&sb_lock); - super_lock_excl(s); - if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { + born = super_lock_excl(s); + if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } @@ -447,8 +537,8 @@ static int grab_super(struct super_block *s) __releases(sb_lock) bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { - if (!hlist_unhashed(&sb->s_instances) && - sb->s_root && (sb->s_flags & SB_BORN)) + if (!(sb->s_flags & SB_DYING) && sb->s_root && + (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } @@ -475,7 +565,7 @@ bool super_trylock_shared(struct super_block *sb) void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); - super_lock_excl(sb); + __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; @@ -557,6 +647,13 @@ void generic_shutdown_super(struct super_block *sb) /* should be initialized for __put_super_and_need_restart() */ hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); + /* + * Broadcast to everyone that grabbed a temporary reference to this + * superblock before we removed it from @fs_supers that the superblock + * is dying. Every walker of @fs_supers outside of sget{_fc}() will now + * discard this superblock and treat it as dead. + */ + super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) @@ -631,6 +728,11 @@ retry: s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + /* + * Make the superblock visible on @super_blocks and @fs_supers. + * It's in a nascent state and users should wait on SB_BORN or + * SB_DYING to be set. + */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); @@ -740,7 +842,8 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) + /* Pairs with memory marrier in super_wake(). */ + if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); @@ -770,13 +873,13 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; + bool born; + sb->s_count++; spin_unlock(&sb_lock); - super_lock_shared(sb); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); @@ -806,11 +909,13 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - super_lock_shared(sb); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); @@ -841,14 +946,11 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; -restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_bdev == bdev) { if (!grab_super(sb)) - goto restart; + return NULL; super_unlock_excl(sb); return sb; } @@ -862,22 +964,21 @@ struct super_block *user_get_super(dev_t dev, bool excl) struct super_block *sb; spin_lock(&sb_lock); -rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_dev == dev) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - super_lock(sb, excl); /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock(sb, excl); + if (born && sb->s_root) return sb; super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); - goto rescan; + break; } } spin_unlock(&sb_lock); @@ -921,7 +1022,7 @@ int reconfigure_super(struct fs_context *fc) if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); - super_lock_excl(sb); + __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) @@ -984,9 +1085,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - super_lock_excl(sb); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { + bool born = super_lock_excl(sb); + + if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1020,8 +1121,9 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - super_lock_excl(sb); - if (sb->s_root && sb->s_flags & SB_BORN) { + bool born = super_lock_excl(sb); + + if (born && sb->s_root) { emergency_thaw_bdev(sb); thaw_super_locked(sb); } else { @@ -1212,9 +1314,9 @@ EXPORT_SYMBOL(get_tree_keyed); */ static bool super_lock_shared_active(struct super_block *sb) { - super_lock_shared(sb); - if (!sb->s_root || - (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) { + bool born = super_lock_shared(sb); + + if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock_shared(sb); return false; } @@ -1374,7 +1476,7 @@ int get_tree_bdev(struct fs_context *fc, */ super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); - super_lock_excl(s); + __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { @@ -1426,7 +1528,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, */ super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); - super_lock_excl(s); + __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { @@ -1566,13 +1668,13 @@ int vfs_get_tree(struct fs_context *fc) WARN_ON(!sb->s_bdi); /* - * Write barrier is for super_cache_count(). We place it before setting - * SB_BORN as the data dependency between the two functions is the - * superblock structure contents that we just set up, not the SB_BORN - * flag. + * super_wake() contains a memory barrier which also care of + * ordering for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is + * the superblock structure contents that we just set up, not + * the SB_BORN flag. */ - smp_wmb(); - sb->s_flags |= SB_BORN; + super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { @@ -1715,7 +1817,7 @@ int freeze_super(struct super_block *sb) int ret; atomic_inc(&sb->s_active); - super_lock_excl(sb); + __super_lock_excl(sb); if (sb->s_writers.frozen != SB_UNFROZEN) { deactivate_locked_super(sb); return -EBUSY; @@ -1737,7 +1839,7 @@ int freeze_super(struct super_block *sb) /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - super_lock_excl(sb); + __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -1820,7 +1922,7 @@ out: */ int thaw_super(struct super_block *sb) { - super_lock_excl(sb); + __super_lock_excl(sb); return thaw_super_locked(sb); } EXPORT_SYMBOL(thaw_super); diff --git a/include/linux/fs.h b/include/linux/fs.h index 14b5777a24a0..173672645156 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1095,6 +1095,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) -- cgit v1.2.3 From 2c18a63b760a0f68f14cb8bb4c3840bb0b63b73e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 18 Aug 2023 16:00:51 +0200 Subject: super: wait until we passed kill super Recent rework moved block device closing out of sb->put_super() and into sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and blkdev_put() can end up taking s_umount again. That means we need to move the removal of the superblock from @fs_supers out of generic_shutdown_super() and into deactivate_locked_super() to ensure that concurrent mounters don't fail to open block devices that are still in use because blkdev_put() in sb->kill_sb() hasn't been called yet. We can now do this as we can make iterators through @fs_super and @super_blocks wait without holding s_umount. Concurrent mounts will wait until a dying superblock is fully dead so until sb->kill_sb() has been called and SB_DEAD been set. Concurrent iterators can already discard any SB_DYING superblock. Reviewed-by: Jan Kara Message-Id: <20230818-vfs-super-fixes-v3-v3-4-9f0b1876e46b@kernel.org> Signed-off-by: Christian Brauner --- fs/super.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++------ include/linux/fs.h | 1 + 2 files changed, 65 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index e2630fe4928a..2f604a6494fb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -153,7 +153,7 @@ static inline bool super_lock_excl(struct super_block *sb) } /* wake waiters */ -#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING) +#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); @@ -461,6 +461,25 @@ void deactivate_locked_super(struct super_block *s) list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); + /* + * Remove it from @fs_supers so it isn't found by new + * sget{_fc}() walkers anymore. Any concurrent mounter still + * managing to grab a temporary reference is guaranteed to + * already see SB_DYING and will wait until we notify them about + * SB_DEAD. + */ + spin_lock(&sb_lock); + hlist_del_init(&s->s_instances); + spin_unlock(&sb_lock); + + /* + * Let concurrent mounts know that this thing is really dead. + * We don't need @sb->s_umount here as every concurrent caller + * will see SB_DYING and either discard the superblock or wait + * for SB_DEAD. + */ + super_wake(s, SB_DEAD); + put_filesystem(fs); put_super(s); } else { @@ -517,6 +536,45 @@ static int grab_super(struct super_block *s) __releases(sb_lock) return 0; } +static inline bool wait_dead(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with memory barrier in super_wake() and ensures + * that we see SB_DEAD after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & SB_DEAD; +} + +/** + * grab_super_dead - acquire an active reference to a superblock + * @sb: superblock to acquire + * + * Acquire a temporary reference on a superblock and try to trade it for + * an active reference. This is used in sget{_fc}() to wait for a + * superblock to either become SB_BORN or for it to pass through + * sb->kill() and be marked as SB_DEAD. + * + * Return: This returns true if an active reference could be acquired, + * false if not. + */ +static bool grab_super_dead(struct super_block *sb) +{ + + sb->s_count++; + if (grab_super(sb)) { + put_super(sb); + lockdep_assert_held(&sb->s_umount); + return true; + } + wait_var_event(&sb->s_flags, wait_dead(sb)); + put_super(sb); + lockdep_assert_not_held(&sb->s_umount); + return false; +} + /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab @@ -643,15 +701,14 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb->s_inode_list_lock); } } - spin_lock(&sb_lock); - /* should be initialized for __put_super_and_need_restart() */ - hlist_del_init(&sb->s_instances); - spin_unlock(&sb_lock); /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. + * + * We leave the superblock on @fs_supers so it can be found by + * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); @@ -746,7 +803,7 @@ share_extant_sb: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -790,7 +847,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; diff --git a/include/linux/fs.h b/include/linux/fs.h index 173672645156..a63da68305e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1095,6 +1095,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) -- cgit v1.2.3