summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorCarlos Maiolino <cem@kernel.org>2025-05-14 12:20:57 +0200
committerCarlos Maiolino <cem@kernel.org>2025-05-14 12:20:57 +0200
commit6475ece803e7b41d5a3dcffffc35f6482d4cbb4d (patch)
treede64e29c352d00d104e1b3ad7a2553557a9d4a0f /block
parentf0447f80aec83f1699d599c94618bb5c323963e6 (diff)
parent8098514bd5ca98beca6ec725751d82d0d5b492d8 (diff)
Merge branch 'block-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into xfs-6.16-merge
Merging block tree into XFS because of some dependencies like bdev_validate_blocksize() Signed-off-by: Carlos Maiolino <cem@kernel.org>
Diffstat (limited to 'block')
-rw-r--r--block/bdev.c67
-rw-r--r--block/bio-integrity-auto.c62
-rw-r--r--block/bio-integrity.c17
-rw-r--r--block/blk-cgroup.c2
-rw-r--r--block/blk-settings.c8
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-throttle.h1
-rw-r--r--block/blk-zoned.c5
-rw-r--r--block/blk.h6
-rw-r--r--block/fops.c18
-rw-r--r--block/ioctl.c6
-rw-r--r--block/ioprio.c6
12 files changed, 148 insertions, 52 deletions
diff --git a/block/bdev.c b/block/bdev.c
index 4844d1e27b6f..520515e4e64e 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -152,27 +152,65 @@ static void set_init_blocksize(struct block_device *bdev)
get_order(bsize));
}
-int set_blocksize(struct file *file, int size)
+/**
+ * bdev_validate_blocksize - check that this block size is acceptable
+ * @bdev: blockdevice to check
+ * @block_size: block size to check
+ *
+ * For block device users that do not use buffer heads or the block device
+ * page cache, make sure that this block size can be used with the device.
+ *
+ * Return: On success zero is returned, negative error code on failure.
+ */
+int bdev_validate_blocksize(struct block_device *bdev, int block_size)
{
- struct inode *inode = file->f_mapping->host;
- struct block_device *bdev = I_BDEV(inode);
-
- if (blk_validate_block_size(size))
+ if (blk_validate_block_size(block_size))
return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
- if (size < bdev_logical_block_size(bdev))
+ if (block_size < bdev_logical_block_size(bdev))
return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_validate_blocksize);
+
+int set_blocksize(struct file *file, int size)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct block_device *bdev = I_BDEV(inode);
+ int ret;
+
+ ret = bdev_validate_blocksize(bdev, size);
+ if (ret)
+ return ret;
+
if (!file->private_data)
return -EINVAL;
/* Don't change the size if it is same as current */
if (inode->i_blkbits != blksize_bits(size)) {
+ /*
+ * Flush and truncate the pagecache before we reconfigure the
+ * mapping geometry because folio sizes are variable now. If a
+ * reader has already allocated a folio whose size is smaller
+ * than the new min_order but invokes readahead after the new
+ * min_order becomes visible, readahead will think there are
+ * "zero" blocks per folio and crash. Take the inode and
+ * invalidation locks to avoid racing with
+ * read/write/fallocate.
+ */
+ inode_lock(inode);
+ filemap_invalidate_lock(inode->i_mapping);
+
sync_blockdev(bdev);
+ kill_bdev(bdev);
+
inode->i_blkbits = blksize_bits(size);
mapping_set_folio_min_order(inode->i_mapping, get_order(size));
kill_bdev(bdev);
+ filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
}
return 0;
}
@@ -777,13 +815,13 @@ static void blkdev_put_part(struct block_device *part)
blkdev_put_whole(whole);
}
-struct block_device *blkdev_get_no_open(dev_t dev)
+struct block_device *blkdev_get_no_open(dev_t dev, bool autoload)
{
struct block_device *bdev;
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
- if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
+ if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
if (inode)
@@ -1005,7 +1043,7 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (ret)
return ERR_PTR(ret);
- bdev = blkdev_get_no_open(dev);
+ bdev = blkdev_get_no_open(dev, true);
if (!bdev)
return ERR_PTR(-ENXIO);
@@ -1275,18 +1313,15 @@ void sync_bdevs(bool wait)
void bdev_statx(struct path *path, struct kstat *stat,
u32 request_mask)
{
- struct inode *backing_inode;
struct block_device *bdev;
- backing_inode = d_backing_inode(path->dentry);
-
/*
- * Note that backing_inode is the inode of a block device node file,
- * not the block device's internal inode. Therefore it is *not* valid
- * to use I_BDEV() here; the block device has to be looked up by i_rdev
+ * Note that d_backing_inode() returns the block device node inode, not
+ * the block device's internal inode. Therefore it is *not* valid to
+ * use I_BDEV() here; the block device has to be looked up by i_rdev
* instead.
*/
- bdev = blkdev_get_no_open(backing_inode->i_rdev);
+ bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false);
if (!bdev)
return;
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index e524c609be50..9c6657664792 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -9,6 +9,7 @@
* not aware of PI.
*/
#include <linux/blk-integrity.h>
+#include <linux/t10-pi.h>
#include <linux/workqueue.h>
#include "blk.h"
@@ -43,6 +44,29 @@ static void bio_integrity_verify_fn(struct work_struct *work)
bio_endio(bio);
}
+#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
+static bool bip_should_check(struct bio_integrity_payload *bip)
+{
+ return bip->bip_flags & BIP_CHECK_FLAGS;
+}
+
+static bool bi_offload_capable(struct blk_integrity *bi)
+{
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ return bi->tuple_size == sizeof(struct t10_pi_tuple);
+ default:
+ pr_warn_once("%s: unknown integrity checksum type:%d\n",
+ __func__, bi->csum_type);
+ fallthrough;
+ case BLK_INTEGRITY_CSUM_NONE:
+ return false;
+ }
+}
+
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
@@ -54,12 +78,12 @@ static void bio_integrity_verify_fn(struct work_struct *work)
*/
bool __bio_integrity_endio(struct bio *bio)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_data *bid =
container_of(bip, struct bio_integrity_data, bip);
- if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) {
+ if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
+ bip_should_check(bip)) {
INIT_WORK(&bid->work, bio_integrity_verify_fn);
queue_work(kintegrityd_wq, &bid->work);
return false;
@@ -84,6 +108,7 @@ bool bio_integrity_prep(struct bio *bio)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_data *bid;
+ bool set_flags = true;
gfp_t gfp = GFP_NOIO;
unsigned int len;
void *buf;
@@ -100,19 +125,24 @@ bool bio_integrity_prep(struct bio *bio)
switch (bio_op(bio)) {
case REQ_OP_READ:
- if (bi->flags & BLK_INTEGRITY_NOVERIFY)
- return true;
+ if (bi->flags & BLK_INTEGRITY_NOVERIFY) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ }
break;
case REQ_OP_WRITE:
- if (bi->flags & BLK_INTEGRITY_NOGENERATE)
- return true;
-
/*
* Zero the memory allocated to not leak uninitialized kernel
* memory to disk for non-integrity metadata where nothing else
* initializes the memory.
*/
- if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
+ if (bi->flags & BLK_INTEGRITY_NOGENERATE) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ gfp |= __GFP_ZERO;
+ } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
gfp |= __GFP_ZERO;
break;
default:
@@ -137,19 +167,21 @@ bool bio_integrity_prep(struct bio *bio)
bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
- if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
- bid->bip.bip_flags |= BIP_IP_CHECKSUM;
- if (bi->csum_type)
- bid->bip.bip_flags |= BIP_CHECK_GUARD;
- if (bi->flags & BLK_INTEGRITY_REF_TAG)
- bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+ if (set_flags) {
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
+ bid->bip.bip_flags |= BIP_IP_CHECKSUM;
+ if (bi->csum_type)
+ bid->bip.bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+ }
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)) < len)
goto err_end_io;
/* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
+ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip))
blk_integrity_generate(bio);
else
bid->saved_bio_iter = bio->bi_iter;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 608594a154a5..43ef6bd06c85 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -66,16 +66,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
}
EXPORT_SYMBOL(bio_integrity_alloc);
-static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
- bool dirty)
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs)
{
int i;
- for (i = 0; i < nr_vecs; i++) {
- if (dirty && !PageCompound(bv[i].bv_page))
- set_page_dirty_lock(bv[i].bv_page);
+ for (i = 0; i < nr_vecs; i++)
unpin_user_page(bv[i].bv_page);
- }
}
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
@@ -91,7 +87,7 @@ static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
- bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
+ bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs);
}
/**
@@ -111,8 +107,7 @@ void bio_integrity_unmap_user(struct bio *bio)
return;
}
- bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt,
- bio_data_dir(bio) == READ);
+ bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt);
}
/**
@@ -198,7 +193,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
if (write)
- bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+ bio_integrity_unpin_bvec(bvec, nr_vecs);
else
memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
@@ -319,7 +314,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
return 0;
release_pages:
- bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+ bio_integrity_unpin_bvec(bvec, nr_bvecs);
free_bvec:
if (bvec != stack_vec)
kfree(bvec);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5905f277057b..ce93706555c5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -797,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -EINVAL;
input = skip_spaces(input);
- bdev = blkdev_get_no_open(MKDEV(major, minor));
+ bdev = blkdev_get_no_open(MKDEV(major, minor), false);
if (!bdev)
return -ENODEV;
if (bdev_is_partition(bdev)) {
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6b2dbe645d23..4817e7ca03f8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -61,8 +61,14 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
+ *
+ * There is no hardware limitation for the read-ahead size and the user
+ * might have increased the read-ahead size through sysfs, so don't ever
+ * decrease it.
*/
- bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ bdi->ra_pages = max3(bdi->ra_pages,
+ lim->io_opt * 2 / PAGE_SIZE,
+ VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index a2882751f0d2..1f9b45b0b9ee 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -909,6 +909,8 @@ out_unregister_ia_ranges:
out_debugfs_remove:
blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
+ if (queue_is_mq(q))
+ blk_mq_sysfs_unregister(disk);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
return ret;
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 7964cc041e06..f9f8666891ab 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 0c77244a35c9..8f15d1aa6eb8 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -343,6 +343,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
@@ -364,8 +365,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
- if (cmd == BLKRESETZONE)
+ if (cmd == BLKRESETZONE) {
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
+ }
return ret;
}
diff --git a/block/blk.h b/block/blk.h
index 006e3be433d2..594eeba7b949 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -94,6 +94,9 @@ static inline void blk_wait_io(struct completion *done)
wait_for_completion_io(done);
}
+struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
+void blkdev_put_no_open(struct block_device *bdev);
+
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
@@ -477,7 +480,8 @@ static inline void blk_zone_update_request_bio(struct request *rq,
* the original BIO sector so that blk_zone_write_plug_bio_endio() can
* lookup the zone write plug.
*/
- if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+ if (req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);
diff --git a/block/fops.c b/block/fops.c
index be9f1dbea9ce..82b672d15ea4 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -642,7 +642,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- bdev = blkdev_get_no_open(inode->i_rdev);
+ bdev = blkdev_get_no_open(inode->i_rdev, true);
if (!bdev)
return -ENXIO;
@@ -746,7 +746,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with
+ * set_blocksize changing i_blkbits/folio order and punching
+ * out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = blkdev_buffered_write(iocb, from);
+ inode_unlock_shared(bd_inode);
}
if (ret > 0)
@@ -757,6 +764,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
@@ -793,7 +801,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto reexpand;
}
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
+ * changing i_blkbits/folio order and punching out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
+ inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -836,6 +850,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
/*
@@ -868,6 +883,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
fail:
filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
return error;
}
diff --git a/block/ioctl.c b/block/ioctl.c
index faa40f383e27..e472cc1030c6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -142,6 +142,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
if (err)
return err;
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
@@ -174,6 +175,7 @@ out_unplug:
blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -199,12 +201,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
end > bdev_nr_bytes(bdev))
return -EINVAL;
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -236,6 +240,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end);
if (err)
@@ -246,6 +251,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
diff --git a/block/ioprio.c b/block/ioprio.c
index 73301a261429..f0ee2798539c 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio)
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
- fallthrough;
- /* rt has prio field too */
- case IOPRIO_CLASS_BE:
- if (level >= IOPRIO_NR_LEVELS)
- return -EINVAL;
break;
+ case IOPRIO_CLASS_BE:
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE: