From 0c40d7cb5ef3af260e8c7f88e0e5d7ae15d6ce57 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 19 Jun 2025 19:17:58 +0800 Subject: block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits Currently, disks primarily implement the write zeroes command (aka REQ_OP_WRITE_ZEROES) through two mechanisms: the first involves physically writing zeros to the disk media (e.g., HDDs), while the second performs an unmap operation on the logical blocks, effectively putting them into a deallocated state (e.g., SSDs). The first method is generally slow, while the second method is typically very fast. For example, on certain NVMe SSDs that support NVME_NS_DEAC, submitting REQ_OP_WRITE_ZEROES requests with the NVME_WZ_DEAC bit can accelerate the write zeros operation by placing disk blocks into a deallocated state, which opportunistically avoids writing zeroes to media while still guaranteeing that subsequent reads from the specified block range will return zeroed data. This is a best-effort optimization, not a mandatory requirement, some devices may partially fall back to writing physical zeroes due to factors such as misalignment or being asked to clear a block range smaller than the device's internal allocation unit. Therefore, the speed of this operation is not guaranteed. It is difficult to determine whether the storage device supports unmap write zeroes operation. We cannot determine this by only querying bdev_limits(bdev)->max_write_zeroes_sectors. Therefore, first, add a new hardware queue limit parameters, max_hw_wzeroes_unmap_sectors, to indicate whether a device supports this unmap write zeroes operation. Then, add two new counterpart software queue limits, max_wzeroes_unmap_sectors and max_user_wzeroes_unmap_sectors, which allow users to disable this operation if the speed is very slow on some sepcial devices. Finally, for the stacked devices cases, initialize these two parameters to UINT_MAX. This operation should be enabled by both the stacking driver and all underlying devices. Thanks to Martin K. Petersen for optimizing the documentation of the write_zeroes_unmap sysfs interface. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/20250619111806.3546162-2-yi.zhang@huaweicloud.com Reviewed-by: Christoph Hellwig Reviewed-by: "Martin K. Petersen" Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a59880c809c7..1a5725c1f93d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -383,6 +383,9 @@ struct queue_limits { unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_wzeroes_unmap_sectors; + unsigned int max_hw_wzeroes_unmap_sectors; + unsigned int max_user_wzeroes_unmap_sectors; unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; @@ -1042,6 +1045,7 @@ static inline void blk_queue_disable_secure_erase(struct request_queue *q) static inline void blk_queue_disable_write_zeroes(struct request_queue *q) { q->limits.max_write_zeroes_sectors = 0; + q->limits.max_wzeroes_unmap_sectors = 0; } /* @@ -1378,6 +1382,12 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_write_zeroes_sectors; } +static inline unsigned int +bdev_write_zeroes_unmap_sectors(struct block_device *bdev) +{ + return bdev_limits(bdev)->max_wzeroes_unmap_sectors; +} + static inline bool bdev_nonrot(struct block_device *bdev) { return blk_queue_nonrot(bdev_get_queue(bdev)); -- cgit v1.2.3 From 9b8b84879d4adc506b0d3944e20b28d9f3f6994b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 18 Jun 2025 15:00:45 +0900 Subject: block: Increase BLK_DEF_MAX_SECTORS_CAP Back in 2015, commit d2be537c3ba3 ("block: bump BLK_DEF_MAX_SECTORS to 2560") increased the default maximum size of a block device I/O to 2560 sectors (1280 KiB) to "accommodate a 10-data-disk stripe write with chunk size 128k". This choice is rather arbitrary and since then, improvements to the block layer have software RAID drivers correctly advertize their stripe width through chunk_sectors and abuses of BLK_DEF_MAX_SECTORS_CAP by drivers (to set the HW limit rather than the default user controlled maximum I/O size) have been fixed. Since many block devices can benefit from a larger value of BLK_DEF_MAX_SECTORS_CAP, and in particular HDDs, increase this value to be 4MiB, or 8192 sectors. And given that BLK_DEF_MAX_SECTORS_CAP is only used in the block layer and should not be used by drivers directly, move this macro definition to the block layer internal header file block/blk.h. Suggested-by: Martin K . Petersen Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20250618060045.37593-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk.h | 9 +++++++++ include/linux/blkdev.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk.h b/block/blk.h index 37ec459fe656..1141b343d0b5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -13,6 +13,15 @@ struct elevator_type; +/* + * Default upper limit for the software max_sectors limit used for regular I/Os. + * This can be increased through sysfs. + * + * This should not be confused with the max_hw_sector limit that is entirely + * controlled by the block device driver, usually based on hardware limits. + */ +#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT) + #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) #define BLK_MIN_SEGMENT_SIZE 4096 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a59880c809c7..5c626d23cbb2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1220,15 +1220,6 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -/* - * Default upper limit for the software max_sectors limit used for - * regular file system I/O. This can be increased through sysfs. - * - * Not to be confused with the max_hw_sector limit that is entirely - * controlled by the driver, usually based on hardware limits. - */ -#define BLK_DEF_MAX_SECTORS_CAP 2560u - static inline struct queue_limits *bdev_limits(struct block_device *bdev) { return &bdev_get_queue(bdev)->limits; -- cgit v1.2.3 From f70291411ba20d50008db90a6f0731efac27872c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 25 Jun 2025 18:33:24 +0900 Subject: block: Introduce bio_needs_zone_write_plugging() In preparation for fixing device mapper zone write handling, introduce the inline helper function bio_needs_zone_write_plugging() to test if a BIO requires handling through zone write plugging using the function blk_zone_plug_bio(). This function returns true for any write (op_is_write(bio) == true) operation directed at a zoned block device using zone write plugging, that is, a block device with a disk that has a zone write plug hash table. This helper allows simplifying the check on entry to blk_zone_plug_bio() and used in to protect calls to it for blk-mq devices and DM devices. Fixes: f211268ed1f9 ("dm: Use the block layer zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250625093327.548866-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- block/blk-zoned.c | 20 +----------------- drivers/md/dm.c | 4 +++- include/linux/blkdev.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 22 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4806b867e37d..0c61492724d2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3169,8 +3169,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) - goto queue_exit; + if (bio_needs_zone_write_plugging(bio)) { + if (blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + } new_request: if (rq) { diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 351d659280e1..efe71b1a1da1 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1116,25 +1116,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { struct block_device *bdev = bio->bi_bdev; - if (!bdev->bd_disk->zone_wplugs_hash) - return false; - - /* - * If the BIO already has the plugging flag set, then it was already - * handled through this path and this is a submission from the zone - * plug bio submit work. - */ - if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) - return false; - - /* - * We do not need to do anything special for empty flush BIOs, e.g - * BIOs such as issued by blkdev_issue_flush(). The is because it is - * the responsibility of the user to first wait for the completion of - * write operations for flush to have any effect on the persistence of - * the written data. - */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) return false; /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1726f0f828cc..6e3de50eae47 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1788,7 +1788,9 @@ static inline bool dm_zone_bio_needs_split(struct mapped_device *md, } static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { - return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); + if (!bio_needs_zone_write_plugging(bio)) + return false; + return blk_zone_plug_bio(bio, 0); } static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c626d23cbb2..a51f92b6c340 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -837,6 +837,55 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return disk->nr_zones; } + +/** + * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone + * write plugging + * @bio: The BIO being submitted + * + * Return true whenever @bio execution needs to be handled through zone + * write plugging (using blk_zone_plug_bio()). Return false otherwise. + */ +static inline bool bio_needs_zone_write_plugging(struct bio *bio) +{ + enum req_op op = bio_op(bio); + + /* + * Only zoned block devices have a zone write plug hash table. But not + * all of them have one (e.g. DM devices may not need one). + */ + if (!bio->bi_bdev->bd_disk->zone_wplugs_hash) + return false; + + /* Only write operations need zone write plugging. */ + if (!op_is_write(op)) + return false; + + /* Ignore empty flush */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + + /* Ignore BIOs that already have been handled by zone write plugging. */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return false; + + /* + * All zone write operations must be handled through zone write plugging + * using blk_zone_plug_bio(). + */ + switch (op) { + case REQ_OP_ZONE_APPEND: + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + return true; + default: + return false; + } +} + bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); /** @@ -866,6 +915,12 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } + +static inline bool bio_needs_zone_write_plugging(struct bio *bio) +{ + return false; +} + static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { return false; -- cgit v1.2.3 From c6603b1d6556cc02d0169f74508ab0e3e3e4bd76 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 30 Jun 2025 14:35:45 +0530 Subject: block: rename tuple_size field in blk_integrity to metadata_size The tuple_size field in blk_integrity currently represents the total size of metadata associated with each data interval. To make the meaning more explicit, rename tuple_size to metadata_size. This is a purely mechanical rename with no functional changes. Suggested-by: Martin K. Petersen Signed-off-by: Anuj Gupta Link: https://lore.kernel.org/20250630090548.3317-2-anuj20.g@samsung.com Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Christian Brauner --- block/bio-integrity-auto.c | 4 ++-- block/blk-integrity.c | 2 +- block/blk-settings.c | 6 +++--- block/t10-pi.c | 16 ++++++++-------- drivers/md/dm-crypt.c | 4 ++-- drivers/md/dm-integrity.c | 12 ++++++------ drivers/nvdimm/btt.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/nvme/target/io-cmd-bdev.c | 2 +- drivers/scsi/sd_dif.c | 2 +- include/linux/blk-integrity.h | 4 ++-- include/linux/blkdev.h | 2 +- 12 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 9c6657664792..687952f63bbb 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -54,10 +54,10 @@ static bool bi_offload_capable(struct blk_integrity *bi) { switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_CRC64: - return bi->tuple_size == sizeof(struct crc64_pi_tuple); + return bi->metadata_size == sizeof(struct crc64_pi_tuple); case BLK_INTEGRITY_CSUM_CRC: case BLK_INTEGRITY_CSUM_IP: - return bi->tuple_size == sizeof(struct t10_pi_tuple); + return bi->metadata_size == sizeof(struct t10_pi_tuple); default: pr_warn_once("%s: unknown integrity checksum type:%d\n", __func__, bi->csum_type); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index e4e2567061f9..c1102bf4cd8d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -239,7 +239,7 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - if (!bi->tuple_size) + if (!bi->metadata_size) return sysfs_emit(page, "none\n"); return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } diff --git a/block/blk-settings.c b/block/blk-settings.c index a000daafbfb4..787500ff00c3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -114,7 +114,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; - if (!bi->tuple_size) { + if (!bi->metadata_size) { if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { pr_warn("invalid PI settings.\n"); @@ -875,7 +875,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, return true; if (ti->flags & BLK_INTEGRITY_STACKED) { - if (ti->tuple_size != bi->tuple_size) + if (ti->metadata_size != bi->metadata_size) goto incompatible; if (ti->interval_exp != bi->interval_exp) goto incompatible; @@ -891,7 +891,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | (bi->flags & BLK_INTEGRITY_REF_TAG); ti->csum_type = bi->csum_type; - ti->tuple_size = bi->tuple_size; + ti->metadata_size = bi->metadata_size; ti->pi_offset = bi->pi_offset; ti->interval_exp = bi->interval_exp; ti->tag_size = bi->tag_size; diff --git a/block/t10-pi.c b/block/t10-pi.c index 851db518ee5e..0c4ed9702146 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -56,7 +56,7 @@ static void t10_pi_generate(struct blk_integrity_iter *iter, pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } } @@ -105,7 +105,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } @@ -125,7 +125,7 @@ next: static void t10_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -177,7 +177,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -234,7 +234,7 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } } @@ -289,7 +289,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } @@ -299,7 +299,7 @@ next: static void ext_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -340,7 +340,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9dfdb63220d7..3d6d06b94c9f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1189,11 +1189,11 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) return -EINVAL; } - if (bi->tuple_size < cc->used_tag_size) { + if (bi->metadata_size < cc->used_tag_size) { ti->error = "Integrity profile tag size mismatch."; return -EINVAL; } - cc->tuple_size = bi->tuple_size; + cc->tuple_size = bi->metadata_size; if (1 << bi->interval_exp != cc->sector_size) { ti->error = "Integrity profile sector size mismatch."; return -EINVAL; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4395657fa583..efeee0a873c0 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3906,8 +3906,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim struct blk_integrity *bi = &limits->integrity; memset(bi, 0, sizeof(*bi)); - bi->tuple_size = ic->tag_size; - bi->tag_size = bi->tuple_size; + bi->metadata_size = ic->tag_size; + bi->tag_size = bi->metadata_size; bi->interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; } @@ -4746,18 +4746,18 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Integrity profile not supported"; goto bad; } - /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/ - if (bi->tuple_size < ic->tag_size) { + /*printk("tag_size: %u, metadata_size: %u\n", bi->tag_size, bi->metadata_size);*/ + if (bi->metadata_size < ic->tag_size) { r = -EINVAL; ti->error = "The integrity profile is smaller than tag size"; goto bad; } - if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) { + if ((unsigned long)bi->metadata_size > PAGE_SIZE / 2) { r = -EINVAL; ti->error = "Too big tuple size"; goto bad; } - ic->tuple_size = bi->tuple_size; + ic->tuple_size = bi->metadata_size; if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) { r = -EINVAL; ti->error = "Integrity profile sector size mismatch"; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 423dcd190906..2a1aa32e6693 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1506,7 +1506,7 @@ static int btt_blk_init(struct btt *btt) int rc; if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { - lim.integrity.tuple_size = btt_meta_size(btt); + lim.integrity.metadata_size = btt_meta_size(btt); lim.integrity.tag_size = btt_meta_size(btt); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 92697f98c601..b027dda38e69 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1866,7 +1866,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, break; } - bi->tuple_size = head->ms; + bi->metadata_size = head->ms; bi->pi_offset = info->pi_offset; return true; } diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index eba42df2f821..42fb19f94ab8 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -65,7 +65,7 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) return; if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) { - ns->metadata_size = bi->tuple_size; + ns->metadata_size = bi->metadata_size; if (bi->flags & BLK_INTEGRITY_REF_TAG) ns->pi_type = NVME_NS_DPS_PI_TYPE1; else diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index ae6ce6f5d622..18bfca1f1c78 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -52,7 +52,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim) if (type != T10_PI_TYPE3_PROTECTION) bi->flags |= BLK_INTEGRITY_REF_TAG; - bi->tuple_size = sizeof(struct t10_pi_tuple); + bi->metadata_size = sizeof(struct t10_pi_tuple); if (dif && type) { bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c7eae0bfb013..d27730da47f3 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -33,7 +33,7 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { - return q->limits.integrity.tuple_size; + return q->limits.integrity.metadata_size; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) @@ -74,7 +74,7 @@ static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, unsigned int sectors) { - return bio_integrity_intervals(bi, sectors) * bi->tuple_size; + return bio_integrity_intervals(bi, sectors) * bi->metadata_size; } static inline bool blk_integrity_rq(struct request *rq) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a59880c809c7..ccda87d06a38 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -116,7 +116,7 @@ enum blk_integrity_checksum { struct blk_integrity { unsigned char flags; enum blk_integrity_checksum csum_type; - unsigned char tuple_size; + unsigned char metadata_size; unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; -- cgit v1.2.3 From 76e45252a4cefa205439eb6610a244771e7d88da Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 30 Jun 2025 14:35:46 +0530 Subject: block: introduce pi_tuple_size field in blk_integrity Introduce a new pi_tuple_size field in struct blk_integrity to explicitly represent the size (in bytes) of the protection information (PI) tuple. This is a prep patch. Add validation in blk_validate_integrity_limits() to ensure that pi size matches the expected size for known checksum types and never exceeds the pi_tuple_size. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Link: https://lore.kernel.org/20250630090548.3317-3-anuj20.g@samsung.com Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Christian Brauner --- block/blk-settings.c | 38 ++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/core.c | 2 ++ drivers/scsi/sd_dif.c | 1 + include/linux/blkdev.h | 1 + 4 files changed, 42 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index 787500ff00c3..32f3cdc9835a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "blk.h" #include "blk-rq-qos.h" @@ -135,6 +137,42 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return -EINVAL; } + if (bi->pi_tuple_size > bi->metadata_size) { + pr_warn("pi_tuple_size (%u) exceeds metadata_size (%u)\n", + bi->pi_tuple_size, + bi->metadata_size); + return -EINVAL; + } + + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_NONE: + if (bi->pi_tuple_size) { + pr_warn("pi_tuple_size must be 0 when checksum type \ + is none\n"); + return -EINVAL; + } + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) { + pr_warn("pi_tuple_size mismatch for T10 PI: expected \ + %zu, got %u\n", + sizeof(struct t10_pi_tuple), + bi->pi_tuple_size); + return -EINVAL; + } + break; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) { + pr_warn("pi_tuple_size mismatch for CRC64 PI: \ + expected %zu, got %u\n", + sizeof(struct crc64_pi_tuple), + bi->pi_tuple_size); + return -EINVAL; + } + break; + } + if (!bi->interval_exp) bi->interval_exp = ilog2(lim->logical_block_size); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b027dda38e69..fe72accab516 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1867,6 +1867,8 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, } bi->metadata_size = head->ms; + if (bi->csum_type) + bi->pi_tuple_size = head->pi_size; bi->pi_offset = info->pi_offset; return true; } diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 18bfca1f1c78..ff4217fef93b 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -53,6 +53,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim) bi->flags |= BLK_INTEGRITY_REF_TAG; bi->metadata_size = sizeof(struct t10_pi_tuple); + bi->pi_tuple_size = bi->metadata_size; if (dif && type) { bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ccda87d06a38..0d4011dcfed3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -120,6 +120,7 @@ struct blk_integrity { unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; + unsigned char pi_tuple_size; }; typedef unsigned int __bitwise blk_mode_t; -- cgit v1.2.3 From 4cdf1bdd45ac78a088773722f009883af30ad318 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 4 Jul 2025 11:21:34 +0200 Subject: block: reject bs > ps block devices when THP is disabled If THP is disabled and when a block device with logical block size > page size is present, the following null ptr deref panic happens during boot: [ [13.2 mK AOSAN: null-ptr-deref in range [0x0000000000000000-0x0000000000K0 0 0[07] [ 13.017749] RIP: 0010:create_empty_buffers+0x3b/0x380 [ 13.025448] Call Trace: [ 13.025692] [ 13.025895] block_read_full_folio+0x610/0x780 [ 13.026379] ? __pfx_blkdev_get_block+0x10/0x10 [ 13.027008] ? __folio_batch_add_and_move+0x1fa/0x2b0 [ 13.027548] ? __pfx_blkdev_read_folio+0x10/0x10 [ 13.028080] filemap_read_folio+0x9b/0x200 [ 13.028526] ? __pfx_filemap_read_folio+0x10/0x10 [ 13.029030] ? __filemap_get_folio+0x43/0x620 [ 13.029497] do_read_cache_folio+0x155/0x3b0 [ 13.029962] ? __pfx_blkdev_read_folio+0x10/0x10 [ 13.030381] read_part_sector+0xb7/0x2a0 [ 13.030805] read_lba+0x174/0x2c0 [ 13.045348] nvme_scan_ns+0x684/0x850 [nvme_core] [ 13.045858] ? __pfx_nvme_scan_ns+0x10/0x10 [nvme_core] [ 13.046414] ? _raw_spin_unlock+0x15/0x40 [ 13.046843] ? __switch_to+0x523/0x10a0 [ 13.047253] ? kvm_clock_get_cycles+0x14/0x30 [ 13.047742] ? __pfx_nvme_scan_ns_async+0x10/0x10 [nvme_core] [ 13.048353] async_run_entry_fn+0x96/0x4f0 [ 13.048787] process_one_work+0x667/0x10a0 [ 13.049219] worker_thread+0x63c/0xf60 As large folio support depends on THP, only allow bs > ps block devices if THP is enabled. Fixes: 47dd67532303 ("block/bdev: lift block size restrictions to 64k") Signed-off-by: Pankaj Raghav Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20250704092134.289491-1-p.raghav@samsung.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 332b56f323d9..369a8e63c865 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -269,11 +269,16 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) * however we constrain this to what we can validate and test. */ #define BLK_MAX_BLOCK_SIZE SZ_64K +#else +#define BLK_MAX_BLOCK_SIZE PAGE_SIZE +#endif + /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) -- cgit v1.2.3