From ebc4176551cdd021d02f4d2ed734e7b65e44442a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:35 -0800 Subject: blk-crypto: add basic hardware-wrapped key support To prevent keys from being compromised if an attacker acquires read access to kernel memory, some inline encryption hardware can accept keys which are wrapped by a per-boot hardware-internal key. This avoids needing to keep the raw keys in kernel memory, without limiting the number of keys that can be used. Such hardware also supports deriving a "software secret" for cryptographic tasks that can't be handled by inline encryption; this is needed for fscrypt to work properly. To support this hardware, allow struct blk_crypto_key to represent a hardware-wrapped key as an alternative to a raw key, and make drivers set flags in struct blk_crypto_profile to indicate which types of keys they support. Also add the ->derive_sw_secret() low-level operation, which drivers supporting wrapped keys must implement. For more information, see the detailed documentation which this patch adds to Documentation/block/inline-encryption.rst. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-2-ebiggers@kernel.org Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 7 +++--- block/blk-crypto-internal.h | 1 + block/blk-crypto-profile.c | 46 ++++++++++++++++++++++++++++++++++ block/blk-crypto.c | 61 ++++++++++++++++++++++++++++++++++----------- 4 files changed, 98 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 29a205482617..f154be0b575a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split; * This is the key we set when evicting a keyslot. This *should* be the all 0's * key, but AES-XTS rejects that key, so we use some random bytes instead. */ -static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; +static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { @@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, key->size); if (err) { blk_crypto_fallback_evict_keyslot(slot); @@ -539,7 +539,7 @@ static int blk_crypto_fallback_init(void) if (blk_crypto_fallback_inited) return 0; - get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + get_random_bytes(blank_key, sizeof(blank_key)); err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) @@ -561,6 +561,7 @@ static int blk_crypto_fallback_init(void) blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops; blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 93a141979694..1893df9a8f06 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -14,6 +14,7 @@ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ + unsigned int security_strength; /* security strength in bytes */ unsigned int ivsize; /* iv size in bytes */ }; diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 7fabc883e39f..a990d9026c83 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return false; if (profile->max_dun_bytes_supported < cfg->dun_bytes) return false; + if (!(profile->key_types_supported & cfg->key_type)) + return false; return true; } @@ -462,6 +464,44 @@ bool blk_crypto_register(struct blk_crypto_profile *profile, } EXPORT_SYMBOL_GPL(blk_crypto_register); +/** + * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key + * @bdev: a block device that supports hardware-wrapped keys + * @eph_key: a hardware-wrapped key in ephemerally-wrapped form + * @eph_key_size: size of @eph_key in bytes + * @sw_secret: (output) the software secret + * + * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that + * it is used for I/O), ask the hardware to derive the secret which software can + * use for cryptographic tasks other than inline encryption. This secret is + * guaranteed to be cryptographically isolated from the inline encryption key, + * i.e. derived with a different KDF context. + * + * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support + * hardware-wrapped keys, -EBADMSG if the key isn't a valid + * ephemerally-wrapped key, or another -errno code. + */ +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.derive_sw_secret) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size, + sw_secret); + blk_crypto_hw_exit(profile); + return err; +} + /** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device @@ -485,10 +525,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, child->max_dun_bytes_supported); for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) parent->modes_supported[i] &= child->modes_supported[i]; + parent->key_types_supported &= child->key_types_supported; } else { parent->max_dun_bytes_supported = 0; memset(parent->modes_supported, 0, sizeof(parent->modes_supported)); + parent->key_types_supported = 0; } } EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); @@ -521,6 +563,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, target->max_dun_bytes_supported) return false; + if (reference->key_types_supported & ~target->key_types_supported) + return false; + return true; } EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); @@ -555,5 +600,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, sizeof(dst->modes_supported)); dst->max_dun_bytes_supported = src->max_dun_bytes_supported; + dst->key_types_supported = src->key_types_supported; } EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 4d760b092deb..72975a980fbc 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, + .security_strength = 16, .ivsize = 16, }, }; @@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void) /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); - /* Sanity check that no algorithm exceeds the defined limits. */ + /* + * Validate the crypto mode properties. This ideally would be done with + * static assertions, but boot-time checks are the next best thing. + */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { - BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].keysize > + BLK_CRYPTO_MAX_RAW_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].security_strength > + blk_crypto_modes[i].keysize); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } @@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. - * @raw_key: Pointer to the raw key. Must be the correct length for the chosen - * @crypto_mode; see blk_crypto_modes[]. + * @key_bytes: the bytes of the key + * @key_size: size of the key in bytes + * @key_type: type of the key -- either raw or hardware-wrapped * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for - * zeroizing both blk_key and raw_key when done with them. + * zeroizing both blk_key and key_bytes when done with them. */ -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) @@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; - if (mode->keysize == 0) + switch (key_type) { + case BLK_CRYPTO_KEY_TYPE_RAW: + if (key_size != mode->keysize) + return -EINVAL; + break; + case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED: + if (key_size < mode->security_strength || + key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + return -EINVAL; + break; + default: return -EINVAL; + } if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; @@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->crypto_cfg.key_type = key_type; blk_key->data_unit_size_bits = ilog2(data_unit_size); - blk_key->size = mode->keysize; - memcpy(blk_key->raw, raw_key, mode->keysize); + blk_key->size = key_size; + memcpy(blk_key->bytes, key_bytes, key_size); return 0; } @@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { - return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_crypto_config_supported_natively(bdev, cfg); + if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) && + cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW) + return true; + return blk_crypto_config_supported_natively(bdev, cfg); } /** @@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev, * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * - * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and - * blk-crypto-fallback is either disabled or the needed algorithm - * is disabled in the crypto API; or another -errno code. + * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does + * not support wrapped keys; -ENOPKG if the key is a raw key but the + * hardware does not support raw keys and blk-crypto-fallback is either + * disabled or the needed algorithm is disabled in the crypto API; or + * another -errno code if something else went wrong. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; + if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) { + pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev); + return -EOPNOTSUPP; + } return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } -- cgit v1.2.3 From e35fde43e25ad725d27315992fba8088d1210b01 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:36 -0800 Subject: blk-crypto: show supported key types in sysfs Add sysfs files that indicate which type(s) of keys are supported by the inline encryption hardware associated with a particular request queue: /sys/block/$disk/queue/crypto/hw_wrapped_keys /sys/block/$disk/queue/crypto/raw_keys Userspace can use the presence or absence of these files to decide what encyption settings to use. Don't use a single key_type file, as devices might support both key types at the same time. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-3-ebiggers@kernel.org Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 20 ++++++++++++++++++++ block/blk-crypto-sysfs.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'block') diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 0cceb2badc83..890cde28bf90 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -229,6 +229,17 @@ Description: encryption, refer to Documentation/block/inline-encryption.rst. +What: /sys/block//queue/crypto/hw_wrapped_keys +Date: February 2025 +Contact: linux-block@vger.kernel.org +Description: + [RO] The presence of this file indicates that the device + supports hardware-wrapped inline encryption keys, i.e. key blobs + that can only be unwrapped and used by dedicated hardware. For + more information about hardware-wrapped inline encryption keys, + see Documentation/block/inline-encryption.rst. + + What: /sys/block//queue/crypto/max_dun_bits Date: February 2022 Contact: linux-block@vger.kernel.org @@ -267,6 +278,15 @@ Description: use with inline encryption. +What: /sys/block//queue/crypto/raw_keys +Date: February 2025 +Contact: linux-block@vger.kernel.org +Description: + [RO] The presence of this file indicates that the device + supports raw inline encryption keys, i.e. keys that are managed + in raw, plaintext form in software. + + What: /sys/block//queue/dax Date: June 2016 Contact: linux-block@vger.kernel.org diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index a304434489ba..e832f403f200 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) return container_of(attr, struct blk_crypto_attr, attr); } +static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { @@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, return sysfs_emit(page, "%u\n", profile->num_slots); } +static ssize_t raw_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + #define BLK_CRYPTO_RO_ATTR(_name) \ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) +BLK_CRYPTO_RO_ATTR(hw_wrapped_keys); BLK_CRYPTO_RO_ATTR(max_dun_bits); BLK_CRYPTO_RO_ATTR(num_keyslots); +BLK_CRYPTO_RO_ATTR(raw_keys); + +static umode_t blk_crypto_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + if (a == &hw_wrapped_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return 0; + if (a == &raw_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW)) + return 0; + + return 0444; +} static struct attribute *blk_crypto_attrs[] = { + &hw_wrapped_keys_attr.attr, &max_dun_bits_attr.attr, &num_keyslots_attr.attr, + &raw_keys_attr.attr, NULL, }; static const struct attribute_group blk_crypto_attr_group = { .attrs = blk_crypto_attrs, + .is_visible = blk_crypto_is_visible, }; /* -- cgit v1.2.3 From 1ebd4a3c095cd538d3c1c7c12738ef47d8e71f96 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:37 -0800 Subject: blk-crypto: add ioctls to create and prepare hardware-wrapped keys Until this point, the kernel can use hardware-wrapped keys to do encryption if userspace provides one -- specifically a key in ephemerally-wrapped form. However, no generic way has been provided for userspace to get such a key in the first place. Getting such a key is a two-step process. First, the key needs to be imported from a raw key or generated by the hardware, producing a key in long-term wrapped form. This happens once in the whole lifetime of the key. Second, the long-term wrapped key needs to be converted into ephemerally-wrapped form. This happens each time the key is "unlocked". In Android, these operations are supported in a generic way through KeyMint, a userspace abstraction layer. However, that method is Android-specific and can't be used on other Linux systems, may rely on proprietary libraries, and also misleads people into supporting KeyMint features like rollback resistance that make sense for other KeyMint keys but don't make sense for hardware-wrapped inline encryption keys. Therefore, this patch provides a generic kernel interface for these operations by introducing new block device ioctls: - BLKCRYPTOIMPORTKEY: convert a raw key to long-term wrapped form. - BLKCRYPTOGENERATEKEY: have the hardware generate a new key, then return it in long-term wrapped form. - BLKCRYPTOPREPAREKEY: convert a key from long-term wrapped form to ephemerally-wrapped form. These ioctls are implemented using new operations in blk_crypto_ll_ops. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-4-ebiggers@kernel.org Signed-off-by: Jens Axboe --- Documentation/block/inline-encryption.rst | 36 ++++++ Documentation/userspace-api/ioctl/ioctl-number.rst | 2 + block/blk-crypto-internal.h | 9 ++ block/blk-crypto-profile.c | 55 ++++++++ block/blk-crypto.c | 143 +++++++++++++++++++++ block/ioctl.c | 5 + include/linux/blk-crypto-profile.h | 53 ++++++++ include/linux/blk-crypto.h | 1 + include/uapi/linux/blk-crypto.h | 44 +++++++ include/uapi/linux/fs.h | 6 +- 10 files changed, 350 insertions(+), 4 deletions(-) create mode 100644 include/uapi/linux/blk-crypto.h (limited to 'block') diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index f03bd5b090d8..6380e6ab492b 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -492,6 +492,42 @@ when hardware support is available. This works in the following way: blk-crypto-fallback doesn't support hardware-wrapped keys. Therefore, hardware-wrapped keys can only be used with actual inline encryption hardware. +All the above deals with hardware-wrapped keys in ephemerally-wrapped form only. +To get such keys in the first place, new block device ioctls have been added to +provide a generic interface to creating and preparing such keys: + +- ``BLKCRYPTOIMPORTKEY`` converts a raw key to long-term wrapped form. It takes + in a pointer to a ``struct blk_crypto_import_key_arg``. The caller must set + ``raw_key_ptr`` and ``raw_key_size`` to the pointer and size (in bytes) of the + raw key to import. On success, ``BLKCRYPTOIMPORTKEY`` returns 0 and writes + the resulting long-term wrapped key blob to the buffer pointed to by + ``lt_key_ptr``, which is of maximum size ``lt_key_size``. It also updates + ``lt_key_size`` to be the actual size of the key. On failure, it returns -1 + and sets errno. An errno of ``EOPNOTSUPP`` indicates that the block device + does not support hardware-wrapped keys. An errno of ``EOVERFLOW`` indicates + that the output buffer did not have enough space for the key blob. + +- ``BLKCRYPTOGENERATEKEY`` is like ``BLKCRYPTOIMPORTKEY``, but it has the + hardware generate the key instead of importing one. It takes in a pointer to + a ``struct blk_crypto_generate_key_arg``. + +- ``BLKCRYPTOPREPAREKEY`` converts a key from long-term wrapped form to + ephemerally-wrapped form. It takes in a pointer to a ``struct + blk_crypto_prepare_key_arg``. The caller must set ``lt_key_ptr`` and + ``lt_key_size`` to the pointer and size (in bytes) of the long-term wrapped + key blob to convert. On success, ``BLKCRYPTOPREPAREKEY`` returns 0 and writes + the resulting ephemerally-wrapped key blob to the buffer pointed to by + ``eph_key_ptr``, which is of maximum size ``eph_key_size``. It also updates + ``eph_key_size`` to be the actual size of the key. On failure, it returns -1 + and sets errno. Errno values of ``EOPNOTSUPP`` and ``EOVERFLOW`` mean the + same as they do for ``BLKCRYPTOIMPORTKEY``. An errno of ``EBADMSG`` indicates + that the long-term wrapped key is invalid. + +Userspace needs to use either ``BLKCRYPTOIMPORTKEY`` or ``BLKCRYPTOGENERATEKEY`` +once to create a key, and then ``BLKCRYPTOPREPAREKEY`` each time the key is +unlocked and added to the kernel. Note that these ioctls have no relevance for +raw keys; they are only for hardware-wrapped keys. + Testability ----------- diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 6d1465315df3..d448c010d307 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -85,6 +85,8 @@ Code Seq# Include File Comments 0x10 20-2F arch/s390/include/uapi/asm/hypfs.h 0x12 all linux/fs.h BLK* ioctls linux/blkpg.h + linux/blkzoned.h + linux/blk-crypto.h 0x15 all linux/fs.h FS_IOC_* ioctls 0x1b all InfiniBand Subsystem diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 1893df9a8f06..ccf6dff6ff6b 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -83,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -130,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq) return false; } +static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index a990d9026c83..94a155912bf1 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -502,6 +502,61 @@ int blk_crypto_derive_sw_secret(struct block_device *bdev, return err; } +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.import_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size, + lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.generate_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.generate_key(profile, lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.prepare_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size, + eph_key); + blk_crypto_hw_exit(profile); + return ret; +} + /** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 72975a980fbc..4b1ad84d1b5a 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -469,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev, pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); + +static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_import_key_arg arg; + u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key)) + return -EINVAL; + + if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr), + arg.raw_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(raw_key, sizeof(raw_key)); + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_generate_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + ret = blk_crypto_generate_key(profile, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_prepare_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.lt_key_size > sizeof(lt_key)) + return -EINVAL; + + if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr), + arg.lt_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key); + if (ret < 0) + goto out; + if (ret > arg.eph_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.eph_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key, + arg.eph_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + memzero_explicit(eph_key, sizeof(eph_key)); + return ret; +} + +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + + if (!profile) + return -EOPNOTSUPP; + + switch (cmd) { + case BLKCRYPTOIMPORTKEY: + return blk_crypto_ioctl_import_key(profile, argp); + case BLKCRYPTOGENERATEKEY: + return blk_crypto_ioctl_generate_key(profile, argp); + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl_prepare_key(profile, argp); + default: + return -ENOTTY; + } +} diff --git a/block/ioctl.c b/block/ioctl.c index 6554b728bae6..faa40f383e27 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -15,6 +15,7 @@ #include #include #include "blk.h" +#include "blk-crypto-internal.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) @@ -620,6 +621,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKCRYPTOIMPORTKEY: + case BLKCRYPTOGENERATEKEY: + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index 7764b4f7b45b..4f39e9cd7576 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -71,6 +71,48 @@ struct blk_crypto_ll_ops { int (*derive_sw_secret)(struct blk_crypto_profile *profile, const u8 *eph_key, size_t eph_key_size, u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + + /** + * @import_key: Create a hardware-wrapped key by importing a raw key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*import_key)(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @generate_key: Generate a hardware-wrapped key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*generate_key)(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @prepare_key: Prepare a hardware-wrapped key to be used. + * + * Prepare a hardware-wrapped key to be used by converting it from + * long-term wrapped form to ephemerally-wrapped form. This only needs + * to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED is supported. + * + * On success, must write the key in ephemerally-wrapped form to + * @eph_key and return its size in bytes. On failure, must return + * -EBADMSG if the key is invalid, or another -errno on other error. + */ + int (*prepare_key)(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); }; /** @@ -163,6 +205,17 @@ void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, const struct blk_crypto_profile *child); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 81f932b3ea37..58b0c5254a67 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -8,6 +8,7 @@ #include #include +#include enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, diff --git a/include/uapi/linux/blk-crypto.h b/include/uapi/linux/blk-crypto.h new file mode 100644 index 000000000000..97302c6eb6af --- /dev/null +++ b/include/uapi/linux/blk-crypto.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_BLK_CRYPTO_H +#define _UAPI_LINUX_BLK_CRYPTO_H + +#include +#include + +struct blk_crypto_import_key_arg { + /* Raw key (input) */ + __u64 raw_key_ptr; + __u64 raw_key_size; + /* Long-term wrapped key blob (output) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + __u64 reserved[4]; +}; + +struct blk_crypto_generate_key_arg { + /* Long-term wrapped key blob (output) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + __u64 reserved[4]; +}; + +struct blk_crypto_prepare_key_arg { + /* Long-term wrapped key blob (input) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + /* Ephemerally-wrapped key blob (output) */ + __u64 eph_key_ptr; + __u64 eph_key_size; + __u64 reserved[4]; +}; + +/* + * These ioctls share the block device ioctl space; see uapi/linux/fs.h. + * 140-141 are reserved for future blk-crypto ioctls; any more than that would + * require an additional allocation from the block device ioctl space. + */ +#define BLKCRYPTOIMPORTKEY _IOWR(0x12, 137, struct blk_crypto_import_key_arg) +#define BLKCRYPTOGENERATEKEY _IOWR(0x12, 138, struct blk_crypto_generate_key_arg) +#define BLKCRYPTOPREPAREKEY _IOWR(0x12, 139, struct blk_crypto_prepare_key_arg) + +#endif /* _UAPI_LINUX_BLK_CRYPTO_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 2bbe00cf1248..e762e1af650c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -212,10 +212,8 @@ struct fsxattr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) #define BLKGETDISKSEQ _IOR(0x12,128,__u64) -/* - * A jump here: 130-136 are reserved for zoned block devices - * (see uapi/linux/blkzoned.h) - */ +/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ +/* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From 36d03cb3277e29beedb87b8efb1e4da02b26e0c0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 8 Feb 2025 17:04:15 +0800 Subject: block: introduce init_wait_func() There is already a macro DEFINE_WAIT_FUNC() to declare a wait_queue_entry with a specified waking function. But there is not a counterpart for initializing one wait_queue_entry with a specified waking function. So introducing init_wait_func() for this, which also could be used in iocost and rq-qos. Using default_wake_function() in rq_qos_wait() to wake up waiters, which could remove ->task field from rq_qos_wait_data. Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Muchun Song Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250208090416.38642-1-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 +-- block/blk-rq-qos.c | 14 +++++++------- include/linux/wait.h | 6 ++++-- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 65a1d4427ccf..6be46e28459b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2718,8 +2718,7 @@ retry_lock: * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); - wait.wait.private = current; + init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d4d4f4dc0e23..0ed3c81723bb 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -196,7 +196,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) struct rq_qos_wait_data { struct wait_queue_entry wq; - struct task_struct *task; struct rq_wait *rqw; acquire_inflight_cb_t *cb; void *private_data; @@ -218,7 +217,12 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - wake_up_process(data->task); + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). + */ + default_wake_function(curr, mode, wake_flags, key); list_del_init_careful(&curr->entry); return 1; } @@ -244,11 +248,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { - .wq = { - .func = rq_qos_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, .rqw = rqw, .cb = acquire_inflight_cb, .private_data = private_data, @@ -259,6 +258,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; + init_wait_func(&data.wq, rq_qos_wake_function); has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..2bdc8f47963b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1207,14 +1207,16 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) -#define init_wait(wait) \ +#define init_wait_func(wait, function) \ do { \ (wait)->private = current; \ - (wait)->func = autoremove_wake_function; \ + (wait)->func = function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) +#define init_wait(wait) init_wait_func(wait, autoremove_wake_function) + typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); -- cgit v1.2.3 From a052bfa636bb763786b9dc13a301a59afb03787a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 8 Feb 2025 17:04:16 +0800 Subject: block: refactor rq_qos_wait() When rq_qos_wait() is first introduced, it is easy to understand. But with some bug fixes applied, it is not easy for newcomers to understand the whole logic under those fixes. In this patch, rq_qos_wait() is refactored and more comments are added for better understanding. There are 3 points for the improvement: 1) Use waitqueue_active() instead of wq_has_sleeper() to eliminate unnecessary memory barrier in wq_has_sleeper() which is supposed to be used in waker side. In this case, we do need the barrier. So use the cheaper one to locklessly test for waiters on the queue. 2) Remove acquire_inflight_cb() logic for the first waiter out of the while loop to make the code clear. 3) Add more comments to explain how to sync with different waiters and the waker. Signed-off-by: Muchun Song Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250208090416.38642-2-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 68 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 0ed3c81723bb..95982bc46ba1 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -223,6 +223,14 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, * Remove explicitly and use default_wake_function(). */ default_wake_function(curr, mode, wake_flags, key); + /* + * Note that the order of operations is important as finish_wait() + * tests whether @curr is removed without grabbing the lock. This + * should be the last thing to do to make sure we will not have a + * UAF access to @data. And the semantics of memory barrier in it + * also make sure the waiter will see the latest @data->got_token + * once list_empty_careful() in finish_wait() returns true. + */ list_del_init_careful(&curr->entry); return 1; } @@ -248,37 +256,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { - .rqw = rqw, - .cb = acquire_inflight_cb, - .private_data = private_data, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + .got_token = false, }; - bool has_sleeper; + bool first_waiter; - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + /* + * If there are no waiters in the waiting queue, try to increase the + * inflight counter if we can. Otherwise, prepare for adding ourselves + * to the waiting queue. + */ + if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) return; init_wait_func(&data.wq, rq_qos_wake_function); - has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + /* + * Make sure there is at least one inflight process; otherwise, waiters + * will never be woken up. Since there may be no inflight process before + * adding ourselves to the waiting queue above, we need to try to + * increase the inflight counter for ourselves. And it is sufficient to + * guarantee that at least the first waiter to enter the waiting queue + * will re-check the waiting condition before going to sleep, thus + * ensuring forward progress. + */ + if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + /* + * We raced with rq_qos_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + * + * Enough memory barrier in list_empty_careful() in + * finish_wait() is paired with list_del_init_careful() + * in rq_qos_wake_function() to make sure we will see + * the latest @data->got_token. + */ + if (data.got_token) + cleanup_cb(rqw, private_data); + return; + } + + /* we are now relying on the waker to increase our inflight counter. */ do { - /* The memory barrier in set_current_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(&rqw->wait, &data.wq); - - /* - * We raced with rq_qos_wake_function() getting a token, - * which means we now have two. Put our local token - * and wake anyone else potentially waiting for one. - */ - if (data.got_token) - cleanup_cb(rqw, private_data); - return; - } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); -- cgit v1.2.3 From 8985c4298733a56d38c11948dc3b1dd24f4fcd6b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 19 Feb 2025 21:53:25 +0100 Subject: block: Remove commented out code Remove commented out code. Signed-off-by: Thorsten Blum Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250219205328.28462-2-thorsten.blum@linux.dev Signed-off-by: Jens Axboe --- block/partitions/sgi.c | 2 -- block/partitions/sun.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'block') diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 9cc6b8c1eea4..b5ecddd5181a 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state) p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - /*printk("Dev %s SGI disklabel: bad magic %08x\n", - state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index ddf9e6def4b2..2419af76120f 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { -/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } -- cgit v1.2.3 From 5d01d2df85f01ce083e0372bd3bd4968155e2911 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Thu, 13 Feb 2025 18:06:10 +0800 Subject: blk-wbt: Fix some comments wbt_wait() no longer uses a spinlock as a parameter. Update the function comments accordingly. RWB_UNKNOWN_BUMP is used when we gradually adjust scale_steps toward the center state, which is a value of 0. Signed-off-by: Tang Yizhou Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250213100611.209997-2-yizhou.tang@shopee.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6dfc659d22e2..8b73c0c11aec 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -136,8 +136,9 @@ enum { RWB_MIN_WRITE_SAMPLES = 3, /* - * If we have this number of consecutive windows with not enough - * information to scale up or down, scale up. + * If we have this number of consecutive windows without enough + * information to scale up or down, slowly return to center state + * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; @@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) __wbt_done(rqos, flags); } -/* - * May sleep, if we have exceeded the writeback limits. Caller can pass - * in an irq held spinlock, if it holds one when calling this function. - * If we do sleep, we'll release and re-grab it. - */ +/* May sleep, if we have exceeded the writeback limits. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); -- cgit v1.2.3 From 8ac17e6ae1bf4625b8fa457f135865c1fd86beae Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Thu, 13 Feb 2025 18:06:11 +0800 Subject: blk-wbt: Cleanup a comment in wb_timer_fn The original comment contains a grammatical error. Rewrite it into a more easily understandable sentence. Signed-off-by: Tang Yizhou Link: https://lore.kernel.org/r/20250213100611.209997-3-yizhou.tang@shopee.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8b73c0c11aec..f1754d07f7e0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -447,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) break; case LAT_UNKNOWN_WRITES: /* - * We started a the center step, but don't have a valid - * read/write sample, but we do have writes going on. - * Allow step to go negative, to increase write perf. + * We don't have a valid read/write sample, but we do have + * writes going on. Allow step to go negative, to increase + * write performance. */ scale_up(rwb); break; -- cgit v1.2.3 From 5fd0268a8806d35dcaf89139bfcda92be51b2b2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:31 -0800 Subject: block: mark bounce buffering as incompatible with integrity None of the few drivers still using the legacy block layer bounce buffering support integrity metadata. Explicitly mark the features as incompatible and stop creating the slab and mempool for integrity buffers for the bounce bio_set. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250225154449.422989-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 5 +++++ block/bounce.c | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index c44dadc35e1e..2763a34a9d56 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -117,6 +117,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return 0; } + if (lim->features & BLK_FEAT_BOUNCE_HIGH) { + pr_warn("no bounce buffer support for integrity metadata\n"); + return -EINVAL; + } + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; diff --git a/block/bounce.c b/block/bounce.c index 0d898cd5ec49..09a9616cf209 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -41,8 +41,6 @@ static void init_bounce_bioset(void) ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); BUG_ON(ret); - if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) - BUG_ON(1); ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); BUG_ON(ret); -- cgit v1.2.3 From e51679112c56ce327d6143caea0f0d2bd4618c4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:32 -0800 Subject: block: move the block layer auto-integrity code into a new file The code that automatically creates a integrity payload and generates and verifies the checksums for bios that don't have submitter-provided integrity payload currently sits right in the middle of the block integrity metadata infrastructure. Split it into a separate file to make the different layers clear. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250225154449.422989-3-hch@lst.de Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/bio-integrity-auto.c | 162 +++++++++++++++++++++++++++++++++++++++++++++ block/bio-integrity.c | 159 -------------------------------------------- 3 files changed, 164 insertions(+), 160 deletions(-) create mode 100644 block/bio-integrity-auto.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 33748123710b..3a941dc0d27f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,8 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ + bio-integrity-auto.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c new file mode 100644 index 000000000000..357241fa0f20 --- /dev/null +++ b/block/bio-integrity-auto.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen + * + * Automatically generate and verify integrity data on PI capable devices if the + * bio submitter didn't provide PI itself. This ensures that kernel verifies + * data integrity even if the file system (or other user of the block device) is + * not aware of PI. + */ +#include +#include +#include "blk.h" + +static struct workqueue_struct *kintegrityd_wq; + +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + + blk_integrity_verify(bio); + + kfree(bvec_virt(bip->bip_vec)); + bio_integrity_free(bio); + bio_endio(bio); +} + +/** + * __bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * + * Normally I/O completion is done in interrupt context. However, verifying I/O + * integrity is a time-consuming task which must be run in process context. + * + * This function postpones completion accordingly. + */ +bool __bio_integrity_endio(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); + return false; + } + + kfree(bvec_virt(bip->bip_vec)); + bio_integrity_free(bio); + return true; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Checks if the bio already has an integrity payload attached. If it does, the + * payload has been generated by another kernel subsystem, and we just pass it + * through. + * Otherwise allocates integrity payload and for writes the integrity metadata + * will be generated. For reads, the completion handler will verify the + * metadata. + */ +bool bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + gfp_t gfp = GFP_NOIO; + unsigned int len; + void *buf; + + if (!bi) + return true; + + if (!bio_sectors(bio)) + return true; + + /* Already protected? */ + if (bio_integrity(bio)) + return true; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) + return true; + break; + case REQ_OP_WRITE: + if (bi->flags & BLK_INTEGRITY_NOGENERATE) + return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; + break; + default: + return true; + } + + /* Allocate kernel buffer for protection data */ + len = bio_integrity_bytes(bi, bio_sectors(bio)); + buf = kmalloc(len, gfp); + if (!buf) + goto err_end_io; + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (IS_ERR(bip)) { + kfree(buf); + goto err_end_io; + } + + bip->bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(bip, bio->bi_iter.bi_sector); + + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bip->bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) + goto err_end_io; + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(bio); + else + bip->bio_iter = bio->bi_iter; + return true; + +err_end_io: + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return false; +} +EXPORT_SYMBOL(bio_integrity_prep); + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +static int __init blk_integrity_auto_init(void) +{ + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + return 0; +} +subsys_initcall(blk_integrity_auto_init); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5d81ad9a3d20..aa9f96612319 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -10,17 +10,10 @@ #include #include #include -#include #include #include "blk.h" static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -void blk_flush_integrity(void) -{ - flush_workqueue(kintegrityd_wq); -} /** * bio_integrity_free - Free bio integrity payload @@ -413,149 +406,6 @@ int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) return ret; } -/** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Checks if the bio already has an integrity payload attached. - * If it does, the payload has been generated by another kernel subsystem, - * and we just pass it through. Otherwise allocates integrity payload. - * The bio must have data direction, target device and start sector set priot - * to calling. In the WRITE case, integrity metadata will be generated using - * the block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -bool bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - unsigned int len; - void *buf; - gfp_t gfp = GFP_NOIO; - - if (!bi) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - switch (bio_op(bio)) { - case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) - return true; - break; - case REQ_OP_WRITE: - if (bi->flags & BLK_INTEGRITY_NOGENERATE) - return true; - - /* - * Zero the memory allocated to not leak uninitialized kernel - * memory to disk for non-integrity metadata where nothing else - * initializes the memory. - */ - if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) - gfp |= __GFP_ZERO; - break; - default: - return true; - } - - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (unlikely(buf == NULL)) { - goto err_end_io; - } - - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } - - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); - - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; - - /* describe what tags to check in payload */ - if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - blk_integrity_generate(bio); - else - bip->bio_iter = bio->bi_iter; - return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - - blk_integrity_verify(bio); - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - bio_endio(bio); -} - -/** - * __bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -bool __bio_integrity_endio(struct bio *bio) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); - return false; - } - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - return true; -} - /** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update @@ -644,15 +494,6 @@ void bioset_integrity_free(struct bio_set *bs) void __init bio_integrity_init(void) { - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIO_INLINE_VECS, -- cgit v1.2.3 From 105ca2a2c2ff2c8df0e334d6913d62eec1973dd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:33 -0800 Subject: block: split struct bio_integrity_payload Many of the fields in struct bio_integrity_payload are only needed for the default integrity buffer in the block layer, and the variable sized array at the end of the structure makes it very hard to embed into caller allocated structures. Reduce struct bio_integrity_payload to the minimal structure needed in common code and create two separate containing structures for the automatically generated payload and the caller allocated payload. The latter is a simple wrapper for struct bio_integrity_payload and the bvecs, while the former contains the additional fields moved out of struct bio_integrity_payload. Always use a dedicated mempool for automatic integrity metadata instead of depending on bio_set that is submitter controlled and thus often doesn't have the mempool initialized and stop using mempools for the submitter buffers as they aren't in the NOIO I/O submission path where we need to guarantee forward progress. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Tested-by: Anuj Gupta Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250225154449.422989-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 75 +++++++++++++++++-------- block/bio-integrity.c | 107 ++++++++---------------------------- block/bio.c | 6 -- block/blk.h | 2 +- block/t10-pi.c | 6 +- drivers/md/dm-integrity.c | 12 ---- drivers/md/dm-table.c | 6 -- drivers/md/md.c | 13 ----- drivers/target/target_core_iblock.c | 12 ---- include/linux/bio-integrity.h | 25 +-------- include/linux/bio.h | 4 -- 11 files changed, 80 insertions(+), 188 deletions(-) (limited to 'block') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 357241fa0f20..e524c609be50 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -12,18 +12,34 @@ #include #include "blk.h" +struct bio_integrity_data { + struct bio *bio; + struct bvec_iter saved_bio_iter; + struct work_struct work; + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *bid_slab; +static mempool_t bid_pool; static struct workqueue_struct *kintegrityd_wq; -static void bio_integrity_verify_fn(struct work_struct *work) +static void bio_integrity_finish(struct bio_integrity_data *bid) { - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; + bid->bio->bi_integrity = NULL; + bid->bio->bi_opf &= ~REQ_INTEGRITY; + kfree(bvec_virt(bid->bip.bip_vec)); + mempool_free(bid, &bid_pool); +} - blk_integrity_verify(bio); +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_data *bid = + container_of(work, struct bio_integrity_data, work); + struct bio *bio = bid->bio; - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); + blk_integrity_verify_iter(bio, &bid->saved_bio_iter); + bio_integrity_finish(bid); bio_endio(bio); } @@ -40,15 +56,16 @@ bool __bio_integrity_endio(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_data *bid = + container_of(bip, struct bio_integrity_data, bip); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); + INIT_WORK(&bid->work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bid->work); return false; } - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); + bio_integrity_finish(bid); return true; } @@ -65,8 +82,8 @@ bool __bio_integrity_endio(struct bio *bio) */ bool bio_integrity_prep(struct bio *bio) { - struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_data *bid; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; @@ -102,27 +119,30 @@ bool bio_integrity_prep(struct bio *bio) return true; } + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return true; + /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (!buf) goto err_end_io; + bid = mempool_alloc(&bid_pool, GFP_NOIO); + if (!bid) + goto err_free_buf; + bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } + bid->bio = bio; - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); + bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; + bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; + bid->bip.bip_flags |= BIP_CHECK_GUARD; if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; + bid->bip.bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) @@ -132,9 +152,11 @@ bool bio_integrity_prep(struct bio *bio) if (bio_data_dir(bio) == WRITE) blk_integrity_generate(bio); else - bip->bio_iter = bio->bi_iter; + bid->saved_bio_iter = bio->bi_iter; return true; +err_free_buf: + kfree(buf); err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); @@ -149,6 +171,13 @@ void blk_flush_integrity(void) static int __init blk_integrity_auto_init(void) { + bid_slab = kmem_cache_create("bio_integrity_data", + sizeof(struct bio_integrity_data), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) + panic("bio: can't create integrity pool\n"); + /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. diff --git a/block/bio-integrity.c b/block/bio-integrity.c index aa9f96612319..608594a154a5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,13 +7,12 @@ */ #include -#include -#include -#include -#include #include "blk.h" -static struct kmem_cache *bip_slab; +struct bio_integrity_alloc { + struct bio_integrity_payload bip; + struct bio_vec bvecs[]; +}; /** * bio_integrity_free - Free bio integrity payload @@ -23,21 +22,23 @@ static struct kmem_cache *bip_slab; */ void bio_integrity_free(struct bio *bio) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - if (bip->bip_vec) - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } + kfree(bio_integrity(bio)); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs) +{ + memset(bip, 0, sizeof(*bip)); + bip->bip_max_vcnt = nr_vecs; + if (nr_vecs) + bip->bip_vec = bvecs; + + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -52,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned inline_vecs; + struct bio_integrity_alloc *bia; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); - if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { - bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!bip)) + bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask); + if (unlikely(!bia)) return ERR_PTR(-ENOMEM); - - memset(bip, 0, sizeof(*bip)); - - /* always report as many vecs as asked explicitly, not inline vecs */ - bip->bip_max_vcnt = nr_vecs; - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); - if (!bip->bip_vec) - goto err; - } else if (nr_vecs) { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_bio = bio; - bio->bi_integrity = bip; - bio->bi_opf |= REQ_INTEGRITY; - - return bip; -err: - if (bs && mempool_initialized(&bs->bio_integrity_pool)) - mempool_free(bip, &bs->bio_integrity_pool); - else - kfree(bip); - return ERR_PTR(-ENOMEM); + bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs); + return &bia->bip; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -467,35 +436,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (mempool_initialized(&bs->bio_integrity_pool)) - return 0; - - if (mempool_init_slab_pool(&bs->bio_integrity_pool, - pool_size, bip_slab)) - return -1; - - if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { - mempool_exit(&bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - mempool_exit(&bs->bio_integrity_pool); - mempool_exit(&bs->bvec_integrity_pool); -} - -void __init bio_integrity_init(void) -{ - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} diff --git a/block/bio.c b/block/bio.c index f0c416e5931d..dabc1a6c41b1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1657,7 +1657,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1737,8 +1736,6 @@ static int __init init_bio(void) BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); - bio_integrity_init(); - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1754,9 +1751,6 @@ static int __init init_bio(void) BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/block/blk.h b/block/blk.h index 90fa5f28ccab..8f5554a6989e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -710,7 +710,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); -void blk_integrity_verify(struct bio *bio); +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); diff --git a/block/t10-pi.c b/block/t10-pi.c index 2d05421f0fa5..de172d56b1f3 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio) } } -void blk_integrity_verify(struct bio *bio) +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); @@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio) */ iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.seed = bip->bio_iter.bi_sector; + iter.seed = saved_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + __bio_for_each_segment(bv, bio, bviter, *saved_iter) { void *kaddr = bvec_kmap_local(&bv); blk_status_t ret = BLK_STS_OK; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e..e743657379f7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4808,23 +4808,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); if (r) { ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recalc_bios, 1); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bf9a61191e9a..453803f1edf5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1081,15 +1081,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->io_bs, pool_size)) - goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->bs, pool_size)) - goto out_free_pools; t->mempools = pools; return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 30b3dbbce2d2..79cabe4be77d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2359,19 +2359,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index c8dc92a7d63e..73564efd11d2 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -167,18 +167,6 @@ static int iblock_configure_device(struct se_device *dev) break; } - if (dev->dev_attrib.pi_prot_type) { - struct bio_set *bs = &ib_dev->ibd_bio_set; - - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); - } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 802f52e38efd..0a25716820fe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -16,8 +16,6 @@ enum bip_flags { }; struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ @@ -25,12 +23,7 @@ struct bio_integrity_payload { unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ @@ -74,6 +67,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, bip->bip_iter.bi_sector = seed; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, @@ -85,9 +80,6 @@ bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); -int bioset_integrity_create(struct bio_set *bs, int pool_size); -void bioset_integrity_free(struct bio_set *bs); -void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ @@ -96,15 +88,6 @@ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) return NULL; } -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free(struct bio_set *bs) -{ -} - static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; @@ -139,10 +122,6 @@ static inline void bio_integrity_trim(struct bio *bio) { } -static inline void bio_integrity_init(void) -{ -} - static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b79bf50f4f0..cafc7c215de8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -625,10 +625,6 @@ struct bio_set { mempool_t bio_pool; mempool_t bvec_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t bio_integrity_pool; - mempool_t bvec_integrity_pool; -#endif unsigned int back_pad; /* -- cgit v1.2.3 From 29cb955934302a5da525db6b327c795572538426 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 27 Feb 2025 20:06:45 +0800 Subject: blk-throttle: fix lower bps rate by throtl_trim_slice() The bio submission time may be a few jiffies more than the expected waiting time, due to 'extra_bytes' can't be divided in tg_within_bps_limit(), and also due to timer wakeup delay. In this case, adjust slice_start to jiffies will discard the extra wait time, causing lower rate than expected. Current in-tree code already covers deviation by rounddown(), but turns out it is not enough, because jiffies - slice_start can be a multiple of throtl_slice. For example, assume bps_limit is 1000bytes, 1 jiffes is 10ms, and slice is 20ms(2 jiffies), expected rate is 1000 / 1000 * 20 = 20 bytes per slice. If user issues two 21 bytes IO, then wait time will be 30ms for the first IO: bytes_allowed = 20, extra_bytes = 1; jiffy_wait = 1 + 2 = 3 jiffies and consider extra 1 jiffies by timer, throtl_trim_slice() will be called at: jiffies = 40ms slice_start = 0ms, slice_end= 40ms bytes_disp = 21 In this case, before the patch, real rate in the first two slices is 10.5 bytes per slice, and slice will be updated to: jiffies = 40ms slice_start = 40ms, slice_end = 60ms, bytes_disp = 0; Hence the second IO will have to wait another 30ms; With the patch, the real rate in the first slice is 20 bytes per slice, which is the same as expected, and slice will be updated: jiffies=40ms, slice_start = 20ms, slice_end = 60ms, bytes_disp = 1; And now, there is still 19 bytes allowed in the second slice, and the second IO will only have to wait 10ms; This problem will cause blktests throtl/001 failure in case of CONFIG_HZ_100=y, fix it by preserving one extra finished slice in throtl_trim_slice(). Fixes: e43473b7f223 ("blkio: Core implementation of throttle policy") Reported-by: Ming Lei Closes: https://lore.kernel.org/linux-block/20250222092823.210318-3-yukuai1@huaweicloud.com/ Reviewed-by: Ming Lei Acked-by: Tejun Heo Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250227120645.812815-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8d149aff9fd0..a52f0d6b40ad 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -599,14 +599,23 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); - if (!time_elapsed) + /* Don't trim slice until at least 2 slices are used */ + if (time_elapsed < tg->td->throtl_slice * 2) return; + /* + * The bio submission time may be a few jiffies more than the expected + * waiting time, due to 'extra_bytes' can't be divided in + * tg_within_bps_limit(), and also due to timer wakeup delay. In this + * case, adjust slice_start will discard the extra wait time, causing + * lower rate than expected. Therefore, other than the above rounddown, + * one extra slice is preserved for deviation. + */ + time_elapsed -= tg->td->throtl_slice; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), time_elapsed) + tg->carryover_bytes[rw]; -- cgit v1.2.3 From 483a393e7e6189aac7d47b5295029159ab7a1cf1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 5 Mar 2025 12:31:19 +0800 Subject: blk-throttle: remove last_bytes_disp and last_ios_disp The two fields are not used any more, so remove them. Cc: Tejun Heo Cc: Josef Bacik Cc: Yu Kuai Signed-off-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250305043123.3938491-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 5 +---- block/blk-throttle.h | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a52f0d6b40ad..213e7b04617a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -819,13 +819,10 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->bytes_disp[rw] += bio_size; - tg->last_bytes_disp[rw] += bio_size; - } tg->io_disp[rw]++; - tg->last_io_disp[rw]++; } /** diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 1a36d1278eea..ba8f6e986994 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -106,9 +106,6 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; - /* * The following two fields are updated when new configuration is * submitted while some bios are still throttled, they record how many -- cgit v1.2.3 From a9fc8868b350cbf4ff730a4ea9651319cc669516 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 5 Mar 2025 12:31:20 +0800 Subject: blk-throttle: don't take carryover for prioritized processing of metadata Commit 29390bb5661d ("blk-throttle: support prioritized processing of metadata") takes bytes/ios carryover for prioritized processing of metadata. Turns out we can support it by charging it directly without trimming slice, and the result is same with carryover. Cc: Tejun Heo Cc: Josef Bacik Cc: Yu Kuai Signed-off-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250305043123.3938491-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 213e7b04617a..7271aee94faf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1620,13 +1620,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) return tg_may_dispatch(tg, bio, NULL); } -static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) -{ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) - tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); - tg->carryover_ios[rw]--; -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -1663,10 +1656,12 @@ bool __blk_throtl_bio(struct bio *bio) /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. - * Debts are handled by carryover_bytes/ios while - * calculating wait time. + * + * Charge and dispatch directly, and our throttle + * control algorithm is adaptive, and extra IO bytes + * will be throttled for paying the debt */ - tg_dispatch_in_debt(tg, bio, rw); + throtl_charge_bio(tg, bio); } else { /* if above limits, break to queue */ break; -- cgit v1.2.3 From 6cc477c36875ea5329b8bfbdf4d91f83dc653c91 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 5 Mar 2025 12:31:21 +0800 Subject: blk-throttle: carry over directly Now ->carryover_bytes[] and ->carryover_ios[] only covers limit/config update. Actually the carryover bytes/ios can be carried to ->bytes_disp[] and ->io_disp[] directly, since the carryover is one-shot thing and only valid in current slice. Then we can remove the two fields and simplify code much. Type of ->bytes_disp[] and ->io_disp[] has to change as signed because the two fields may become negative when updating limits or config, but both are big enough for holding bytes/ios dispatched in single slice Cc: Tejun Heo Cc: Josef Bacik Cc: Yu Kuai Signed-off-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250305043123.3938491-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 49 +++++++++++++++++++++---------------------------- block/blk-throttle.h | 4 ++-- 2 files changed, 23 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7271aee94faf..91dab43c65ab 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -478,8 +478,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -498,16 +496,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, - bool clear_carryover) + bool clear) { - tg->bytes_disp[rw] = 0; - tg->io_disp[rw] = 0; + if (clear) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + } tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; - if (clear_carryover) { - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; - } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -617,20 +613,16 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) */ time_elapsed -= tg->td->throtl_slice; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed) + - tg->carryover_bytes[rw]; - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + - tg->carryover_ios[rw]; + time_elapsed); + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); if (bytes_trim <= 0 && io_trim <= 0) return; - tg->carryover_bytes[rw] = 0; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; - tg->carryover_ios[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else @@ -645,7 +637,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) jiffies); } -static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +static void __tg_update_carryover(struct throtl_grp *tg, bool rw, + long long *bytes, int *ios) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); @@ -658,26 +651,28 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw) * configuration. */ if (bps_limit != U64_MAX) - tg->carryover_bytes[rw] += - calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) - tg->carryover_ios[rw] += - calculate_io_allowed(iops_limit, jiffy_elapsed) - + *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; + tg->bytes_disp[rw] -= *bytes; + tg->io_disp[rw] -= *ios; } static void tg_update_carryover(struct throtl_grp *tg) { + long long bytes[2] = {0}; + int ios[2] = {0}; + if (tg->service_queue.nr_queued[READ]) - __tg_update_carryover(tg, READ); + __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); if (tg->service_queue.nr_queued[WRITE]) - __tg_update_carryover(tg, WRITE); + __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, - tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], - tg->carryover_ios[READ], tg->carryover_ios[WRITE]); + bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, @@ -695,8 +690,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + - tg->carryover_ios[rw]; + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; @@ -729,8 +723,7 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + - tg->carryover_bytes[rw]; + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index ba8f6e986994..7964cc041e06 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -102,9 +102,9 @@ struct throtl_grp { unsigned int iops[2]; /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; + int64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; + int io_disp[2]; /* * The following two fields are updated when new configuration is -- cgit v1.2.3 From 677e332e4885a17def5efa4788b6e725a737b63c Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Wed, 5 Mar 2025 12:00:32 +0530 Subject: block: ensure correct integrity capability propagation in stacked devices queue_limits_stack_integrity() incorrectly sets BLK_INTEGRITY_DEVICE_CAPABLE for a DM device even when none of its underlying devices support integrity. This happens because the flag is inherited unconditionally. Ensure that integrity capabilities are correctly propagated only when the underlying devices actually support integrity. Reported-by: M Nikhil Link: https://lore.kernel.org/linux-block/f6130475-3ccd-45d2-abde-3ccceada0f0a@linux.ibm.com/ Fixes: c6e56cf6b2e7 ("block: move integrity information into queue_limits") Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250305063033.1813-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 2763a34a9d56..25fd7793fd9d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -864,36 +864,28 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) return true; - if (!ti->tuple_size) { - /* inherit the settings from the first underlying device */ - if (!(ti->flags & BLK_INTEGRITY_STACKED)) { - ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | - (bi->flags & BLK_INTEGRITY_REF_TAG); - ti->csum_type = bi->csum_type; - ti->tuple_size = bi->tuple_size; - ti->pi_offset = bi->pi_offset; - ti->interval_exp = bi->interval_exp; - ti->tag_size = bi->tag_size; - goto done; - } - if (!bi->tuple_size) - goto done; + if (ti->flags & BLK_INTEGRITY_STACKED) { + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + } else { + ti->flags = BLK_INTEGRITY_STACKED; + ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; } - - if (ti->tuple_size != bi->tuple_size) - goto incompatible; - if (ti->interval_exp != bi->interval_exp) - goto incompatible; - if (ti->tag_size != bi->tag_size) - goto incompatible; - if (ti->csum_type != bi->csum_type) - goto incompatible; - if ((ti->flags & BLK_INTEGRITY_REF_TAG) != - (bi->flags & BLK_INTEGRITY_REF_TAG)) - goto incompatible; - -done: - ti->flags |= BLK_INTEGRITY_STACKED; return true; incompatible: -- cgit v1.2.3 From 85f72925000e924291a0ebf63d2234994a4f22bd Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Wed, 5 Mar 2025 12:00:33 +0530 Subject: block: Correctly initialize BLK_INTEGRITY_NOGENERATE and BLK_INTEGRITY_NOVERIFY Currently, BLK_INTEGRITY_NOGENERATE and BLK_INTEGRITY_NOVERIFY are not explicitly set during integrity initialization. This can lead to incorrect reporting of read_verify and write_generate sysfs values, particularly when a device does not support integrity. Ensure that these flags are correctly initialized by default. Reported-by: M Nikhil Link: https://lore.kernel.org/linux-block/f6130475-3ccd-45d2-abde-3ccceada0f0a@linux.ibm.com/ Fixes: 9f4aa46f2a74 ("block: invert the BLK_INTEGRITY_{GENERATE,VERIFY} flags") Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250305063033.1813-3-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 25fd7793fd9d..008947a13541 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -114,6 +114,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) pr_warn("invalid PI settings.\n"); return -EINVAL; } + bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY; return 0; } -- cgit v1.2.3 From 7d83c5d73c1a3c7b71ba70d0ad2ae66e7a0e7ace Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:54:56 +0800 Subject: badblocks: Fix error shitf ops 'bb->shift' is used directly in badblocks. It is wrong, fix it. Fixes: 3ea3354cb9f0 ("badblocks: improve badblocks_check() for multiple ranges handling") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-2-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index db4ec8b9b2a8..bcee057efc47 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -880,8 +880,8 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, /* round the start down, and the end up */ sector_t next = s + sectors; - rounddown(s, bb->shift); - roundup(next, bb->shift); + rounddown(s, 1 << bb->shift); + roundup(next, 1 << bb->shift); sectors = next - s; } @@ -1157,8 +1157,8 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) * isn't than to think a block is not bad when it is. */ target = s + sectors; - roundup(s, bb->shift); - rounddown(target, bb->shift); + roundup(s, 1 << bb->shift); + rounddown(target, 1 << bb->shift); sectors = target - s; } @@ -1288,8 +1288,8 @@ static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, /* round the start down, and the end up */ target = s + sectors; - rounddown(s, bb->shift); - roundup(target, bb->shift); + rounddown(s, 1 << bb->shift); + roundup(target, 1 << bb->shift); sectors = target - s; } -- cgit v1.2.3 From 270b68fee9688428e0a98d4a2c3e6d4c434a84ba Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:54:57 +0800 Subject: badblocks: factor out a helper try_adjacent_combine Factor out try_adjacent_combine(), and it will be used in the later patch. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250227075507.151331-3-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index bcee057efc47..f069c93e986d 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -855,6 +855,31 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool try_adjacent_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + if (prev >= 0 && (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + return true; + } + return false; +} + /* Do exact work to set bad block range into the bad block table */ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, int acknowledged) @@ -1022,20 +1047,7 @@ update_sectors: * merged. (prev < 0) condition is not handled here, * because it's already complicated enough. */ - if (prev >= 0 && - (prev + 1) < bb->count && - BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && - (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && - BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { - p[prev] = BB_MAKE(BB_OFFSET(p[prev]), - BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), - BB_ACK(p[prev])); - - if ((prev + 2) < bb->count) - memmove(p + prev + 1, p + prev + 2, - (bb->count - (prev + 2)) * 8); - bb->count--; - } + try_adjacent_combine(bb, prev); if (space_desired && !badblocks_full(bb)) { s = orig_start; -- cgit v1.2.3 From 32e9ad4d11f69949ff331e35a417871ee0d31d99 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:54:58 +0800 Subject: badblocks: attempt to merge adjacent badblocks during ack_all_badblocks If ack and unack badblocks are adjacent, they will not be merged and will remain as two separate badblocks. Even after the bad blocks are written to disk and both become ack, they will still remain as two independent bad blocks. This is not ideal as it wastes the limited space for badblocks. Therefore, during ack_all_badblocks(), attempt to merge badblocks if they are adjacent. Fixes: aa511ff8218b ("badblocks: switch to the improved badblock handling code") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-4-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index f069c93e986d..ad8652fbe1c8 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -1491,6 +1491,11 @@ void ack_all_badblocks(struct badblocks *bb) p[i] = BB_MAKE(start, len, 1); } } + + for (i = 0; i < bb->count ; i++) + while (try_adjacent_combine(bb, i)) + ; + bb->unacked_exist = 0; } write_sequnlock_irq(&bb->lock); -- cgit v1.2.3 From 28243dcd1f49cc8be398a1396d16a45527882ce5 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:54:59 +0800 Subject: badblocks: return error directly when setting badblocks exceeds 512 In the current handling of badblocks settings, a lot of processing has been done for scenarios where the number of badblocks exceeds 512. This makes the code look quite complex and also introduces some issues, For example, if there is 512 badblocks already: for((i=0; i<510; i++)); do ((sector=i*2)); echo "$sector 1" > bad_blocks; done echo 2100 10 > bad_blocks echo 2200 10 > bad_blocks Set new one, exceed 512: echo 2000 500 > bad_blocks Expected: 2000 500 Actual: 2100 400 In fact, a disk shouldn't have too many badblocks, and for disks with 512 badblocks, attempting to set more bad blocks doesn't make much sense. At that point, the more appropriate action would be to replace the disk. Therefore, to resolve these issues and simplify the code somewhat, return error directly when setting badblocks exceeds 512. Fixes: aa511ff8218b ("badblocks: switch to the improved badblock handling code") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250227075507.151331-5-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 121 +++++++++--------------------------------------------- 1 file changed, 19 insertions(+), 102 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index ad8652fbe1c8..1c8b8f65f6df 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -527,51 +527,6 @@ out: return ret; } -/* - * Return 'true' if the range indicated by 'bad' can be backward merged - * with the bad range (from the bad table) index by 'behind'. - */ -static bool can_merge_behind(struct badblocks *bb, - struct badblocks_context *bad, int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - - if ((s < BB_OFFSET(p[behind])) && - ((s + sectors) >= BB_OFFSET(p[behind])) && - ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && - BB_ACK(p[behind]) == bad->ack) - return true; - return false; -} - -/* - * Do backward merge for range indicated by 'bad' and the bad range - * (from the bad table) indexed by 'behind'. The return value is merged - * sectors from bad->len. - */ -static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, - int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - int merged = 0; - - WARN_ON(s >= BB_OFFSET(p[behind])); - WARN_ON((s + sectors) < BB_OFFSET(p[behind])); - - if (s < BB_OFFSET(p[behind])) { - merged = BB_OFFSET(p[behind]) - s; - p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); - - WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); - } - - return merged; -} - /* * Return 'true' if the range indicated by 'bad' can be forward * merged with the bad range (from the bad table) indexed by 'prev'. @@ -884,11 +839,9 @@ static bool try_adjacent_combine(struct badblocks *bb, int prev) static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, int acknowledged) { - int retried = 0, space_desired = 0; - int orig_len, len = 0, added = 0; + int len = 0, added = 0; struct badblocks_context bad; int prev = -1, hint = -1; - sector_t orig_start; unsigned long flags; int rv = 0; u64 *p; @@ -912,8 +865,6 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, write_seqlock_irqsave(&bb->lock, flags); - orig_start = s; - orig_len = sectors; bad.ack = acknowledged; p = bb->page; @@ -922,6 +873,11 @@ re_insert: bad.len = sectors; len = 0; + if (badblocks_full(bb)) { + rv = 1; + goto out; + } + if (badblocks_empty(bb)) { len = insert_at(bb, 0, &bad); bb->count++; @@ -933,32 +889,14 @@ re_insert: /* start before all badblocks */ if (prev < 0) { - if (!badblocks_full(bb)) { - /* insert on the first */ - if (bad.len > (BB_OFFSET(p[0]) - bad.start)) - bad.len = BB_OFFSET(p[0]) - bad.start; - len = insert_at(bb, 0, &bad); - bb->count++; - added++; - hint = 0; - goto update_sectors; - } - - /* No sapce, try to merge */ - if (overlap_behind(bb, &bad, 0)) { - if (can_merge_behind(bb, &bad, 0)) { - len = behind_merge(bb, &bad, 0); - added++; - } else { - len = BB_OFFSET(p[0]) - s; - space_desired = 1; - } - hint = 0; - goto update_sectors; - } - - /* no table space and give up */ - goto out; + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = 0; + goto update_sectors; } /* in case p[prev-1] can be merged with p[prev] */ @@ -978,6 +916,11 @@ re_insert: int extra = 0; if (!can_front_overwrite(bb, prev, &bad, &extra)) { + if (extra > 0) { + rv = 1; + goto out; + } + len = min_t(sector_t, BB_END(p[prev]) - s, sectors); hint = prev; @@ -1004,24 +947,6 @@ re_insert: goto update_sectors; } - /* if no space in table, still try to merge in the covered range */ - if (badblocks_full(bb)) { - /* skip the cannot-merge range */ - if (((prev + 1) < bb->count) && - overlap_behind(bb, &bad, prev + 1) && - ((s + sectors) >= BB_END(p[prev + 1]))) { - len = BB_END(p[prev + 1]) - s; - hint = prev + 1; - goto update_sectors; - } - - /* no retry any more */ - len = sectors; - space_desired = 1; - hint = -1; - goto update_sectors; - } - /* cannot merge and there is space in bad table */ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) @@ -1049,14 +974,6 @@ update_sectors: */ try_adjacent_combine(bb, prev); - if (space_desired && !badblocks_full(bb)) { - s = orig_start; - sectors = orig_len; - space_desired = 0; - if (retried++ < 3) - goto re_insert; - } - out: if (added) { set_changed(bb); -- cgit v1.2.3 From 7f500f0a59b1d7345a05ec4ae703babf34b7e470 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:55:00 +0800 Subject: badblocks: return error if any badblock set fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _badblocks_set() returns success if at least one badblock is set successfully, even if others fail. This can lead to data inconsistencies in raid, where a failed badblock set should trigger the disk to be kicked out to prevent future reads from failed write areas. _badblocks_set() should return error if any badblock set fails. Instead of relying on 'rv', directly returning 'sectors' for clearer logic. If all badblocks are successfully set, 'sectors' will be 0, otherwise it indicates the number of badblocks that have not been set yet, thus signaling failure. By the way, it can also fix an issue: when a newly set unack badblock is included in an existing ack badblock, the setting will return an error. ··· echo "0 100" /sys/block/md0/md/dev-loop1/bad_blocks echo "0 100" /sys/block/md0/md/dev-loop1/unacknowledged_bad_blocks -bash: echo: write error: No space left on device ``` After fix, it will return success. Fixes: aa511ff8218b ("badblocks: switch to the improved badblock handling code") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-6-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index 1c8b8f65f6df..88f27d4f3856 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -843,7 +843,6 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, struct badblocks_context bad; int prev = -1, hint = -1; unsigned long flags; - int rv = 0; u64 *p; if (bb->shift < 0) @@ -873,10 +872,8 @@ re_insert: bad.len = sectors; len = 0; - if (badblocks_full(bb)) { - rv = 1; + if (badblocks_full(bb)) goto out; - } if (badblocks_empty(bb)) { len = insert_at(bb, 0, &bad); @@ -916,10 +913,8 @@ re_insert: int extra = 0; if (!can_front_overwrite(bb, prev, &bad, &extra)) { - if (extra > 0) { - rv = 1; + if (extra > 0) goto out; - } len = min_t(sector_t, BB_END(p[prev]) - s, sectors); @@ -986,10 +981,7 @@ out: write_sequnlock_irqrestore(&bb->lock, flags); - if (!added) - rv = 1; - - return rv; + return sectors; } /* @@ -1353,7 +1345,8 @@ EXPORT_SYMBOL_GPL(badblocks_check); * * Return: * 0: success - * 1: failed to set badblocks (out of space) + * other: failed to set badblocks (out of space). Parital setting will be + * treated as failure. */ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, int acknowledged) -- cgit v1.2.3 From 37446680dfbfbba7cbedd680047182f70a0b857b Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:55:01 +0800 Subject: badblocks: fix the using of MAX_BADBLOCKS The number of badblocks cannot exceed MAX_BADBLOCKS, but it should be allowed to equal MAX_BADBLOCKS. Fixes: aa511ff8218b ("badblocks: switch to the improved badblock handling code") Fixes: c3c6a86e9efc ("badblocks: add helper routines for badblock ranges handling") Signed-off-by: Li Nan Reviewed-by: Zhu Yanjun Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-7-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index 88f27d4f3856..43430bd3efa7 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -700,7 +700,7 @@ static bool can_front_overwrite(struct badblocks *bb, int prev, *extra = 2; } - if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + if ((bb->count + (*extra)) > MAX_BADBLOCKS) return false; return true; @@ -1135,7 +1135,7 @@ re_clear: if ((BB_OFFSET(p[prev]) < bad.start) && (BB_END(p[prev]) > (bad.start + bad.len))) { /* Splitting */ - if ((bb->count + 1) < MAX_BADBLOCKS) { + if ((bb->count + 1) <= MAX_BADBLOCKS) { len = front_splitting_clear(bb, prev, &bad); bb->count += 1; cleared++; -- cgit v1.2.3 From 3a23d05f9c1abf8238fe48167ab5574062d1606e Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:55:02 +0800 Subject: badblocks: try can_merge_front before overlap_front Regardless of whether overlap_front() returns true or false, can_merge_front() will be executed first. Therefore, move can_merge_front() in front of can_merge_front() to simplify code. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250227075507.151331-8-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index 43430bd3efa7..57e9edf9b848 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -905,39 +905,35 @@ re_insert: goto update_sectors; } + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + hint = prev; + goto update_sectors; + } + if (overlap_front(bb, prev, &bad)) { - if (can_merge_front(bb, prev, &bad)) { - len = front_merge(bb, prev, &bad); - added++; - } else { - int extra = 0; + int extra = 0; - if (!can_front_overwrite(bb, prev, &bad, &extra)) { - if (extra > 0) - goto out; + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + if (extra > 0) + goto out; - len = min_t(sector_t, - BB_END(p[prev]) - s, sectors); - hint = prev; - goto update_sectors; - } + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; + goto update_sectors; + } - len = front_overwrite(bb, prev, &bad, extra); - added++; - bb->count += extra; + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; - if (can_combine_front(bb, prev, &bad)) { - front_combine(bb, prev); - bb->count--; - } + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; } - hint = prev; - goto update_sectors; - } - if (can_merge_front(bb, prev, &bad)) { - len = front_merge(bb, prev, &bad); - added++; hint = prev; goto update_sectors; } -- cgit v1.2.3 From 9ec65dec634a752ab0a1203510ee190356e4cf1a Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 27 Feb 2025 15:55:03 +0800 Subject: badblocks: fix merge issue when new badblocks align with pre+1 There is a merge issue when adding badblocks as follow: echo 0 10 > bad_blocks echo 30 10 > bad_blocks echo 20 10 > bad_blocks cat bad_blocks 0 10 20 10 //should be merged with (30 10) 30 10 In this case, if new badblocks does not intersect with prev, it is added by insert_at(). If there is an intersection with prev+1, the merge will be processed in the next re_insert loop. However, when the end of the new badblocks is exactly equal to the offset of prev+1, no further re_insert loop occurs, and the two badblocks are not merge. Fix it by inc prev, badblocks can be merged during the subsequent code. Fixes: aa511ff8218b ("badblocks: switch to the improved badblock handling code") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250227075507.151331-9-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index 57e9edf9b848..92bd43f7fff1 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -892,7 +892,7 @@ re_insert: len = insert_at(bb, 0, &bad); bb->count++; added++; - hint = 0; + hint = ++prev; goto update_sectors; } @@ -947,7 +947,7 @@ re_insert: len = insert_at(bb, prev + 1, &bad); bb->count++; added++; - hint = prev + 1; + hint = ++prev; update_sectors: s += len; -- cgit v1.2.3 From 5236f041fa6c81c71eabad44897e54a0d6d5bbf6 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:04 +0800 Subject: badblocks: fix missing bad blocks on retry in _badblocks_check() The bad blocks check would miss bad blocks when retrying under contention, as checking parameters are not reset. These stale values from the previous attempt could lead to incorrect scanning in the subsequent retry. Move seqlock to outer function and reinitialize checking state for each retry. This ensures a clean state for each check attempt, preventing any missed bad blocks. Fixes: 3ea3354cb9f0 ("badblocks: improve badblocks_check() for multiple ranges handling") Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-10-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index 92bd43f7fff1..b66d5f12a766 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -1191,31 +1191,12 @@ update_sectors: static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { - int unacked_badblocks, acked_badblocks; int prev = -1, hint = -1, set = 0; struct badblocks_context bad; - unsigned int seq; + int unacked_badblocks = 0; + int acked_badblocks = 0; + u64 *p = bb->page; int len, rv; - u64 *p; - - WARN_ON(bb->shift < 0 || sectors == 0); - - if (bb->shift > 0) { - sector_t target; - - /* round the start down, and the end up */ - target = s + sectors; - rounddown(s, 1 << bb->shift); - roundup(target, 1 << bb->shift); - sectors = target - s; - } - -retry: - seq = read_seqbegin(&bb->lock); - - p = bb->page; - unacked_badblocks = 0; - acked_badblocks = 0; re_check: bad.start = s; @@ -1281,9 +1262,6 @@ update_sectors: else rv = 0; - if (read_seqretry(&bb->lock, seq)) - goto retry; - return rv; } @@ -1324,7 +1302,27 @@ update_sectors: int badblocks_check(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { - return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + unsigned int seq; + int rv; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + /* round the start down, and the end up */ + sector_t target = s + sectors; + + rounddown(s, 1 << bb->shift); + roundup(target, 1 << bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; } EXPORT_SYMBOL_GPL(badblocks_check); -- cgit v1.2.3 From c8775aefba959cdfbaa25408a84d3dd15bbeb991 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:05 +0800 Subject: badblocks: return boolean from badblocks_set() and badblocks_clear() Change the return type of badblocks_set() and badblocks_clear() from int to bool, indicating success or failure. Specifically: - _badblocks_set() and _badblocks_clear() functions now return true for success and false for failure. - All calls to these functions are updated to handle the new boolean return type. - This change improves code clarity and ensures a more consistent handling of success and failure states. Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Acked-by: Ira Weiny Link: https://lore.kernel.org/r/20250227075507.151331-11-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 41 ++++++++++++++++++++--------------------- drivers/block/null_blk/main.c | 14 +++++++------- drivers/md/md.c | 35 ++++++++++++++++++----------------- drivers/nvdimm/badrange.c | 2 +- include/linux/badblocks.h | 6 +++--- 5 files changed, 49 insertions(+), 49 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index b66d5f12a766..e326a16fd056 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -836,8 +836,8 @@ static bool try_adjacent_combine(struct badblocks *bb, int prev) } /* Do exact work to set bad block range into the bad block table */ -static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +static bool _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { int len = 0, added = 0; struct badblocks_context bad; @@ -847,11 +847,11 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { /* round the start down, and the end up */ @@ -977,7 +977,7 @@ out: write_sequnlock_irqrestore(&bb->lock, flags); - return sectors; + return sectors == 0; } /* @@ -1048,21 +1048,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) { struct badblocks_context bad; int prev = -1, hint = -1; int len = 0, cleared = 0; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { sector_t target; @@ -1182,9 +1181,9 @@ update_sectors: write_sequnlock_irq(&bb->lock); if (!cleared) - rv = 1; + return false; - return rv; + return true; } /* Do the exact work to check bad blocks range from the bad block table */ @@ -1338,12 +1337,12 @@ EXPORT_SYMBOL_GPL(badblocks_check); * decide how best to handle it. * * Return: - * 0: success - * other: failed to set badblocks (out of space). Parital setting will be + * true: success + * false: failed to set badblocks (out of space). Parital setting will be * treated as failure. */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); } @@ -1360,10 +1359,10 @@ EXPORT_SYMBOL_GPL(badblocks_set); * drop the remove request. * * Return: - * 0: success - * 1: failed to clear badblocks + * true: success + * false: failed to clear badblocks */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors) { return _badblocks_clear(bb, s, sectors); } @@ -1485,10 +1484,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, return -EINVAL; } - if (badblocks_set(bb, sector, length, !unack)) + if (!badblocks_set(bb, sector, length, !unack)) return -ENOSPC; - else - return len; + + return len; } EXPORT_SYMBOL_GPL(badblocks_store); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 31d44cef6841..8f6025efc543 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -561,14 +561,14 @@ static ssize_t nullb_device_badblocks_store(struct config_item *item, goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) + if (buf[0] == '+') { + if (badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1)) + ret = count; + } else if (badblocks_clear(&t_dev->badblocks, start, + end - start + 1)) { ret = count; + } out: kfree(orig); return ret; diff --git a/drivers/md/md.c b/drivers/md/md.c index 79cabe4be77d..95ceb6052360 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1748,7 +1748,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -9833,7 +9833,6 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { struct mddev *mddev = rdev->mddev; - int rv; /* * Recording new badblocks for faulty rdev will force unnecessary @@ -9849,33 +9848,35 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) return 0; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return 1; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return 0; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; + return 1; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c index a002ea6fdd84..ee478ccde7c6 100644 --- a/drivers/nvdimm/badrange.c +++ b/drivers/nvdimm/badrange.c @@ -167,7 +167,7 @@ static void set_badblock(struct badblocks *bb, sector_t s, int num) dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n", (u64) s * 512, (u64) num * 512); /* this isn't an error as the hardware will still throw an exception */ - if (badblocks_set(bb, s, num, 1)) + if (!badblocks_set(bb, s, num, 1)) dev_info_once(bb->dev, "%s: failed for sector %llx\n", __func__, (u64) s); } diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 670f2dae692f..8764bed9ff16 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -50,9 +50,9 @@ struct badblocks_context { int badblocks_check(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors); -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, -- cgit v1.2.3 From d301f164c3fbff611bd71f57dfa553b9219f0f5e Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:07 +0800 Subject: badblocks: use sector_t instead of int to avoid truncation of badblocks length There is a truncation of badblocks length issue when set badblocks as follow: echo "2055 4294967299" > bad_blocks cat bad_blocks 2055 3 Change 'sectors' argument type from 'int' to 'sector_t'. This change avoids truncation of badblocks length for large sectors by replacing 'int' with 'sector_t' (u64), enabling proper handling of larger disk sizes and ensuring compatibility with 64-bit sector addressing. Fixes: 9e0e252a048b ("badblocks: Add core badblock management code") Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-13-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 20 ++++++++------------ drivers/block/null_blk/main.c | 3 +-- drivers/md/md.h | 6 +++--- drivers/md/raid1-10.c | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 8 ++++---- drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/pfn_devs.c | 7 ++++--- drivers/nvdimm/pmem.c | 2 +- include/linux/badblocks.h | 8 ++++---- 10 files changed, 29 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index e326a16fd056..673ef068423a 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -836,7 +836,7 @@ static bool try_adjacent_combine(struct badblocks *bb, int prev) } /* Do exact work to set bad block range into the bad block table */ -static bool _badblocks_set(struct badblocks *bb, sector_t s, int sectors, +static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged) { int len = 0, added = 0; @@ -956,8 +956,6 @@ update_sectors: if (sectors > 0) goto re_insert; - WARN_ON(sectors < 0); - /* * Check whether the following already set range can be * merged. (prev < 0) condition is not handled here, @@ -1048,7 +1046,7 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static bool _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { struct badblocks_context bad; int prev = -1, hint = -1; @@ -1171,8 +1169,6 @@ update_sectors: if (sectors > 0) goto re_clear; - WARN_ON(sectors < 0); - if (cleared) { badblocks_update_acked(bb); set_changed(bb); @@ -1187,8 +1183,8 @@ update_sectors: } /* Do the exact work to check bad blocks range from the bad block table */ -static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { int prev = -1, hint = -1, set = 0; struct badblocks_context bad; @@ -1298,8 +1294,8 @@ update_sectors: * -1: there are bad blocks which have not yet been acknowledged in metadata. * plus the start/length of the first bad section we overlap. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { unsigned int seq; int rv; @@ -1341,7 +1337,7 @@ EXPORT_SYMBOL_GPL(badblocks_check); * false: failed to set badblocks (out of space). Parital setting will be * treated as failure. */ -bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); @@ -1362,7 +1358,7 @@ EXPORT_SYMBOL_GPL(badblocks_set); * true: success * false: failed to clear badblocks */ -bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { return _badblocks_clear(bb, s, sectors); } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 8f6025efc543..4a5cd288dc07 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1339,8 +1339,7 @@ blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, struct badblocks *bb = &cmd->nq->dev->badblocks; struct nullb_device *dev = cmd->nq->dev; unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT; - sector_t first_bad; - int bad_sectors; + sector_t first_bad, bad_sectors; unsigned int partial_io_sectors = 0; if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors)) diff --git a/drivers/md/md.h b/drivers/md/md.h index 923a0ef51efe..6edc0f71b7d4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -266,8 +266,8 @@ enum flag_bits { Nonrot, /* non-rotational device (SSD) */ }; -static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, @@ -284,7 +284,7 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 4378d3250bd7..62b980b12f93 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, sector_t this_sector, int *len) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; /* no bad block overlap */ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8beb8cccc6af..0b2839105857 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1537,7 +1537,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, @@ -2886,7 +2886,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { /* may need to read from here */ sector_t first_bad = MaxSector; - int bad_sectors; + sector_t bad_sectors; if (is_badblock(rdev, sector_nr, good_sectors, &first_bad, &bad_sectors)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7ed933181712..a8664e29aada 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -747,7 +747,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, for (slot = 0; slot < conf->copies ; slot++) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; sector_t dev_sector; unsigned int pending; bool nonrot; @@ -1438,7 +1438,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, dev_sector, max_sectors, @@ -3413,7 +3413,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t from_addr, to_addr; struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; - int bad_sectors; + sector_t bad_sectors; if (!rdev || !test_bit(In_sync, &rdev->flags)) continue; @@ -3609,7 +3609,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; - int bad_sectors; + sector_t bad_sectors; struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 5ca06e9a2d29..cc5c8f3f81e8 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -673,7 +673,7 @@ static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, { if (bb->count) { sector_t first_bad; - int num_bad; + sector_t num_bad; return !!badblocks_check(bb, sector, len / 512, &first_bad, &num_bad); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index cfdfe0eaa512..8f3e816e805d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -367,9 +367,10 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) struct nd_namespace_common *ndns = nd_pfn->ndns; void *zero_page = page_address(ZERO_PAGE(0)); struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - int num_bad, meta_num, rc, bb_present; + int meta_num, rc, bb_present; sector_t first_bad, meta_start; struct nd_namespace_io *nsio; + sector_t num_bad; if (nd_pfn->mode != PFN_MODE_PMEM) return 0; @@ -394,7 +395,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) bb_present = badblocks_check(&nd_region->bb, meta_start, meta_num, &first_bad, &num_bad); if (bb_present) { - dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n", + dev_dbg(&nd_pfn->dev, "meta: %llx badblocks at %llx\n", num_bad, first_bad); nsoff = ALIGN_DOWN((nd_region->ndr_start + (first_bad << 9)) - nsio->res.start, @@ -413,7 +414,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) } if (rc) { dev_err(&nd_pfn->dev, - "error clearing %x badblocks at %llx\n", + "error clearing %llx badblocks at %llx\n", num_bad, first_bad); return rc; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d81faa9d89c9..43156e1576c9 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -249,7 +249,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT; struct badblocks *bb = &pmem->bb; sector_t first_bad; - int num_bad; + sector_t num_bad; if (kaddr) *kaddr = pmem->virt_addr + offset; diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 8764bed9ff16..996493917f36 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -48,11 +48,11 @@ struct badblocks_context { int ack; }; -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged); -bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, -- cgit v1.2.3 From 6e51a1279cd60cb93e3379ff140d8fa6c39ecf20 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:30 +0530 Subject: block: acquire q->limits_lock while reading sysfs attributes There're few sysfs attributes(RW) whose store method is protected with q->limits_lock, however the corresponding show method of these attributes run holding q->sysfs_lock and that doesn't make sense as ideally the show method of these attributes should also run holding q->limits_lock instead of q->sysfs_lock. Hence update the show method of these sysfs attributes so that reading of these attributes acquire q->limits_lock instead of q->sysfs_lock. Similarly, there're few sysfs attributes(RO) whose show method is currently protected with q->sysfs_lock however updates to these attributes could occur using atomic limit update APIs such as queue_ limits_start_update() and queue_limits_commit_update() which run holding q->limits_lock. So that means that reading these attributes holding q->sysfs_lock doesn't make sense. Hence update the show method of these sysfs attributes(RO) such that they run with holding q-> limits_lock instead of q->sysfs_lock. We have defined a new macro QUEUE_LIM_RO_ENTRY() which uses new ->show_ limit() method and it runs holding q->limits_lock. All existing sysfs attributes(RO) which needs protection using q->limits_lock while reading have been now updated to use this new macro for initialization. Also, the existing QUEUE_LIM_RW_ENTRY() is updated to use new ->show_ limit() method for reading attributes instead of existing ->show() method. As ->show_limit() runs holding q->limits_lock, the existing sysfs attributes(RW) requiring protection are now inherently protected using q->limits_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-2-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 102 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6f548a4376aa..eba5121690af 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -23,9 +23,12 @@ struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); + ssize_t (*show_limit)(struct gendisk *disk, char *page); + ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); + void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; @@ -412,10 +415,16 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store = _prefix##_store, \ }; +#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show_limit = _prefix##_show, \ +} + #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ + .show_limit = _prefix##_show, \ .store_limit = _prefix##_store, \ } @@ -430,39 +439,39 @@ static struct queue_sysfs_entry _prefix##_entry = { \ QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); -QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); -QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler"); -QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); -QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); -QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); -QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); -QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); +QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); -QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); -QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); -QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); -QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors, +QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); -QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); -QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); -QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); -QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); -QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); @@ -470,16 +479,16 @@ QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); -QUEUE_RO_ENTRY(queue_fua, "fua"); -QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); +QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); -QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); +QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { - .attr = {.name = "hw_sector_size", .mode = 0444 }, - .show = queue_logical_block_size_show, + .attr = {.name = "hw_sector_size", .mode = 0444 }, + .show_limit = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); @@ -561,7 +570,9 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { - &queue_ra_entry.attr, + /* + * Attributes which are protected with q->limits_lock. + */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, @@ -577,37 +588,46 @@ static struct attribute *queue_attrs[] = { &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, - &queue_discard_zeroes_data_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, - &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, - &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, - &queue_nomerges_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, - &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, - &queue_poll_delay_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, + + /* + * Attributes which are protected with q->sysfs_lock. + */ + &queue_ra_entry.attr, + &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, + &queue_nr_zones_entry.attr, + &queue_nomerges_entry.attr, + &queue_poll_entry.attr, + &queue_poll_delay_entry.attr, + NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { + /* + * Attributes which are protected with q->sysfs_lock. + */ &queue_requests_entry.attr, &elv_iosched_entry.attr, &queue_rq_affinity_entry.attr, @@ -666,8 +686,16 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); ssize_t res; - if (!entry->show) + if (!entry->show && !entry->show_limit) return -EIO; + + if (entry->show_limit) { + mutex_lock(&disk->queue->limits_lock); + res = entry->show_limit(disk, page); + mutex_unlock(&disk->queue->limits_lock); + return res; + } + mutex_lock(&disk->queue->sysfs_lock); res = entry->show(disk, page); mutex_unlock(&disk->queue->sysfs_lock); -- cgit v1.2.3 From b07a889e833555735ce72ca4a6d39f4c2ca725ba Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:31 +0530 Subject: block: move q->sysfs_lock and queue-freeze under show/store method In preparation to further simplify and group sysfs attributes which don't require locking or require some form of locking other than q-> limits_lock, move acquire/release of q->sysfs_lock and queue freeze/ unfreeze under each attributes' respective show/store method. While we are at it, also remove ->load_module() as it's used to load the module before queue is freezed. Now as we moved queue-freeze under ->store(), we could load module directly from the attributes' store method before we actually start freezing the queue. Currently, the ->load_module() is only used by "scheduler" attribute, so we now load the relevant elevator module before we start freezing the queue in elv_iosched_store(). Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-3-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 210 +++++++++++++++++++++++++++++++++++++----------------- block/elevator.c | 20 +++++- 2 files changed, 162 insertions(+), 68 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index eba5121690af..4700ee168ed5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -28,8 +28,6 @@ struct queue_sysfs_entry { ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); - - void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t @@ -55,7 +53,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count) static ssize_t queue_requests_show(struct gendisk *disk, char *page) { - return queue_var_show(disk->queue->nr_requests, page); + ssize_t ret; + + mutex_lock(&disk->queue->sysfs_lock); + ret = queue_var_show(disk->queue->nr_requests, page); + mutex_unlock(&disk->queue->sysfs_lock); + return ret; } static ssize_t @@ -63,27 +66,38 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; + unsigned int memflags; + struct request_queue *q = disk->queue; - if (!queue_is_mq(disk->queue)) + if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) - return err; - + ret = err; + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { - return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + ssize_t ret; + + mutex_lock(&disk->queue->sysfs_lock); + ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + mutex_unlock(&disk->queue->sysfs_lock); + + return ret; } static ssize_t @@ -91,11 +105,19 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; + unsigned int memflags; + struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; + + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); + return ret; } @@ -150,7 +172,12 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sysfs_emit(page, "%d\n", _val); \ + ssize_t ret; \ + \ + mutex_lock(&disk->queue->sysfs_lock); \ + ret = sysfs_emit(page, "%d\n", _val); \ + mutex_unlock(&disk->queue->sysfs_lock); \ + return ret; \ } /* deprecated fields */ @@ -239,10 +266,17 @@ QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_poll_show(struct gendisk *disk, char *page) { - if (queue_is_mq(disk->queue)) - return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); - return sysfs_emit(page, "%u\n", - !!(disk->queue->limits.features & BLK_FEAT_POLL)); + ssize_t ret; + + mutex_lock(&disk->queue->sysfs_lock); + if (queue_is_mq(disk->queue)) { + ret = sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + } else { + ret = sysfs_emit(page, "%u\n", + !!(disk->queue->limits.features & BLK_FEAT_POLL)); + } + mutex_unlock(&disk->queue->sysfs_lock); + return ret; } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) @@ -254,7 +288,12 @@ static ssize_t queue_zoned_show(struct gendisk *disk, char *page) static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { - return queue_var_show(disk_nr_zones(disk), page); + ssize_t ret; + + mutex_lock(&disk->queue->sysfs_lock); + ret = queue_var_show(disk_nr_zones(disk), page); + mutex_unlock(&disk->queue->sysfs_lock); + return ret; } static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) @@ -281,35 +320,51 @@ static int queue_iostats_passthrough_store(struct gendisk *disk, static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { - return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | + ssize_t ret; + + mutex_lock(&disk->queue->sysfs_lock); + ret = queue_var_show((blk_queue_nomerges(disk->queue) << 1) | blk_queue_noxmerges(disk->queue), page); + mutex_unlock(&disk->queue->sysfs_lock); + return ret; } static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; + unsigned int memflags; + struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue); + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); return ret; } static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { - bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); - bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); + ssize_t ret; + bool set, force; - return queue_var_show(set << force, page); + mutex_lock(&disk->queue->sysfs_lock); + set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); + force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); + ret = queue_var_show(set << force, page); + mutex_unlock(&disk->queue->sysfs_lock); + return ret; } static ssize_t @@ -319,11 +374,14 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; + unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -334,6 +392,8 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); #endif return ret; } @@ -347,29 +407,52 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - if (!(disk->queue->limits.features & BLK_FEAT_POLL)) - return -EINVAL; + unsigned int memflags; + ssize_t ret = count; + struct request_queue *q = disk->queue; + + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); + if (!(q->limits.features & BLK_FEAT_POLL)) { + ret = -EINVAL; + goto out; + } pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); - return count; +out: + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); + + return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); + ssize_t ret; + + mutex_lock(&disk->queue->sysfs_lock); + ret = sysfs_emit(page, "%u\n", + jiffies_to_msecs(disk->queue->rq_timeout)); + mutex_unlock(&disk->queue->sysfs_lock); + return ret; } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val; + unsigned int val, memflags; int err; + struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; - blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val)); + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); + blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); return count; } @@ -428,14 +511,6 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store_limit = _prefix##_store, \ } -#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .load_module = _prefix##_load_module, \ - .store = _prefix##_store, \ -} - QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); @@ -443,7 +518,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); -QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler"); +QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); @@ -512,14 +587,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page) static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { - if (!wbt_rq_qos(disk->queue)) - return -EINVAL; + ssize_t ret; + struct request_queue *q = disk->queue; - if (wbt_disabled(disk->queue)) - return sysfs_emit(page, "0\n"); + mutex_lock(&q->sysfs_lock); + if (!wbt_rq_qos(q)) { + ret = -EINVAL; + goto out; + } - return sysfs_emit(page, "%llu\n", - div_u64(wbt_get_min_lat(disk->queue), 1000)); + if (wbt_disabled(q)) { + ret = sysfs_emit(page, "0\n"); + goto out; + } + + ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +out: + mutex_unlock(&q->sysfs_lock); + return ret; } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, @@ -529,6 +614,7 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, struct rq_qos *rqos; ssize_t ret; s64 val; + unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -536,20 +622,24 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); + rqos = wbt_rq_qos(q); if (!rqos) { ret = wbt_init(disk); if (ret) - return ret; + goto out; } + ret = count; if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) - return count; + goto out; /* * Ensure that the queue is idled, in case the latency update @@ -561,8 +651,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); +out: + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); - return count; + return ret; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); @@ -684,22 +777,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); - ssize_t res; if (!entry->show && !entry->show_limit) return -EIO; if (entry->show_limit) { + ssize_t res; + mutex_lock(&disk->queue->limits_lock); res = entry->show_limit(disk, page); mutex_unlock(&disk->queue->limits_lock); return res; } - mutex_lock(&disk->queue->sysfs_lock); - res = entry->show(disk, page); - mutex_unlock(&disk->queue->sysfs_lock); - return res; + return entry->show(disk, page); } static ssize_t @@ -709,21 +800,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - unsigned int memflags; - ssize_t res; if (!entry->store_limit && !entry->store) return -EIO; - /* - * If the attribute needs to load a module, do it before freezing the - * queue to ensure that the module file can be read when the request - * queue is the one for the device storing the module file. - */ - if (entry->load_module) - entry->load_module(disk, page, length); - if (entry->store_limit) { + ssize_t res; + struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); @@ -738,12 +821,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return length; } - mutex_lock(&q->sysfs_lock); - memflags = blk_mq_freeze_queue(q); - res = entry->store(disk, page, length); - blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); - return res; + return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { diff --git a/block/elevator.c b/block/elevator.c index cd2ce4921601..041f1d983bc7 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -723,11 +723,24 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, { char elevator_name[ELV_NAME_MAX]; int ret; + unsigned int memflags; + struct request_queue *q = disk->queue; + /* + * If the attribute needs to load a module, do it before freezing the + * queue to ensure that the module file can be read when the request + * queue is the one for the device storing the module file. + */ + elv_iosched_load_module(disk, buf, count); strscpy(elevator_name, buf, sizeof(elevator_name)); - ret = elevator_change(disk->queue, strstrip(elevator_name)); + + mutex_lock(&q->sysfs_lock); + memflags = blk_mq_freeze_queue(q); + ret = elevator_change(q, strstrip(elevator_name)); if (!ret) - return count; + ret = count; + blk_mq_unfreeze_queue(q, memflags); + mutex_unlock(&q->sysfs_lock); return ret; } @@ -738,6 +751,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; + mutex_lock(&q->sysfs_lock); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { @@ -755,6 +769,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); + mutex_unlock(&q->sysfs_lock); + return len; } -- cgit v1.2.3 From d23977fee1ee838316fb1b00945064a146460843 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:32 +0530 Subject: block: remove q->sysfs_lock for attributes which don't need it There're few sysfs attributes in block layer which don't really need acquiring q->sysfs_lock while accessing it. The reason being, reading/ writing a value from/to such attributes are either atomic or could be easily protected using READ_ONCE()/WRITE_ONCE(). Moreover, sysfs attributes are inherently protected with sysfs/kernfs internal locking. So this change help segregate all existing sysfs attributes for which we could avoid acquiring q->sysfs_lock. For all read-only attributes we removed the q->sysfs_lock from show method of such attributes. In case attribute is read/write then we removed the q->sysfs_lock from both show and store methods of these attributes. We audited all block sysfs attributes and found following list of attributes which shouldn't require q->sysfs_lock protection: 1. io_poll: Write to this attribute is ignored. So, we don't need q->sysfs_lock. 2. io_poll_delay: Write to this attribute is NOP, so we don't need q->sysfs_lock. 3. io_timeout: Write to this attribute updates q->rq_timeout and read of this attribute returns the value stored in q->rq_timeout Moreover, the q->rq_timeout is set only once when we init the queue (under blk_mq_ init_allocated_queue()) even before disk is added. So that means that we don't need to protect it with q->sysfs_lock. As this attribute is not directly correlated with anything else simply using READ_ONCE/WRITE_ONCE should be enough. 4. nomerges: Write to this attribute file updates two q->flags : QUEUE_FLAG_ NOMERGES and QUEUE_FLAG_NOXMERGES. These flags are accessed during bio-merge which anyways doesn't run with q->sysfs_lock held. Moreover, the q->flags are updated/accessed with bitops which are atomic. So, protecting it with q->sysfs_lock is not necessary. 5. rq_affinity: Write to this attribute file makes atomic updates to q->flags: QUEUE_FLAG_SAME_COMP and QUEUE_FLAG_SAME_FORCE. These flags are also accessed from blk_mq_complete_need_ipi() using test_bit macro. As read/write to q->flags uses bitops which are atomic, protecting it with q->stsys_lock is not necessary. 6. nr_zones: Write to this attribute happens in the driver probe method (except nvme) before disk is added and outside of q->sysfs_lock or any other lock. Moreover nr_zones is defined as "unsigned int" and so reading this attribute, even when it's simultaneously being updated on other cpu, should not return torn value on any architecture supported by linux. So we can avoid using q->sysfs_lock or any other lock/ protection while reading this attribute. 7. discard_zeroes_data: Reading of this attribute always returns 0, so we don't require holding q->sysfs_lock. 8. write_same_max_bytes Reading of this attribute always returns 0, so we don't require holding q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-4-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- block/blk-sysfs.c | 81 ++++++++++++++++++---------------------------------- 2 files changed, 29 insertions(+), 54 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 008947a13541..3c3e87bad6d3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -21,7 +21,7 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { - q->rq_timeout = timeout; + WRITE_ONCE(q->rq_timeout, timeout); } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4700ee168ed5..bc641ac71cde 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -172,12 +172,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - ssize_t ret; \ - \ - mutex_lock(&disk->queue->sysfs_lock); \ - ret = sysfs_emit(page, "%d\n", _val); \ - mutex_unlock(&disk->queue->sysfs_lock); \ - return ret; \ + return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ @@ -266,17 +261,11 @@ QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_poll_show(struct gendisk *disk, char *page) { - ssize_t ret; + if (queue_is_mq(disk->queue)) + return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); - mutex_lock(&disk->queue->sysfs_lock); - if (queue_is_mq(disk->queue)) { - ret = sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); - } else { - ret = sysfs_emit(page, "%u\n", + return sysfs_emit(page, "%u\n", !!(disk->queue->limits.features & BLK_FEAT_POLL)); - } - mutex_unlock(&disk->queue->sysfs_lock); - return ret; } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) @@ -288,12 +277,7 @@ static ssize_t queue_zoned_show(struct gendisk *disk, char *page) static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { - ssize_t ret; - - mutex_lock(&disk->queue->sysfs_lock); - ret = queue_var_show(disk_nr_zones(disk), page); - mutex_unlock(&disk->queue->sysfs_lock); - return ret; + return queue_var_show(disk_nr_zones(disk), page); } static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) @@ -320,13 +304,8 @@ static int queue_iostats_passthrough_store(struct gendisk *disk, static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { - ssize_t ret; - - mutex_lock(&disk->queue->sysfs_lock); - ret = queue_var_show((blk_queue_nomerges(disk->queue) << 1) | + return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | blk_queue_noxmerges(disk->queue), page); - mutex_unlock(&disk->queue->sysfs_lock); - return ret; } static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, @@ -340,7 +319,6 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, if (ret < 0) return ret; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); @@ -349,22 +327,16 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { - ssize_t ret; - bool set, force; + bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); - mutex_lock(&disk->queue->sysfs_lock); - set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); - force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); - ret = queue_var_show(set << force, page); - mutex_unlock(&disk->queue->sysfs_lock); - return ret; + return queue_var_show(set << force, page); } static ssize_t @@ -380,7 +352,12 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) if (ret < 0) return ret; - mutex_lock(&q->sysfs_lock); + /* + * Here we update two queue flags each using atomic bitops, although + * updating two flags isn't atomic it should be harmless as those flags + * are accessed individually using atomic test_bit operation. So we + * don't grab any lock while updating these flags. + */ memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); @@ -393,7 +370,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); #endif return ret; } @@ -411,30 +387,23 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page, ssize_t ret = count; struct request_queue *q = disk->queue; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; } + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); - return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - ssize_t ret; - - mutex_lock(&disk->queue->sysfs_lock); - ret = sysfs_emit(page, "%u\n", - jiffies_to_msecs(disk->queue->rq_timeout)); - mutex_unlock(&disk->queue->sysfs_lock); - return ret; + return sysfs_emit(page, "%u\n", + jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, @@ -448,11 +417,9 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, if (err || val == 0) return -EINVAL; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return count; } @@ -706,6 +673,10 @@ static struct attribute *queue_attrs[] = { * Attributes which are protected with q->sysfs_lock. */ &queue_ra_entry.attr, + + /* + * Attributes which don't require locking. + */ &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nr_zones_entry.attr, @@ -723,11 +694,15 @@ static struct attribute *blk_mq_queue_attrs[] = { */ &queue_requests_entry.attr, &elv_iosched_entry.attr, - &queue_rq_affinity_entry.attr, - &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif + /* + * Attributes which don't require locking. + */ + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, + NULL, }; -- cgit v1.2.3 From 1bf70d08cc3b55abd1763e6dff5855cb8dd8318b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:33 +0530 Subject: block: introduce a dedicated lock for protecting queue elevator updates A queue's elevator can be updated either when modifying nr_hw_queues or through the sysfs scheduler attribute. Currently, elevator switching/ updating is protected using q->sysfs_lock, but this has led to lockdep splats[1] due to inconsistent lock ordering between q->sysfs_lock and the freeze-lock in multiple block layer call sites. As the scope of q->sysfs_lock is not well-defined, its (mis)use has resulted in numerous lockdep warnings. To address this, introduce a new q->elevator_lock, dedicated specifically for protecting elevator switches/updates. And we'd now use this new q->elevator_lock instead of q->sysfs_lock for protecting elevator switches/updates. While at it, make elv_iosched_load_module() a static function, as it is only called from elv_iosched_store(). Also, remove redundant parameters from elv_iosched_load_module() function signature. [1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-5-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq.c | 15 +++++++-------- block/blk-sysfs.c | 32 ++++++++++++++++++++++---------- block/elevator.c | 35 ++++++++++++++++------------------- block/elevator.h | 2 -- block/genhd.c | 9 ++++++--- include/linux/blkdev.h | 8 ++++++++ 7 files changed, 60 insertions(+), 42 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d6c4fa3943b5..362d0a55b07a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); + mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); diff --git a/block/blk-mq.c b/block/blk-mq.c index 40490ac88045..5a2d63927525 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4467,7 +4467,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4500,7 +4500,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4933,10 +4933,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, if (!qe) return false; - /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); + /* Accessing q->elevator needs protection from ->elevator_lock. */ + mutex_lock(&q->elevator_lock); - /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); goto unlock; @@ -4950,7 +4949,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); elevator_disable(q); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return true; } @@ -4980,11 +4979,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bc641ac71cde..1562e22877e1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -693,10 +693,15 @@ static struct attribute *blk_mq_queue_attrs[] = { * Attributes which are protected with q->sysfs_lock. */ &queue_requests_entry.attr, - &elv_iosched_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif + /* + * Attributes which require some form of locking other than + * q->sysfs_lock. + */ + &elv_iosched_entry.attr, + /* * Attributes which don't require locking. */ @@ -865,15 +870,19 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_debugfs_remove; + ret = blk_crypto_sysfs_register(disk); + if (ret) + goto out_unregister_ia_ranges; + + mutex_lock(&q->elevator_lock); if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) - goto out_unregister_ia_ranges; + if (ret) { + mutex_unlock(&q->elevator_lock); + goto out_crypto_sysfs_unregister; + } } - - ret = blk_crypto_sysfs_register(disk); - if (ret) - goto out_elv_unregister; + mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); @@ -898,8 +907,8 @@ int blk_register_queue(struct gendisk *disk) return ret; -out_elv_unregister: - elv_unregister_queue(q); +out_crypto_sysfs_unregister: + blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: @@ -945,8 +954,11 @@ void blk_unregister_queue(struct gendisk *disk) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elv_unregister_queue(q); + mutex_unlock(&q->elevator_lock); + + mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); diff --git a/block/elevator.c b/block/elevator.c index 041f1d983bc7..b4d08026b02c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { @@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q) { struct elevator_queue *e = q->elevator; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); @@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) unsigned int memflags; int ret; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q) { unsigned int memflags; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -700,28 +700,23 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -void elv_iosched_load_module(struct gendisk *disk, const char *buf, - size_t count) +static void elv_iosched_load_module(char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *found; - const char *name; - - strscpy(elevator_name, buf, sizeof(elevator_name)); - name = strstrip(elevator_name); spin_lock(&elv_list_lock); - found = __elevator_find(name); + found = __elevator_find(elevator_name); spin_unlock(&elv_list_lock); if (!found) - request_module("%s-iosched", name); + request_module("%s-iosched", elevator_name); } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; + char *name; int ret; unsigned int memflags; struct request_queue *q = disk->queue; @@ -731,16 +726,18 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, * queue to ensure that the module file can be read when the request * queue is the one for the device storing the module file. */ - elv_iosched_load_module(disk, buf, count); strscpy(elevator_name, buf, sizeof(elevator_name)); + name = strstrip(elevator_name); + + elv_iosched_load_module(name); - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); - ret = elevator_change(q, strstrip(elevator_name)); + mutex_lock(&q->elevator_lock); + ret = elevator_change(q, name); if (!ret) ret = count; + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -751,7 +748,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { @@ -769,7 +766,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return len; } diff --git a/block/elevator.h b/block/elevator.h index e526662c5dbb..e4e44dfac503 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -148,8 +148,6 @@ extern void elv_unregister(struct elevator_type *); * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); -void elv_iosched_load_module(struct gendisk *disk, const char *page, - size_t count); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); diff --git a/block/genhd.c b/block/genhd.c index e9375e20d866..c2bd86cd09de 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -565,8 +565,11 @@ out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: - if (disk->queue->elevator) + if (disk->queue->elevator) { + mutex_lock(&disk->queue->elevator_lock); elevator_exit(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); + } return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); @@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk) blk_mq_quiesce_queue(q); if (q->elevator) { - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_exit(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..31b1b635c710 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -560,6 +560,14 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; + /* + * Protects against I/O scheduler switching, specifically when + * updating q->elevator. To ensure proper locking order during + * an elevator update, first freeze the queue, then acquire + * ->elevator_lock. + */ + struct mutex elevator_lock; + struct mutex sysfs_lock; struct mutex limits_lock; -- cgit v1.2.3 From 3efe7571c3ae2b6481253a2616c2bb3fbadd503b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:34 +0530 Subject: block: protect nr_requests update using q->elevator_lock The sysfs attribute nr_requests could be simultaneously updated from elevator switch/update or nr_hw_queue update code path. The update to nr_requests for each of those code paths runs holding q->elevator_lock. So we should protect access to sysfs attribute nr_requests using q-> elevator_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-6-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 10 +++++----- include/linux/blkdev.h | 10 ++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1562e22877e1..f1fa57de29ed 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -55,9 +55,9 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) { ssize_t ret; - mutex_lock(&disk->queue->sysfs_lock); + mutex_lock(&disk->queue->elevator_lock); ret = queue_var_show(disk->queue->nr_requests, page); - mutex_unlock(&disk->queue->sysfs_lock); + mutex_unlock(&disk->queue->elevator_lock); return ret; } @@ -76,16 +76,16 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) if (ret < 0) return ret; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) ret = err; + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -692,7 +692,6 @@ static struct attribute *blk_mq_queue_attrs[] = { /* * Attributes which are protected with q->sysfs_lock. */ - &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif @@ -701,6 +700,7 @@ static struct attribute *blk_mq_queue_attrs[] = { * q->sysfs_lock. */ &elv_iosched_entry.attr, + &queue_requests_entry.attr, /* * Attributes which don't require locking. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31b1b635c710..3e66ad016a23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,10 +561,12 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, specifically when - * updating q->elevator. To ensure proper locking order during - * an elevator update, first freeze the queue, then acquire - * ->elevator_lock. + * Protects against I/O scheduler switching, particularly when + * updating q->elevator. Since the elevator update code path may + * also modify q->nr_requests, this lock also protects the sysfs + * attribute nr_requests. + * To ensure proper locking order during an elevator update, first + * freeze the queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 245618f8e45ff4f79327627b474b563da71c2c75 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:35 +0530 Subject: block: protect wbt_lat_usec using q->elevator_lock The wbt latency and state could be updated while initializing the elevator or exiting the elevator. It could be also updated while configuring IO latency QoS parameters using cgroup. The elevator code path is now protected with q->elevator_lock. So we should protect the access to sysfs attribute wbt_lat_usec using q->elevator _lock instead of q->sysfs_lock. White we're at it, also protect ioc_qos_write(), which configures wbt parameters via cgroup, using q->elevator_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-7-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 ++ block/blk-sysfs.c | 20 ++++++++------------ include/linux/blkdev.h | 4 ++-- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6be46e28459b..38e7bf3c3b4f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3248,6 +3248,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, } memflags = blk_mq_freeze_queue(disk->queue); + mutex_lock(&disk->queue->elevator_lock); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3355,6 +3356,7 @@ einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); blk_mq_unfreeze_queue(disk->queue, memflags); ret = -EINVAL; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f1fa57de29ed..223da196a548 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -557,7 +557,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ssize_t ret; struct request_queue *q = disk->queue; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; @@ -570,7 +570,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return ret; } @@ -589,8 +589,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { @@ -619,8 +619,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, blk_mq_unquiesce_queue(q); out: + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -689,19 +689,15 @@ static struct attribute *queue_attrs[] = { /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { - /* - * Attributes which are protected with q->sysfs_lock. - */ -#ifdef CONFIG_BLK_WBT - &queue_wb_lat_entry.attr, -#endif /* * Attributes which require some form of locking other than * q->sysfs_lock. */ &elv_iosched_entry.attr, &queue_requests_entry.attr, - +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, +#endif /* * Attributes which don't require locking. */ @@ -882,10 +878,10 @@ int blk_register_queue(struct gendisk *disk) goto out_crypto_sysfs_unregister; } } + wbt_enable_default(disk); mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3e66ad016a23..0ee3b5c9388e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -563,8 +563,8 @@ struct request_queue { /* * Protects against I/O scheduler switching, particularly when * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests, this lock also protects the sysfs - * attribute nr_requests. + * also modify q->nr_requests and wbt latency, this lock also + * protects the sysfs attributes nr_requests and wbt_lat_usec. * To ensure proper locking order during an elevator update, first * freeze the queue, then acquire ->elevator_lock. */ -- cgit v1.2.3 From 5e40f4452dc9a3fb44d13bb6bc7032f3911a2675 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:36 +0530 Subject: block: protect read_ahead_kb using q->limits_lock The bdi->ra_pages could be updated under q->limits_lock because it's usually calculated from the queue limits by queue_limits_commit_update. So protect reading/writing the sysfs attribute read_ahead_kb using q->limits_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-8-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 16 ++++++++++------ include/linux/blkdev.h | 3 +++ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 223da196a548..d584461a1d84 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -93,9 +93,9 @@ static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; - mutex_lock(&disk->queue->sysfs_lock); + mutex_lock(&disk->queue->limits_lock); ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); - mutex_unlock(&disk->queue->sysfs_lock); + mutex_unlock(&disk->queue->limits_lock); return ret; } @@ -111,12 +111,15 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - mutex_lock(&q->sysfs_lock); + /* + * ->ra_pages is protected by ->limits_lock because it is usually + * calculated from the queue limits by queue_limits_commit_update. + */ + mutex_lock(&q->limits_lock); memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + mutex_unlock(&q->limits_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -670,7 +673,8 @@ static struct attribute *queue_attrs[] = { &queue_dma_alignment_entry.attr, /* - * Attributes which are protected with q->sysfs_lock. + * Attributes which require some form of locking other than + * q->sysfs_lock. */ &queue_ra_entry.attr, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee3b5c9388e..3bee1b4858b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,9 @@ struct request_queue { struct mutex elevator_lock; struct mutex sysfs_lock; + /* + * Protects queue limits and also sysfs attribute read_ahead_kb. + */ struct mutex limits_lock; /* -- cgit v1.2.3 From 5abba4cebec0a591ca7e7f55701e42cd5dc059af Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 6 Mar 2025 15:09:53 +0530 Subject: block: protect hctx attributes/params using q->elevator_lock Currently, hctx attributes (nr_tags, nr_reserved_tags, and cpu_list) are protected using `q->sysfs_lock`. However, these attributes can be updated in multiple scenarios: - During the driver's probe method. - When updating nr_hw_queues. - When writing to the sysfs attribute nr_requests, which can modify nr_tags. The nr_requests attribute is already protected using q->elevator_lock, but none of the update paths actually use q->sysfs_lock to protect hctx attributes. So to ensure proper synchronization, replace q->sysfs_lock with q->elevator_lock when reading hctx attributes through sysfs. Additionally, blk_mq_update_nr_hw_queues allocates and updates hctx. The allocation of hctx is protected using q->elevator_lock, however, updating hctx params happens without any protection, so safeguard hctx param update path by also using q->elevator_lock. Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250306093956.2818808-1-nilay@linux.ibm.com [axboe: wrap comment at 80 chars] Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 4 ++-- block/blk-mq.c | 4 ++++ include/linux/blkdev.h | 14 ++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 3feeeccf8a99..24656980f443 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return res; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 5a2d63927525..b9550a127c8e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4094,6 +4094,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&q->elevator_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4198,6 +4200,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + mutex_unlock(&q->elevator_lock); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3bee1b4858b6..dcf8fce15e23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,12 +561,14 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, particularly when - * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests and wbt latency, this lock also - * protects the sysfs attributes nr_requests and wbt_lat_usec. - * To ensure proper locking order during an elevator update, first - * freeze the queue, then acquire ->elevator_lock. + * Protects against I/O scheduler switching, particularly when updating + * q->elevator. Since the elevator update code path may also modify q-> + * nr_requests and wbt latency, this lock also protects the sysfs attrs + * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update + * may modify hctx tags, reserved-tags and cpumask, so this lock also + * helps protect the hctx attrs. To ensure proper locking order during + * an elevator or nr_hw_queue update, first freeze the queue, then + * acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From fc0e982b8a3a169b1c654d9a1aa45bf292943ef2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 10 Mar 2025 19:54:53 +0800 Subject: block: make sure ->nr_integrity_segments is cloned in blk_rq_prep_clone Make sure ->nr_integrity_segments is cloned in blk_rq_prep_clone(), otherwise requests cloned by device-mapper multipath will not have the proper nr_integrity_segments values set, then BUG() is hit from sg_alloc_table_chained(). Fixes: b0fd271d5fba ("block: add request clone interface (v2)") Cc: stable@vger.kernel.org Cc: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250310115453.2271109-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b9550a127c8e..f1030d589a1b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3314,6 +3314,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; -- cgit v1.2.3 From 7e76336e14de9a2b67af96012ddd46c5676cf340 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 9 Mar 2025 12:05:56 -0400 Subject: badblocks: Fix a nonsense WARN_ON() which checks whether a u64 variable < 0 In _badblocks_check(), there are lines of code like this, 1246 sectors -= len; [snipped] 1251 WARN_ON(sectors < 0); The WARN_ON() at line 1257 doesn't make sense because sectors is unsigned long long type and never to be <0. Fix it by checking directly checking whether sectors is less than len. Reported-by: Dan Carpenter Signed-off-by: Coly Li Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250309160556.42854-1-colyli@kernel.org Signed-off-by: Jens Axboe --- block/badblocks.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index 673ef068423a..ece64e76fe8f 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -1242,14 +1242,15 @@ re_check: len = sectors; update_sectors: + /* This situation should never happen */ + WARN_ON(sectors < len); + s += len; sectors -= len; if (sectors > 0) goto re_check; - WARN_ON(sectors < 0); - if (unacked_badblocks > 0) rv = -1; else if (acked_badblocks > 0) -- cgit v1.2.3 From 61667cb6644f6fb01eb8baa928e381c016b5ed7b Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 12 Mar 2025 16:47:22 +0800 Subject: block: remove unused parameter The blk_mq_map_queue()'s request_queue param is not used anymore, remove it, same with blk_get_flush_queue(). Signed-off-by: Guixin Liu Link: https://lore.kernel.org/r/20250312084722.129680-1-kanie@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-flush.c | 10 +++++----- block/blk-mq-sched.c | 2 +- block/blk-mq-tag.c | 3 +-- block/blk-mq.c | 2 +- block/blk-mq.h | 4 +--- block/kyber-iosched.c | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index a72e2a83d075..43d6152897a4 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,9 +95,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +blk_get_flush_queue(struct blk_mq_ctx *ctx) { - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; + return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) @@ -205,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, struct list_head *running; struct request *rq, *n; unsigned long flags = 0; - struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -341,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; - struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(ctx); if (q->elevator) { WARN_ON(rq->tag < 0); @@ -382,7 +382,7 @@ static void blk_rq_init_flush(struct request *rq) bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7442ca27c2bf..109611445d40 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -349,7 +349,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, } ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + hctx = blk_mq_map_queue(bio->bi_opf, ctx); type = hctx->type; if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index b9f417d980b4..d880c50629d6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -190,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, - data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index f1030d589a1b..ae8494d88897 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -508,7 +508,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); if (q->elevator) { /* diff --git a/block/blk-mq.h b/block/blk-mq.h index 44979e92b79f..3011a78cf16a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -100,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue - * @q: request queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ -static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - blk_opf_t opf, +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index dc31f2dfa414..0f0f8452609a 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -568,7 +568,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); -- cgit v1.2.3 From 26064d3e2b4d9a14df1072980e558c636fb023ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 12 Mar 2025 22:51:36 +0800 Subject: block: fix adding folio to bio >4GB folio is possible on some ARCHs, such as aarch64, 16GB hugepage is supported, then 'offset' of folio can't be held in 'unsigned int', cause warning in bio_add_folio_nofail() and IO failure. Fix it by adjusting 'page' & trimming 'offset' so that `->bi_offset` won't be overflow, and folio can be added to bio successfully. Fixes: ed9832bc08db ("block: introduce folio awareness and add a bigger size from folio") Cc: Kundan Kumar Cc: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Luis Chamberlain Cc: Gavin Shan Signed-off-by: Ming Lei Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20250312145136.2891229-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index dabc1a6c41b1..3761600f3e04 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1026,9 +1026,10 @@ EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { + unsigned long nr = off / PAGE_SIZE; + WARN_ON_ONCE(len > UINT_MAX); - WARN_ON_ONCE(off > UINT_MAX); - __bio_add_page(bio, &folio->page, len, off); + __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); @@ -1049,9 +1050,11 @@ EXPORT_SYMBOL_GPL(bio_add_folio_nofail); bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { - if (len > UINT_MAX || off > UINT_MAX) + unsigned long nr = off / PAGE_SIZE; + + if (len > UINT_MAX) return false; - return bio_add_page(bio, &folio->page, len, off) > 0; + return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); -- cgit v1.2.3 From 75618ac6e98faee6ed1f17ae64875cc2d7784204 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 13 Mar 2025 09:23:18 +0530 Subject: block: remove unused parameter 'q' parameter in __blk_rq_map_sg() request_queue param is no longer used by blk_rq_map_sg and __blk_rq_map_sg. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313035322.243239-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 4 ++-- block/bsg-lib.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/block/sunvdc.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/memstick/core/ms_block.c | 2 +- drivers/memstick/core/mspro_block.c | 4 +--- drivers/mmc/core/queue.c | 2 +- drivers/mtd/ubi/block.c | 2 +- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 3 +-- drivers/nvme/target/loop.c | 2 +- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 9 ++++----- 18 files changed, 22 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 15cd231d560c..8bfe54f23e5e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -551,8 +551,8 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, * Map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries. */ -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) { struct req_iterator iter = { .bio = rq->bio, diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 93523d8f8195..9ceb5d0832f5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); - buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 95361099a2dc..0d619df03fa9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2056,7 +2056,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, unsigned int nents; /* Map the scatter list for DMA access */ - nents = blk_rq_map_sg(hctx->queue, rq, command->sg); + nents = blk_rq_map_sg(rq, command->sg); nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); prefetch(&port->flags); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 82467ecde7ec..15627417f12e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1010,7 +1010,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, * See queue limits. */ if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES)) - sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl); + sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl); if (sg_cnt == 0) sg_mark_end(&iu->sgt.sgl[0]); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 282f81616a78..2b33fb5b949b 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -485,7 +485,7 @@ static int __send_request(struct request *req) } sg_init_table(sg, port->ring_cookies); - nsg = blk_rq_map_sg(req->q, req, sg); + nsg = blk_rq_map_sg(req, sg); len = 0; for (i = 0; i < nsg; i++) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6a61ec35f426..a3df4d49bd46 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -226,7 +226,7 @@ static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, if (unlikely(err)) return -ENOMEM; - return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); + return blk_rq_map_sg(req, vbr->sg_table.sgl); } static void virtblk_cleanup_cmd(struct request *req) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index edcd08a9dcef..5babe575c288 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -751,7 +751,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri id = blkif_ring_get_request(rinfo, req, &final_ring_req); ring_req = &rinfo->shadow[id].req; - num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); + num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 5b617c1f6789..f4398383ae06 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1904,7 +1904,7 @@ static void msb_io_work(struct work_struct *work) /* process the request */ dbg_verbose("IO: processing new request"); - blk_rq_map_sg(msb->queue, req, sg); + blk_rq_map_sg(req, sg); lba = blk_rq_pos(req); diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 634d343b6bdb..c9853d887d28 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -627,9 +627,7 @@ static int mspro_block_issue_req(struct memstick_dev *card) while (true) { msb->current_page = 0; msb->current_seg = 0; - msb->seg_count = blk_rq_map_sg(msb->block_req->q, - msb->block_req, - msb->req_sg); + msb->seg_count = blk_rq_map_sg(msb->block_req, msb->req_sg); if (!msb->seg_count) { unsigned int bytes = blk_rq_cur_bytes(msb->block_req); diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index ab662f502fe7..3ba62f825b84 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -523,5 +523,5 @@ unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq) { struct request *req = mmc_queue_req_to_req(mqrq); - return blk_rq_map_sg(mq->queue, req, mqrq->sg); + return blk_rq_map_sg(req, mqrq->sg); } diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 2836905f0152..39cc0a6a4d37 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -199,7 +199,7 @@ static blk_status_t ubiblock_read(struct request *req) * and ubi_read_sg() will check that limit. */ ubi_sgl_init(&pdu->usgl); - blk_rq_map_sg(req->q, req, pdu->usgl.sg); + blk_rq_map_sg(req, pdu->usgl.sg); while (bytes_left) { /* diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 1de11b722f04..fe2f9b143c9f 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -525,7 +525,7 @@ static blk_status_t apple_nvme_map_data(struct apple_nvme *anv, if (!iod->sg) return BLK_STS_RESOURCE; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + iod->nents = blk_rq_map_sg(req, iod->sg); if (!iod->nents) goto out_free_sg; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4f1866fbd5b..7de29dae8e74 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2620,7 +2620,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, if (ret) return -ENOMEM; - op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); + op->nents = blk_rq_map_sg(rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, rq_dma_dir(rq)); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9197a5b173fd..a65978b6cdd8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -812,7 +812,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (!iod->sgt.sgl) return BLK_STS_RESOURCE; sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); + iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl); if (!iod->sgt.orig_nents) goto out_free_sg; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86a2891d9bcc..b5a0295b5bf4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1476,8 +1476,7 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, if (ret) return -ENOMEM; - req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, - req->data_sgl.sg_table.sgl); + req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl); *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a9d112d34d4f..a5c41144667c 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -162,7 +162,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, } iod->req.sg = iod->sg_table.sgl; - iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.sg_cnt = blk_rq_map_sg(req, iod->sg_table.sgl); iod->req.transfer_len = blk_rq_payload_bytes(req); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index be0890e4e706..02576c98a833 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1149,7 +1149,7 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + count = __blk_rq_map_sg(rq, cmd->sdb.table.sgl, &last_sg); if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) { unsigned int pad_len = diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cd..d99024423355 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1155,14 +1155,13 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; - return __blk_rq_map_sg(q, rq, sglist, &last_sg); + return __blk_rq_map_sg(rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); -- cgit v1.2.3 From a3996d11f3ab743e6cc4e3529ce9459c2cd27139 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Mar 2025 17:21:50 +0530 Subject: block: protect debugfs attrs using elevator_lock instead of sysfs_lock Currently, the block debugfs attributes (tags, tags_bitmap, sched_tags, and sched_tags_bitmap) are protected using q->sysfs_lock. However, these attributes are updated in multiple scenarios: - During driver probe method - During an elevator switch/update - During an nr_hw_queues update - When writing to the sysfs attribute nr_requests All these update paths (except driver probe method, which doesn't require any protection) are already protected using q->elevator_lock. To ensure consistency and proper synchronization, replace q->sysfs_lock with q->elevator_lock for protecting these debugfs attributes. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313115235.3707600-2-nilay@linux.ibm.com [axboe: some commit message rewording/fixes] Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 16 ++++++++-------- include/linux/blkdev.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index adf5f0697b6b..62775b132d4c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -400,12 +400,12 @@ static int hctx_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -417,12 +417,12 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -434,12 +434,12 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -451,12 +451,12 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dcf8fce15e23..8d072042790e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,9 +566,9 @@ struct request_queue { * nr_requests and wbt latency, this lock also protects the sysfs attrs * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update * may modify hctx tags, reserved-tags and cpumask, so this lock also - * helps protect the hctx attrs. To ensure proper locking order during - * an elevator or nr_hw_queue update, first freeze the queue, then - * acquire ->elevator_lock. + * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking + * order during an elevator or nr_hw_queue update, first freeze the + * queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 78800f5997d8ae0f20d4aced66a524f0f2fc4c7f Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Mar 2025 17:21:51 +0530 Subject: block: remove unnecessary goto labels in debugfs attribute read methods In some debugfs attribute read methods, failure to acquire the mutex lock results in jumping to a label before returning an error code. However this is unnecessary, as we can return the failure code directly, improving code readability and reducing complexity. This commit removes the goto labels and ensures that the method returns immediately upon failing to acquire the mutex lock. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313115235.3707600-3-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 62775b132d4c..1c958bbaddce 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -402,13 +402,12 @@ static int hctx_tags_show(void *data, struct seq_file *m) res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) @@ -419,13 +418,12 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_show(void *data, struct seq_file *m) @@ -436,13 +434,12 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) @@ -453,13 +450,12 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_active_show(void *data, struct seq_file *m) -- cgit v1.2.3 From 0e94ed33681424a6dce65c62837e08e4c7aa09ac Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Mar 2025 17:21:52 +0530 Subject: block: protect debugfs attribute method hctx_busy_show The hctx_busy_show method in debugfs is currently unprotected. This method iterates over all started requests in a tagset and prints them. However, the tags can be updated concurrently via the sysfs attributes 'nr_requests' or 'scheduler' (elevator switch), leading to potential race conditions. Since sysfs attributes 'nr_requests' and 'scheduler' are already protected using q->elevator_lock, extend this protection to the debugfs 'busy' attribute as well to ensure consistency. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313115235.3707600-4-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1c958bbaddce..3421b5521fe2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -347,9 +347,14 @@ static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; + int res; + res = mutex_lock_interruptible(&hctx->queue->elevator_lock); + if (res) + return res; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, ¶ms); + mutex_unlock(&hctx->queue->elevator_lock); return 0; } -- cgit v1.2.3 From b0d42581195603f38184d7c130d0e2f43f40fb33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 12 Mar 2025 16:01:27 +0100 Subject: block: fix a comment in the queue_attrs[] array queue_ra_entry uses limits_lock just like the attributes above it. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250312150127.703534-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d584461a1d84..a2882751f0d2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -671,11 +671,6 @@ static struct attribute *queue_attrs[] = { &queue_dax_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, - - /* - * Attributes which require some form of locking other than - * q->sysfs_lock. - */ &queue_ra_entry.attr, /* -- cgit v1.2.3 From ffa1e7ada456087c2402b37cd6b2863ced29aff0 Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Tue, 18 Mar 2025 10:55:48 +0100 Subject: block: Make request_queue lockdep splats show up earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In recent kernels, there are lockdep splats around the struct request_queue::io_lockdep_map, similar to [1], but they typically don't show up until reclaim with writeback happens. Having multiple kernel versions released with a known risc of kernel deadlock during reclaim writeback should IMHO be addressed and backported to -stable with the highest priority. In order to have these lockdep splats show up earlier, preferrably during system initialization, prime the struct request_queue::io_lockdep_map as GFP_KERNEL reclaim- tainted. This will instead lead to lockdep splats looking similar to [2], but without the need for reclaim + writeback happening. [1]: [ 189.762244] ====================================================== [ 189.762432] WARNING: possible circular locking dependency detected [ 189.762441] 6.14.0-rc6-xe+ #6 Tainted: G U [ 189.762450] ------------------------------------------------------ [ 189.762459] kswapd0/119 is trying to acquire lock: [ 189.762467] ffff888110ceb710 (&q->q_usage_counter(io)#26){++++}-{0:0}, at: __submit_bio+0x76/0x230 [ 189.762485] but task is already holding lock: [ 189.762494] ffffffff834c97c0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0xbe/0xb00 [ 189.762507] which lock already depends on the new lock. [ 189.762519] the existing dependency chain (in reverse order) is: [ 189.762529] -> #2 (fs_reclaim){+.+.}-{0:0}: [ 189.762540] fs_reclaim_acquire+0xc5/0x100 [ 189.762548] kmem_cache_alloc_lru_noprof+0x4a/0x480 [ 189.762558] alloc_inode+0xaa/0xe0 [ 189.762566] iget_locked+0x157/0x330 [ 189.762573] kernfs_get_inode+0x1b/0x110 [ 189.762582] kernfs_get_tree+0x1b0/0x2e0 [ 189.762590] sysfs_get_tree+0x1f/0x60 [ 189.762597] vfs_get_tree+0x2a/0xf0 [ 189.762605] path_mount+0x4cd/0xc00 [ 189.762613] __x64_sys_mount+0x119/0x150 [ 189.762621] x64_sys_call+0x14f2/0x2310 [ 189.762630] do_syscall_64+0x91/0x180 [ 189.762637] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 189.762647] -> #1 (&root->kernfs_rwsem){++++}-{3:3}: [ 189.762659] down_write+0x3e/0xf0 [ 189.762667] kernfs_remove+0x32/0x60 [ 189.762676] sysfs_remove_dir+0x4f/0x60 [ 189.762685] __kobject_del+0x33/0xa0 [ 189.762709] kobject_del+0x13/0x30 [ 189.762716] elv_unregister_queue+0x52/0x80 [ 189.762725] elevator_switch+0x68/0x360 [ 189.762733] elv_iosched_store+0x14b/0x1b0 [ 189.762756] queue_attr_store+0x181/0x1e0 [ 189.762765] sysfs_kf_write+0x49/0x80 [ 189.762773] kernfs_fop_write_iter+0x17d/0x250 [ 189.762781] vfs_write+0x281/0x540 [ 189.762790] ksys_write+0x72/0xf0 [ 189.762798] __x64_sys_write+0x19/0x30 [ 189.762807] x64_sys_call+0x2a3/0x2310 [ 189.762815] do_syscall_64+0x91/0x180 [ 189.762823] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 189.762833] -> #0 (&q->q_usage_counter(io)#26){++++}-{0:0}: [ 189.762845] __lock_acquire+0x1525/0x2760 [ 189.762854] lock_acquire+0xca/0x310 [ 189.762861] blk_mq_submit_bio+0x8a2/0xba0 [ 189.762870] __submit_bio+0x76/0x230 [ 189.762878] submit_bio_noacct_nocheck+0x323/0x430 [ 189.762888] submit_bio_noacct+0x2cc/0x620 [ 189.762896] submit_bio+0x38/0x110 [ 189.762904] __swap_writepage+0xf5/0x380 [ 189.762912] swap_writepage+0x3c7/0x600 [ 189.762920] shmem_writepage+0x3da/0x4f0 [ 189.762929] pageout+0x13f/0x310 [ 189.762937] shrink_folio_list+0x61c/0xf60 [ 189.763261] evict_folios+0x378/0xcd0 [ 189.763584] try_to_shrink_lruvec+0x1b0/0x360 [ 189.763946] shrink_one+0x10e/0x200 [ 189.764266] shrink_node+0xc02/0x1490 [ 189.764586] balance_pgdat+0x563/0xb00 [ 189.764934] kswapd+0x1e8/0x430 [ 189.765249] kthread+0x10b/0x260 [ 189.765559] ret_from_fork+0x44/0x70 [ 189.765889] ret_from_fork_asm+0x1a/0x30 [ 189.766198] other info that might help us debug this: [ 189.767089] Chain exists of: &q->q_usage_counter(io)#26 --> &root->kernfs_rwsem --> fs_reclaim [ 189.767971] Possible unsafe locking scenario: [ 189.768555] CPU0 CPU1 [ 189.768849] ---- ---- [ 189.769136] lock(fs_reclaim); [ 189.769421] lock(&root->kernfs_rwsem); [ 189.769714] lock(fs_reclaim); [ 189.770016] rlock(&q->q_usage_counter(io)#26); [ 189.770305] *** DEADLOCK *** [ 189.771167] 1 lock held by kswapd0/119: [ 189.771453] #0: ffffffff834c97c0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0xbe/0xb00 [ 189.771770] stack backtrace: [ 189.772351] CPU: 4 UID: 0 PID: 119 Comm: kswapd0 Tainted: G U 6.14.0-rc6-xe+ #6 [ 189.772353] Tainted: [U]=USER [ 189.772354] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 2001 02/01/2023 [ 189.772354] Call Trace: [ 189.772355] [ 189.772356] dump_stack_lvl+0x6e/0xa0 [ 189.772359] dump_stack+0x10/0x18 [ 189.772360] print_circular_bug.cold+0x17a/0x1b7 [ 189.772363] check_noncircular+0x13a/0x150 [ 189.772365] ? __pfx_stack_trace_consume_entry+0x10/0x10 [ 189.772368] __lock_acquire+0x1525/0x2760 [ 189.772368] ? ret_from_fork_asm+0x1a/0x30 [ 189.772371] lock_acquire+0xca/0x310 [ 189.772372] ? __submit_bio+0x76/0x230 [ 189.772375] ? lock_release+0xd5/0x2c0 [ 189.772376] blk_mq_submit_bio+0x8a2/0xba0 [ 189.772378] ? __submit_bio+0x76/0x230 [ 189.772380] __submit_bio+0x76/0x230 [ 189.772382] ? trace_hardirqs_on+0x1e/0xe0 [ 189.772384] submit_bio_noacct_nocheck+0x323/0x430 [ 189.772386] ? submit_bio_noacct_nocheck+0x323/0x430 [ 189.772387] ? __might_sleep+0x58/0xa0 [ 189.772390] submit_bio_noacct+0x2cc/0x620 [ 189.772391] ? count_memcg_events+0x68/0x90 [ 189.772393] submit_bio+0x38/0x110 [ 189.772395] __swap_writepage+0xf5/0x380 [ 189.772396] swap_writepage+0x3c7/0x600 [ 189.772397] shmem_writepage+0x3da/0x4f0 [ 189.772401] pageout+0x13f/0x310 [ 189.772406] shrink_folio_list+0x61c/0xf60 [ 189.772409] ? isolate_folios+0xe80/0x16b0 [ 189.772410] ? mark_held_locks+0x46/0x90 [ 189.772412] evict_folios+0x378/0xcd0 [ 189.772414] ? evict_folios+0x34a/0xcd0 [ 189.772415] ? lock_is_held_type+0xa3/0x130 [ 189.772417] try_to_shrink_lruvec+0x1b0/0x360 [ 189.772420] shrink_one+0x10e/0x200 [ 189.772421] shrink_node+0xc02/0x1490 [ 189.772423] ? shrink_node+0xa08/0x1490 [ 189.772424] ? shrink_node+0xbd8/0x1490 [ 189.772425] ? mem_cgroup_iter+0x366/0x480 [ 189.772427] balance_pgdat+0x563/0xb00 [ 189.772428] ? balance_pgdat+0x563/0xb00 [ 189.772430] ? trace_hardirqs_on+0x1e/0xe0 [ 189.772431] ? finish_task_switch.isra.0+0xcb/0x330 [ 189.772433] ? __switch_to_asm+0x33/0x70 [ 189.772437] kswapd+0x1e8/0x430 [ 189.772438] ? __pfx_autoremove_wake_function+0x10/0x10 [ 189.772440] ? __pfx_kswapd+0x10/0x10 [ 189.772441] kthread+0x10b/0x260 [ 189.772443] ? __pfx_kthread+0x10/0x10 [ 189.772444] ret_from_fork+0x44/0x70 [ 189.772446] ? __pfx_kthread+0x10/0x10 [ 189.772447] ret_from_fork_asm+0x1a/0x30 [ 189.772450] [2]: [ 8.760253] ====================================================== [ 8.760254] WARNING: possible circular locking dependency detected [ 8.760255] 6.14.0-rc6-xe+ #7 Tainted: G U [ 8.760256] ------------------------------------------------------ [ 8.760257] (udev-worker)/674 is trying to acquire lock: [ 8.760259] ffff888100e39148 (&root->kernfs_rwsem){++++}-{3:3}, at: kernfs_remove+0x32/0x60 [ 8.760265] but task is already holding lock: [ 8.760266] ffff888110dc7680 (&q->q_usage_counter(io)#27){++++}-{0:0}, at: blk_mq_freeze_queue_nomemsave+0x12/0x30 [ 8.760272] which lock already depends on the new lock. [ 8.760272] the existing dependency chain (in reverse order) is: [ 8.760273] -> #2 (&q->q_usage_counter(io)#27){++++}-{0:0}: [ 8.760276] blk_alloc_queue+0x30a/0x350 [ 8.760279] blk_mq_alloc_queue+0x6b/0xe0 [ 8.760281] scsi_alloc_sdev+0x276/0x3c0 [ 8.760284] scsi_probe_and_add_lun+0x22a/0x440 [ 8.760286] __scsi_scan_target+0x109/0x230 [ 8.760288] scsi_scan_channel+0x65/0xc0 [ 8.760290] scsi_scan_host_selected+0xff/0x140 [ 8.760292] do_scsi_scan_host+0xa7/0xc0 [ 8.760293] do_scan_async+0x1c/0x160 [ 8.760295] async_run_entry_fn+0x32/0x150 [ 8.760299] process_one_work+0x224/0x5f0 [ 8.760302] worker_thread+0x1d4/0x3e0 [ 8.760304] kthread+0x10b/0x260 [ 8.760306] ret_from_fork+0x44/0x70 [ 8.760309] ret_from_fork_asm+0x1a/0x30 [ 8.760312] -> #1 (fs_reclaim){+.+.}-{0:0}: [ 8.760315] fs_reclaim_acquire+0xc5/0x100 [ 8.760317] kmem_cache_alloc_lru_noprof+0x4a/0x480 [ 8.760319] alloc_inode+0xaa/0xe0 [ 8.760322] iget_locked+0x157/0x330 [ 8.760323] kernfs_get_inode+0x1b/0x110 [ 8.760325] kernfs_get_tree+0x1b0/0x2e0 [ 8.760327] sysfs_get_tree+0x1f/0x60 [ 8.760329] vfs_get_tree+0x2a/0xf0 [ 8.760332] path_mount+0x4cd/0xc00 [ 8.760334] __x64_sys_mount+0x119/0x150 [ 8.760336] x64_sys_call+0x14f2/0x2310 [ 8.760338] do_syscall_64+0x91/0x180 [ 8.760340] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 8.760342] -> #0 (&root->kernfs_rwsem){++++}-{3:3}: [ 8.760345] __lock_acquire+0x1525/0x2760 [ 8.760347] lock_acquire+0xca/0x310 [ 8.760348] down_write+0x3e/0xf0 [ 8.760350] kernfs_remove+0x32/0x60 [ 8.760351] sysfs_remove_dir+0x4f/0x60 [ 8.760353] __kobject_del+0x33/0xa0 [ 8.760355] kobject_del+0x13/0x30 [ 8.760356] elv_unregister_queue+0x52/0x80 [ 8.760358] elevator_switch+0x68/0x360 [ 8.760360] elv_iosched_store+0x14b/0x1b0 [ 8.760362] queue_attr_store+0x181/0x1e0 [ 8.760364] sysfs_kf_write+0x49/0x80 [ 8.760366] kernfs_fop_write_iter+0x17d/0x250 [ 8.760367] vfs_write+0x281/0x540 [ 8.760370] ksys_write+0x72/0xf0 [ 8.760372] __x64_sys_write+0x19/0x30 [ 8.760374] x64_sys_call+0x2a3/0x2310 [ 8.760376] do_syscall_64+0x91/0x180 [ 8.760377] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 8.760380] other info that might help us debug this: [ 8.760380] Chain exists of: &root->kernfs_rwsem --> fs_reclaim --> &q->q_usage_counter(io)#27 [ 8.760384] Possible unsafe locking scenario: [ 8.760384] CPU0 CPU1 [ 8.760385] ---- ---- [ 8.760385] lock(&q->q_usage_counter(io)#27); [ 8.760387] lock(fs_reclaim); [ 8.760388] lock(&q->q_usage_counter(io)#27); [ 8.760390] lock(&root->kernfs_rwsem); [ 8.760391] *** DEADLOCK *** [ 8.760391] 6 locks held by (udev-worker)/674: [ 8.760392] #0: ffff8881209ac420 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0x72/0xf0 [ 8.760398] #1: ffff88810c80f488 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x136/0x250 [ 8.760402] #2: ffff888125d1d330 (kn->active#101){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x13f/0x250 [ 8.760406] #3: ffff888110dc7bb0 (&q->sysfs_lock){+.+.}-{3:3}, at: queue_attr_store+0x148/0x1e0 [ 8.760411] #4: ffff888110dc7680 (&q->q_usage_counter(io)#27){++++}-{0:0}, at: blk_mq_freeze_queue_nomemsave+0x12/0x30 [ 8.760416] #5: ffff888110dc76b8 (&q->q_usage_counter(queue)#27){++++}-{0:0}, at: blk_mq_freeze_queue_nomemsave+0x12/0x30 [ 8.760421] stack backtrace: [ 8.760422] CPU: 7 UID: 0 PID: 674 Comm: (udev-worker) Tainted: G U 6.14.0-rc6-xe+ #7 [ 8.760424] Tainted: [U]=USER [ 8.760425] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 2001 02/01/2023 [ 8.760426] Call Trace: [ 8.760427] [ 8.760428] dump_stack_lvl+0x6e/0xa0 [ 8.760431] dump_stack+0x10/0x18 [ 8.760433] print_circular_bug.cold+0x17a/0x1b7 [ 8.760437] check_noncircular+0x13a/0x150 [ 8.760441] ? save_trace+0x54/0x360 [ 8.760445] __lock_acquire+0x1525/0x2760 [ 8.760446] ? irqentry_exit+0x3a/0xb0 [ 8.760448] ? sysvec_apic_timer_interrupt+0x57/0xc0 [ 8.760452] lock_acquire+0xca/0x310 [ 8.760453] ? kernfs_remove+0x32/0x60 [ 8.760457] down_write+0x3e/0xf0 [ 8.760459] ? kernfs_remove+0x32/0x60 [ 8.760460] kernfs_remove+0x32/0x60 [ 8.760462] sysfs_remove_dir+0x4f/0x60 [ 8.760464] __kobject_del+0x33/0xa0 [ 8.760466] kobject_del+0x13/0x30 [ 8.760467] elv_unregister_queue+0x52/0x80 [ 8.760470] elevator_switch+0x68/0x360 [ 8.760472] elv_iosched_store+0x14b/0x1b0 [ 8.760475] queue_attr_store+0x181/0x1e0 [ 8.760479] ? lock_acquire+0xca/0x310 [ 8.760480] ? kernfs_fop_write_iter+0x13f/0x250 [ 8.760482] ? lock_is_held_type+0xa3/0x130 [ 8.760485] sysfs_kf_write+0x49/0x80 [ 8.760487] kernfs_fop_write_iter+0x17d/0x250 [ 8.760489] vfs_write+0x281/0x540 [ 8.760494] ksys_write+0x72/0xf0 [ 8.760497] __x64_sys_write+0x19/0x30 [ 8.760499] x64_sys_call+0x2a3/0x2310 [ 8.760502] do_syscall_64+0x91/0x180 [ 8.760504] ? trace_hardirqs_off+0x5d/0xe0 [ 8.760506] ? handle_softirqs+0x479/0x4d0 [ 8.760508] ? hrtimer_interrupt+0x13f/0x280 [ 8.760511] ? irqentry_exit_to_user_mode+0x8b/0x260 [ 8.760513] ? clear_bhb_loop+0x15/0x70 [ 8.760515] ? clear_bhb_loop+0x15/0x70 [ 8.760516] ? clear_bhb_loop+0x15/0x70 [ 8.760518] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 8.760520] RIP: 0033:0x7aa3bf2f5504 [ 8.760522] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 8b 10 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 [ 8.760523] RSP: 002b:00007ffc1e3697d8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 8.760526] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007aa3bf2f5504 [ 8.760527] RDX: 0000000000000003 RSI: 00007ffc1e369ae0 RDI: 000000000000001c [ 8.760528] RBP: 00007ffc1e369800 R08: 00007aa3bf3f51c8 R09: 00007ffc1e3698b0 [ 8.760528] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000003 [ 8.760529] R13: 00007ffc1e369ae0 R14: 0000613ccf21f2f0 R15: 00007aa3bf3f4e80 [ 8.760533] v2: - Update a code comment to increase readability (Ming Lei). Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Ming Lei Signed-off-by: Thomas Hellström Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250318095548.5187-1-thomas.hellstrom@linux.intel.com Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 362d0a55b07a..4623de79effa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -456,6 +456,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", &q->q_lock_cls_key, 0); + /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */ + fs_reclaim_acquire(GFP_KERNEL); + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); + fs_reclaim_release(GFP_KERNEL); + q->nr_requests = BLKDEV_DEFAULT_RQ; return q; -- cgit v1.2.3 From e1a0202c6bfda24002a3ae2115154fa90104c649 Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Mon, 17 Mar 2025 10:29:24 +0800 Subject: blk-cgroup: improve policy registration error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch improve the returned error code of blkcg_policy_register(). 1. Move the validation check for cpd/pd_alloc_fn and cpd/pd_free_fn function pairs to the start of blkcg_policy_register(). This ensures we immediately return -EINVAL if the function pairs are not correctly provided, rather than returning -ENOSPC after locking and unlocking mutexes unnecessarily. Those locks should not contention any problems, as error of policy registration is a super cold path. 2. Return -ENOMEM when cpd_alloc_fn() failed. Co-authored-by: Wen Tao Signed-off-by: Wen Tao Signed-off-by: Chen Linxuan Reviewed-by: Michal Koutný Acked-by: Tejun Heo Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/3E333A73B6B6DFC0+20250317022924.150907-1-chenlinxuan@uniontech.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ed93d91d754..2609f7294427 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1727,27 +1727,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* - * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy - * without pd_alloc_fn/pd_free_fn can't be activated. - */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1758,8 +1758,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; -- cgit v1.2.3 From 89ed5fa3b5419f04452051fbcb6d3e5b801cdb1b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 19 Mar 2025 16:23:45 +0530 Subject: block: release q->elevator_lock in ioc_qos_write The ioc_qos_write method acquires q->elevator_lock to protect updates to blk-wbt parameters. Once these updates are complete, the lock should be released before returning from ioc_qos_write. However, in one code path, the release of q->elevator_lock was mistakenly omitted, potentially leading to a lock leak. This commit fixes the issue by ensuring that q->elevator_lock is properly released in all return paths of ioc_qos_write. Fixes: 245618f8e45f ("block: protect wbt_lat_usec using q->elevator_lock") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202503171650.cc082b66-lkp@intel.com Signed-off-by: Nilay Shroff Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250319105518.468941-2-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 38e7bf3c3b4f..56e6fb51316d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3348,6 +3348,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); blk_mq_unfreeze_queue(disk->queue, memflags); blkg_conf_exit(&ctx); -- cgit v1.2.3 From 9730763f4756e32520cb86778331465e8d063a8f Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 19 Mar 2025 16:23:46 +0530 Subject: block: correct locking order for protecting blk-wbt parameters The commit '245618f8e45f ("block: protect wbt_lat_usec using q-> elevator_lock")' introduced q->elevator_lock to protect updates to blk-wbt parameters when writing to the sysfs attribute wbt_ lat_usec and the cgroup attribute io.cost.qos. However, both these attributes also acquire q->rq_qos_mutex, leading to the following lockdep warning: ====================================================== WARNING: possible circular locking dependency detected 6.14.0-rc5+ #138 Not tainted ------------------------------------------------------ bash/5902 is trying to acquire lock: c000000085d495a0 (&q->rq_qos_mutex){+.+.}-{4:4}, at: wbt_init+0x164/0x238 but task is already holding lock: c000000085d498c8 (&q->elevator_lock){+.+.}-{4:4}, at: queue_wb_lat_store+0xb0/0x20c which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&q->elevator_lock){+.+.}-{4:4}: __mutex_lock+0xf0/0xa58 ioc_qos_write+0x16c/0x85c cgroup_file_write+0xc4/0x32c kernfs_fop_write_iter+0x1b8/0x29c vfs_write+0x410/0x584 ksys_write+0x84/0x140 system_call_exception+0x134/0x360 system_call_vectored_common+0x15c/0x2ec -> #0 (&q->rq_qos_mutex){+.+.}-{4:4}: __lock_acquire+0x1b6c/0x2ae0 lock_acquire+0x140/0x430 __mutex_lock+0xf0/0xa58 wbt_init+0x164/0x238 queue_wb_lat_store+0x1dc/0x20c queue_attr_store+0x12c/0x164 sysfs_kf_write+0x6c/0xb0 kernfs_fop_write_iter+0x1b8/0x29c vfs_write+0x410/0x584 ksys_write+0x84/0x140 system_call_exception+0x134/0x360 system_call_vectored_common+0x15c/0x2ec other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->elevator_lock); lock(&q->rq_qos_mutex); lock(&q->elevator_lock); lock(&q->rq_qos_mutex); *** DEADLOCK *** 6 locks held by bash/5902: #0: c000000051122400 (sb_writers#3){.+.+}-{0:0}, at: ksys_write+0x84/0x140 #1: c00000007383f088 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0x174/0x29c #2: c000000008550428 (kn->active#182){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x180/0x29c #3: c000000085d493a8 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: blk_mq_freeze_queue_nomemsave+0x28/0x40 #4: c000000085d493e0 (&q->q_usage_counter(queue)#5){++++}-{0:0}, at: blk_mq_freeze_queue_nomemsave+0x28/0x40 #5: c000000085d498c8 (&q->elevator_lock){+.+.}-{4:4}, at: queue_wb_lat_store+0xb0/0x20c stack backtrace: CPU: 17 UID: 0 PID: 5902 Comm: bash Kdump: loaded Not tainted 6.14.0-rc5+ #138 Hardware name: IBM,9043-MRX POWER10 (architected) 0x800200 0xf000006 of:IBM,FW1060.00 (NM1060_028) hv:phyp pSeries Call Trace: [c0000000721ef590] [c00000000118f8a8] dump_stack_lvl+0x108/0x18c (unreliable) [c0000000721ef5c0] [c00000000022563c] print_circular_bug+0x448/0x604 [c0000000721ef670] [c000000000225a44] check_noncircular+0x24c/0x26c [c0000000721ef740] [c00000000022bf28] __lock_acquire+0x1b6c/0x2ae0 [c0000000721ef870] [c000000000229240] lock_acquire+0x140/0x430 [c0000000721ef970] [c0000000011cfbec] __mutex_lock+0xf0/0xa58 [c0000000721efaa0] [c00000000096c46c] wbt_init+0x164/0x238 [c0000000721efaf0] [c0000000008f8cd8] queue_wb_lat_store+0x1dc/0x20c [c0000000721efb50] [c0000000008f8fa0] queue_attr_store+0x12c/0x164 [c0000000721efc60] [c0000000007c11cc] sysfs_kf_write+0x6c/0xb0 [c0000000721efca0] [c0000000007bfa4c] kernfs_fop_write_iter+0x1b8/0x29c [c0000000721efcf0] [c0000000006a281c] vfs_write+0x410/0x584 [c0000000721efdc0] [c0000000006a2cc8] ksys_write+0x84/0x140 [c0000000721efe10] [c000000000031b64] system_call_exception+0x134/0x360 [c0000000721efe50] [c00000000000cedc] system_call_vectored_common+0x15c/0x2ec >From the above log it's apparent that method which writes to sysfs attr wbt_lat_usec acquires q->elevator_lock first, and then acquires q->rq_ qos_mutex. However the another method which writes to io.cost.qos, acquires q->rq_qos_mutex first, and then acquires q->rq_qos_mutex. So this could potentially cause the deadlock. A closer look at ioc_qos_write shows that correcting the lock order is non-trivial because q->rq_qos_mutex is acquired in blkg_conf_open_bdev and released in blkg_conf_exit. The function blkg_conf_open_bdev is responsible for parsing user input and finding the corresponding block device (bdev) from the user provided major:minor number. Since we do not know the bdev until blkg_conf_open_bdev completes, we cannot simply move q->elevator_lock acquisition before blkg_conf_open_ bdev. So to address this, we intoduce new helpers blkg_conf_open_bdev_ frozen and blkg_conf_exit_frozen which are just wrappers around blkg_ conf_open_bdev and blkg_conf_exit respectively. The helper blkg_conf_ open_bdev_frozen is similar to blkg_conf_open_bdev, but additionally freezes the queue, acquires q->elevator_lock and ensures the correct locking order is followed between q->elevator_lock and q->rq_qos_mutex. Similarly another helper blkg_conf_exit_frozen in addition to unfreezing the queue ensures that we release the locks in correct order. By using these helpers, now we maintain the same locking order in all code paths where we update blk-wbt parameters. Fixes: 245618f8e45f ("block: protect wbt_lat_usec using q->elevator_lock") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202503171650.cc082b66-lkp@intel.com Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250319105518.468941-3-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-cgroup.h | 2 ++ block/blk-iocost.c | 18 +++++------------- 3 files changed, 58 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2609f7294427..88ff27697380 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -815,6 +815,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) ctx->bdev = bdev; return 0; } +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * acquires q->elevator_lock, and ensures the correct locking order + * between q->elevator_lock and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we haven’t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue and acquiring q->elevator_lock to maintain the correct + * locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->elevator_lock); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; +} /** * blkg_conf_prep - parse and prepare for per-blkg config update @@ -971,6 +1006,22 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); +/* + * Similar to blkg_conf_exit, but also unfreezes the queue and releases + * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen + * is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) +{ + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; + + blkg_conf_exit(ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + } +} + static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2c4663bd993a..81868ad86330 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -219,9 +219,11 @@ struct blkg_conf_ctx { void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); +unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 56e6fb51316d..3724b0308cd8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3223,13 +3223,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; - unsigned int memflags; + unsigned long memflags; int ret; blkg_conf_init(&ctx, input); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) goto err; body = ctx.body; @@ -3247,8 +3247,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } - memflags = blk_mq_freeze_queue(disk->queue); - mutex_lock(&disk->queue->elevator_lock); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3348,21 +3346,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); - mutex_unlock(&disk->queue->elevator_lock); - blk_mq_unfreeze_queue(disk->queue, memflags); - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); - blk_mq_unquiesce_queue(disk->queue); - mutex_unlock(&disk->queue->elevator_lock); - blk_mq_unfreeze_queue(disk->queue, memflags); - ret = -EINVAL; err: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } -- cgit v1.2.3 From 03c90afb21b45edb87533fa6f11c5f914d26298b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Mar 2025 14:36:33 -0600 Subject: block/blk-iocost: ensure 'ret' is set on error In case blkg_conf_open_bdev_frozen() fails, ioc_qos_write() jumps to the error path without assigning a value to 'ret'. Ensure that it inherits the error from the passed back error value. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503200454.QWpwKeJu-lkp@intel.com/ Fixes: 9730763f4756 ("block: correct locking order for protecting blk-wbt parameters") Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 3724b0308cd8..f798f2c9ca35 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3229,8 +3229,10 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blkg_conf_init(&ctx, input); memflags = blkg_conf_open_bdev_frozen(&ctx); - if (IS_ERR_VALUE(memflags)) + if (IS_ERR_VALUE(memflags)) { + ret = memflags; goto err; + } body = ctx.body; disk = ctx.bdev->bd_disk; -- cgit v1.2.3