summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:57:21 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:57:21 -0800
commit0c00ed308d0559fc216be0442a3df124e9e13533 (patch)
treea41c8509b8543ce8681d0aa9c06a9f94c2b6e458 /include
parent591beb0e3a03258ef9c01893a5209845799a7c33 (diff)
parent72f4d6fca699a1e35b39c5e5dacac2926d254135 (diff)
Merge tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - Support for batch request processing for ublk, improving the efficiency of the kernel/ublk server communication. This can yield nice 7-12% performance improvements - Support for integrity data for ublk - Various other ublk improvements and additions, including a ton of selftests additions and updated - Move the handling of blk-crypto software fallback from below the block layer to above it. This reduces the complexity of dealing with bio splitting - Series fixing a number of potential deadlocks in blk-mq related to the queue usage counter and writeback throttling and rq-qos debugfs handling - Add an async_depth queue attribute, to resolve a performance regression that's been around for a qhilw related to the scheduler depth handling - Only use task_work for IOPOLL completions on NVMe, if it is necessary to do so. An earlier fix for an issue resulted in all these completions being punted to task_work, to guarantee that completions were only run for a given io_uring ring when it was local to that ring. With the new changes, we can detect if it's necessary to use task_work or not, and avoid it if possible. - rnbd fixes: - Fix refcount underflow in device unmap path - Handle PREFLUSH and NOUNMAP flags properly in protocol - Fix server-side bi_size for special IOs - Zero response buffer before use - Fix trace format for flags - Add .release to rnbd_dev_ktype - MD pull requests via Yu Kuai - Fix raid5_run() to return error when log_init() fails - Fix IO hang with degraded array with llbitmap - Fix percpu_ref not resurrected on suspend timeout in llbitmap - Fix GPF in write_page caused by resize race - Fix NULL pointer dereference in process_metadata_update - Fix hang when stopping arrays with metadata through dm-raid - Fix any_working flag handling in raid10_sync_request - Refactor sync/recovery code path, improve error handling for badblocks, and remove unused recovery_disabled field - Consolidate mddev boolean fields into mddev_flags - Use mempool to allocate stripe_request_ctx and make sure max_sectors is not less than io_opt in raid5 - Fix return value of mddev_trylock - Fix memory leak in raid1_run() - Add Li Nan as mdraid reviewer - Move phys_vec definitions to the kernel types, mostly in preparation for some VFIO and RDMA changes - Improve the speed for secure erase for some devices - Various little rust updates - Various other minor fixes, improvements, and cleanups * tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits) blk-mq: ABI/sysfs-block: fix docs build warnings selftests: ublk: organize test directories by test ID block: decouple secure erase size limit from discard size limit block: remove redundant kill_bdev() call in set_blocksize() blk-mq: add documentation for new queue attribute async_dpeth block, bfq: convert to use request_queue->async_depth mq-deadline: covert to use request_queue->async_depth kyber: covert to use request_queue->async_depth blk-mq: add a new queue sysfs attribute async_depth blk-mq: factor out a helper blk_mq_limit_depth() blk-mq-sched: unify elevators checking for async requests block: convert nr_requests to unsigned int block: don't use strcpy to copy blockdev name blk-mq-debugfs: warn about possible deadlock blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs() blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static blk-rq-qos: fix possible debugfs_mutex deadlock blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/bio.h6
-rw-r--r--include/linux/blk-crypto.h32
-rw-r--r--include/linux/blk-integrity.h6
-rw-r--r--include/linux/blk-mq-dma.h2
-rw-r--r--include/linux/blk-mq.h4
-rw-r--r--include/linux/blk_types.h4
-rw-r--r--include/linux/blkdev.h24
-rw-r--r--include/linux/types.h5
-rw-r--r--include/uapi/linux/ublk_cmd.h121
9 files changed, 181 insertions, 23 deletions
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6156f2d66d4a..7a2f3fc5be57 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -256,12 +256,6 @@ static inline struct folio *bio_first_folio_all(struct bio *bio)
return page_folio(bio_first_page_all(bio));
}
-static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
-{
- WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
- return &bio->bi_io_vec[bio->bi_vcnt - 1];
-}
-
/**
* struct folio_iter - State for iterating all folios in a bio.
* @folio: The current folio we're iterating. NULL after the last folio.
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index 58b0c5254a67..f7c3cb4a342f 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -132,6 +132,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio)
return bio->bi_crypt_context;
}
+static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio)
+{
+ return bio->bi_crypt_context;
+}
+
void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
gfp_t gfp_mask);
@@ -169,8 +174,35 @@ static inline bool bio_has_crypt_ctx(struct bio *bio)
return false;
}
+static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio)
+{
+ return NULL;
+}
+
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
+bool __blk_crypto_submit_bio(struct bio *bio);
+
+/**
+ * blk_crypto_submit_bio - Submit a bio that may have a crypto context
+ * @bio: bio to submit
+ *
+ * If @bio has no crypto context, or the crypt context attached to @bio is
+ * supported by the underlying device's inline encryption hardware, just submit
+ * @bio.
+ *
+ * Otherwise, try to perform en/decryption for this bio by falling back to the
+ * kernel crypto API. For encryption this means submitting newly allocated
+ * bios for the encrypted payload while keeping back the source bio until they
+ * complete, while for reads the decryption happens in-place by a hooked in
+ * completion handler.
+ */
+static inline void blk_crypto_submit_bio(struct bio *bio)
+{
+ if (!bio_has_crypt_ctx(bio) || __blk_crypto_submit_bio(bio))
+ submit_bio(bio);
+}
+
int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
/**
* bio_crypt_clone - clone bio encryption context
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index a6b84206eb94..c15b1ac62765 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -91,7 +91,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
return bio_integrity_intervals(bi, sectors) * bi->metadata_size;
}
-static inline bool blk_integrity_rq(struct request *rq)
+static inline bool blk_integrity_rq(const struct request *rq)
{
return rq->cmd_flags & REQ_INTEGRITY;
}
@@ -168,9 +168,9 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
{
return 0;
}
-static inline int blk_integrity_rq(struct request *rq)
+static inline bool blk_integrity_rq(const struct request *rq)
{
- return 0;
+ return false;
}
static inline struct bio_vec rq_integrity_vec(struct request *rq)
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index cb88fc791fbd..214c181ff2c9 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -28,7 +28,7 @@ struct blk_dma_iter {
bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
struct dma_iova_state *state, struct blk_dma_iter *iter);
bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
- struct dma_iova_state *state, struct blk_dma_iter *iter);
+ struct blk_dma_iter *iter);
/**
* blk_rq_dma_map_coalesce - were all segments coalesced?
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index cae9e857aea4..18a2388ba581 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -13,6 +13,7 @@
struct blk_mq_tags;
struct blk_flush_queue;
+struct io_comp_batch;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_DEFAULT_RQ 128
@@ -22,7 +23,8 @@ enum rq_end_io_ret {
RQ_END_IO_FREE,
};
-typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
+typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t,
+ const struct io_comp_batch *);
/*
* request flags */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5dc061d318a4..19a888a2f104 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -232,6 +232,8 @@ struct bio {
atomic_t __bi_remaining;
+ /* The actual vec list, preserved by bio_reset() */
+ struct bio_vec *bi_io_vec;
struct bvec_iter bi_iter;
union {
@@ -275,8 +277,6 @@ struct bio {
atomic_t __bi_cnt; /* pin count */
- struct bio_vec *bi_io_vec; /* the actual vec list */
-
struct bio_set *bi_pool;
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 72e34acd439c..99ef8cd7673c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -340,14 +340,13 @@ typedef unsigned int __bitwise blk_features_t;
/* skip this queue in blk_mq_(un)quiesce_tagset */
#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13))
+/* atomic writes enabled */
+#define BLK_FEAT_ATOMIC_WRITES ((__force blk_features_t)(1u << 14))
+
/* undocumented magic for bcache */
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
((__force blk_features_t)(1u << 15))
-/* atomic writes enabled */
-#define BLK_FEAT_ATOMIC_WRITES \
- ((__force blk_features_t)(1u << 16))
-
/*
* Flags automatically inherited when stacking limits.
*/
@@ -551,7 +550,8 @@ struct request_queue {
/*
* queue settings
*/
- unsigned long nr_requests; /* Max # of requests */
+ unsigned int nr_requests; /* Max # of requests */
+ unsigned int async_depth; /* Max # of async requests */
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct blk_crypto_profile *crypto_profile;
@@ -681,7 +681,7 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q) \
test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
-#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL))
+#define blk_queue_rot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL)
#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT)
#define blk_queue_passthrough_stat(q) \
((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH)
@@ -1026,7 +1026,7 @@ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
-/* Helper to convert REQ_OP_XXX to its string format XXX */
+/* Convert a request operation REQ_OP_name into the string "name" */
extern const char *blk_op_str(enum req_op op);
int blk_status_to_errno(blk_status_t status);
@@ -1044,7 +1044,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
return bdev->bd_queue; /* this is never NULL */
}
-/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
+/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
static inline unsigned int bio_zone_no(struct bio *bio)
@@ -1462,9 +1462,14 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev)
return bdev_limits(bdev)->max_wzeroes_unmap_sectors;
}
+static inline bool bdev_rot(struct block_device *bdev)
+{
+ return blk_queue_rot(bdev_get_queue(bdev));
+}
+
static inline bool bdev_nonrot(struct block_device *bdev)
{
- return blk_queue_nonrot(bdev_get_queue(bdev));
+ return !bdev_rot(bdev);
}
static inline bool bdev_synchronous(struct block_device *bdev)
@@ -1822,6 +1827,7 @@ struct io_comp_batch {
struct rq_list req_list;
bool need_ts;
void (*complete)(struct io_comp_batch *);
+ void *poll_ctx;
};
static inline bool blk_atomic_write_start_sect_aligned(sector_t sector,
diff --git a/include/linux/types.h b/include/linux/types.h
index d4437e9c452c..d673747eda8a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -171,6 +171,11 @@ typedef u64 phys_addr_t;
typedef u32 phys_addr_t;
#endif
+struct phys_vec {
+ phys_addr_t paddr;
+ size_t len;
+};
+
typedef phys_addr_t resource_size_t;
/*
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ec77dabba45b..a88876756805 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -55,7 +55,8 @@
_IOWR('u', 0x15, struct ublksrv_ctrl_cmd)
#define UBLK_U_CMD_QUIESCE_DEV \
_IOWR('u', 0x16, struct ublksrv_ctrl_cmd)
-
+#define UBLK_U_CMD_TRY_STOP_DEV \
+ _IOWR('u', 0x17, struct ublksrv_ctrl_cmd)
/*
* 64bits are enough now, and it should be easy to extend in case of
* running out of feature flags
@@ -103,6 +104,30 @@
#define UBLK_U_IO_UNREGISTER_IO_BUF \
_IOWR('u', 0x24, struct ublksrv_io_cmd)
+/*
+ * return 0 if the command is run successfully, otherwise failure code
+ * is returned
+ */
+#define UBLK_U_IO_PREP_IO_CMDS \
+ _IOWR('u', 0x25, struct ublk_batch_io)
+/*
+ * If failure code is returned, nothing in the command buffer is handled.
+ * Otherwise, the returned value means how many bytes in command buffer
+ * are handled actually, then number of handled IOs can be calculated with
+ * `elem_bytes` for each IO. IOs in the remained bytes are not committed,
+ * userspace has to check return value for dealing with partial committing
+ * correctly.
+ */
+#define UBLK_U_IO_COMMIT_IO_CMDS \
+ _IOWR('u', 0x26, struct ublk_batch_io)
+
+/*
+ * Fetch io commands to provided buffer in multishot style,
+ * `IORING_URING_CMD_MULTISHOT` is required for this command.
+ */
+#define UBLK_U_IO_FETCH_IO_CMDS \
+ _IOWR('u', 0x27, struct ublk_batch_io)
+
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
#define UBLK_IO_RES_NEED_GET_DATA 1
@@ -134,6 +159,10 @@
#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS)
#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
+/* Copy to/from request integrity buffer instead of data buffer */
+#define UBLK_INTEGRITY_FLAG_OFF 62
+#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF)
+
/*
* ublk server can register data buffers for incoming I/O requests with a sparse
* io_uring buffer table. The request buffer can then be used as the data buffer
@@ -311,6 +340,36 @@
*/
#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
+/*
+ * Support the following commands for delivering & committing io command
+ * in batch.
+ *
+ * - UBLK_U_IO_PREP_IO_CMDS
+ * - UBLK_U_IO_COMMIT_IO_CMDS
+ * - UBLK_U_IO_FETCH_IO_CMDS
+ * - UBLK_U_IO_REGISTER_IO_BUF
+ * - UBLK_U_IO_UNREGISTER_IO_BUF
+ *
+ * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and
+ * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature.
+ */
+#define UBLK_F_BATCH_IO (1ULL << 15)
+
+/*
+ * ublk device supports requests with integrity/metadata buffer.
+ * Requires UBLK_F_USER_COPY.
+ */
+#define UBLK_F_INTEGRITY (1ULL << 16)
+
+/*
+ * The device supports the UBLK_CMD_TRY_STOP_DEV command, which
+ * allows stopping the device only if there are no openers.
+ */
+#define UBLK_F_SAFE_STOP_DEV (1ULL << 17)
+
+/* Disable automatic partition scanning when device is started */
+#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -408,6 +467,8 @@ struct ublksrv_ctrl_dev_info {
* passed in.
*/
#define UBLK_IO_F_NEED_REG_BUF (1U << 17)
+/* Request has an integrity data buffer */
+#define UBLK_IO_F_INTEGRITY (1UL << 18)
/*
* io cmd is described by this structure, and stored in share memory, indexed
@@ -525,6 +586,51 @@ struct ublksrv_io_cmd {
};
};
+struct ublk_elem_header {
+ __u16 tag; /* IO tag */
+
+ /*
+ * Buffer index for incoming io command, only valid iff
+ * UBLK_F_AUTO_BUF_REG is set
+ */
+ __u16 buf_index;
+ __s32 result; /* I/O completion result (commit only) */
+};
+
+/*
+ * uring_cmd buffer structure for batch commands
+ *
+ * buffer includes multiple elements, which number is specified by
+ * `nr_elem`. Each element buffer is organized in the following order:
+ *
+ * struct ublk_elem_buffer {
+ * // Mandatory fields (8 bytes)
+ * struct ublk_elem_header header;
+ *
+ * // Optional fields (8 bytes each, included based on flags)
+ *
+ * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data
+ * // between ublk request and ublk server buffer
+ * __u64 buf_addr;
+ *
+ * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA)
+ * __u64 zone_lba;
+ * }
+ *
+ * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS`
+ */
+struct ublk_batch_io {
+ __u16 q_id;
+#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0)
+#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1)
+#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2)
+ __u16 flags;
+ __u16 nr_elem;
+ __u8 elem_bytes;
+ __u8 reserved;
+ __u64 reserved2;
+};
+
struct ublk_param_basic {
#define UBLK_ATTR_READ_ONLY (1 << 0)
#define UBLK_ATTR_ROTATIONAL (1 << 1)
@@ -600,6 +706,17 @@ struct ublk_param_segment {
__u8 pad[2];
};
+struct ublk_param_integrity {
+ __u32 flags; /* LBMD_PI_CAP_* from linux/fs.h */
+ __u16 max_integrity_segments; /* 0 means no limit */
+ __u8 interval_exp;
+ __u8 metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */
+ __u8 pi_offset;
+ __u8 csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */
+ __u8 tag_size;
+ __u8 pad[5];
+};
+
struct ublk_params {
/*
* Total length of parameters, userspace has to set 'len' for both
@@ -614,6 +731,7 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_ZONED (1 << 3)
#define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4)
#define UBLK_PARAM_TYPE_SEGMENT (1 << 5)
+#define UBLK_PARAM_TYPE_INTEGRITY (1 << 6) /* requires UBLK_F_INTEGRITY */
__u32 types; /* types of parameter included */
struct ublk_param_basic basic;
@@ -622,6 +740,7 @@ struct ublk_params {
struct ublk_param_zoned zoned;
struct ublk_param_dma_align dma;
struct ublk_param_segment seg;
+ struct ublk_param_integrity integrity;
};
#endif