summaryrefslogtreecommitdiff
path: root/include/uapi/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:57:21 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:57:21 -0800
commit0c00ed308d0559fc216be0442a3df124e9e13533 (patch)
treea41c8509b8543ce8681d0aa9c06a9f94c2b6e458 /include/uapi/linux
parent591beb0e3a03258ef9c01893a5209845799a7c33 (diff)
parent72f4d6fca699a1e35b39c5e5dacac2926d254135 (diff)
Merge tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - Support for batch request processing for ublk, improving the efficiency of the kernel/ublk server communication. This can yield nice 7-12% performance improvements - Support for integrity data for ublk - Various other ublk improvements and additions, including a ton of selftests additions and updated - Move the handling of blk-crypto software fallback from below the block layer to above it. This reduces the complexity of dealing with bio splitting - Series fixing a number of potential deadlocks in blk-mq related to the queue usage counter and writeback throttling and rq-qos debugfs handling - Add an async_depth queue attribute, to resolve a performance regression that's been around for a qhilw related to the scheduler depth handling - Only use task_work for IOPOLL completions on NVMe, if it is necessary to do so. An earlier fix for an issue resulted in all these completions being punted to task_work, to guarantee that completions were only run for a given io_uring ring when it was local to that ring. With the new changes, we can detect if it's necessary to use task_work or not, and avoid it if possible. - rnbd fixes: - Fix refcount underflow in device unmap path - Handle PREFLUSH and NOUNMAP flags properly in protocol - Fix server-side bi_size for special IOs - Zero response buffer before use - Fix trace format for flags - Add .release to rnbd_dev_ktype - MD pull requests via Yu Kuai - Fix raid5_run() to return error when log_init() fails - Fix IO hang with degraded array with llbitmap - Fix percpu_ref not resurrected on suspend timeout in llbitmap - Fix GPF in write_page caused by resize race - Fix NULL pointer dereference in process_metadata_update - Fix hang when stopping arrays with metadata through dm-raid - Fix any_working flag handling in raid10_sync_request - Refactor sync/recovery code path, improve error handling for badblocks, and remove unused recovery_disabled field - Consolidate mddev boolean fields into mddev_flags - Use mempool to allocate stripe_request_ctx and make sure max_sectors is not less than io_opt in raid5 - Fix return value of mddev_trylock - Fix memory leak in raid1_run() - Add Li Nan as mdraid reviewer - Move phys_vec definitions to the kernel types, mostly in preparation for some VFIO and RDMA changes - Improve the speed for secure erase for some devices - Various little rust updates - Various other minor fixes, improvements, and cleanups * tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits) blk-mq: ABI/sysfs-block: fix docs build warnings selftests: ublk: organize test directories by test ID block: decouple secure erase size limit from discard size limit block: remove redundant kill_bdev() call in set_blocksize() blk-mq: add documentation for new queue attribute async_dpeth block, bfq: convert to use request_queue->async_depth mq-deadline: covert to use request_queue->async_depth kyber: covert to use request_queue->async_depth blk-mq: add a new queue sysfs attribute async_depth blk-mq: factor out a helper blk_mq_limit_depth() blk-mq-sched: unify elevators checking for async requests block: convert nr_requests to unsigned int block: don't use strcpy to copy blockdev name blk-mq-debugfs: warn about possible deadlock blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs() blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static blk-rq-qos: fix possible debugfs_mutex deadlock blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter ...
Diffstat (limited to 'include/uapi/linux')
-rw-r--r--include/uapi/linux/ublk_cmd.h121
1 files changed, 120 insertions, 1 deletions
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ec77dabba45b..a88876756805 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -55,7 +55,8 @@
_IOWR('u', 0x15, struct ublksrv_ctrl_cmd)
#define UBLK_U_CMD_QUIESCE_DEV \
_IOWR('u', 0x16, struct ublksrv_ctrl_cmd)
-
+#define UBLK_U_CMD_TRY_STOP_DEV \
+ _IOWR('u', 0x17, struct ublksrv_ctrl_cmd)
/*
* 64bits are enough now, and it should be easy to extend in case of
* running out of feature flags
@@ -103,6 +104,30 @@
#define UBLK_U_IO_UNREGISTER_IO_BUF \
_IOWR('u', 0x24, struct ublksrv_io_cmd)
+/*
+ * return 0 if the command is run successfully, otherwise failure code
+ * is returned
+ */
+#define UBLK_U_IO_PREP_IO_CMDS \
+ _IOWR('u', 0x25, struct ublk_batch_io)
+/*
+ * If failure code is returned, nothing in the command buffer is handled.
+ * Otherwise, the returned value means how many bytes in command buffer
+ * are handled actually, then number of handled IOs can be calculated with
+ * `elem_bytes` for each IO. IOs in the remained bytes are not committed,
+ * userspace has to check return value for dealing with partial committing
+ * correctly.
+ */
+#define UBLK_U_IO_COMMIT_IO_CMDS \
+ _IOWR('u', 0x26, struct ublk_batch_io)
+
+/*
+ * Fetch io commands to provided buffer in multishot style,
+ * `IORING_URING_CMD_MULTISHOT` is required for this command.
+ */
+#define UBLK_U_IO_FETCH_IO_CMDS \
+ _IOWR('u', 0x27, struct ublk_batch_io)
+
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
#define UBLK_IO_RES_NEED_GET_DATA 1
@@ -134,6 +159,10 @@
#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS)
#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
+/* Copy to/from request integrity buffer instead of data buffer */
+#define UBLK_INTEGRITY_FLAG_OFF 62
+#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF)
+
/*
* ublk server can register data buffers for incoming I/O requests with a sparse
* io_uring buffer table. The request buffer can then be used as the data buffer
@@ -311,6 +340,36 @@
*/
#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
+/*
+ * Support the following commands for delivering & committing io command
+ * in batch.
+ *
+ * - UBLK_U_IO_PREP_IO_CMDS
+ * - UBLK_U_IO_COMMIT_IO_CMDS
+ * - UBLK_U_IO_FETCH_IO_CMDS
+ * - UBLK_U_IO_REGISTER_IO_BUF
+ * - UBLK_U_IO_UNREGISTER_IO_BUF
+ *
+ * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and
+ * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature.
+ */
+#define UBLK_F_BATCH_IO (1ULL << 15)
+
+/*
+ * ublk device supports requests with integrity/metadata buffer.
+ * Requires UBLK_F_USER_COPY.
+ */
+#define UBLK_F_INTEGRITY (1ULL << 16)
+
+/*
+ * The device supports the UBLK_CMD_TRY_STOP_DEV command, which
+ * allows stopping the device only if there are no openers.
+ */
+#define UBLK_F_SAFE_STOP_DEV (1ULL << 17)
+
+/* Disable automatic partition scanning when device is started */
+#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -408,6 +467,8 @@ struct ublksrv_ctrl_dev_info {
* passed in.
*/
#define UBLK_IO_F_NEED_REG_BUF (1U << 17)
+/* Request has an integrity data buffer */
+#define UBLK_IO_F_INTEGRITY (1UL << 18)
/*
* io cmd is described by this structure, and stored in share memory, indexed
@@ -525,6 +586,51 @@ struct ublksrv_io_cmd {
};
};
+struct ublk_elem_header {
+ __u16 tag; /* IO tag */
+
+ /*
+ * Buffer index for incoming io command, only valid iff
+ * UBLK_F_AUTO_BUF_REG is set
+ */
+ __u16 buf_index;
+ __s32 result; /* I/O completion result (commit only) */
+};
+
+/*
+ * uring_cmd buffer structure for batch commands
+ *
+ * buffer includes multiple elements, which number is specified by
+ * `nr_elem`. Each element buffer is organized in the following order:
+ *
+ * struct ublk_elem_buffer {
+ * // Mandatory fields (8 bytes)
+ * struct ublk_elem_header header;
+ *
+ * // Optional fields (8 bytes each, included based on flags)
+ *
+ * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data
+ * // between ublk request and ublk server buffer
+ * __u64 buf_addr;
+ *
+ * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA)
+ * __u64 zone_lba;
+ * }
+ *
+ * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS`
+ */
+struct ublk_batch_io {
+ __u16 q_id;
+#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0)
+#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1)
+#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2)
+ __u16 flags;
+ __u16 nr_elem;
+ __u8 elem_bytes;
+ __u8 reserved;
+ __u64 reserved2;
+};
+
struct ublk_param_basic {
#define UBLK_ATTR_READ_ONLY (1 << 0)
#define UBLK_ATTR_ROTATIONAL (1 << 1)
@@ -600,6 +706,17 @@ struct ublk_param_segment {
__u8 pad[2];
};
+struct ublk_param_integrity {
+ __u32 flags; /* LBMD_PI_CAP_* from linux/fs.h */
+ __u16 max_integrity_segments; /* 0 means no limit */
+ __u8 interval_exp;
+ __u8 metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */
+ __u8 pi_offset;
+ __u8 csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */
+ __u8 tag_size;
+ __u8 pad[5];
+};
+
struct ublk_params {
/*
* Total length of parameters, userspace has to set 'len' for both
@@ -614,6 +731,7 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_ZONED (1 << 3)
#define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4)
#define UBLK_PARAM_TYPE_SEGMENT (1 << 5)
+#define UBLK_PARAM_TYPE_INTEGRITY (1 << 6) /* requires UBLK_F_INTEGRITY */
__u32 types; /* types of parameter included */
struct ublk_param_basic basic;
@@ -622,6 +740,7 @@ struct ublk_params {
struct ublk_param_zoned zoned;
struct ublk_param_dma_align dma;
struct ublk_param_segment seg;
+ struct ublk_param_integrity integrity;
};
#endif