From 3c7d76d6128a0fef68e6540754bf85a44a29bb59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2025 03:25:41 -0700 Subject: io_uring: IOPOLL polling improvements io_uring manages issued and pending IOPOLL read/write requests in a singly linked list. One downside of that is that individual items cannot easily be removed from that list, and as a result, io_uring will only complete a completed request N in that list if 0..N-1 are also complete. For homogenous IO this isn't necessarily an issue, but if different devices are involved in polling in the same ring, or if disparate IO from the same device is being polled for, this can defer completion of some requests unnecessarily. Move to a doubly linked list for iopoll completions instead, making it possible to easily complete whatever requests that were polled done successfully. Co-developed-by: Fengnan Chang Link: https://lore.kernel.org/io-uring/20251210085501.84261-1-changfengnan@bytedance.com/ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1adb0d20a0a..54fd30abf2b8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -316,7 +316,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ bool poll_multi_queue; - struct io_wq_work_list iopoll_list; + struct list_head iopoll_list; struct io_file_table file_table; struct io_rsrc_data buf_table; @@ -708,7 +708,16 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - struct io_task_work io_task_work; + + /* + * IOPOLL doesn't use task_work, so use the ->iopoll_node list + * entry to manage pending iopoll requests. + */ + union { + struct io_task_work io_task_work; + struct list_head iopoll_node; + }; + union { /* * for polled requests, i.e. IORING_OP_POLL_ADD and async armed -- cgit v1.2.3 From d6406c45f14842019cfaaba19fe2a76ef9fa831c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:45 -0700 Subject: io_uring: track restrictions separately for IORING_OP and IORING_REGISTER It's quite likely that only register opcode restrictions exists, in which case we'd never need to check the normal opcodes. Split ctx->restricted into two separate fields, one for I/O opcodes, and one for register opcodes. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 ++++++-- io_uring/io_uring.c | 4 ++-- io_uring/register.c | 19 ++++++++++++++----- 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 54fd30abf2b8..e4c804f99c30 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -224,7 +224,10 @@ struct io_restriction { DECLARE_BITMAP(sqe_op, IORING_OP_LAST); u8 sqe_flags_allowed; u8 sqe_flags_required; - bool registered; + /* IORING_OP_* restrictions exist */ + bool op_registered; + /* IORING_REGISTER_* restrictions exist */ + bool reg_registered; }; struct io_submit_link { @@ -259,7 +262,8 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int drain_next: 1; - unsigned int restricted: 1; + unsigned int op_restricted: 1; + unsigned int reg_restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int has_evfd: 1; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92ed92a44023..2cde22af78a3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2056,7 +2056,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->restricted) + if (!ctx->op_restricted) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -2159,7 +2159,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, io_init_drain(ctx); } } - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ diff --git a/io_uring/register.c b/io_uring/register.c index 54ccf164be38..8551f13920dc 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -133,24 +133,31 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, if (res[i].register_op >= IORING_REGISTER_LAST) goto err; __set_bit(res[i].register_op, restrictions->register_op); + restrictions->reg_registered = true; break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) goto err; __set_bit(res[i].sqe_op, restrictions->sqe_op); + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: restrictions->sqe_flags_allowed = res[i].sqe_flags; + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: restrictions->sqe_flags_required = res[i].sqe_flags; + restrictions->op_registered = true; break; default: goto err; } } ret = nr_args; - restrictions->registered = true; + if (!nr_args) { + restrictions->op_registered = true; + restrictions->reg_registered = true; + } err: kfree(res); return ret; @@ -166,7 +173,7 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return -EBADFD; /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) + if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) return -EBUSY; ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); @@ -175,8 +182,10 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); return ret; } - if (ctx->restrictions.registered) - ctx->restricted = 1; + if (ctx->restrictions.op_registered) + ctx->op_restricted = 1; + if (ctx->restrictions.reg_registered) + ctx->reg_restricted = 1; return 0; } @@ -626,7 +635,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; -- cgit v1.2.3 From 697a5284ad9697609324739e38e341612cd342a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Jan 2026 07:59:08 -0700 Subject: io_uring: fix IOPOLL with passthrough I/O A previous commit improving IOPOLL made an incorrect assumption that task_work isn't used with IOPOLL. This can cause crashes when doing passthrough I/O on nvme, where queueing the completion task_work will trample on the same memory that holds the completed list of requests. Fix it up by shuffling the members around, so we're not sharing any parts that end up getting used in this path. Fixes: 3c7d76d6128a ("io_uring: IOPOLL polling improvements") Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs_SLPj9v9w5MgfzHKy+983enPx3ZQY2kMuMJ1202DBefw@mail.gmail.com/ Tested-by: Yi Zhang Cc: Ming Lei Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 11 ++++------- io_uring/rw.c | 5 +++-- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e4c804f99c30..211686ad89fd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -713,13 +713,10 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - /* - * IOPOLL doesn't use task_work, so use the ->iopoll_node list - * entry to manage pending iopoll requests. - */ union { struct io_task_work io_task_work; - struct list_head iopoll_node; + /* For IOPOLL setup queues, with hybrid polling */ + u64 iopoll_start; }; union { @@ -728,8 +725,8 @@ struct io_kiocb { * poll */ struct hlist_node hash_node; - /* For IOPOLL setup queues, with hybrid polling */ - u64 iopoll_start; + /* IOPOLL completion handling */ + struct list_head iopoll_node; /* for private io_kiocb freeing */ struct rcu_head rcu_head; }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 307f1f39d9f3..c33c533a267e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1296,12 +1296,13 @@ static int io_uring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags) { struct io_ring_ctx *ctx = req->ctx; - u64 runtime, sleep_time; + u64 runtime, sleep_time, iopoll_start; int ret; + iopoll_start = READ_ONCE(req->iopoll_start); sleep_time = io_hybrid_iopoll_delay(ctx, req); ret = io_uring_classic_poll(req, iob, poll_flags); - runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + runtime = ktime_get_ns() - iopoll_start - sleep_time; /* * Use minimum sleep time if we're polling devices with different -- cgit v1.2.3 From 07f3c3a1cd56c2048a92dad0c11f15e4ac3888c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 11:21:32 -0700 Subject: io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member A previous commit got rid of any use of this member, but forgot to remove it. Kill it. Fixes: f4bb2f65bb81 ("io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 211686ad89fd..dc6bd6940a0d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -442,6 +442,9 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; + /* protected by ->completion_lock */ + unsigned nr_req_allocated; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ @@ -454,10 +457,6 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif - /* protected by ->completion_lock */ - unsigned evfd_last_cq_tail; - unsigned nr_req_allocated; - /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from -- cgit v1.2.3 From 5247c034a67f5a93cc1faa15e9867eec5b22f38a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 20:47:40 +0000 Subject: io_uring: introduce non-circular SQ Outside of SQPOLL, normally SQ entries are consumed by the time the submission syscall returns. For those cases we don't need a circular buffer and the head/tail tracking, instead the kernel can assume that entries always start from the beginning of the SQ at index 0. This patch introduces a setup flag doing exactly that. It's a simpler and helps to keeps SQEs hot in cache. The feature is optional and enabled by setting IORING_SETUP_SQ_REWIND. The flag is rejected if passed together with SQPOLL as it'd require waiting for SQ before each submission. It also requires IORING_SETUP_NO_SQARRAY, which can be supported but it's unlikely there will be users, so leave more space for future optimisations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++++++ io_uring/io_uring.c | 29 ++++++++++++++++++++++------- io_uring/io_uring.h | 3 ++- 3 files changed, 36 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b5b23c0d5283..475094c7a668 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_SQE_MIXED (1U << 19) +/* + * When set, io_uring ignores SQ head and tail and fetches SQEs to submit + * starting from index 0 instead from the index stored in the head pointer. + * IOW, the user should place all SQE at the beginning of the SQ memory + * before issuing a submission syscall. + * + * It requires IORING_SETUP_NO_SQARRAY and is incompatible with + * IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail + * values and keep it set to 0. Any other value is undefined behaviour. + */ +#define IORING_SETUP_SQ_REWIND (1U << 20) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a50459238bee..0f88ec74e55d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1945,12 +1945,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); + if (ctx->flags & IORING_SETUP_SQ_REWIND) { + ctx->cached_sq_head = 0; + } else { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); + } } /* @@ -1996,10 +2000,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - unsigned int entries = io_sqring_entries(ctx); + unsigned int entries; unsigned int left; int ret; + if (ctx->flags & IORING_SETUP_SQ_REWIND) + entries = ctx->sq_entries; + else + entries = io_sqring_entries(ctx); + entries = min(nr, entries); if (unlikely(!entries)) return 0; @@ -2728,6 +2737,12 @@ static int io_uring_sanitise_params(struct io_uring_params *p) if (flags & ~IORING_SETUP_FLAGS) return -EINVAL; + if (flags & IORING_SETUP_SQ_REWIND) { + if ((flags & IORING_SETUP_SQPOLL) || + !(flags & IORING_SETUP_NO_SQARRAY)) + return -EINVAL; + } + /* There is no way to mmap rings without a real fd */ if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(flags & IORING_SETUP_NO_MMAP)) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 29b8f90fdabf..acdc39b9f8d6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -69,7 +69,8 @@ struct io_ctx_config { IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ IORING_SETUP_CQE_MIXED |\ - IORING_SETUP_SQE_MIXED) + IORING_SETUP_SQE_MIXED |\ + IORING_SETUP_SQ_REWIND) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ -- cgit v1.2.3