From 3c7d76d6128a0fef68e6540754bf85a44a29bb59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2025 03:25:41 -0700 Subject: io_uring: IOPOLL polling improvements io_uring manages issued and pending IOPOLL read/write requests in a singly linked list. One downside of that is that individual items cannot easily be removed from that list, and as a result, io_uring will only complete a completed request N in that list if 0..N-1 are also complete. For homogenous IO this isn't necessarily an issue, but if different devices are involved in polling in the same ring, or if disparate IO from the same device is being polled for, this can defer completion of some requests unnecessarily. Move to a doubly linked list for iopoll completions instead, making it possible to easily complete whatever requests that were polled done successfully. Co-developed-by: Fengnan Chang Link: https://lore.kernel.org/io-uring/20251210085501.84261-1-changfengnan@bytedance.com/ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1adb0d20a0a..54fd30abf2b8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -316,7 +316,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ bool poll_multi_queue; - struct io_wq_work_list iopoll_list; + struct list_head iopoll_list; struct io_file_table file_table; struct io_rsrc_data buf_table; @@ -708,7 +708,16 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - struct io_task_work io_task_work; + + /* + * IOPOLL doesn't use task_work, so use the ->iopoll_node list + * entry to manage pending iopoll requests. + */ + union { + struct io_task_work io_task_work; + struct list_head iopoll_node; + }; + union { /* * for polled requests, i.e. IORING_OP_POLL_ADD and async armed -- cgit v1.2.3 From d6406c45f14842019cfaaba19fe2a76ef9fa831c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:45 -0700 Subject: io_uring: track restrictions separately for IORING_OP and IORING_REGISTER It's quite likely that only register opcode restrictions exists, in which case we'd never need to check the normal opcodes. Split ctx->restricted into two separate fields, one for I/O opcodes, and one for register opcodes. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 ++++++-- io_uring/io_uring.c | 4 ++-- io_uring/register.c | 19 ++++++++++++++----- 3 files changed, 22 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 54fd30abf2b8..e4c804f99c30 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -224,7 +224,10 @@ struct io_restriction { DECLARE_BITMAP(sqe_op, IORING_OP_LAST); u8 sqe_flags_allowed; u8 sqe_flags_required; - bool registered; + /* IORING_OP_* restrictions exist */ + bool op_registered; + /* IORING_REGISTER_* restrictions exist */ + bool reg_registered; }; struct io_submit_link { @@ -259,7 +262,8 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int drain_next: 1; - unsigned int restricted: 1; + unsigned int op_restricted: 1; + unsigned int reg_restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int has_evfd: 1; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92ed92a44023..2cde22af78a3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2056,7 +2056,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->restricted) + if (!ctx->op_restricted) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -2159,7 +2159,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, io_init_drain(ctx); } } - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ diff --git a/io_uring/register.c b/io_uring/register.c index 54ccf164be38..8551f13920dc 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -133,24 +133,31 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, if (res[i].register_op >= IORING_REGISTER_LAST) goto err; __set_bit(res[i].register_op, restrictions->register_op); + restrictions->reg_registered = true; break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) goto err; __set_bit(res[i].sqe_op, restrictions->sqe_op); + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: restrictions->sqe_flags_allowed = res[i].sqe_flags; + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: restrictions->sqe_flags_required = res[i].sqe_flags; + restrictions->op_registered = true; break; default: goto err; } } ret = nr_args; - restrictions->registered = true; + if (!nr_args) { + restrictions->op_registered = true; + restrictions->reg_registered = true; + } err: kfree(res); return ret; @@ -166,7 +173,7 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return -EBADFD; /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) + if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) return -EBUSY; ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); @@ -175,8 +182,10 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); return ret; } - if (ctx->restrictions.registered) - ctx->restricted = 1; + if (ctx->restrictions.op_registered) + ctx->op_restricted = 1; + if (ctx->restrictions.reg_registered) + ctx->reg_restricted = 1; return 0; } @@ -626,7 +635,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; -- cgit v1.2.3 From 697a5284ad9697609324739e38e341612cd342a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Jan 2026 07:59:08 -0700 Subject: io_uring: fix IOPOLL with passthrough I/O A previous commit improving IOPOLL made an incorrect assumption that task_work isn't used with IOPOLL. This can cause crashes when doing passthrough I/O on nvme, where queueing the completion task_work will trample on the same memory that holds the completed list of requests. Fix it up by shuffling the members around, so we're not sharing any parts that end up getting used in this path. Fixes: 3c7d76d6128a ("io_uring: IOPOLL polling improvements") Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs_SLPj9v9w5MgfzHKy+983enPx3ZQY2kMuMJ1202DBefw@mail.gmail.com/ Tested-by: Yi Zhang Cc: Ming Lei Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 11 ++++------- io_uring/rw.c | 5 +++-- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e4c804f99c30..211686ad89fd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -713,13 +713,10 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - /* - * IOPOLL doesn't use task_work, so use the ->iopoll_node list - * entry to manage pending iopoll requests. - */ union { struct io_task_work io_task_work; - struct list_head iopoll_node; + /* For IOPOLL setup queues, with hybrid polling */ + u64 iopoll_start; }; union { @@ -728,8 +725,8 @@ struct io_kiocb { * poll */ struct hlist_node hash_node; - /* For IOPOLL setup queues, with hybrid polling */ - u64 iopoll_start; + /* IOPOLL completion handling */ + struct list_head iopoll_node; /* for private io_kiocb freeing */ struct rcu_head rcu_head; }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 307f1f39d9f3..c33c533a267e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1296,12 +1296,13 @@ static int io_uring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags) { struct io_ring_ctx *ctx = req->ctx; - u64 runtime, sleep_time; + u64 runtime, sleep_time, iopoll_start; int ret; + iopoll_start = READ_ONCE(req->iopoll_start); sleep_time = io_hybrid_iopoll_delay(ctx, req); ret = io_uring_classic_poll(req, iob, poll_flags); - runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + runtime = ktime_get_ns() - iopoll_start - sleep_time; /* * Use minimum sleep time if we're polling devices with different -- cgit v1.2.3 From 07f3c3a1cd56c2048a92dad0c11f15e4ac3888c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 11:21:32 -0700 Subject: io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member A previous commit got rid of any use of this member, but forgot to remove it. Kill it. Fixes: f4bb2f65bb81 ("io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 211686ad89fd..dc6bd6940a0d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -442,6 +442,9 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; + /* protected by ->completion_lock */ + unsigned nr_req_allocated; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ @@ -454,10 +457,6 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif - /* protected by ->completion_lock */ - unsigned evfd_last_cq_tail; - unsigned nr_req_allocated; - /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from -- cgit v1.2.3