From 3c7d76d6128a0fef68e6540754bf85a44a29bb59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Dec 2025 03:25:41 -0700
Subject: io_uring: IOPOLL polling improvements

io_uring manages issued and pending IOPOLL read/write requests in a
singly linked list. One downside of that is that individual items
cannot easily be removed from that list, and as a result, io_uring
will only complete a completed request N in that list if 0..N-1 are
also complete. For homogenous IO this isn't necessarily an issue,
but if different devices are involved in polling in the same ring, or
if disparate IO from the same device is being polled for, this can
defer completion of some requests unnecessarily.

Move to a doubly linked list for iopoll completions instead, making it
possible to easily complete whatever requests that were polled done
successfully.

Co-developed-by: Fengnan Chang <fengnanchang@gmail.com>
Link: https://lore.kernel.org/io-uring/20251210085501.84261-1-changfengnan@bytedance.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1adb0d20a0a..54fd30abf2b8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -316,7 +316,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		bool			poll_multi_queue;
-		struct io_wq_work_list	iopoll_list;
+		struct list_head	iopoll_list;
 
 		struct io_file_table	file_table;
 		struct io_rsrc_data	buf_table;
@@ -708,7 +708,16 @@ struct io_kiocb {
 
 	atomic_t			refs;
 	bool				cancel_seq_set;
-	struct io_task_work		io_task_work;
+
+	/*
+	 * IOPOLL doesn't use task_work, so use the ->iopoll_node list
+	 * entry to manage pending iopoll requests.
+	 */
+	union {
+		struct io_task_work	io_task_work;
+		struct list_head	iopoll_node;
+	};
+
 	union {
 		/*
 		 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
-- 
cgit v1.2.3


From d6406c45f14842019cfaaba19fe2a76ef9fa831c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 12 Jan 2026 08:14:45 -0700
Subject: io_uring: track restrictions separately for IORING_OP and
 IORING_REGISTER

It's quite likely that only register opcode restrictions exists, in
which case we'd never need to check the normal opcodes. Split
ctx->restricted into two separate fields, one for I/O opcodes, and one
for register opcodes.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  8 ++++++--
 io_uring/io_uring.c            |  4 ++--
 io_uring/register.c            | 19 ++++++++++++++-----
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 54fd30abf2b8..e4c804f99c30 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -224,7 +224,10 @@ struct io_restriction {
 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 	u8 sqe_flags_allowed;
 	u8 sqe_flags_required;
-	bool registered;
+	/* IORING_OP_* restrictions exist */
+	bool op_registered;
+	/* IORING_REGISTER_* restrictions exist */
+	bool reg_registered;
 };
 
 struct io_submit_link {
@@ -259,7 +262,8 @@ struct io_ring_ctx {
 	struct {
 		unsigned int		flags;
 		unsigned int		drain_next: 1;
-		unsigned int		restricted: 1;
+		unsigned int		op_restricted: 1;
+		unsigned int		reg_restricted: 1;
 		unsigned int		off_timeout_used: 1;
 		unsigned int		drain_active: 1;
 		unsigned int		has_evfd: 1;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 92ed92a44023..2cde22af78a3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2056,7 +2056,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 					struct io_kiocb *req,
 					unsigned int sqe_flags)
 {
-	if (!ctx->restricted)
+	if (!ctx->op_restricted)
 		return true;
 	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
 		return false;
@@ -2159,7 +2159,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			io_init_drain(ctx);
 		}
 	}
-	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+	if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) {
 		if (!io_check_restriction(ctx, req, sqe_flags))
 			return io_init_fail_req(req, -EACCES);
 		/* knock it to the slow queue path, will be drained there */
diff --git a/io_uring/register.c b/io_uring/register.c
index 54ccf164be38..8551f13920dc 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -133,24 +133,31 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
 			if (res[i].register_op >= IORING_REGISTER_LAST)
 				goto err;
 			__set_bit(res[i].register_op, restrictions->register_op);
+			restrictions->reg_registered = true;
 			break;
 		case IORING_RESTRICTION_SQE_OP:
 			if (res[i].sqe_op >= IORING_OP_LAST)
 				goto err;
 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
+			restrictions->op_registered = true;
 			break;
 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
+			restrictions->op_registered = true;
 			break;
 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
 			restrictions->sqe_flags_required = res[i].sqe_flags;
+			restrictions->op_registered = true;
 			break;
 		default:
 			goto err;
 		}
 	}
 	ret = nr_args;
-	restrictions->registered = true;
+	if (!nr_args) {
+		restrictions->op_registered = true;
+		restrictions->reg_registered = true;
+	}
 err:
 	kfree(res);
 	return ret;
@@ -166,7 +173,7 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
 		return -EBADFD;
 
 	/* We allow only a single restrictions registration */
-	if (ctx->restrictions.registered)
+	if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
 		return -EBUSY;
 
 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
@@ -175,8 +182,10 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
 		return ret;
 	}
-	if (ctx->restrictions.registered)
-		ctx->restricted = 1;
+	if (ctx->restrictions.op_registered)
+		ctx->op_restricted = 1;
+	if (ctx->restrictions.reg_registered)
+		ctx->reg_restricted = 1;
 	return 0;
 }
 
@@ -626,7 +635,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	if (ctx->submitter_task && ctx->submitter_task != current)
 		return -EEXIST;
 
-	if (ctx->restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
+	if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
 		if (!test_bit(opcode, ctx->restrictions.register_op))
 			return -EACCES;
-- 
cgit v1.2.3


From 697a5284ad9697609324739e38e341612cd342a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Jan 2026 07:59:08 -0700
Subject: io_uring: fix IOPOLL with passthrough I/O

A previous commit improving IOPOLL made an incorrect assumption that
task_work isn't used with IOPOLL. This can cause crashes when doing
passthrough I/O on nvme, where queueing the completion task_work will
trample on the same memory that holds the completed list of requests.

Fix it up by shuffling the members around, so we're not sharing any
parts that end up getting used in this path.

Fixes: 3c7d76d6128a ("io_uring: IOPOLL polling improvements")
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Link: https://lore.kernel.org/linux-block/CAHj4cs_SLPj9v9w5MgfzHKy+983enPx3ZQY2kMuMJ1202DBefw@mail.gmail.com/
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Cc: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 11 ++++-------
 io_uring/rw.c                  |  5 +++--
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e4c804f99c30..211686ad89fd 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -713,13 +713,10 @@ struct io_kiocb {
 	atomic_t			refs;
 	bool				cancel_seq_set;
 
-	/*
-	 * IOPOLL doesn't use task_work, so use the ->iopoll_node list
-	 * entry to manage pending iopoll requests.
-	 */
 	union {
 		struct io_task_work	io_task_work;
-		struct list_head	iopoll_node;
+		/* For IOPOLL setup queues, with hybrid polling */
+		u64                     iopoll_start;
 	};
 
 	union {
@@ -728,8 +725,8 @@ struct io_kiocb {
 		 * poll
 		 */
 		struct hlist_node	hash_node;
-		/* For IOPOLL setup queues, with hybrid polling */
-		u64                     iopoll_start;
+		/* IOPOLL completion handling */
+		struct list_head	iopoll_node;
 		/* for private io_kiocb freeing */
 		struct rcu_head		rcu_head;
 	};
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 307f1f39d9f3..c33c533a267e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1296,12 +1296,13 @@ static int io_uring_hybrid_poll(struct io_kiocb *req,
 				struct io_comp_batch *iob, unsigned int poll_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	u64 runtime, sleep_time;
+	u64 runtime, sleep_time, iopoll_start;
 	int ret;
 
+	iopoll_start = READ_ONCE(req->iopoll_start);
 	sleep_time = io_hybrid_iopoll_delay(ctx, req);
 	ret = io_uring_classic_poll(req, iob, poll_flags);
-	runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
+	runtime = ktime_get_ns() - iopoll_start - sleep_time;
 
 	/*
 	 * Use minimum sleep time if we're polling devices with different
-- 
cgit v1.2.3


From 07f3c3a1cd56c2048a92dad0c11f15e4ac3888c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 21 Jan 2026 11:21:32 -0700
Subject: io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member

A previous commit got rid of any use of this member, but forgot to
remove it. Kill it.

Fixes: f4bb2f65bb81 ("io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 211686ad89fd..dc6bd6940a0d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -442,6 +442,9 @@ struct io_ring_ctx {
 	struct list_head		defer_list;
 	unsigned			nr_drained;
 
+	/* protected by ->completion_lock */
+	unsigned			nr_req_allocated;
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	struct list_head	napi_list;	/* track busy poll napi_id */
 	spinlock_t		napi_lock;	/* napi_list lock */
@@ -454,10 +457,6 @@ struct io_ring_ctx {
 	DECLARE_HASHTABLE(napi_ht, 4);
 #endif
 
-	/* protected by ->completion_lock */
-	unsigned			evfd_last_cq_tail;
-	unsigned			nr_req_allocated;
-
 	/*
 	 * Protection for resize vs mmap races - both the mmap and resize
 	 * side will need to grab this lock, to prevent either side from
-- 
cgit v1.2.3