10 files changed, 75 insertions, 66 deletions
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index fab936d31ba8..100d5da94cb9 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -33,20 +33,18 @@ static void io_eventfd_free(struct rcu_head *rcu)
 	kfree(ev_fd);
 }
 
-static void io_eventfd_do_signal(struct rcu_head *rcu)
+static void io_eventfd_put(struct io_ev_fd *ev_fd)
 {
-	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
-
-	eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
-
 	if (refcount_dec_and_test(&ev_fd->refs))
-		io_eventfd_free(rcu);
+		call_rcu(&ev_fd->rcu, io_eventfd_free);
 }
 
-static void io_eventfd_put(struct io_ev_fd *ev_fd)
+static void io_eventfd_do_signal(struct rcu_head *rcu)
 {
-	if (refcount_dec_and_test(&ev_fd->refs))
-		call_rcu(&ev_fd->rcu, io_eventfd_free);
+	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+	eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
+	io_eventfd_put(ev_fd);
 }
 
 static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d3403c8216db..4758f1ba902b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -320,7 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_rw));
 	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
-			    sizeof(struct uring_cache));
+			    sizeof(struct io_uring_cmd_data));
 	spin_lock_init(&ctx->msg_lock);
 	ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_kiocb));
@@ -1226,10 +1226,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 
 	/* SQPOLL doesn't need the task_work added, it'll run it itself */
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		struct io_sq_data *sqd = ctx->sq_data;
-
-		if (sqd->thread)
-			__set_notify_signal(sqd->thread);
+		__set_notify_signal(tctx->task);
 		return;
 	}
 
@@ -2813,13 +2810,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 
 	if (unlikely(!ctx->poll_activated))
 		io_activate_pollwq(ctx);
-
-	poll_wait(file, &ctx->poll_wq, wait);
 	/*
-	 * synchronizes with barrier from wq_has_sleeper call in
-	 * io_commit_cqring
+	 * provides mb() which pairs with barrier from wq_has_sleeper
+	 * call in io_commit_cqring
 	 */
-	smp_rmb();
+	poll_wait(file, &ctx->poll_wq, wait);
+
 	if (!io_sqring_full(ctx))
 		mask |= EPOLLOUT | EPOLLWRNORM;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 12abee607e4a..492cbbf2c23b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -125,6 +125,9 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 #if defined(CONFIG_PROVE_LOCKING)
 	lockdep_assert(in_task());
 
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		lockdep_assert_held(&ctx->uring_lock);
+
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
 		lockdep_assert_held(&ctx->uring_lock);
 	} else if (!ctx->task_complete) {
@@ -136,9 +139,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 		 * Not from an SQE, as those cannot be submitted, but via
 		 * updating tagged resources.
 		 */
-		if (percpu_ref_is_dying(&ctx->refs))
-			lockdep_assert(current_work());
-		else
+		if (!percpu_ref_is_dying(&ctx->refs))
 			lockdep_assert(current == ctx->submitter_task);
 	}
 #endif
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3de75eca1c92..e8baef4e5146 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 
 #include "io_uring.h"
 #include "opdef.h"
@@ -414,7 +415,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.plug			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= 2 * sizeof(struct io_uring_sqe),
+		.async_size		= sizeof(struct io_uring_cmd_data),
 		.prep			= io_uring_cmd_prep,
 		.issue			= io_uring_cmd,
 	},
diff --git a/io_uring/register.c b/io_uring/register.c
index fdd44914c39c..371aec87e078 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -405,8 +405,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 {
 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
 	size_t size, sq_array_offset;
+	unsigned i, tail, old_head;
 	struct io_uring_params p;
-	unsigned i, tail;
 	void *ptr;
 	int ret;
 
@@ -449,10 +449,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	if (IS_ERR(n.rings))
 		return PTR_ERR(n.rings);
 
-	n.rings->sq_ring_mask = p.sq_entries - 1;
-	n.rings->cq_ring_mask = p.cq_entries - 1;
-	n.rings->sq_ring_entries = p.sq_entries;
-	n.rings->cq_ring_entries = p.cq_entries;
+	/*
+	 * At this point n.rings is shared with userspace, just like o.rings
+	 * is as well. While we don't expect userspace to modify it while
+	 * a resize is in progress, and it's most likely that userspace will
+	 * shoot itself in the foot if it does, we can't always assume good
+	 * intent... Use read/write once helpers from here on to indicate the
+	 * shared nature of it.
+	 */
+	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
+	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
+	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
+	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
 
 	if (copy_to_user(arg, &p, sizeof(p))) {
 		io_register_free_rings(&p, &n);
@@ -509,20 +517,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	 * rings can't hold what is already there, then fail the operation.
 	 */
 	n.sq_sqes = ptr;
-	tail = o.rings->sq.tail;
-	if (tail - o.rings->sq.head > p.sq_entries)
+	tail = READ_ONCE(o.rings->sq.tail);
+	old_head = READ_ONCE(o.rings->sq.head);
+	if (tail - old_head > p.sq_entries)
 		goto overflow;
-	for (i = o.rings->sq.head; i < tail; i++) {
+	for (i = old_head; i < tail; i++) {
 		unsigned src_head = i & (ctx->sq_entries - 1);
-		unsigned dst_head = i & n.rings->sq_ring_mask;
+		unsigned dst_head = i & (p.sq_entries - 1);
 
 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
 	}
-	n.rings->sq.head = o.rings->sq.head;
-	n.rings->sq.tail = o.rings->sq.tail;
+	WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
+	WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
 
-	tail = o.rings->cq.tail;
-	if (tail - o.rings->cq.head > p.cq_entries) {
+	tail = READ_ONCE(o.rings->cq.tail);
+	old_head = READ_ONCE(o.rings->cq.head);
+	if (tail - old_head > p.cq_entries) {
 overflow:
 		/* restore old rings, and return -EOVERFLOW via cleanup path */
 		ctx->rings = o.rings;
@@ -531,21 +541,21 @@ overflow:
 		ret = -EOVERFLOW;
 		goto out;
 	}
-	for (i = o.rings->cq.head; i < tail; i++) {
+	for (i = old_head; i < tail; i++) {
 		unsigned src_head = i & (ctx->cq_entries - 1);
-		unsigned dst_head = i & n.rings->cq_ring_mask;
+		unsigned dst_head = i & (p.cq_entries - 1);
 
 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
 	}
-	n.rings->cq.head = o.rings->cq.head;
-	n.rings->cq.tail = o.rings->cq.tail;
+	WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head));
+	WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail));
 	/* invalidate cached cqe refill */
 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
 
-	n.rings->sq_dropped = o.rings->sq_dropped;
-	n.rings->sq_flags = o.rings->sq_flags;
-	n.rings->cq_flags = o.rings->cq_flags;
-	n.rings->cq_overflow = o.rings->cq_overflow;
+	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
+	WRITE_ONCE(n.rings->sq_flags, READ_ONCE(o.rings->sq_flags));
+	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
+	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
 
 	/* all done, store old pointers and assign new ones */
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 077f84684c18..69937d0c94f9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -997,7 +997,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 			if (!dst_node) {
 				ret = -ENOMEM;
-				goto out_put_free;
+				goto out_unlock;
 			}
 
 			refcount_inc(&src_node->buf->refs);
@@ -1033,14 +1033,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	mutex_lock(&src_ctx->uring_lock);
 	/* someone raced setting up buffers, dump ours */
 	ret = -EBUSY;
-out_put_free:
-	i = data.nr;
-	while (i--) {
-		if (data.nodes[i]) {
-			io_buffer_unmap(src_ctx, data.nodes[i]);
-			kfree(data.nodes[i]);
-		}
-	}
 out_unlock:
 	io_rsrc_data_free(ctx, &data);
 	mutex_unlock(&src_ctx->uring_lock);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 9e5bd79fd2b5..8961a3c1e73c 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -268,8 +268,12 @@ static int io_sq_thread(void *data)
 	DEFINE_WAIT(wait);
 
 	/* offload context creation failed, just exit */
-	if (!current->io_uring)
+	if (!current->io_uring) {
+		mutex_lock(&sqd->lock);
+		sqd->thread = NULL;
+		mutex_unlock(&sqd->lock);
 		goto err_out;
+	}
 
 	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 	set_task_comm(current, buf);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 362689b17ccc..e9cec9e4dc2f 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -427,10 +427,12 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 
 	timeout->off = 0; /* noseq */
 	data = req->async_data;
+	data->ts = *ts;
+
 	list_add_tail(&timeout->list, &ctx->timeout_list);
 	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
 	data->timer.function = io_timeout_fn;
-	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode);
 	return 0;
 }
 
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index af842e9b4eb9..ce7726a04883 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -16,26 +16,35 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
-static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
+static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct uring_cache *cache;
+	struct io_uring_cmd_data *cache;
 
 	cache = io_alloc_cache_get(&ctx->uring_cache);
 	if (cache) {
+		cache->op_data = NULL;
 		req->flags |= REQ_F_ASYNC_DATA;
 		req->async_data = cache;
 		return cache;
 	}
-	if (!io_alloc_async_data(req))
-		return req->async_data;
+	if (!io_alloc_async_data(req)) {
+		cache = req->async_data;
+		cache->op_data = NULL;
+		return cache;
+	}
 	return NULL;
 }
 
 static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	struct uring_cache *cache = req->async_data;
+	struct io_uring_cmd_data *cache = req->async_data;
+
+	if (cache->op_data) {
+		kfree(cache->op_data);
+		cache->op_data = NULL;
+	}
 
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		return;
@@ -183,7 +192,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
 				   const struct io_uring_sqe *sqe)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	struct uring_cache *cache;
+	struct io_uring_cmd_data *cache;
 
 	cache = io_uring_async_get(req);
 	if (unlikely(!cache))
@@ -260,7 +269,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
 	if (ret == -EAGAIN) {
-		struct uring_cache *cache = req->async_data;
+		struct io_uring_cmd_data *cache = req->async_data;
 
 		if (ioucmd->sqe != (void *) cache)
 			memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx));
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 7dba0f1efc58..f6837ee0955b 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,9 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-struct uring_cache {
-	struct io_uring_sqe sqes[2];
-};
-
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);