summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile2
-rw-r--r--io_uring/cancel.c1
-rw-r--r--io_uring/cmd_net.c3
-rw-r--r--io_uring/fdinfo.c24
-rw-r--r--io_uring/futex.c13
-rw-r--r--io_uring/io-wq.c6
-rw-r--r--io_uring/io_uring.c155
-rw-r--r--io_uring/io_uring.h124
-rw-r--r--io_uring/kbuf.c67
-rw-r--r--io_uring/kbuf.h39
-rw-r--r--io_uring/msg_ring.c24
-rw-r--r--io_uring/net.c160
-rw-r--r--io_uring/nop.c17
-rw-r--r--io_uring/notif.c7
-rw-r--r--io_uring/opdef.c1
-rw-r--r--io_uring/openclose.c1
-rw-r--r--io_uring/poll.c6
-rw-r--r--io_uring/query.c101
-rw-r--r--io_uring/query.h9
-rw-r--r--io_uring/register.c60
-rw-r--r--io_uring/rsrc.c8
-rw-r--r--io_uring/rw.c66
-rw-r--r--io_uring/splice.c1
-rw-r--r--io_uring/timeout.c2
-rw-r--r--io_uring/uring_cmd.c85
-rw-r--r--io_uring/waitid.c4
-rw-r--r--io_uring/zcrx.c302
-rw-r--r--io_uring/zcrx.h19
28 files changed, 861 insertions, 446 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index b3f1bd492804..bc4e4a3fa0a5 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
sync.o msg_ring.o advise.o openclose.o \
statx.o timeout.o cancel.o \
waitid.o register.o truncate.o \
- memmap.o alloc_cache.o
+ memmap.o alloc_cache.o query.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 6d57602304df..64b51e82baa2 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -11,6 +11,7 @@
#include <uapi/linux/io_uring.h>
+#include "filetable.h"
#include "io_uring.h"
#include "tctx.h"
#include "poll.h"
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index 3866fe6ff541..27a09aa4c9d0 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -4,6 +4,7 @@
#include <net/sock.h>
#include "uring_cmd.h"
+#include "io_uring.h"
static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
@@ -73,7 +74,7 @@ static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk,
cqe->user_data = 0;
cqe->res = tskey;
- cqe->flags = IORING_CQE_F_MORE;
+ cqe->flags = IORING_CQE_F_MORE | ctx_cqe32_flags(cmd_to_io_kiocb(cmd)->ctx);
cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT;
if (ret == SOF_TIMESTAMPING_TX_HARDWARE)
cqe->flags |= IORING_CQE_F_TSTAMP_HW;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 9798d6fb4ec7..ff3364531c77 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -9,7 +9,7 @@
#include <uapi/linux/io_uring.h>
-#include "io_uring.h"
+#include "filetable.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "cancel.h"
@@ -65,15 +65,12 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
unsigned int sq_tail = READ_ONCE(r->sq.tail);
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
- unsigned int cq_shift = 0;
unsigned int sq_shift = 0;
- unsigned int sq_entries, cq_entries;
+ unsigned int sq_entries;
int sq_pid = -1, sq_cpu = -1;
u64 sq_total_time = 0, sq_work_time = 0;
unsigned int i;
- if (ctx->flags & IORING_SETUP_CQE32)
- cq_shift = 1;
if (ctx->flags & IORING_SETUP_SQE128)
sq_shift = 1;
@@ -125,18 +122,23 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "\n");
}
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
- cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
- for (i = 0; i < cq_entries; i++) {
- unsigned int entry = i + cq_head;
- struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
+ while (cq_head < cq_tail) {
+ struct io_uring_cqe *cqe;
+ bool cqe32 = false;
+ cqe = &r->cqes[(cq_head & cq_mask)];
+ if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32)
+ cqe32 = true;
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
- entry & cq_mask, cqe->user_data, cqe->res,
+ cq_head & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
- if (cq_shift)
+ if (cqe32)
seq_printf(m, ", extra1:%llu, extra2:%llu\n",
cqe->big_cqe[0], cqe->big_cqe[1]);
seq_printf(m, "\n");
+ cq_head++;
+ if (cqe32)
+ cq_head++;
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 9113a44984f3..64f3bd51c84c 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -43,7 +43,6 @@ void io_futex_cache_free(struct io_ring_ctx *ctx)
static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw)
{
- req->async_data = NULL;
hlist_del_init(&req->hash_node);
io_req_task_complete(req, tw);
}
@@ -54,6 +53,7 @@ static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw)
io_tw_lock(ctx, tw);
io_cache_free(&ctx->futex_cache, req->async_data);
+ io_req_async_data_clear(req, 0);
__io_futex_complete(req, tw);
}
@@ -72,8 +72,7 @@ static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw)
io_req_set_res(req, res, 0);
}
- kfree(req->async_data);
- req->flags &= ~REQ_F_ASYNC_DATA;
+ io_req_async_data_free(req);
__io_futex_complete(req, tw);
}
@@ -232,9 +231,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
io_ring_submit_unlock(ctx, issue_flags);
req_set_fail(req);
io_req_set_res(req, ret, 0);
- kfree(futexv);
- req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
+ io_req_async_data_free(req);
return IOU_COMPLETE;
}
@@ -310,9 +307,7 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
- kfree(ifd);
+ io_req_async_data_free(req);
return IOU_COMPLETE;
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 17dfaa0395c4..1d03b2fc4b25 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -352,16 +352,16 @@ static void create_worker_cb(struct callback_head *cb)
struct io_wq *wq;
struct io_wq_acct *acct;
- bool do_create = false;
+ bool activated_free_worker, do_create = false;
worker = container_of(cb, struct io_worker, create_work);
wq = worker->wq;
acct = worker->acct;
rcu_read_lock();
- do_create = !io_acct_activate_free_worker(acct);
+ activated_free_worker = io_acct_activate_free_worker(acct);
rcu_read_unlock();
- if (!do_create)
+ if (activated_free_worker)
goto no_need_create;
raw_spin_lock(&acct->workers_lock);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 93633613a165..49ebdeb5b2d9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -79,6 +79,7 @@
#include "io-wq.h"
+#include "filetable.h"
#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
@@ -108,9 +109,6 @@
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
-#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
- IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
-
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
@@ -179,6 +177,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = {
};
#endif
+static void io_poison_cached_req(struct io_kiocb *req)
+{
+ req->ctx = IO_URING_PTR_POISON;
+ req->tctx = IO_URING_PTR_POISON;
+ req->file = IO_URING_PTR_POISON;
+ req->creds = IO_URING_PTR_POISON;
+ req->io_task_work.func = IO_URING_PTR_POISON;
+ req->apoll = IO_URING_PTR_POISON;
+}
+
+static void io_poison_req(struct io_kiocb *req)
+{
+ io_poison_cached_req(req);
+ req->async_data = IO_URING_PTR_POISON;
+ req->kbuf = IO_URING_PTR_POISON;
+ req->comp_list.next = IO_URING_PTR_POISON;
+ req->file_node = IO_URING_PTR_POISON;
+ req->link = IO_URING_PTR_POISON;
+}
+
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
@@ -235,6 +253,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
+ if (IS_ENABLED(CONFIG_KASAN))
+ io_poison_cached_req(req);
wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
@@ -290,7 +310,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
- io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_rsrc_cache_free(ctx);
}
@@ -337,9 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_cmd),
sizeof(struct io_async_cmd));
- spin_lock_init(&ctx->msg_lock);
- ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct io_kiocb), 0);
ret |= io_futex_cache_init(ctx);
ret |= io_rsrc_cache_init(ctx);
if (ret)
@@ -598,27 +614,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{
- size_t cqe_size = sizeof(struct io_uring_cqe);
-
lockdep_assert_held(&ctx->uring_lock);
/* don't abort if we're dying, entries must get freed */
if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
return;
- if (ctx->flags & IORING_SETUP_CQE32)
- cqe_size <<= 1;
-
io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) {
+ size_t cqe_size = sizeof(struct io_uring_cqe);
struct io_uring_cqe *cqe;
struct io_overflow_cqe *ocqe;
+ bool is_cqe32 = false;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
+ if (ocqe->cqe.flags & IORING_CQE_F_32 ||
+ ctx->flags & IORING_SETUP_CQE32) {
+ is_cqe32 = true;
+ cqe_size <<= 1;
+ }
if (!dying) {
- if (!io_get_cqe_overflow(ctx, &cqe, true))
+ if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32))
break;
memcpy(cqe, &ocqe->cqe, cqe_size);
}
@@ -730,10 +748,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
{
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+ bool is_cqe32 = false;
- if (is_cqe32)
+ if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) {
+ is_cqe32 = true;
ocq_size += sizeof(struct io_uring_cqe);
+ }
ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
@@ -752,11 +772,29 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
}
/*
+ * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
+ * because the ring is a single 16b entry away from wrapping.
+ */
+static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
+{
+ if (__io_cqring_events(ctx) < ctx->cq_entries) {
+ struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
+
+ cqe->user_data = 0;
+ cqe->res = 0;
+ cqe->flags = IORING_CQE_F_SKIP;
+ ctx->cached_cq_tail++;
+ return true;
+ }
+ return false;
+}
+
+/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
@@ -770,12 +808,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
return false;
+ /*
+ * Post dummy CQE if a 32b CQE is needed and there's only room for a
+ * 16b CQE before the ring wraps.
+ */
+ if (cqe32 && off + 1 == ctx->cq_entries) {
+ if (!io_fill_nop_cqe(ctx, off))
+ return false;
+ off = 0;
+ }
+
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
free = ctx->cq_entries - queued;
/* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off);
- if (!len)
+ if (len < (cqe32 + 1))
return false;
if (ctx->flags & IORING_SETUP_CQE32) {
@@ -793,9 +841,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
{
struct io_uring_cqe *cqe;
- if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))))
return false;
- if (unlikely(!io_get_cqe(ctx, &cqe)))
+ if (unlikely(!io_get_cqe(ctx, &cqe, true)))
return false;
memcpy(cqe, src_cqe, 2 * sizeof(*cqe));
@@ -806,14 +854,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
+ bool cqe32 = cflags & IORING_CQE_F_32;
struct io_uring_cqe *cqe;
- if (likely(io_get_cqe(ctx, &cqe))) {
+ if (likely(io_get_cqe(ctx, &cqe, cqe32))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (cqe32) {
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
@@ -985,7 +1034,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
lockdep_assert_held(&req->ctx->uring_lock);
req_set_fail(req);
- io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED));
+ io_req_set_res(req, res, io_put_kbuf(req, res, NULL));
if (def->fail)
def->fail(req);
io_req_complete_defer(req);
@@ -1406,8 +1455,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw)
void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
{
- io_tw_lock(req->ctx, tw);
- if (unlikely(io_should_terminate_tw()))
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_tw_lock(ctx, tw);
+ if (unlikely(io_should_terminate_tw(ctx)))
io_req_defer_failed(req, -EFAULT);
else if (req->flags & REQ_F_FORCE_ASYNC)
io_queue_iowq(req);
@@ -2003,11 +2054,9 @@ fail:
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
- io_kbuf_recycle(req, 0);
io_req_task_queue(req);
break;
case IO_APOLL_ABORTED:
- io_kbuf_recycle(req, 0);
io_queue_iowq(req);
break;
case IO_APOLL_OK:
@@ -2736,6 +2785,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
if (check_shl_overflow(off, 1, &off))
return SIZE_MAX;
}
+ if (flags & IORING_SETUP_CQE_MIXED) {
+ if (cq_entries < 2)
+ return SIZE_MAX;
+ }
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
@@ -2767,6 +2820,7 @@ static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
+ io_poison_req(req);
kmem_cache_free(req_cachep, req);
nr++;
}
@@ -3047,10 +3101,10 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
- * Use system_unbound_wq to avoid spawning tons of event kworkers
+ * Use system_dfl_wq to avoid spawning tons of event kworkers
* if we're exiting a ton of rings at the same time. It just adds
* noise and overhead, there's no discernable change in runtime
- * over using system_wq.
+ * over using system_percpu_wq.
*/
queue_work(iou_wq, &ctx->exit_work);
}
@@ -3404,12 +3458,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
struct file *file;
long ret;
- if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
- IORING_ENTER_REGISTERED_RING |
- IORING_ENTER_ABS_TIMER |
- IORING_ENTER_EXT_ARG_REG |
- IORING_ENTER_NO_IOWAIT)))
+ if (unlikely(flags & ~IORING_ENTER_FLAGS))
return -EINVAL;
/*
@@ -3659,6 +3708,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
!(flags & IORING_SETUP_SINGLE_ISSUER))
return -EINVAL;
+ /*
+ * Nonsensical to ask for CQE32 and mixed CQE support, it's not
+ * supported to post 16b CQEs on a ring setup with CQE32.
+ */
+ if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
+ (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
+ return -EINVAL;
+
return 0;
}
@@ -3809,15 +3866,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
- IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
- IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
- IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT;
+ p->features = IORING_FEAT_FLAGS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3825,8 +3874,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
}
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
- && !(ctx->flags & IORING_SETUP_R_DISABLED))
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
+ && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
+ /*
+ * Unlike io_register_enable_rings(), don't need WRITE_ONCE()
+ * since ctx isn't yet accessible from other tasks
+ */
+ ctx->submitter_task = get_task_struct(current);
+ }
file = io_uring_get_file(ctx);
if (IS_ERR(file)) {
@@ -3877,17 +3931,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
return -EINVAL;
}
- if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
- IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
- IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
- IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
- IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
+ if (p.flags & ~IORING_SETUP_FLAGS)
return -EINVAL;
-
return io_uring_create(entries, &p, params);
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index abc6de227f74..46d9141d772a 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -11,13 +11,69 @@
#include "alloc_cache.h"
#include "io-wq.h"
#include "slist.h"
-#include "filetable.h"
#include "opdef.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#endif
+#define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\
+ IORING_FEAT_NODROP |\
+ IORING_FEAT_SUBMIT_STABLE |\
+ IORING_FEAT_RW_CUR_POS |\
+ IORING_FEAT_CUR_PERSONALITY |\
+ IORING_FEAT_FAST_POLL |\
+ IORING_FEAT_POLL_32BITS |\
+ IORING_FEAT_SQPOLL_NONFIXED |\
+ IORING_FEAT_EXT_ARG |\
+ IORING_FEAT_NATIVE_WORKERS |\
+ IORING_FEAT_RSRC_TAGS |\
+ IORING_FEAT_CQE_SKIP |\
+ IORING_FEAT_LINKED_FILE |\
+ IORING_FEAT_REG_REG_RING |\
+ IORING_FEAT_RECVSEND_BUNDLE |\
+ IORING_FEAT_MIN_TIMEOUT |\
+ IORING_FEAT_RW_ATTR |\
+ IORING_FEAT_NO_IOWAIT)
+
+#define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\
+ IORING_SETUP_SQPOLL |\
+ IORING_SETUP_SQ_AFF |\
+ IORING_SETUP_CQSIZE |\
+ IORING_SETUP_CLAMP |\
+ IORING_SETUP_ATTACH_WQ |\
+ IORING_SETUP_R_DISABLED |\
+ IORING_SETUP_SUBMIT_ALL |\
+ IORING_SETUP_COOP_TASKRUN |\
+ IORING_SETUP_TASKRUN_FLAG |\
+ IORING_SETUP_SQE128 |\
+ IORING_SETUP_CQE32 |\
+ IORING_SETUP_SINGLE_ISSUER |\
+ IORING_SETUP_DEFER_TASKRUN |\
+ IORING_SETUP_NO_MMAP |\
+ IORING_SETUP_REGISTERED_FD_ONLY |\
+ IORING_SETUP_NO_SQARRAY |\
+ IORING_SETUP_HYBRID_IOPOLL |\
+ IORING_SETUP_CQE_MIXED)
+
+#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
+ IORING_ENTER_SQ_WAKEUP |\
+ IORING_ENTER_SQ_WAIT |\
+ IORING_ENTER_EXT_ARG |\
+ IORING_ENTER_REGISTERED_RING |\
+ IORING_ENTER_ABS_TIMER |\
+ IORING_ENTER_EXT_ARG_REG |\
+ IORING_ENTER_NO_IOWAIT)
+
+
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\
+ IOSQE_IO_DRAIN |\
+ IOSQE_IO_LINK |\
+ IOSQE_IO_HARDLINK |\
+ IOSQE_ASYNC |\
+ IOSQE_BUFFER_SELECT |\
+ IOSQE_CQE_SKIP_SUCCESS)
+
enum {
IOU_COMPLETE = 0,
@@ -75,7 +131,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset);
int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
@@ -169,25 +225,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
struct io_uring_cqe **ret,
- bool overflow)
+ bool overflow, bool cqe32)
{
io_lockdep_assert_cq_locked(ctx);
- if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
- if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
+ if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) {
+ if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32)))
return false;
}
*ret = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
- if (ctx->flags & IORING_SETUP_CQE32)
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ ctx->cqe_cached++;
+ } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) {
ctx->cqe_cached++;
+ ctx->cached_cq_tail++;
+ }
+ WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel);
return true;
}
-static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
+static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret,
+ bool cqe32)
{
- return io_get_cqe_overflow(ctx, ret, false);
+ return io_get_cqe_overflow(ctx, ret, false, cqe32);
}
static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
@@ -196,25 +258,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
io_lockdep_assert_cq_locked(ctx);
ctx->submit_state.cq_flush = true;
- return io_get_cqe(ctx, cqe_ret);
+ return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED);
}
static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
+ bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32;
struct io_uring_cqe *cqe;
/*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
+ * If we can't get a cq entry, userspace overflowed the submission
+ * (by quite a lot).
*/
- if (unlikely(!io_get_cqe(ctx, &cqe)))
+ if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32)))
return false;
-
memcpy(cqe, &req->cqe, sizeof(*cqe));
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) {
memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
@@ -239,6 +300,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
req->cqe.flags = cflags;
}
+static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx)
+{
+ if (ctx->flags & IORING_SETUP_CQE_MIXED)
+ return IORING_CQE_F_32;
+ return 0;
+}
+
+static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags,
+ __u64 extra1, __u64 extra2)
+{
+ req->cqe.res = res;
+ req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx);
+ req->big_cqe.extra1 = extra1;
+ req->big_cqe.extra2 = extra2;
+}
+
static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
struct io_kiocb *req)
{
@@ -260,6 +337,19 @@ static inline bool req_has_async_data(struct io_kiocb *req)
return req->flags & REQ_F_ASYNC_DATA;
}
+static inline void io_req_async_data_clear(struct io_kiocb *req,
+ io_req_flags_t extra_flags)
+{
+ req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags);
+ req->async_data = NULL;
+}
+
+static inline void io_req_async_data_free(struct io_kiocb *req)
+{
+ kfree(req->async_data);
+ io_req_async_data_clear(req, 0);
+}
+
static inline void io_put_file(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
@@ -476,9 +566,9 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
* 2) PF_KTHREAD is set, in which case the invoker of the task_work is
* our fallback task_work.
*/
-static inline bool io_should_terminate_tw(void)
+static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx)
{
- return current->flags & (PF_KTHREAD | PF_EXITING);
+ return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs);
}
static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 19a8bde5e1e1..aad655e38672 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -155,19 +155,19 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
return 1;
}
-static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
- struct io_buffer_list *bl,
- unsigned int issue_flags)
+static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
__u16 tail, head = bl->head;
+ struct io_br_sel sel = { };
struct io_uring_buf *buf;
- void __user *ret;
u32 buf_len;
tail = smp_load_acquire(&br->tail);
if (unlikely(tail == head))
- return NULL;
+ return sel;
if (head + 1 == tail)
req->flags |= REQ_F_BL_EMPTY;
@@ -177,9 +177,9 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
if (*len == 0 || *len > buf_len)
*len = buf_len;
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
- req->buf_list = bl;
req->buf_index = buf->bid;
- ret = u64_to_user_ptr(buf->addr);
+ sel.buf_list = bl;
+ sel.addr = u64_to_user_ptr(buf->addr);
if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
/*
@@ -192,30 +192,30 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
* the transfer completes (or if we get -EAGAIN and must poll of
* retry).
*/
- io_kbuf_commit(req, bl, *len, 1);
- req->buf_list = NULL;
+ io_kbuf_commit(req, sel.buf_list, *len, 1);
+ sel.buf_list = NULL;
}
- return ret;
+ return sel;
}
-void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned buf_group, unsigned int issue_flags)
+struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
+ unsigned buf_group, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_br_sel sel = { };
struct io_buffer_list *bl;
- void __user *ret = NULL;
io_ring_submit_lock(req->ctx, issue_flags);
bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
if (bl->flags & IOBL_BUF_RING)
- ret = io_ring_buffer_select(req, len, bl, issue_flags);
+ sel = io_ring_buffer_select(req, len, bl, issue_flags);
else
- ret = io_provided_buffer_select(req, len, bl);
+ sel.addr = io_provided_buffer_select(req, len, bl);
}
io_ring_submit_unlock(req->ctx, issue_flags);
- return ret;
+ return sel;
}
/* cap it at a reasonable 256, will be one page even for 4K */
@@ -300,24 +300,22 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
req->flags |= REQ_F_BL_EMPTY;
req->flags |= REQ_F_BUFFER_RING;
- req->buf_list = bl;
return iov - arg->iovs;
}
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
- unsigned int issue_flags)
+ struct io_br_sel *sel, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
int ret = -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
- bl = io_buffer_get_list(ctx, arg->buf_group);
- if (unlikely(!bl))
+ sel->buf_list = io_buffer_get_list(ctx, arg->buf_group);
+ if (unlikely(!sel->buf_list))
goto out_unlock;
- if (bl->flags & IOBL_BUF_RING) {
- ret = io_ring_buffers_peek(req, arg, bl);
+ if (sel->buf_list->flags & IOBL_BUF_RING) {
+ ret = io_ring_buffers_peek(req, arg, sel->buf_list);
/*
* Don't recycle these buffers if we need to go through poll.
* Nobody else can use them anyway, and holding on to provided
@@ -327,17 +325,21 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
*/
if (ret > 0) {
req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
- io_kbuf_commit(req, bl, arg->out_len, ret);
+ io_kbuf_commit(req, sel->buf_list, arg->out_len, ret);
}
} else {
- ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
+ ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs);
}
out_unlock:
- io_ring_submit_unlock(ctx, issue_flags);
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ sel->buf_list = NULL;
+ mutex_unlock(&ctx->uring_lock);
+ }
return ret;
}
-int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
+int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
+ struct io_br_sel *sel)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
@@ -353,16 +355,18 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
ret = io_ring_buffers_peek(req, arg, bl);
if (ret > 0)
req->flags |= REQ_F_BUFFERS_COMMIT;
+ sel->buf_list = bl;
return ret;
}
/* don't support multiple buffer selections for legacy */
+ sel->buf_list = NULL;
return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
}
-static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
+static inline bool __io_put_kbuf_ring(struct io_kiocb *req,
+ struct io_buffer_list *bl, int len, int nr)
{
- struct io_buffer_list *bl = req->buf_list;
bool ret = true;
if (bl)
@@ -372,7 +376,8 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
return ret;
}
-unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
+unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl,
+ int len, int nbufs)
{
unsigned int ret;
@@ -383,7 +388,7 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
return ret;
}
- if (!__io_put_kbuf_ring(req, len, nbufs))
+ if (!__io_put_kbuf_ring(req, bl, len, nbufs))
ret |= IORING_CQE_F_BUF_MORE;
return ret;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 723d0361898e..ada382ff38d7 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -62,11 +62,12 @@ struct buf_sel_arg {
unsigned short partial_map;
};
-void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned buf_group, unsigned int issue_flags);
+struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
+ unsigned buf_group, unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
- unsigned int issue_flags);
-int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
+ struct io_br_sel *sel, unsigned int issue_flags);
+int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
+ struct io_br_sel *sel);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -80,23 +81,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
void io_kbuf_drop_legacy(struct io_kiocb *req);
-unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs);
+unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl,
+ int len, int nbufs);
bool io_kbuf_commit(struct io_kiocb *req,
struct io_buffer_list *bl, int len, int nr);
struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
unsigned int bgid);
-static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
+static inline bool io_kbuf_recycle_ring(struct io_kiocb *req,
+ struct io_buffer_list *bl)
{
- /*
- * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
- * the flag and hence ensure that bl->head doesn't get incremented.
- * If the tail has already been incremented, hang on to it.
- * The exception is partial io, that case we should increment bl->head
- * to monopolize the buffer.
- */
- if (req->buf_list) {
+ if (bl) {
req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}
@@ -110,30 +106,31 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
}
-static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+static inline bool io_kbuf_recycle(struct io_kiocb *req, struct io_buffer_list *bl,
+ unsigned issue_flags)
{
if (req->flags & REQ_F_BL_NO_RECYCLE)
return false;
+ if (req->flags & REQ_F_BUFFER_RING)
+ return io_kbuf_recycle_ring(req, bl);
if (req->flags & REQ_F_BUFFER_SELECTED)
return io_kbuf_recycle_legacy(req, issue_flags);
- if (req->flags & REQ_F_BUFFER_RING)
- return io_kbuf_recycle_ring(req);
return false;
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len,
- unsigned issue_flags)
+ struct io_buffer_list *bl)
{
if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbufs(req, len, 1);
+ return __io_put_kbufs(req, bl, len, 1);
}
static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
- int nbufs, unsigned issue_flags)
+ struct io_buffer_list *bl, int nbufs)
{
if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbufs(req, len, nbufs);
+ return __io_put_kbufs(req, bl, len, nbufs);
}
#endif
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 4c2578f2efcb..5e5b94236d72 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -11,7 +11,6 @@
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
-#include "alloc_cache.h"
#include "msg_ring.h"
/* All valid masks for MSG_RING */
@@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
struct io_ring_ctx *ctx = req->ctx;
io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
- if (spin_trylock(&ctx->msg_lock)) {
- if (io_alloc_cache_put(&ctx->msg_cache, req))
- req = NULL;
- spin_unlock(&ctx->msg_lock);
- }
- if (req)
- kfree_rcu(req, rcu_head);
+ kfree_rcu(req, rcu_head);
percpu_ref_put(&ctx->refs);
}
@@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0;
}
-static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
-{
- struct io_kiocb *req = NULL;
-
- if (spin_trylock(&ctx->msg_lock)) {
- req = io_alloc_cache_get(&ctx->msg_cache);
- spin_unlock(&ctx->msg_lock);
- if (req)
- return req;
- }
- return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
-}
-
static int io_msg_data_remote(struct io_ring_ctx *target_ctx,
struct io_msg *msg)
{
struct io_kiocb *target;
u32 flags = 0;
- target = io_msg_get_kiocb(target_ctx);
+ target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ;
if (unlikely(!target))
return -ENOMEM;
diff --git a/io_uring/net.c b/io_uring/net.c
index d69f2afa4f7a..f99b90c762fc 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -10,6 +10,7 @@
#include <uapi/linux/io_uring.h>
+#include "filetable.h"
#include "io_uring.h"
#include "kbuf.h"
#include "alloc_cache.h"
@@ -178,10 +179,8 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&hdr->vec);
- if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
- req->async_data = NULL;
- req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
- }
+ if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
+ io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
}
static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
@@ -433,7 +432,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->opcode == IORING_OP_SENDMSG)
return -EINVAL;
sr->msg_flags |= MSG_WAITALL;
- req->buf_list = NULL;
req->flags |= REQ_F_MULTISHOT;
}
@@ -494,29 +492,29 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
return nbufs;
}
-static int io_net_kbuf_recyle(struct io_kiocb *req,
+static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
struct io_async_msghdr *kmsg, int len)
{
req->flags |= REQ_F_BL_NO_RECYCLE;
if (req->flags & REQ_F_BUFFERS_COMMIT)
- io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len));
+ io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
return IOU_RETRY;
}
-static inline bool io_send_finish(struct io_kiocb *req, int *ret,
+static inline bool io_send_finish(struct io_kiocb *req,
struct io_async_msghdr *kmsg,
- unsigned issue_flags)
+ struct io_br_sel *sel)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- bool bundle_finished = *ret <= 0;
+ bool bundle_finished = sel->val <= 0;
unsigned int cflags;
if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
- cflags = io_put_kbuf(req, *ret, issue_flags);
+ cflags = io_put_kbuf(req, sel->val, sel->buf_list);
goto finish;
}
- cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
+ cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
goto finish;
@@ -525,15 +523,15 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
* Fill CQE for this receive and see if we should keep trying to
* receive from this socket.
*/
- if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
+ if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
io_mshot_prep_retry(req, kmsg);
return false;
}
/* Otherwise stop bundle and use the current result. */
finish:
- io_req_set_res(req, *ret, cflags);
- *ret = IOU_COMPLETE;
+ io_req_set_res(req, sel->val, cflags);
+ sel->val = IOU_COMPLETE;
return true;
}
@@ -571,7 +569,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg->msg.msg_controllen = 0;
kmsg->msg.msg_control = NULL;
sr->done_io += ret;
- return io_net_kbuf_recyle(req, kmsg, ret);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -587,17 +585,16 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
}
static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
- struct io_async_msghdr *kmsg)
+ struct io_br_sel *sel, struct io_async_msghdr *kmsg)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-
- int ret;
struct buf_sel_arg arg = {
.iovs = &kmsg->fast_iov,
.max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1,
.buf_group = sr->buf_group,
};
+ int ret;
if (kmsg->vec.iovec) {
arg.nr_iovs = kmsg->vec.nr;
@@ -610,7 +607,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
else
arg.mode |= KBUF_MODE_EXPAND;
- ret = io_buffers_select(req, &arg, issue_flags);
+ ret = io_buffers_select(req, &arg, sel, issue_flags);
if (unlikely(ret < 0))
return ret;
@@ -639,6 +636,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
+ struct io_br_sel sel = { };
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -657,8 +655,9 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
flags |= MSG_DONTWAIT;
retry_bundle:
+ sel.buf_list = NULL;
if (io_do_buffer_select(req)) {
- ret = io_send_select_buffer(req, issue_flags, kmsg);
+ ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
if (ret)
return ret;
}
@@ -682,7 +681,7 @@ retry_bundle:
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
- return io_net_kbuf_recyle(req, kmsg, ret);
+ return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -693,11 +692,12 @@ retry_bundle:
else if (sr->done_io)
ret = sr->done_io;
- if (!io_send_finish(req, &ret, kmsg, issue_flags))
+ sel.val = ret;
+ if (!io_send_finish(req, kmsg, &sel))
goto retry_bundle;
io_req_msg_cleanup(req, issue_flags);
- return ret;
+ return sel.val;
}
static int io_recvmsg_mshot_prep(struct io_kiocb *req,
@@ -794,18 +794,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->flags |= REQ_F_NOWAIT;
if (sr->msg_flags & MSG_ERRQUEUE)
req->flags |= REQ_F_CLEAR_POLLIN;
- if (req->flags & REQ_F_BUFFER_SELECT) {
- /*
- * Store the buffer group for this multishot receive separately,
- * as if we end up doing an io-wq based issue that selects a
- * buffer, it has to be committed immediately and that will
- * clear ->buf_list. This means we lose the link to the buffer
- * list, and the eventual buffer put on completion then cannot
- * restore it.
- */
+ if (req->flags & REQ_F_BUFFER_SELECT)
sr->buf_group = req->buf_index;
- req->buf_list = NULL;
- }
sr->mshot_total_len = sr->mshot_len = 0;
if (sr->flags & IORING_RECV_MULTISHOT) {
if (!(req->flags & REQ_F_BUFFER_SELECT))
@@ -846,9 +836,10 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* Returns true if it is actually finished, or false if it should run
* again (for multishot).
*/
-static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
+static inline bool io_recv_finish(struct io_kiocb *req,
struct io_async_msghdr *kmsg,
- bool mshot_finished, unsigned issue_flags)
+ struct io_br_sel *sel, bool mshot_finished,
+ unsigned issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
unsigned int cflags = 0;
@@ -856,13 +847,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
- if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
+ if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
/*
* If sr->len hits zero, the limit has been reached. Mark
* mshot as finished, and flag MSHOT_DONE as well to prevent
* a potential bundle from being retried.
*/
- sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len);
+ sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
if (!sr->mshot_total_len) {
sr->flags |= IORING_RECV_MSHOT_DONE;
mshot_finished = true;
@@ -870,13 +861,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
}
if (sr->flags & IORING_RECVSEND_BUNDLE) {
- size_t this_ret = *ret - sr->done_io;
+ size_t this_ret = sel->val - sr->done_io;
- cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret),
- issue_flags);
+ cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
if (sr->flags & IORING_RECV_RETRY)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
- if (sr->mshot_len && *ret >= sr->mshot_len)
+ if (sr->mshot_len && sel->val >= sr->mshot_len)
sr->flags |= IORING_RECV_MSHOT_CAP;
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
@@ -895,7 +885,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
return false;
}
} else {
- cflags |= io_put_kbuf(req, *ret, issue_flags);
+ cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
}
/*
@@ -903,8 +893,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
* receive from this socket.
*/
if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
- io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
- *ret = IOU_RETRY;
+ io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
+ sel->val = IOU_RETRY;
io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
@@ -916,15 +906,15 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
sr->nr_multishot_loops = 0;
sr->flags &= ~IORING_RECV_MSHOT_CAP;
if (issue_flags & IO_URING_F_MULTISHOT)
- *ret = IOU_REQUEUE;
+ sel->val = IOU_REQUEUE;
}
return true;
}
/* Finish the request / stop multishot. */
finish:
- io_req_set_res(req, *ret, cflags);
- *ret = IOU_COMPLETE;
+ io_req_set_res(req, sel->val, cflags);
+ sel->val = IOU_COMPLETE;
io_req_msg_cleanup(req, issue_flags);
return true;
}
@@ -1017,6 +1007,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
+ struct io_br_sel sel = { };
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
@@ -1036,23 +1027,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
flags |= MSG_DONTWAIT;
retry_multishot:
+ sel.buf_list = NULL;
if (io_do_buffer_select(req)) {
- void __user *buf;
size_t len = sr->len;
- buf = io_buffer_select(req, &len, sr->buf_group, issue_flags);
- if (!buf)
+ sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
+ if (!sel.addr)
return -ENOBUFS;
if (req->flags & REQ_F_APOLL_MULTISHOT) {
- ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
+ ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
if (ret) {
- io_kbuf_recycle(req, issue_flags);
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
return ret;
}
}
- iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
+ iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
}
kmsg->msg.msg_get_inq = 1;
@@ -1071,14 +1062,12 @@ retry_multishot:
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
- if (issue_flags & IO_URING_F_MULTISHOT)
- io_kbuf_recycle(req, issue_flags);
-
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
return IOU_RETRY;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
- return io_net_kbuf_recyle(req, kmsg, ret);
+ return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -1092,16 +1081,17 @@ retry_multishot:
else if (sr->done_io)
ret = sr->done_io;
else
- io_kbuf_recycle(req, issue_flags);
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
- if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
+ sel.val = ret;
+ if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
goto retry_multishot;
- return ret;
+ return sel.val;
}
static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
- size_t *len, unsigned int issue_flags)
+ struct io_br_sel *sel, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
int ret;
@@ -1126,15 +1116,15 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
arg.mode |= KBUF_MODE_FREE;
}
- if (*len)
- arg.max_len = *len;
+ if (sel->val)
+ arg.max_len = sel->val;
else if (kmsg->msg.msg_inq > 1)
- arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq);
+ arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
/* if mshot limited, ensure we don't go over */
if (sr->flags & IORING_RECV_MSHOT_LIM)
arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
- ret = io_buffers_peek(req, &arg);
+ ret = io_buffers_peek(req, &arg, sel);
if (unlikely(ret < 0))
return ret;
@@ -1155,14 +1145,13 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
} else {
- void __user *buf;
+ size_t len = sel->val;
- *len = sr->len;
- buf = io_buffer_select(req, len, sr->buf_group, issue_flags);
- if (!buf)
+ *sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
+ if (!sel->addr)
return -ENOBUFS;
- sr->buf = buf;
- sr->len = *len;
+ sr->buf = sel->addr;
+ sr->len = len;
map_ubuf:
ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
&kmsg->msg.msg_iter);
@@ -1177,11 +1166,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
+ struct io_br_sel sel;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- size_t len = sr->len;
bool mshot_finished;
if (!(req->flags & REQ_F_POLLED) &&
@@ -1197,9 +1186,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
flags |= MSG_DONTWAIT;
retry_multishot:
+ sel.buf_list = NULL;
if (io_do_buffer_select(req)) {
- ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
- if (unlikely(ret)) {
+ sel.val = sr->len;
+ ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
+ if (unlikely(ret < 0)) {
kmsg->msg.msg_inq = -1;
goto out_free;
}
@@ -1215,16 +1206,14 @@ retry_multishot:
ret = sock_recvmsg(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
- if (issue_flags & IO_URING_F_MULTISHOT)
- io_kbuf_recycle(req, issue_flags);
-
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
return IOU_RETRY;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
- return io_net_kbuf_recyle(req, kmsg, ret);
+ return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -1240,12 +1229,13 @@ out_free:
else if (sr->done_io)
ret = sr->done_io;
else
- io_kbuf_recycle(req, issue_flags);
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
- if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
+ sel.val = ret;
+ if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
goto retry_multishot;
- return ret;
+ return sel.val;
}
int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1505,7 +1495,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
zc->len -= ret;
zc->buf += ret;
zc->done_io += ret;
- return io_net_kbuf_recyle(req, kmsg, ret);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -1575,7 +1565,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
- return io_net_kbuf_recyle(req, kmsg, ret);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 20ed0f85b1c2..3caf07878f8a 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -17,11 +17,13 @@ struct io_nop {
int result;
int fd;
unsigned int flags;
+ __u64 extra1;
+ __u64 extra2;
};
#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \
- IORING_NOP_TW)
+ IORING_NOP_TW | IORING_NOP_CQE32)
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -41,6 +43,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
nop->fd = -1;
if (nop->flags & IORING_NOP_FIXED_BUFFER)
req->buf_index = READ_ONCE(sqe->buf_index);
+ if (nop->flags & IORING_NOP_CQE32) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
+ return -EINVAL;
+ nop->extra1 = READ_ONCE(sqe->off);
+ nop->extra2 = READ_ONCE(sqe->addr);
+ }
return 0;
}
@@ -68,7 +78,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret < 0)
req_set_fail(req);
- io_req_set_res(req, nop->result, 0);
+ if (nop->flags & IORING_NOP_CQE32)
+ io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2);
+ else
+ io_req_set_res(req, nop->result, 0);
if (nop->flags & IORING_NOP_TW) {
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 9a6f6e92d742..d8ba1165c949 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -14,10 +14,15 @@ static const struct ubuf_info_ops io_ubuf_ops;
static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw)
{
struct io_notif_data *nd = io_notif_to_data(notif);
+ struct io_ring_ctx *ctx = notif->ctx;
+
+ lockdep_assert_held(&ctx->uring_lock);
do {
notif = cmd_to_io_kiocb(nd);
+ if (WARN_ON_ONCE(ctx != notif->ctx))
+ return;
lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0);
if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used))
@@ -85,7 +90,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
return -EEXIST;
prev_nd = container_of(prev_uarg, struct io_notif_data, uarg);
- prev_notif = cmd_to_io_kiocb(nd);
+ prev_notif = cmd_to_io_kiocb(prev_nd);
/* make sure all noifications can be finished in the same task_work */
if (unlikely(notif->ctx != prev_notif->ctx ||
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9568785810d9..932319633eac 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = {
#endif
},
[IORING_OP_URING_CMD] = {
+ .buffer_select = 1,
.needs_file = 1,
.plug = 1,
.iopoll = 1,
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index d70700e5cef8..bfeb91b31bba 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -14,6 +14,7 @@
#include "../fs/internal.h"
+#include "filetable.h"
#include "io_uring.h"
#include "rsrc.h"
#include "openclose.h"
diff --git a/io_uring/poll.c b/io_uring/poll.c
index c786e587563b..b9681d0f9f13 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
{
int v;
- if (unlikely(io_should_terminate_tw()))
+ if (unlikely(io_should_terminate_tw(req->ctx)))
return -ECANCELED;
do {
@@ -316,10 +316,8 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw)
ret = io_poll_check_events(req, tw);
if (ret == IOU_POLL_NO_ACTION) {
- io_kbuf_recycle(req, 0);
return;
} else if (ret == IOU_POLL_REQUEUE) {
- io_kbuf_recycle(req, 0);
__io_poll_execute(req, 0);
return;
}
@@ -686,8 +684,6 @@ int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask)
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
- io_kbuf_recycle(req, issue_flags);
-
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
if (ret)
return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
diff --git a/io_uring/query.c b/io_uring/query.c
new file mode 100644
index 000000000000..645301bd2c82
--- /dev/null
+++ b/io_uring/query.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "linux/io_uring/query.h"
+
+#include "query.h"
+#include "io_uring.h"
+
+#define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode))
+#define IO_MAX_QUERY_ENTRIES 1000
+
+static ssize_t io_query_ops(void *data)
+{
+ struct io_uring_query_opcode *e = data;
+
+ BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE);
+
+ e->nr_request_opcodes = IORING_OP_LAST;
+ e->nr_register_opcodes = IORING_REGISTER_LAST;
+ e->feature_flags = IORING_FEAT_FLAGS;
+ e->ring_setup_flags = IORING_SETUP_FLAGS;
+ e->enter_flags = IORING_ENTER_FLAGS;
+ e->sqe_flags = SQE_VALID_FLAGS;
+ return sizeof(*e);
+}
+
+static int io_handle_query_entry(struct io_ring_ctx *ctx,
+ void *data, void __user *uhdr,
+ u64 *next_entry)
+{
+ struct io_uring_query_hdr hdr;
+ size_t usize, res_size = 0;
+ ssize_t ret = -EINVAL;
+ void __user *udata;
+
+ if (copy_from_user(&hdr, uhdr, sizeof(hdr)))
+ return -EFAULT;
+ usize = hdr.size;
+ hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE);
+ udata = u64_to_user_ptr(hdr.query_data);
+
+ if (hdr.query_op >= __IO_URING_QUERY_MAX) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size)
+ goto out;
+ if (copy_from_user(data, udata, hdr.size))
+ return -EFAULT;
+
+ switch (hdr.query_op) {
+ case IO_URING_QUERY_OPCODES:
+ ret = io_query_ops(data);
+ break;
+ }
+
+ if (ret >= 0) {
+ if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE))
+ return -EFAULT;
+ res_size = ret;
+ ret = 0;
+ }
+out:
+ hdr.result = ret;
+ hdr.size = min_t(size_t, usize, res_size);
+
+ if (copy_struct_to_user(udata, usize, data, hdr.size, NULL))
+ return -EFAULT;
+ if (copy_to_user(uhdr, &hdr, sizeof(hdr)))
+ return -EFAULT;
+ *next_entry = hdr.next_entry;
+ return 0;
+}
+
+int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+{
+ char entry_buffer[IO_MAX_QUERY_SIZE];
+ void __user *uhdr = arg;
+ int ret, nr = 0;
+
+ memset(entry_buffer, 0, sizeof(entry_buffer));
+
+ if (nr_args)
+ return -EINVAL;
+
+ while (uhdr) {
+ u64 next_hdr;
+
+ ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr);
+ if (ret)
+ return ret;
+ uhdr = u64_to_user_ptr(next_hdr);
+
+ /* Have some limit to avoid a potential cycle */
+ if (++nr >= IO_MAX_QUERY_ENTRIES)
+ return -ERANGE;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ }
+ return 0;
+}
diff --git a/io_uring/query.h b/io_uring/query.h
new file mode 100644
index 000000000000..171d47ccaaba
--- /dev/null
+++ b/io_uring/query.h
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IORING_QUERY_H
+#define IORING_QUERY_H
+
+#include <linux/io_uring_types.h>
+
+int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args);
+
+#endif
diff --git a/io_uring/register.c b/io_uring/register.c
index a59589249fce..43f04c47522c 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -18,6 +18,7 @@
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
+#include "filetable.h"
#include "io_uring.h"
#include "opdef.h"
#include "tctx.h"
@@ -31,6 +32,7 @@
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
+#include "query.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -46,13 +48,9 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
nr_args = IORING_OP_LAST;
size = struct_size(p, ops, nr_args);
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
+ p = memdup_user(arg, size);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
@@ -396,7 +394,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
- IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+ IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
+ IORING_SETUP_CQE_MIXED)
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
{
@@ -407,10 +406,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_params p;
int ret;
- /* for single issuer, must be owner resizing */
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
- current != ctx->submitter_task)
- return -EEXIST;
/* limited to DEFER_TASKRUN for now */
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
return -EINVAL;
@@ -835,6 +830,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_mem_region(ctx, arg);
break;
+ case IORING_REGISTER_QUERY:
+ ret = io_query(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_ZCRX_REFILL:
+ ret = io_zcrx_return_bufs(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
@@ -877,6 +878,23 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered)
return ERR_PTR(-EOPNOTSUPP);
}
+static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
+{
+ struct io_uring_sqe sqe;
+
+ if (!arg || nr_args != 1)
+ return -EINVAL;
+ if (copy_from_user(&sqe, arg, sizeof(sqe)))
+ return -EFAULT;
+ /* no flags supported */
+ if (sqe.flags)
+ return -EINVAL;
+ if (sqe.opcode != IORING_OP_MSG_RING)
+ return -EINVAL;
+
+ return io_uring_sync_msg_ring(&sqe);
+}
+
/*
* "blind" registration opcodes are ones where there's no ring given, and
* hence the source fd must be -1.
@@ -885,21 +903,11 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
unsigned int nr_args)
{
switch (opcode) {
- case IORING_REGISTER_SEND_MSG_RING: {
- struct io_uring_sqe sqe;
-
- if (!arg || nr_args != 1)
- return -EINVAL;
- if (copy_from_user(&sqe, arg, sizeof(sqe)))
- return -EFAULT;
- /* no flags supported */
- if (sqe.flags)
- return -EINVAL;
- if (sqe.opcode == IORING_OP_MSG_RING)
- return io_uring_sync_msg_ring(&sqe);
- }
+ case IORING_REGISTER_SEND_MSG_RING:
+ return io_uring_register_send_msg_ring(arg, nr_args);
+ case IORING_REGISTER_QUERY:
+ return io_query(NULL, arg, nr_args);
}
-
return -EINVAL;
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f75f5e43fa4a..d787c16dc1c3 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,6 +13,7 @@
#include <uapi/linux/io_uring.h>
+#include "filetable.h"
#include "io_uring.h"
#include "openclose.h"
#include "rsrc.h"
@@ -1299,10 +1300,17 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
if (src_ctx != ctx) {
mutex_unlock(&ctx->uring_lock);
lock_two_rings(ctx, src_ctx);
+
+ if (src_ctx->submitter_task &&
+ src_ctx->submitter_task != current) {
+ ret = -EEXIST;
+ goto out;
+ }
}
ret = io_clone_buffers(ctx, src_ctx, &buf);
+out:
if (src_ctx != ctx)
mutex_unlock(&src_ctx->uring_lock);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 52a5b950b2e5..08882648d569 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -15,6 +15,7 @@
#include <uapi/linux/io_uring.h>
+#include "filetable.h"
#include "io_uring.h"
#include "opdef.h"
#include "kbuf.h"
@@ -107,34 +108,35 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
}
static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
- struct io_async_rw *io,
- unsigned int issue_flags)
+ struct io_async_rw *io, struct io_br_sel *sel,
+ unsigned int issue_flags)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- void __user *buf = u64_to_user_ptr(rw->addr);
size_t sqe_len = rw->len;
+ sel->addr = u64_to_user_ptr(rw->addr);
if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT))
- return io_import_vec(ddir, req, io, buf, sqe_len);
+ return io_import_vec(ddir, req, io, sel->addr, sqe_len);
if (io_do_buffer_select(req)) {
- buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
- if (!buf)
+ *sel = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
+ if (!sel->addr)
return -ENOBUFS;
- rw->addr = (unsigned long) buf;
+ rw->addr = (unsigned long) sel->addr;
rw->len = sqe_len;
}
- return import_ubuf(ddir, buf, sqe_len, &io->iter);
+ return import_ubuf(ddir, sel->addr, sqe_len, &io->iter);
}
static inline int io_import_rw_buffer(int rw, struct io_kiocb *req,
struct io_async_rw *io,
+ struct io_br_sel *sel,
unsigned int issue_flags)
{
int ret;
- ret = __io_import_rw_buffer(rw, req, io, issue_flags);
+ ret = __io_import_rw_buffer(rw, req, io, sel, issue_flags);
if (unlikely(ret < 0))
return ret;
@@ -153,10 +155,8 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP)
io_vec_free(&rw->vec);
- if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
- req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
- }
+ if (io_alloc_cache_put(&req->ctx->rw_cache, rw))
+ io_req_async_data_clear(req, 0);
}
static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
@@ -306,10 +306,12 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_rw_do_import(struct io_kiocb *req, int ddir)
{
+ struct io_br_sel sel = { };
+
if (io_do_buffer_select(req))
return 0;
- return io_import_rw_buffer(ddir, req, req->async_data, 0);
+ return io_import_rw_buffer(ddir, req, req->async_data, &sel, 0);
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
@@ -576,7 +578,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
- req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);
+ req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
io_req_rw_cleanup(req, 0);
io_req_task_complete(req, tw);
@@ -645,7 +647,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
}
static int kiocb_done(struct io_kiocb *req, ssize_t ret,
- unsigned int issue_flags)
+ struct io_br_sel *sel, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned final_ret = io_fixup_rw_res(req, ret);
@@ -659,7 +661,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
* from the submission path.
*/
io_req_io_end(req);
- io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
+ io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list));
io_req_rw_cleanup(req, issue_flags);
return IOU_COMPLETE;
} else {
@@ -886,6 +888,9 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
if (req->flags & REQ_F_HAS_METADATA) {
struct io_async_rw *io = req->async_data;
+ if (!(file->f_mode & FMODE_HAS_METADATA))
+ return -EINVAL;
+
/*
* We have a union of meta fields with wpq used for buffered-io
* in io_async_rw, so fail it here.
@@ -899,7 +904,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
return 0;
}
-static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
+ unsigned int issue_flags)
{
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -913,7 +919,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(ret))
return ret;
} else if (io_do_buffer_select(req)) {
- ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags);
+ ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags);
if (unlikely(ret < 0))
return ret;
}
@@ -1015,18 +1021,22 @@ done:
int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct io_br_sel sel = { };
int ret;
- ret = __io_read(req, issue_flags);
+ ret = __io_read(req, &sel, issue_flags);
if (ret >= 0)
- return kiocb_done(req, ret, issue_flags);
+ return kiocb_done(req, ret, &sel, issue_flags);
+ if (req->flags & REQ_F_BUFFERS_COMMIT)
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
return ret;
}
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_br_sel sel = { };
unsigned int cflags = 0;
int ret;
@@ -1038,7 +1048,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
/* make it sync, multishot doesn't support async execution */
rw->kiocb.ki_complete = NULL;
- ret = __io_read(req, issue_flags);
+ ret = __io_read(req, &sel, issue_flags);
/*
* If we get -EAGAIN, recycle our buffer and just let normal poll
@@ -1049,15 +1059,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
* Reset rw->len to 0 again to avoid clamping future mshot
* reads, in case the buffer size varies.
*/
- if (io_kbuf_recycle(req, issue_flags))
+ if (io_kbuf_recycle(req, sel.buf_list, issue_flags))
rw->len = 0;
return IOU_RETRY;
} else if (ret <= 0) {
- io_kbuf_recycle(req, issue_flags);
+ io_kbuf_recycle(req, sel.buf_list, issue_flags);
if (ret < 0)
req_set_fail(req);
} else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
- cflags = io_put_kbuf(req, ret, issue_flags);
+ cflags = io_put_kbuf(req, ret, sel.buf_list);
} else {
/*
* Any successful return value will keep the multishot read
@@ -1065,7 +1075,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
* we fail to post a CQE, or multishot is no longer set, then
* jump to the termination path. This request is then done.
*/
- cflags = io_put_kbuf(req, ret, issue_flags);
+ cflags = io_put_kbuf(req, ret, sel.buf_list);
rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
@@ -1194,7 +1204,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
}
done:
- return kiocb_done(req, ret2, issue_flags);
+ return kiocb_done(req, ret2, NULL, issue_flags);
} else {
ret_eagain:
iov_iter_restore(&io->iter, &io->iter_state);
@@ -1362,7 +1372,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed))
break;
nr_events++;
- req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
+ req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
if (req->opcode != IORING_OP_URING_CMD)
io_req_rw_cleanup(req, 0);
}
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 35ce4e60b495..e81ebbb91925 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -11,6 +11,7 @@
#include <uapi/linux/io_uring.h>
+#include "filetable.h"
#include "io_uring.h"
#include "splice.h"
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 7f13bfa9f2b6..17e3aab0af36 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw)
int ret;
if (prev) {
- if (!io_should_terminate_tw()) {
+ if (!io_should_terminate_tw(req->ctx)) {
struct io_cancel_data cd = {
.ctx = req->ctx,
.data = prev->cqe.user_data,
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 053bac89b6c0..d1e3ba62ee8e 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -11,6 +11,7 @@
#include "io_uring.h"
#include "alloc_cache.h"
#include "rsrc.h"
+#include "kbuf.h"
#include "uring_cmd.h"
#include "poll.h"
@@ -36,8 +37,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) {
ioucmd->sqe = NULL;
- req->async_data = NULL;
- req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
+ io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
}
}
@@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw)
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
unsigned int flags = IO_URING_F_COMPLETE_DEFER;
- if (io_should_terminate_tw())
+ if (io_should_terminate_tw(req->ctx))
flags |= IO_URING_F_TASK_DEAD;
/* task_work executor checks the deffered list completion */
@@ -126,7 +126,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw)
}
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
- void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+ io_uring_cmd_tw_t task_work_cb,
unsigned flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
@@ -151,8 +151,8 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
* Called by consumers of io_uring_cmd, if they originally returned
* -EIOCBQUEUED upon receiving the command.
*/
-void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
- unsigned issue_flags)
+void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
+ unsigned issue_flags, bool is_cqe32)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
@@ -165,8 +165,11 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
req_set_fail(req);
io_req_set_res(req, ret, 0);
- if (req->ctx->flags & IORING_SETUP_CQE32)
+ if (is_cqe32) {
+ if (req->ctx->flags & IORING_SETUP_CQE_MIXED)
+ req->cqe.flags |= IORING_CQE_F_32;
io_req_set_cqe32_extra(req, res2, 0);
+ }
io_req_uring_cleanup(req, issue_flags);
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
@@ -180,7 +183,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
io_req_task_work_add(req);
}
}
-EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+EXPORT_SYMBOL_GPL(__io_uring_cmd_done);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -194,8 +197,15 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ioucmd->flags & ~IORING_URING_CMD_MASK)
return -EINVAL;
- if (ioucmd->flags & IORING_URING_CMD_FIXED)
+ if (ioucmd->flags & IORING_URING_CMD_FIXED) {
+ if (ioucmd->flags & IORING_URING_CMD_MULTISHOT)
+ return -EINVAL;
req->buf_index = READ_ONCE(sqe->buf_index);
+ }
+
+ if (!!(ioucmd->flags & IORING_URING_CMD_MULTISHOT) !=
+ !!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
@@ -234,7 +244,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
if (ctx->flags & IORING_SETUP_SQE128)
issue_flags |= IO_URING_F_SQE128;
- if (ctx->flags & IORING_SETUP_CQE32)
+ if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED))
issue_flags |= IO_URING_F_CQE32;
if (io_is_compat(ctx))
issue_flags |= IO_URING_F_COMPAT;
@@ -251,6 +261,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+ if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) {
+ if (ret >= 0)
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
if (ret == -EAGAIN) {
ioucmd->flags |= IORING_URING_CMD_REISSUE;
return ret;
@@ -333,3 +347,54 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
return false;
return io_req_post_cqe32(req, cqe);
}
+
+/*
+ * Work with io_uring_mshot_cmd_post_cqe() together for committing the
+ * provided buffer upfront
+ */
+struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
+ unsigned buf_group, size_t *len,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
+ return (struct io_br_sel) { .val = -EINVAL };
+
+ if (WARN_ON_ONCE(!io_do_buffer_select(req)))
+ return (struct io_br_sel) { .val = -EINVAL };
+
+ return io_buffer_select(req, len, buf_group, issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select);
+
+/*
+ * Return true if this multishot uring_cmd needs to be completed, otherwise
+ * the event CQE is posted successfully.
+ *
+ * This function must use `struct io_br_sel` returned from
+ * io_uring_cmd_buffer_select() for committing the buffer in the same
+ * uring_cmd submission context.
+ */
+bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
+ struct io_br_sel *sel, unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ unsigned int cflags = 0;
+
+ if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
+ return true;
+
+ if (sel->val > 0) {
+ cflags = io_put_kbuf(req, sel->val, sel->buf_list);
+ if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE))
+ return false;
+ }
+
+ io_kbuf_recycle(req, sel->buf_list, issue_flags);
+ if (sel->val < 0)
+ req_set_fail(req);
+ io_req_set_res(req, sel->val, cflags);
+ return true;
+}
+EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe);
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index e07a94694397..26c118f3918d 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -37,9 +37,7 @@ static void io_waitid_free(struct io_kiocb *req)
struct io_waitid_async *iwa = req->async_data;
put_pid(iwa->wo.wo_pid);
- kfree(req->async_data);
- req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
+ io_req_async_data_free(req);
}
static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index e5ff49f3425e..723e4266b91f 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -12,6 +12,7 @@
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/netlink.h>
+#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/tcp.h>
#include <net/rps.h>
@@ -26,6 +27,8 @@
#include "zcrx.h"
#include "rsrc.h"
+#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
+
#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
@@ -43,38 +46,42 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio
static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+ unsigned niov_pages_shift;
lockdep_assert(!area->mem.is_dmabuf);
- return area->mem.pages[net_iov_idx(niov)];
+ niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT;
+ return area->mem.pages[net_iov_idx(niov) << niov_pages_shift];
}
static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area,
- struct sg_table *sgt, unsigned long off)
+ struct io_zcrx_area *area)
{
+ unsigned niov_size = 1U << ifq->niov_shift;
+ struct sg_table *sgt = area->mem.sgt;
struct scatterlist *sg;
unsigned i, niov_idx = 0;
for_each_sgtable_dma_sg(sgt, sg, i) {
dma_addr_t dma = sg_dma_address(sg);
unsigned long sg_len = sg_dma_len(sg);
- unsigned long sg_off = min(sg_len, off);
- off -= sg_off;
- sg_len -= sg_off;
- dma += sg_off;
+ if (WARN_ON_ONCE(sg_len % niov_size))
+ return -EINVAL;
while (sg_len && niov_idx < area->nia.num_niovs) {
struct net_iov *niov = &area->nia.niovs[niov_idx];
if (net_mp_niov_set_dma_addr(niov, dma))
return -EFAULT;
- sg_len -= PAGE_SIZE;
- dma += PAGE_SIZE;
+ sg_len -= niov_size;
+ dma += niov_size;
niov_idx++;
}
}
+
+ if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs))
+ return -EFAULT;
return 0;
}
@@ -144,7 +151,6 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
goto err;
}
- mem->dmabuf_offset = off;
mem->size = len;
return 0;
err:
@@ -152,14 +158,6 @@ err:
return ret;
}
-static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
-{
- if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
- return -EINVAL;
- return io_populate_area_dma(ifq, area, area->mem.sgt,
- area->mem.dmabuf_offset);
-}
-
static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages)
{
struct folio *last_folio = NULL;
@@ -206,6 +204,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
if (ret < 0)
mem->account_pages = 0;
+ mem->sgt = &mem->page_sg_table;
mem->pages = pages;
mem->nr_folios = nr_pages;
mem->size = area_reg->len;
@@ -220,7 +219,8 @@ static void io_release_area_mem(struct io_zcrx_mem *mem)
}
if (mem->pages) {
unpin_user_pages(mem->pages, mem->nr_folios);
- sg_free_table(&mem->page_sg_table);
+ sg_free_table(mem->sgt);
+ mem->sgt = NULL;
kvfree(mem->pages);
}
}
@@ -231,6 +231,13 @@ static int io_import_area(struct io_zcrx_ifq *ifq,
{
int ret;
+ if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
+ return -EINVAL;
+ if (area_reg->rq_area_token)
+ return -EINVAL;
+ if (area_reg->__resv2[0] || area_reg->__resv2[1])
+ return -EINVAL;
+
ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
if (ret)
return ret;
@@ -247,7 +254,7 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
{
int i;
- guard(mutex)(&ifq->dma_lock);
+ guard(mutex)(&ifq->pp_lock);
if (!area->is_mapped)
return;
area->is_mapped = false;
@@ -263,47 +270,42 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
}
}
-static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
-{
- int ret;
-
- ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
- if (ret < 0)
- return ret;
- return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0);
-}
-
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int ret;
- guard(mutex)(&ifq->dma_lock);
+ guard(mutex)(&ifq->pp_lock);
if (area->is_mapped)
return 0;
- if (area->mem.is_dmabuf)
- ret = io_zcrx_map_area_dmabuf(ifq, area);
- else
- ret = io_zcrx_map_area_umem(ifq, area);
+ if (!area->mem.is_dmabuf) {
+ ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
+ DMA_FROM_DEVICE, IO_DMA_ATTR);
+ if (ret < 0)
+ return ret;
+ }
+ ret = io_populate_area_dma(ifq, area);
if (ret == 0)
area->is_mapped = true;
return ret;
}
-static void io_zcrx_sync_for_device(const struct page_pool *pool,
+static void io_zcrx_sync_for_device(struct page_pool *pool,
struct net_iov *niov)
{
#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
dma_addr_t dma_addr;
+ unsigned niov_size;
+
if (!dma_dev_need_sync(pool->p.dev))
return;
+ niov_size = 1U << io_pp_to_ifq(pool)->niov_shift;
dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
- PAGE_SIZE, pool->p.dma_dir);
+ niov_size, pool->p.dma_dir);
#endif
}
@@ -352,7 +354,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
void *ptr;
int ret;
- off = sizeof(struct io_uring);
+ off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
if (size > rd->size)
return -EINVAL;
@@ -367,6 +369,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
ptr = io_region_get_ptr(&ifq->region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
+
+ reg->offsets.head = offsetof(struct io_uring, head);
+ reg->offsets.tail = offsetof(struct io_uring, tail);
+ reg->offsets.rqes = off;
return 0;
}
@@ -391,23 +397,22 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
kfree(area);
}
-#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
+static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area)
+{
+ if (ifq->area)
+ return -EINVAL;
+ ifq->area = area;
+ return 0;
+}
static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area **res,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
unsigned nr_iovs;
int i, ret;
- if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
- return -EINVAL;
- if (area_reg->rq_area_token)
- return -EINVAL;
- if (area_reg->__resv2[0] || area_reg->__resv2[1])
- return -EINVAL;
-
ret = -ENOMEM;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
@@ -418,22 +423,23 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
if (ret)
goto err;
- nr_iovs = area->mem.size >> PAGE_SHIFT;
+ ifq->niov_shift = PAGE_SHIFT;
+ nr_iovs = area->mem.size >> ifq->niov_shift;
area->nia.num_niovs = nr_iovs;
ret = -ENOMEM;
area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
- GFP_KERNEL | __GFP_ZERO);
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!area->nia.niovs)
goto err;
area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
- GFP_KERNEL | __GFP_ZERO);
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!area->freelist)
goto err;
area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
- GFP_KERNEL | __GFP_ZERO);
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!area->user_refs)
goto err;
@@ -451,8 +457,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
area->area_id = 0;
area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
spin_lock_init(&area->freelist_lock);
- *res = area;
- return 0;
+
+ ret = io_zcrx_append_area(ifq, area);
+ if (!ret)
+ return 0;
err:
if (area)
io_zcrx_free_area(area);
@@ -469,20 +477,19 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
ifq->if_rxq = -1;
ifq->ctx = ctx;
- spin_lock_init(&ifq->lock);
spin_lock_init(&ifq->rq_lock);
- mutex_init(&ifq->dma_lock);
+ mutex_init(&ifq->pp_lock);
return ifq;
}
static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
{
- spin_lock(&ifq->lock);
- if (ifq->netdev) {
- netdev_put(ifq->netdev, &ifq->netdev_tracker);
- ifq->netdev = NULL;
- }
- spin_unlock(&ifq->lock);
+ guard(mutex)(&ifq->pp_lock);
+
+ if (!ifq->netdev)
+ return;
+ netdev_put(ifq->netdev, &ifq->netdev_tracker);
+ ifq->netdev = NULL;
}
static void io_close_queue(struct io_zcrx_ifq *ifq)
@@ -497,11 +504,11 @@ static void io_close_queue(struct io_zcrx_ifq *ifq)
if (ifq->if_rxq == -1)
return;
- spin_lock(&ifq->lock);
- netdev = ifq->netdev;
- netdev_tracker = ifq->netdev_tracker;
- ifq->netdev = NULL;
- spin_unlock(&ifq->lock);
+ scoped_guard(mutex, &ifq->pp_lock) {
+ netdev = ifq->netdev;
+ netdev_tracker = ifq->netdev_tracker;
+ ifq->netdev = NULL;
+ }
if (netdev) {
net_mp_close_rxq(netdev, ifq->if_rxq, &p);
@@ -513,7 +520,6 @@ static void io_close_queue(struct io_zcrx_ifq *ifq)
static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
{
io_close_queue(ifq);
- io_zcrx_drop_netdev(ifq);
if (ifq->area)
io_zcrx_free_area(ifq->area);
@@ -521,7 +527,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
put_device(ifq->dev);
io_free_rbuf_ring(ifq);
- mutex_destroy(&ifq->dma_lock);
+ mutex_destroy(&ifq->pp_lock);
kfree(ifq);
}
@@ -554,14 +560,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EPERM;
/* mandatory io_uring features for zc rx */
- if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
- ctx->flags & IORING_SETUP_CQE32))
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ return -EINVAL;
+ if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
return -EINVAL;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
- if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
+ if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
reg.__resv2 || reg.zcrx_id)
return -EINVAL;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
@@ -599,14 +606,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto err;
}
- ifq->dev = ifq->netdev->dev.parent;
+ ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
if (!ifq->dev) {
ret = -EOPNOTSUPP;
goto err;
}
get_device(ifq->dev);
- ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ ret = io_zcrx_create_area(ifq, &area);
if (ret)
goto err;
@@ -617,9 +624,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto err;
ifq->if_rxq = reg.if_rxq;
- reg.offsets.rqes = sizeof(struct io_uring);
- reg.offsets.head = offsetof(struct io_uring, head);
- reg.offsets.tail = offsetof(struct io_uring, tail);
reg.zcrx_id = id;
scoped_guard(mutex, &ctx->mmap_lock) {
@@ -747,45 +751,53 @@ static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
return &ifq->rqes[idx];
}
+static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
+ struct io_zcrx_ifq *ifq,
+ struct net_iov **ret_niov)
+{
+ unsigned niov_idx, area_idx;
+ struct io_zcrx_area *area;
+
+ area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
+ niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift;
+
+ if (unlikely(rqe->__pad || area_idx))
+ return false;
+ area = ifq->area;
+
+ if (unlikely(niov_idx >= area->nia.num_niovs))
+ return false;
+ niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
+
+ *ret_niov = &area->nia.niovs[niov_idx];
+ return true;
+}
+
static void io_zcrx_ring_refill(struct page_pool *pp,
struct io_zcrx_ifq *ifq)
{
unsigned int mask = ifq->rq_entries - 1;
unsigned int entries;
- netmem_ref netmem;
- spin_lock_bh(&ifq->rq_lock);
+ guard(spinlock_bh)(&ifq->rq_lock);
entries = io_zcrx_rqring_entries(ifq);
- entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
- if (unlikely(!entries)) {
- spin_unlock_bh(&ifq->rq_lock);
+ entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL);
+ if (unlikely(!entries))
return;
- }
do {
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
- struct io_zcrx_area *area;
struct net_iov *niov;
- unsigned niov_idx, area_idx;
-
- area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
- niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT;
+ netmem_ref netmem;
- if (unlikely(rqe->__pad || area_idx))
+ if (!io_parse_rqe(rqe, ifq, &niov))
continue;
- area = ifq->area;
-
- if (unlikely(niov_idx >= area->nia.num_niovs))
- continue;
- niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
-
- niov = &area->nia.niovs[niov_idx];
if (!io_zcrx_put_niov_uref(niov))
continue;
netmem = net_iov_to_netmem(niov);
- if (page_pool_unref_netmem(netmem, 1) != 0)
+ if (!page_pool_unref_and_test(netmem))
continue;
if (unlikely(niov->pp != pp)) {
@@ -798,7 +810,6 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
} while (--entries);
smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
- spin_unlock_bh(&ifq->rq_lock);
}
static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
@@ -860,8 +871,8 @@ static int io_pp_zc_init(struct page_pool *pp)
return -EINVAL;
if (WARN_ON_ONCE(!pp->dma_map))
return -EOPNOTSUPP;
- if (pp->p.order != 0)
- return -EOPNOTSUPP;
+ if (pp->p.order + PAGE_SHIFT != ifq->niov_shift)
+ return -EINVAL;
if (pp->p.dma_dir != DMA_FROM_DEVICE)
return -EOPNOTSUPP;
@@ -917,33 +928,108 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
.uninstall = io_pp_uninstall,
};
+#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16)
+#define IO_ZCRX_SYS_REFILL_BATCH 32
+
+static void io_return_buffers(struct io_zcrx_ifq *ifq,
+ struct io_uring_zcrx_rqe *rqes, unsigned nr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ struct net_iov *niov;
+ netmem_ref netmem;
+
+ if (!io_parse_rqe(&rqes[i], ifq, &niov))
+ continue;
+
+ scoped_guard(spinlock_bh, &ifq->rq_lock) {
+ if (!io_zcrx_put_niov_uref(niov))
+ continue;
+ }
+
+ netmem = net_iov_to_netmem(niov);
+ if (!page_pool_unref_and_test(netmem))
+ continue;
+ io_zcrx_return_niov(niov);
+ }
+}
+
+int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned nr_arg)
+{
+ struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH];
+ struct io_uring_zcrx_rqe __user *user_rqes;
+ struct io_uring_zcrx_sync_refill zr;
+ struct io_zcrx_ifq *ifq;
+ unsigned nr, i;
+
+ if (nr_arg)
+ return -EINVAL;
+ if (copy_from_user(&zr, arg, sizeof(zr)))
+ return -EFAULT;
+ if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS)
+ return -EINVAL;
+ if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv)))
+ return -EINVAL;
+
+ ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id);
+ if (!ifq)
+ return -EINVAL;
+ nr = zr.nr_entries;
+ user_rqes = u64_to_user_ptr(zr.rqes);
+
+ for (i = 0; i < nr;) {
+ unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH);
+ size_t size = batch * sizeof(rqes[0]);
+
+ if (copy_from_user(rqes, user_rqes + i, size))
+ return i ? i : -EFAULT;
+ io_return_buffers(ifq, rqes, batch);
+
+ i += batch;
+
+ if (fatal_signal_pending(current))
+ return i;
+ cond_resched();
+ }
+ return nr;
+}
+
static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
struct io_zcrx_ifq *ifq, int off, int len)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_uring_zcrx_cqe *rcqe;
struct io_zcrx_area *area;
struct io_uring_cqe *cqe;
u64 offset;
- if (!io_defer_get_uncommited_cqe(req->ctx, &cqe))
+ if (!io_defer_get_uncommited_cqe(ctx, &cqe))
return false;
cqe->user_data = req->cqe.user_data;
cqe->res = len;
cqe->flags = IORING_CQE_F_MORE;
+ if (ctx->flags & IORING_SETUP_CQE_MIXED)
+ cqe->flags |= IORING_CQE_F_32;
area = io_zcrx_iov_to_area(niov);
- offset = off + (net_iov_idx(niov) << PAGE_SHIFT);
+ offset = off + (net_iov_idx(niov) << ifq->niov_shift);
rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
rcqe->__pad = 0;
return true;
}
-static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
+static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
{
+ struct io_zcrx_area *area = ifq->area;
struct net_iov *niov = NULL;
+ if (area->mem.is_dmabuf)
+ return NULL;
+
spin_lock_bh(&area->freelist_lock);
if (area->free_count)
niov = __io_zcrx_get_free_niov(area);
@@ -975,9 +1061,9 @@ static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page,
if (folio_test_partial_kmap(page_folio(dst_page)) ||
folio_test_partial_kmap(page_folio(src_page))) {
- dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE);
+ dst_page += dst_offset / PAGE_SIZE;
dst_offset = offset_in_page(dst_offset);
- src_page = nth_page(src_page, src_offset / PAGE_SIZE);
+ src_page += src_offset / PAGE_SIZE;
src_offset = offset_in_page(src_offset);
n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset);
n = min(n, len);
@@ -1003,19 +1089,15 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct page *src_page, unsigned int src_offset,
size_t len)
{
- struct io_zcrx_area *area = ifq->area;
size_t copied = 0;
int ret = 0;
- if (area->mem.is_dmabuf)
- return -EFAULT;
-
while (len) {
struct io_copy_cache cc;
struct net_iov *niov;
size_t n;
- niov = io_zcrx_alloc_fallback(area);
+ niov = io_alloc_fallback_niov(ifq);
if (!niov) {
ret = -ENOMEM;
break;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 109c4ca36434..33ef61503092 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -16,11 +16,10 @@ struct io_zcrx_mem {
unsigned long nr_folios;
struct sg_table page_sg_table;
unsigned long account_pages;
+ struct sg_table *sgt;
struct dma_buf_attachment *attach;
struct dma_buf *dmabuf;
- struct sg_table *sgt;
- unsigned long dmabuf_offset;
};
struct io_zcrx_area {
@@ -42,6 +41,7 @@ struct io_zcrx_area {
struct io_zcrx_ifq {
struct io_ring_ctx *ctx;
struct io_zcrx_area *area;
+ unsigned niov_shift;
spinlock_t rq_lock ____cacheline_aligned_in_smp;
struct io_uring *rq_ring;
@@ -53,12 +53,18 @@ struct io_zcrx_ifq {
struct device *dev;
struct net_device *netdev;
netdevice_tracker netdev_tracker;
- spinlock_t lock;
- struct mutex dma_lock;
+
+ /*
+ * Page pool and net configuration lock, can be taken deeper in the
+ * net stack.
+ */
+ struct mutex pp_lock;
struct io_mapped_region region;
};
#if defined(CONFIG_IO_URING_ZCRX)
+int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
@@ -91,6 +97,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct
{
return NULL;
}
+static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned nr_arg)
+{
+ return -EOPNOTSUPP;
+}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);