diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/Makefile | 2 | ||||
-rw-r--r-- | io_uring/cancel.c | 1 | ||||
-rw-r--r-- | io_uring/cmd_net.c | 3 | ||||
-rw-r--r-- | io_uring/fdinfo.c | 24 | ||||
-rw-r--r-- | io_uring/futex.c | 13 | ||||
-rw-r--r-- | io_uring/io-wq.c | 6 | ||||
-rw-r--r-- | io_uring/io_uring.c | 155 | ||||
-rw-r--r-- | io_uring/io_uring.h | 124 | ||||
-rw-r--r-- | io_uring/kbuf.c | 67 | ||||
-rw-r--r-- | io_uring/kbuf.h | 39 | ||||
-rw-r--r-- | io_uring/msg_ring.c | 24 | ||||
-rw-r--r-- | io_uring/net.c | 160 | ||||
-rw-r--r-- | io_uring/nop.c | 17 | ||||
-rw-r--r-- | io_uring/notif.c | 7 | ||||
-rw-r--r-- | io_uring/opdef.c | 1 | ||||
-rw-r--r-- | io_uring/openclose.c | 1 | ||||
-rw-r--r-- | io_uring/poll.c | 6 | ||||
-rw-r--r-- | io_uring/query.c | 101 | ||||
-rw-r--r-- | io_uring/query.h | 9 | ||||
-rw-r--r-- | io_uring/register.c | 60 | ||||
-rw-r--r-- | io_uring/rsrc.c | 8 | ||||
-rw-r--r-- | io_uring/rw.c | 66 | ||||
-rw-r--r-- | io_uring/splice.c | 1 | ||||
-rw-r--r-- | io_uring/timeout.c | 2 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 85 | ||||
-rw-r--r-- | io_uring/waitid.c | 4 | ||||
-rw-r--r-- | io_uring/zcrx.c | 302 | ||||
-rw-r--r-- | io_uring/zcrx.h | 19 |
28 files changed, 861 insertions, 446 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index b3f1bd492804..bc4e4a3fa0a5 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ sync.o msg_ring.o advise.o openclose.o \ statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ - memmap.o alloc_cache.o + memmap.o alloc_cache.o query.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 6d57602304df..64b51e82baa2 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -11,6 +11,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "tctx.h" #include "poll.h" diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 3866fe6ff541..27a09aa4c9d0 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -4,6 +4,7 @@ #include <net/sock.h> #include "uring_cmd.h" +#include "io_uring.h" static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, @@ -73,7 +74,7 @@ static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, cqe->user_data = 0; cqe->res = tskey; - cqe->flags = IORING_CQE_F_MORE; + cqe->flags = IORING_CQE_F_MORE | ctx_cqe32_flags(cmd_to_io_kiocb(cmd)->ctx); cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; if (ret == SOF_TIMESTAMPING_TX_HARDWARE) cqe->flags |= IORING_CQE_F_TSTAMP_HW; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 9798d6fb4ec7..ff3364531c77 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -9,7 +9,7 @@ #include <uapi/linux/io_uring.h> -#include "io_uring.h" +#include "filetable.h" #include "sqpoll.h" #include "fdinfo.h" #include "cancel.h" @@ -65,15 +65,12 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; unsigned int sq_shift = 0; - unsigned int sq_entries, cq_entries; + unsigned int sq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; unsigned int i; - if (ctx->flags & IORING_SETUP_CQE32) - cq_shift = 1; if (ctx->flags & IORING_SETUP_SQE128) sq_shift = 1; @@ -125,18 +122,23 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "\n"); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); - for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; + while (cq_head < cq_tail) { + struct io_uring_cqe *cqe; + bool cqe32 = false; + cqe = &r->cqes[(cq_head & cq_mask)]; + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) + cqe32 = true; seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", - entry & cq_mask, cqe->user_data, cqe->res, + cq_head & cq_mask, cqe->user_data, cqe->res, cqe->flags); - if (cq_shift) + if (cqe32) seq_printf(m, ", extra1:%llu, extra2:%llu\n", cqe->big_cqe[0], cqe->big_cqe[1]); seq_printf(m, "\n"); + cq_head++; + if (cqe32) + cq_head++; } if (ctx->flags & IORING_SETUP_SQPOLL) { diff --git a/io_uring/futex.c b/io_uring/futex.c index 9113a44984f3..64f3bd51c84c 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -43,7 +43,6 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { - req->async_data = NULL; hlist_del_init(&req->hash_node); io_req_task_complete(req, tw); } @@ -54,6 +53,7 @@ static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(ctx, tw); io_cache_free(&ctx->futex_cache, req->async_data); + io_req_async_data_clear(req, 0); __io_futex_complete(req, tw); } @@ -72,8 +72,7 @@ static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) io_req_set_res(req, res, 0); } - kfree(req->async_data); - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_free(req); __io_futex_complete(req, tw); } @@ -232,9 +231,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_unlock(ctx, issue_flags); req_set_fail(req); io_req_set_res(req, ret, 0); - kfree(futexv); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_free(req); return IOU_COMPLETE; } @@ -310,9 +307,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; - kfree(ifd); + io_req_async_data_free(req); return IOU_COMPLETE; } diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 17dfaa0395c4..1d03b2fc4b25 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -352,16 +352,16 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wq_acct *acct; - bool do_create = false; + bool activated_free_worker, do_create = false; worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; rcu_read_lock(); - do_create = !io_acct_activate_free_worker(acct); + activated_free_worker = io_acct_activate_free_worker(acct); rcu_read_unlock(); - if (!do_create) + if (activated_free_worker) goto no_need_create; raw_spin_lock(&acct->workers_lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a165..49ebdeb5b2d9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -79,6 +79,7 @@ #include "io-wq.h" +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" @@ -108,9 +109,6 @@ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) - #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ @@ -179,6 +177,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif +static void io_poison_cached_req(struct io_kiocb *req) +{ + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; +} + +static void io_poison_req(struct io_kiocb *req) +{ + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; +} + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -235,6 +253,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -290,7 +310,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) @@ -598,27 +614,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -730,10 +748,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; ocq_size += sizeof(struct io_uring_cqe); + } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); @@ -752,11 +772,29 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, } /* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; +} + +/* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -770,12 +808,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -793,9 +841,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); @@ -806,14 +854,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -985,7 +1034,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -1406,8 +1455,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, tw); - if (unlikely(io_should_terminate_tw())) + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(io_should_terminate_tw(ctx))) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); @@ -2003,11 +2054,9 @@ fail: switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, 0); io_queue_iowq(req); break; case IO_APOLL_OK: @@ -2736,6 +2785,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -2767,6 +2820,7 @@ static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } @@ -3047,10 +3101,10 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* - * Use system_unbound_wq to avoid spawning tons of event kworkers + * Use system_dfl_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime - * over using system_wq. + * over using system_percpu_wq. */ queue_work(iou_wq, &ctx->exit_work); } @@ -3404,12 +3458,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, struct file *file; long ret; - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER | - IORING_ENTER_EXT_ARG_REG | - IORING_ENTER_NO_IOWAIT))) + if (unlikely(flags & ~IORING_ENTER_FLAGS)) return -EINVAL; /* @@ -3659,6 +3708,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p) !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + return 0; } @@ -3809,15 +3866,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | - IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; + p->features = IORING_FEAT_FLAGS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3825,8 +3874,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER - && !(ctx->flags & IORING_SETUP_R_DISABLED)) - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + /* + * Unlike io_register_enable_rings(), don't need WRITE_ONCE() + * since ctx isn't yet accessible from other tasks + */ + ctx->submitter_task = get_task_struct(current); + } file = io_uring_get_file(ctx); if (IS_ERR(file)) { @@ -3877,17 +3931,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return -EINVAL; } - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + if (p.flags & ~IORING_SETUP_FLAGS) return -EINVAL; - return io_uring_create(entries, &p, params); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74..46d9141d772a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -11,13 +11,69 @@ #include "alloc_cache.h" #include "io-wq.h" #include "slist.h" -#include "filetable.h" #include "opdef.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #endif +#define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ + IORING_FEAT_NODROP |\ + IORING_FEAT_SUBMIT_STABLE |\ + IORING_FEAT_RW_CUR_POS |\ + IORING_FEAT_CUR_PERSONALITY |\ + IORING_FEAT_FAST_POLL |\ + IORING_FEAT_POLL_32BITS |\ + IORING_FEAT_SQPOLL_NONFIXED |\ + IORING_FEAT_EXT_ARG |\ + IORING_FEAT_NATIVE_WORKERS |\ + IORING_FEAT_RSRC_TAGS |\ + IORING_FEAT_CQE_SKIP |\ + IORING_FEAT_LINKED_FILE |\ + IORING_FEAT_REG_REG_RING |\ + IORING_FEAT_RECVSEND_BUNDLE |\ + IORING_FEAT_MIN_TIMEOUT |\ + IORING_FEAT_RW_ATTR |\ + IORING_FEAT_NO_IOWAIT) + +#define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\ + IORING_SETUP_SQPOLL |\ + IORING_SETUP_SQ_AFF |\ + IORING_SETUP_CQSIZE |\ + IORING_SETUP_CLAMP |\ + IORING_SETUP_ATTACH_WQ |\ + IORING_SETUP_R_DISABLED |\ + IORING_SETUP_SUBMIT_ALL |\ + IORING_SETUP_COOP_TASKRUN |\ + IORING_SETUP_TASKRUN_FLAG |\ + IORING_SETUP_SQE128 |\ + IORING_SETUP_CQE32 |\ + IORING_SETUP_SINGLE_ISSUER |\ + IORING_SETUP_DEFER_TASKRUN |\ + IORING_SETUP_NO_MMAP |\ + IORING_SETUP_REGISTERED_FD_ONLY |\ + IORING_SETUP_NO_SQARRAY |\ + IORING_SETUP_HYBRID_IOPOLL |\ + IORING_SETUP_CQE_MIXED) + +#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ + IORING_ENTER_SQ_WAKEUP |\ + IORING_ENTER_SQ_WAIT |\ + IORING_ENTER_EXT_ARG |\ + IORING_ENTER_REGISTERED_RING |\ + IORING_ENTER_ABS_TIMER |\ + IORING_ENTER_EXT_ARG_REG |\ + IORING_ENTER_NO_IOWAIT) + + +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\ + IOSQE_IO_DRAIN |\ + IOSQE_IO_LINK |\ + IOSQE_IO_HARDLINK |\ + IOSQE_ASYNC |\ + IOSQE_BUFFER_SELECT |\ + IOSQE_CQE_SKIP_SUCCESS) + enum { IOU_COMPLETE = 0, @@ -75,7 +131,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); int io_uring_fill_params(unsigned entries, struct io_uring_params *p); -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -169,25 +225,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, - bool overflow) + bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & IORING_SETUP_CQE32) { + ctx->cqe_cached++; + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { ctx->cqe_cached++; + ctx->cached_cq_tail++; + } + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } -static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, + bool cqe32) { - return io_get_cqe_overflow(ctx, ret, false); + return io_get_cqe_overflow(ctx, ret, false, cqe32); } static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, @@ -196,25 +258,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, io_lockdep_assert_cq_locked(ctx); ctx->submit_state.cq_flush = true; - return io_get_cqe(ctx, cqe_ret); + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. + * If we can't get a cq entry, userspace overflowed the submission + * (by quite a lot). */ - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; - memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { + if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } @@ -239,6 +300,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_CQE_MIXED) + return IORING_CQE_F_32; + return 0; +} + +static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, + __u64 extra1, __u64 extra2) +{ + req->cqe.res = res; + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; +} + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { @@ -260,6 +337,19 @@ static inline bool req_has_async_data(struct io_kiocb *req) return req->flags & REQ_F_ASYNC_DATA; } +static inline void io_req_async_data_clear(struct io_kiocb *req, + io_req_flags_t extra_flags) +{ + req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags); + req->async_data = NULL; +} + +static inline void io_req_async_data_free(struct io_kiocb *req) +{ + kfree(req->async_data); + io_req_async_data_clear(req, 0); +} + static inline void io_put_file(struct io_kiocb *req) { if (!(req->flags & REQ_F_FIXED_FILE) && req->file) @@ -476,9 +566,9 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) * 2) PF_KTHREAD is set, in which case the invoker of the task_work is * our fallback task_work. */ -static inline bool io_should_terminate_tw(void) +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) { - return current->flags & (PF_KTHREAD | PF_EXITING); + return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); } static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 19a8bde5e1e1..aad655e38672 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -155,19 +155,19 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, return 1; } -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) +static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) { struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; + struct io_br_sel sel = { }; struct io_uring_buf *buf; - void __user *ret; u32 buf_len; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) - return NULL; + return sel; if (head + 1 == tail) req->flags |= REQ_F_BL_EMPTY; @@ -177,9 +177,9 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (*len == 0 || *len > buf_len) *len = buf_len; req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; - req->buf_list = bl; req->buf_index = buf->bid; - ret = u64_to_user_ptr(buf->addr); + sel.buf_list = bl; + sel.addr = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* @@ -192,30 +192,30 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ - io_kbuf_commit(req, bl, *len, 1); - req->buf_list = NULL; + io_kbuf_commit(req, sel.buf_list, *len, 1); + sel.buf_list = NULL; } - return ret; + return sel; } -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned buf_group, unsigned int issue_flags) +struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_br_sel sel = { }; struct io_buffer_list *bl; - void __user *ret = NULL; io_ring_submit_lock(req->ctx, issue_flags); bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { if (bl->flags & IOBL_BUF_RING) - ret = io_ring_buffer_select(req, len, bl, issue_flags); + sel = io_ring_buffer_select(req, len, bl, issue_flags); else - ret = io_provided_buffer_select(req, len, bl); + sel.addr = io_provided_buffer_select(req, len, bl); } io_ring_submit_unlock(req->ctx, issue_flags); - return ret; + return sel; } /* cap it at a reasonable 256, will be one page even for 4K */ @@ -300,24 +300,22 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; return iov - arg->iovs; } int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, - unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, arg->buf_group); - if (unlikely(!bl)) + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); + if (unlikely(!sel->buf_list)) goto out_unlock; - if (bl->flags & IOBL_BUF_RING) { - ret = io_ring_buffers_peek(req, arg, bl); + if (sel->buf_list->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, sel->buf_list); /* * Don't recycle these buffers if we need to go through poll. * Nobody else can use them anyway, and holding on to provided @@ -327,17 +325,21 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, */ if (ret > 0) { req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; - io_kbuf_commit(req, bl, arg->out_len, ret); + io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); } } else { - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); } out_unlock: - io_ring_submit_unlock(ctx, issue_flags); + if (issue_flags & IO_URING_F_UNLOCKED) { + sel->buf_list = NULL; + mutex_unlock(&ctx->uring_lock); + } return ret; } -int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_br_sel *sel) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -353,16 +355,18 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; + sel->buf_list = bl; return ret; } /* don't support multiple buffer selections for legacy */ + sel->buf_list = NULL; return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } -static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) { - struct io_buffer_list *bl = req->buf_list; bool ret = true; if (bl) @@ -372,7 +376,8 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) return ret; } -unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) +unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, + int len, int nbufs) { unsigned int ret; @@ -383,7 +388,7 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) return ret; } - if (!__io_put_kbuf_ring(req, len, nbufs)) + if (!__io_put_kbuf_ring(req, bl, len, nbufs)) ret |= IORING_CQE_F_BUF_MORE; return ret; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 723d0361898e..ada382ff38d7 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -62,11 +62,12 @@ struct buf_sel_arg { unsigned short partial_map; }; -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned buf_group, unsigned int issue_flags); +struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, - unsigned int issue_flags); -int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); + struct io_br_sel *sel, unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_br_sel *sel); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -80,23 +81,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_drop_legacy(struct io_kiocb *req); -unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); +unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, + int len, int nbufs); bool io_kbuf_commit(struct io_kiocb *req, struct io_buffer_list *bl, int len, int nr); struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, unsigned int bgid); -static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) +static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, + struct io_buffer_list *bl) { - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - * The exception is partial io, that case we should increment bl->head - * to monopolize the buffer. - */ - if (req->buf_list) { + if (bl) { req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } @@ -110,30 +106,31 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } -static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +static inline bool io_kbuf_recycle(struct io_kiocb *req, struct io_buffer_list *bl, + unsigned issue_flags) { if (req->flags & REQ_F_BL_NO_RECYCLE) return false; + if (req->flags & REQ_F_BUFFER_RING) + return io_kbuf_recycle_ring(req, bl); if (req->flags & REQ_F_BUFFER_SELECTED) return io_kbuf_recycle_legacy(req, issue_flags); - if (req->flags & REQ_F_BUFFER_RING) - return io_kbuf_recycle_ring(req); return false; } static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, - unsigned issue_flags) + struct io_buffer_list *bl) { if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbufs(req, len, 1); + return __io_put_kbufs(req, bl, len, 1); } static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, - int nbufs, unsigned issue_flags) + struct io_buffer_list *bl, int nbufs) { if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbufs(req, len, nbufs); + return __io_put_kbufs(req, bl, len, nbufs); } #endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4c2578f2efcb..5e5b94236d72 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kfree_rcu(req, rcu_head); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); -} - static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_msg *msg) { struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; diff --git a/io_uring/net.c b/io_uring/net.c index d69f2afa4f7a..f99b90c762fc 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -10,6 +10,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "kbuf.h" #include "alloc_cache.h" @@ -178,10 +179,8 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) io_vec_free(&hdr->vec); - if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { - req->async_data = NULL; - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); - } + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); } static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) @@ -433,7 +432,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; sr->msg_flags |= MSG_WAITALL; - req->buf_list = NULL; req->flags |= REQ_F_MULTISHOT; } @@ -494,29 +492,29 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) return nbufs; } -static int io_net_kbuf_recyle(struct io_kiocb *req, +static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl, struct io_async_msghdr *kmsg, int len) { req->flags |= REQ_F_BL_NO_RECYCLE; if (req->flags & REQ_F_BUFFERS_COMMIT) - io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len)); + io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len)); return IOU_RETRY; } -static inline bool io_send_finish(struct io_kiocb *req, int *ret, +static inline bool io_send_finish(struct io_kiocb *req, struct io_async_msghdr *kmsg, - unsigned issue_flags) + struct io_br_sel *sel) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - bool bundle_finished = *ret <= 0; + bool bundle_finished = sel->val <= 0; unsigned int cflags; if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { - cflags = io_put_kbuf(req, *ret, issue_flags); + cflags = io_put_kbuf(req, sel->val, sel->buf_list); goto finish; } - cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); if (bundle_finished || req->flags & REQ_F_BL_EMPTY) goto finish; @@ -525,15 +523,15 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ - if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { io_mshot_prep_retry(req, kmsg); return false; } /* Otherwise stop bundle and use the current result. */ finish: - io_req_set_res(req, *ret, cflags); - *ret = IOU_COMPLETE; + io_req_set_res(req, sel->val, cflags); + sel->val = IOU_COMPLETE; return true; } @@ -571,7 +569,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - return io_net_kbuf_recyle(req, kmsg, ret); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -587,17 +585,16 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) } static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, - struct io_async_msghdr *kmsg) + struct io_br_sel *sel, struct io_async_msghdr *kmsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - - int ret; struct buf_sel_arg arg = { .iovs = &kmsg->fast_iov, .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, .buf_group = sr->buf_group, }; + int ret; if (kmsg->vec.iovec) { arg.nr_iovs = kmsg->vec.nr; @@ -610,7 +607,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, else arg.mode |= KBUF_MODE_EXPAND; - ret = io_buffers_select(req, &arg, issue_flags); + ret = io_buffers_select(req, &arg, sel, issue_flags); if (unlikely(ret < 0)) return ret; @@ -639,6 +636,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct io_br_sel sel = { }; struct socket *sock; unsigned flags; int min_ret = 0; @@ -657,8 +655,9 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_bundle: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { - ret = io_send_select_buffer(req, issue_flags, kmsg); + ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); if (ret) return ret; } @@ -682,7 +681,7 @@ retry_bundle: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - return io_net_kbuf_recyle(req, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -693,11 +692,12 @@ retry_bundle: else if (sr->done_io) ret = sr->done_io; - if (!io_send_finish(req, &ret, kmsg, issue_flags)) + sel.val = ret; + if (!io_send_finish(req, kmsg, &sel)) goto retry_bundle; io_req_msg_cleanup(req, issue_flags); - return ret; + return sel.val; } static int io_recvmsg_mshot_prep(struct io_kiocb *req, @@ -794,18 +794,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (req->flags & REQ_F_BUFFER_SELECT) { - /* - * Store the buffer group for this multishot receive separately, - * as if we end up doing an io-wq based issue that selects a - * buffer, it has to be committed immediately and that will - * clear ->buf_list. This means we lose the link to the buffer - * list, and the eventual buffer put on completion then cannot - * restore it. - */ + if (req->flags & REQ_F_BUFFER_SELECT) sr->buf_group = req->buf_index; - req->buf_list = NULL; - } sr->mshot_total_len = sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -846,9 +836,10 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * Returns true if it is actually finished, or false if it should run * again (for multishot). */ -static inline bool io_recv_finish(struct io_kiocb *req, int *ret, +static inline bool io_recv_finish(struct io_kiocb *req, struct io_async_msghdr *kmsg, - bool mshot_finished, unsigned issue_flags) + struct io_br_sel *sel, bool mshot_finished, + unsigned issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); unsigned int cflags = 0; @@ -856,13 +847,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { + if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { /* * If sr->len hits zero, the limit has been reached. Mark * mshot as finished, and flag MSHOT_DONE as well to prevent * a potential bundle from being retried. */ - sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len); + sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len); if (!sr->mshot_total_len) { sr->flags |= IORING_RECV_MSHOT_DONE; mshot_finished = true; @@ -870,13 +861,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } if (sr->flags & IORING_RECVSEND_BUNDLE) { - size_t this_ret = *ret - sr->done_io; + size_t this_ret = sel->val - sr->done_io; - cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), - issue_flags); + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); if (sr->flags & IORING_RECV_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); - if (sr->mshot_len && *ret >= sr->mshot_len) + if (sr->mshot_len && sel->val >= sr->mshot_len) sr->flags |= IORING_RECV_MSHOT_CAP; /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) @@ -895,7 +885,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, return false; } } else { - cflags |= io_put_kbuf(req, *ret, issue_flags); + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); } /* @@ -903,8 +893,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * receive from this socket. */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && - io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { - *ret = IOU_RETRY; + io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { + sel->val = IOU_RETRY; io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { @@ -916,15 +906,15 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, sr->nr_multishot_loops = 0; sr->flags &= ~IORING_RECV_MSHOT_CAP; if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_REQUEUE; + sel->val = IOU_REQUEUE; } return true; } /* Finish the request / stop multishot. */ finish: - io_req_set_res(req, *ret, cflags); - *ret = IOU_COMPLETE; + io_req_set_res(req, sel->val, cflags); + sel->val = IOU_COMPLETE; io_req_msg_cleanup(req, issue_flags); return true; } @@ -1017,6 +1007,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct io_br_sel sel = { }; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -1036,23 +1027,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { - void __user *buf; size_t len = sr->len; - buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); - if (!buf) + sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); + if (!sel.addr) return -ENOBUFS; if (req->flags & REQ_F_APOLL_MULTISHOT) { - ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); + ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); if (ret) { - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } } - iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); + iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len); } kmsg->msg.msg_get_inq = 1; @@ -1071,14 +1062,12 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) - io_kbuf_recycle(req, issue_flags); - + io_kbuf_recycle(req, sel.buf_list, issue_flags); return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - return io_net_kbuf_recyle(req, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1092,16 +1081,17 @@ retry_multishot: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) + sel.val = ret; + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) goto retry_multishot; - return ret; + return sel.val; } static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, - size_t *len, unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int ret; @@ -1126,15 +1116,15 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg arg.mode |= KBUF_MODE_FREE; } - if (*len) - arg.max_len = *len; + if (sel->val) + arg.max_len = sel->val; else if (kmsg->msg.msg_inq > 1) - arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq); + arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq); /* if mshot limited, ensure we don't go over */ if (sr->flags & IORING_RECV_MSHOT_LIM) arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len); - ret = io_buffers_peek(req, &arg); + ret = io_buffers_peek(req, &arg, sel); if (unlikely(ret < 0)) return ret; @@ -1155,14 +1145,13 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); } else { - void __user *buf; + size_t len = sel->val; - *len = sr->len; - buf = io_buffer_select(req, len, sr->buf_group, issue_flags); - if (!buf) + *sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); + if (!sel->addr) return -ENOBUFS; - sr->buf = buf; - sr->len = *len; + sr->buf = sel->addr; + sr->len = len; map_ubuf: ret = import_ubuf(ITER_DEST, sr->buf, sr->len, &kmsg->msg.msg_iter); @@ -1177,11 +1166,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + struct io_br_sel sel; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - size_t len = sr->len; bool mshot_finished; if (!(req->flags & REQ_F_POLLED) && @@ -1197,9 +1186,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { - ret = io_recv_buf_select(req, kmsg, &len, issue_flags); - if (unlikely(ret)) { + sel.val = sr->len; + ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); + if (unlikely(ret < 0)) { kmsg->msg.msg_inq = -1; goto out_free; } @@ -1215,16 +1206,14 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) - io_kbuf_recycle(req, issue_flags); - + io_kbuf_recycle(req, sel.buf_list, issue_flags); return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; - return io_net_kbuf_recyle(req, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1240,12 +1229,13 @@ out_free: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) + sel.val = ret; + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) goto retry_multishot; - return ret; + return sel.val; } int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1505,7 +1495,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) zc->len -= ret; zc->buf += ret; zc->done_io += ret; - return io_net_kbuf_recyle(req, kmsg, ret); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1575,7 +1565,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - return io_net_kbuf_recyle(req, kmsg, ret); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; diff --git a/io_uring/nop.c b/io_uring/nop.c index 20ed0f85b1c2..3caf07878f8a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -17,11 +17,13 @@ struct io_nop { int result; int fd; unsigned int flags; + __u64 extra1; + __u64 extra2; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ - IORING_NOP_TW) + IORING_NOP_TW | IORING_NOP_CQE32) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -41,6 +43,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); + if (nop->flags & IORING_NOP_CQE32) { + struct io_ring_ctx *ctx = req->ctx; + + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + nop->extra1 = READ_ONCE(sqe->off); + nop->extra2 = READ_ONCE(sqe->addr); + } return 0; } @@ -68,7 +78,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - io_req_set_res(req, nop->result, 0); + if (nop->flags & IORING_NOP_CQE32) + io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); + else + io_req_set_res(req, nop->result, 0); if (nop->flags & IORING_NOP_TW) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); diff --git a/io_uring/notif.c b/io_uring/notif.c index 9a6f6e92d742..d8ba1165c949 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -14,10 +14,15 @@ static const struct ubuf_info_ops io_ubuf_ops; static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) { struct io_notif_data *nd = io_notif_to_data(notif); + struct io_ring_ctx *ctx = notif->ctx; + + lockdep_assert_held(&ctx->uring_lock); do { notif = cmd_to_io_kiocb(nd); + if (WARN_ON_ONCE(ctx != notif->ctx)) + return; lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) @@ -85,7 +90,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) return -EEXIST; prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); - prev_notif = cmd_to_io_kiocb(nd); + prev_notif = cmd_to_io_kiocb(prev_nd); /* make sure all noifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9568785810d9..932319633eac 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_URING_CMD] = { + .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, diff --git a/io_uring/openclose.c b/io_uring/openclose.c index d70700e5cef8..bfeb91b31bba 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -14,6 +14,7 @@ #include "../fs/internal.h" +#include "filetable.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" diff --git a/io_uring/poll.c b/io_uring/poll.c index c786e587563b..b9681d0f9f13 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw())) + if (unlikely(io_should_terminate_tw(req->ctx))) return -ECANCELED; do { @@ -316,10 +316,8 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { - io_kbuf_recycle(req, 0); return; } else if (ret == IOU_POLL_REQUEUE) { - io_kbuf_recycle(req, 0); __io_poll_execute(req, 0); return; } @@ -686,8 +684,6 @@ int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_kbuf_recycle(req, issue_flags); - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; diff --git a/io_uring/query.c b/io_uring/query.c new file mode 100644 index 000000000000..645301bd2c82 --- /dev/null +++ b/io_uring/query.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "linux/io_uring/query.h" + +#include "query.h" +#include "io_uring.h" + +#define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) +#define IO_MAX_QUERY_ENTRIES 1000 + +static ssize_t io_query_ops(void *data) +{ + struct io_uring_query_opcode *e = data; + + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); + + e->nr_request_opcodes = IORING_OP_LAST; + e->nr_register_opcodes = IORING_REGISTER_LAST; + e->feature_flags = IORING_FEAT_FLAGS; + e->ring_setup_flags = IORING_SETUP_FLAGS; + e->enter_flags = IORING_ENTER_FLAGS; + e->sqe_flags = SQE_VALID_FLAGS; + return sizeof(*e); +} + +static int io_handle_query_entry(struct io_ring_ctx *ctx, + void *data, void __user *uhdr, + u64 *next_entry) +{ + struct io_uring_query_hdr hdr; + size_t usize, res_size = 0; + ssize_t ret = -EINVAL; + void __user *udata; + + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + return -EFAULT; + usize = hdr.size; + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); + udata = u64_to_user_ptr(hdr.query_data); + + if (hdr.query_op >= __IO_URING_QUERY_MAX) { + ret = -EOPNOTSUPP; + goto out; + } + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) + goto out; + if (copy_from_user(data, udata, hdr.size)) + return -EFAULT; + + switch (hdr.query_op) { + case IO_URING_QUERY_OPCODES: + ret = io_query_ops(data); + break; + } + + if (ret >= 0) { + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) + return -EFAULT; + res_size = ret; + ret = 0; + } +out: + hdr.result = ret; + hdr.size = min_t(size_t, usize, res_size); + + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) + return -EFAULT; + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) + return -EFAULT; + *next_entry = hdr.next_entry; + return 0; +} + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + char entry_buffer[IO_MAX_QUERY_SIZE]; + void __user *uhdr = arg; + int ret, nr = 0; + + memset(entry_buffer, 0, sizeof(entry_buffer)); + + if (nr_args) + return -EINVAL; + + while (uhdr) { + u64 next_hdr; + + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); + if (ret) + return ret; + uhdr = u64_to_user_ptr(next_hdr); + + /* Have some limit to avoid a potential cycle */ + if (++nr >= IO_MAX_QUERY_ENTRIES) + return -ERANGE; + if (fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } + return 0; +} diff --git a/io_uring/query.h b/io_uring/query.h new file mode 100644 index 000000000000..171d47ccaaba --- /dev/null +++ b/io_uring/query.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IORING_QUERY_H +#define IORING_QUERY_H + +#include <linux/io_uring_types.h> + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); + +#endif diff --git a/io_uring/register.c b/io_uring/register.c index a59589249fce..43f04c47522c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -18,6 +18,7 @@ #include <linux/io_uring.h> #include <linux/io_uring_types.h> +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "tctx.h" @@ -31,6 +32,7 @@ #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" +#include "query.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -46,13 +48,9 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, nr_args = IORING_OP_LAST; size = struct_size(p, ops, nr_args); - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; + p = memdup_user(arg, size); + if (IS_ERR(p)) + return PTR_ERR(p); ret = -EINVAL; if (memchr_inv(p, 0, size)) goto out; @@ -396,7 +394,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { @@ -407,10 +406,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) struct io_uring_params p; int ret; - /* for single issuer, must be owner resizing */ - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && - current != ctx->submitter_task) - return -EEXIST; /* limited to DEFER_TASKRUN for now */ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; @@ -835,6 +830,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_mem_region(ctx, arg); break; + case IORING_REGISTER_QUERY: + ret = io_query(ctx, arg, nr_args); + break; + case IORING_REGISTER_ZCRX_REFILL: + ret = io_zcrx_return_bufs(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -877,6 +878,23 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EOPNOTSUPP); } +static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) +{ + struct io_uring_sqe sqe; + + if (!arg || nr_args != 1) + return -EINVAL; + if (copy_from_user(&sqe, arg, sizeof(sqe))) + return -EFAULT; + /* no flags supported */ + if (sqe.flags) + return -EINVAL; + if (sqe.opcode != IORING_OP_MSG_RING) + return -EINVAL; + + return io_uring_sync_msg_ring(&sqe); +} + /* * "blind" registration opcodes are ones where there's no ring given, and * hence the source fd must be -1. @@ -885,21 +903,11 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, unsigned int nr_args) { switch (opcode) { - case IORING_REGISTER_SEND_MSG_RING: { - struct io_uring_sqe sqe; - - if (!arg || nr_args != 1) - return -EINVAL; - if (copy_from_user(&sqe, arg, sizeof(sqe))) - return -EFAULT; - /* no flags supported */ - if (sqe.flags) - return -EINVAL; - if (sqe.opcode == IORING_OP_MSG_RING) - return io_uring_sync_msg_ring(&sqe); - } + case IORING_REGISTER_SEND_MSG_RING: + return io_uring_register_send_msg_ring(arg, nr_args); + case IORING_REGISTER_QUERY: + return io_query(NULL, arg, nr_args); } - return -EINVAL; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f75f5e43fa4a..d787c16dc1c3 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -13,6 +13,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "openclose.h" #include "rsrc.h" @@ -1299,10 +1300,17 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) if (src_ctx != ctx) { mutex_unlock(&ctx->uring_lock); lock_two_rings(ctx, src_ctx); + + if (src_ctx->submitter_task && + src_ctx->submitter_task != current) { + ret = -EEXIST; + goto out; + } } ret = io_clone_buffers(ctx, src_ctx, &buf); +out: if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); diff --git a/io_uring/rw.c b/io_uring/rw.c index 52a5b950b2e5..08882648d569 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -15,6 +15,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "kbuf.h" @@ -107,34 +108,35 @@ static int io_import_vec(int ddir, struct io_kiocb *req, } static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) + struct io_async_rw *io, struct io_br_sel *sel, + unsigned int issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - void __user *buf = u64_to_user_ptr(rw->addr); size_t sqe_len = rw->len; + sel->addr = u64_to_user_ptr(rw->addr); if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) - return io_import_vec(ddir, req, io, buf, sqe_len); + return io_import_vec(ddir, req, io, sel->addr, sqe_len); if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); - if (!buf) + *sel = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); + if (!sel->addr) return -ENOBUFS; - rw->addr = (unsigned long) buf; + rw->addr = (unsigned long) sel->addr; rw->len = sqe_len; } - return import_ubuf(ddir, buf, sqe_len, &io->iter); + return import_ubuf(ddir, sel->addr, sqe_len, &io->iter); } static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, struct io_async_rw *io, + struct io_br_sel *sel, unsigned int issue_flags) { int ret; - ret = __io_import_rw_buffer(rw, req, io, issue_flags); + ret = __io_import_rw_buffer(rw, req, io, sel, issue_flags); if (unlikely(ret < 0)) return ret; @@ -153,10 +155,8 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) io_vec_free(&rw->vec); - if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; - } + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) + io_req_async_data_clear(req, 0); } static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) @@ -306,10 +306,12 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_rw_do_import(struct io_kiocb *req, int ddir) { + struct io_br_sel sel = { }; + if (io_do_buffer_select(req)) return 0; - return io_import_rw_buffer(ddir, req, req->async_data, 0); + return io_import_rw_buffer(ddir, req, req->async_data, &sel, 0); } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -576,7 +578,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); io_req_rw_cleanup(req, 0); io_req_task_complete(req, tw); @@ -645,7 +647,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } static int kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned final_ret = io_fixup_rw_res(req, ret); @@ -659,7 +661,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, * from the submission path. */ io_req_io_end(req); - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list)); io_req_rw_cleanup(req, issue_flags); return IOU_COMPLETE; } else { @@ -886,6 +888,9 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; + if (!(file->f_mode & FMODE_HAS_METADATA)) + return -EINVAL; + /* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. @@ -899,7 +904,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) return 0; } -static int __io_read(struct io_kiocb *req, unsigned int issue_flags) +static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, + unsigned int issue_flags) { bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); @@ -913,7 +919,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret)) return ret; } else if (io_do_buffer_select(req)) { - ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); + ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags); if (unlikely(ret < 0)) return ret; } @@ -1015,18 +1021,22 @@ done: int io_read(struct io_kiocb *req, unsigned int issue_flags) { + struct io_br_sel sel = { }; int ret; - ret = __io_read(req, issue_flags); + ret = __io_read(req, &sel, issue_flags); if (ret >= 0) - return kiocb_done(req, ret, issue_flags); + return kiocb_done(req, ret, &sel, issue_flags); + if (req->flags & REQ_F_BUFFERS_COMMIT) + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_br_sel sel = { }; unsigned int cflags = 0; int ret; @@ -1038,7 +1048,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) /* make it sync, multishot doesn't support async execution */ rw->kiocb.ki_complete = NULL; - ret = __io_read(req, issue_flags); + ret = __io_read(req, &sel, issue_flags); /* * If we get -EAGAIN, recycle our buffer and just let normal poll @@ -1049,15 +1059,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Reset rw->len to 0 again to avoid clamping future mshot * reads, in case the buffer size varies. */ - if (io_kbuf_recycle(req, issue_flags)) + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) rw->len = 0; return IOU_RETRY; } else if (ret <= 0) { - io_kbuf_recycle(req, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); if (ret < 0) req_set_fail(req); } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - cflags = io_put_kbuf(req, ret, issue_flags); + cflags = io_put_kbuf(req, ret, sel.buf_list); } else { /* * Any successful return value will keep the multishot read @@ -1065,7 +1075,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, ret, issue_flags); + cflags = io_put_kbuf(req, ret, sel.buf_list); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1194,7 +1204,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } done: - return kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, NULL, issue_flags); } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); @@ -1362,7 +1372,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); + req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } diff --git a/io_uring/splice.c b/io_uring/splice.c index 35ce4e60b495..e81ebbb91925 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -11,6 +11,7 @@ #include <uapi/linux/io_uring.h> +#include "filetable.h" #include "io_uring.h" #include "splice.h" diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7f13bfa9f2b6..17e3aab0af36 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) int ret; if (prev) { - if (!io_should_terminate_tw()) { + if (!io_should_terminate_tw(req->ctx)) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0..d1e3ba62ee8e 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" +#include "kbuf.h" #include "uring_cmd.h" #include "poll.h" @@ -36,8 +37,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) { ioucmd->sqe = NULL; - req->async_data = NULL; - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); } } @@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (io_should_terminate_tw()) + if (io_should_terminate_tw(req->ctx)) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ @@ -126,7 +126,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -151,8 +151,8 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, - unsigned issue_flags) +void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -165,8 +165,11 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, req_set_fail(req); io_req_set_res(req, ret, 0); - if (req->ctx->flags & IORING_SETUP_CQE32) + if (is_cqe32) { + if (req->ctx->flags & IORING_SETUP_CQE_MIXED) + req->cqe.flags |= IORING_CQE_F_32; io_req_set_cqe32_extra(req, res2, 0); + } io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ @@ -180,7 +183,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, io_req_task_work_add(req); } } -EXPORT_SYMBOL_GPL(io_uring_cmd_done); +EXPORT_SYMBOL_GPL(__io_uring_cmd_done); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -194,8 +197,15 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) + return -EINVAL; req->buf_index = READ_ONCE(sqe->buf_index); + } + + if (!!(ioucmd->flags & IORING_URING_CMD_MULTISHOT) != + !!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -234,7 +244,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ctx->flags & IORING_SETUP_SQE128) issue_flags |= IO_URING_F_SQE128; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) issue_flags |= IO_URING_F_CQE32; if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; @@ -251,6 +261,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ret >= 0) + return IOU_ISSUE_SKIP_COMPLETE; + } if (ret == -EAGAIN) { ioucmd->flags |= IORING_URING_CMD_REISSUE; return ret; @@ -333,3 +347,54 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, return false; return io_req_post_cqe32(req, cqe); } + +/* + * Work with io_uring_mshot_cmd_post_cqe() together for committing the + * provided buffer upfront + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return (struct io_br_sel) { .val = -EINVAL }; + + if (WARN_ON_ONCE(!io_do_buffer_select(req))) + return (struct io_br_sel) { .val = -EINVAL }; + + return io_buffer_select(req, len, buf_group, issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); + +/* + * Return true if this multishot uring_cmd needs to be completed, otherwise + * the event CQE is posted successfully. + * + * This function must use `struct io_br_sel` returned from + * io_uring_cmd_buffer_select() for committing the buffer in the same + * uring_cmd submission context. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + unsigned int cflags = 0; + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return true; + + if (sel->val > 0) { + cflags = io_put_kbuf(req, sel->val, sel->buf_list); + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) + return false; + } + + io_kbuf_recycle(req, sel->buf_list, issue_flags); + if (sel->val < 0) + req_set_fail(req); + io_req_set_res(req, sel->val, cflags); + return true; +} +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index e07a94694397..26c118f3918d 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -37,9 +37,7 @@ static void io_waitid_free(struct io_kiocb *req) struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); - kfree(req->async_data); - req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + io_req_async_data_free(req); } static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e5ff49f3425e..723e4266b91f 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -12,6 +12,7 @@ #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> #include <net/netlink.h> +#include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/tcp.h> #include <net/rps.h> @@ -26,6 +27,8 @@ #include "zcrx.h" #include "rsrc.h" +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) @@ -43,38 +46,42 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + unsigned niov_pages_shift; lockdep_assert(!area->mem.is_dmabuf); - return area->mem.pages[net_iov_idx(niov)]; + niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; + return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; } static int io_populate_area_dma(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, - struct sg_table *sgt, unsigned long off) + struct io_zcrx_area *area) { + unsigned niov_size = 1U << ifq->niov_shift; + struct sg_table *sgt = area->mem.sgt; struct scatterlist *sg; unsigned i, niov_idx = 0; for_each_sgtable_dma_sg(sgt, sg, i) { dma_addr_t dma = sg_dma_address(sg); unsigned long sg_len = sg_dma_len(sg); - unsigned long sg_off = min(sg_len, off); - off -= sg_off; - sg_len -= sg_off; - dma += sg_off; + if (WARN_ON_ONCE(sg_len % niov_size)) + return -EINVAL; while (sg_len && niov_idx < area->nia.num_niovs) { struct net_iov *niov = &area->nia.niovs[niov_idx]; if (net_mp_niov_set_dma_addr(niov, dma)) return -EFAULT; - sg_len -= PAGE_SIZE; - dma += PAGE_SIZE; + sg_len -= niov_size; + dma += niov_size; niov_idx++; } } + + if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) + return -EFAULT; return 0; } @@ -144,7 +151,6 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq, goto err; } - mem->dmabuf_offset = off; mem->size = len; return 0; err: @@ -152,14 +158,6 @@ err: return ret; } -static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) -{ - if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) - return -EINVAL; - return io_populate_area_dma(ifq, area, area->mem.sgt, - area->mem.dmabuf_offset); -} - static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) { struct folio *last_folio = NULL; @@ -206,6 +204,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, if (ret < 0) mem->account_pages = 0; + mem->sgt = &mem->page_sg_table; mem->pages = pages; mem->nr_folios = nr_pages; mem->size = area_reg->len; @@ -220,7 +219,8 @@ static void io_release_area_mem(struct io_zcrx_mem *mem) } if (mem->pages) { unpin_user_pages(mem->pages, mem->nr_folios); - sg_free_table(&mem->page_sg_table); + sg_free_table(mem->sgt); + mem->sgt = NULL; kvfree(mem->pages); } } @@ -231,6 +231,13 @@ static int io_import_area(struct io_zcrx_ifq *ifq, { int ret; + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) + return -EINVAL; + if (area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); if (ret) return ret; @@ -247,7 +254,7 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, { int i; - guard(mutex)(&ifq->dma_lock); + guard(mutex)(&ifq->pp_lock); if (!area->is_mapped) return; area->is_mapped = false; @@ -263,47 +270,42 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, } } -static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) -{ - int ret; - - ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (ret < 0) - return ret; - return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0); -} - static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { int ret; - guard(mutex)(&ifq->dma_lock); + guard(mutex)(&ifq->pp_lock); if (area->is_mapped) return 0; - if (area->mem.is_dmabuf) - ret = io_zcrx_map_area_dmabuf(ifq, area); - else - ret = io_zcrx_map_area_umem(ifq, area); + if (!area->mem.is_dmabuf) { + ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + return ret; + } + ret = io_populate_area_dma(ifq, area); if (ret == 0) area->is_mapped = true; return ret; } -static void io_zcrx_sync_for_device(const struct page_pool *pool, +static void io_zcrx_sync_for_device(struct page_pool *pool, struct net_iov *niov) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) dma_addr_t dma_addr; + unsigned niov_size; + if (!dma_dev_need_sync(pool->p.dev)) return; + niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, - PAGE_SIZE, pool->p.dma_dir); + niov_size, pool->p.dma_dir); #endif } @@ -352,7 +354,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, void *ptr; int ret; - off = sizeof(struct io_uring); + off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; if (size > rd->size) return -EINVAL; @@ -367,6 +369,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, ptr = io_region_get_ptr(&ifq->region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + + reg->offsets.head = offsetof(struct io_uring, head); + reg->offsets.tail = offsetof(struct io_uring, tail); + reg->offsets.rqes = off; return 0; } @@ -391,23 +397,22 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) kfree(area); } -#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) +static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) +{ + if (ifq->area) + return -EINVAL; + ifq->area = area; + return 0; +} static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, - struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; unsigned nr_iovs; int i, ret; - if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) - return -EINVAL; - if (area_reg->rq_area_token) - return -EINVAL; - if (area_reg->__resv2[0] || area_reg->__resv2[1]) - return -EINVAL; - ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) @@ -418,22 +423,23 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (ret) goto err; - nr_iovs = area->mem.size >> PAGE_SHIFT; + ifq->niov_shift = PAGE_SHIFT; + nr_iovs = area->mem.size >> ifq->niov_shift; area->nia.num_niovs = nr_iovs; ret = -ENOMEM; area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!area->nia.niovs) goto err; area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!area->freelist) goto err; area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!area->user_refs) goto err; @@ -451,8 +457,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, area->area_id = 0; area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; spin_lock_init(&area->freelist_lock); - *res = area; - return 0; + + ret = io_zcrx_append_area(ifq, area); + if (!ret) + return 0; err: if (area) io_zcrx_free_area(area); @@ -469,20 +477,19 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) ifq->if_rxq = -1; ifq->ctx = ctx; - spin_lock_init(&ifq->lock); spin_lock_init(&ifq->rq_lock); - mutex_init(&ifq->dma_lock); + mutex_init(&ifq->pp_lock); return ifq; } static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) { - spin_lock(&ifq->lock); - if (ifq->netdev) { - netdev_put(ifq->netdev, &ifq->netdev_tracker); - ifq->netdev = NULL; - } - spin_unlock(&ifq->lock); + guard(mutex)(&ifq->pp_lock); + + if (!ifq->netdev) + return; + netdev_put(ifq->netdev, &ifq->netdev_tracker); + ifq->netdev = NULL; } static void io_close_queue(struct io_zcrx_ifq *ifq) @@ -497,11 +504,11 @@ static void io_close_queue(struct io_zcrx_ifq *ifq) if (ifq->if_rxq == -1) return; - spin_lock(&ifq->lock); - netdev = ifq->netdev; - netdev_tracker = ifq->netdev_tracker; - ifq->netdev = NULL; - spin_unlock(&ifq->lock); + scoped_guard(mutex, &ifq->pp_lock) { + netdev = ifq->netdev; + netdev_tracker = ifq->netdev_tracker; + ifq->netdev = NULL; + } if (netdev) { net_mp_close_rxq(netdev, ifq->if_rxq, &p); @@ -513,7 +520,6 @@ static void io_close_queue(struct io_zcrx_ifq *ifq) static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) { io_close_queue(ifq); - io_zcrx_drop_netdev(ifq); if (ifq->area) io_zcrx_free_area(ifq->area); @@ -521,7 +527,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) put_device(ifq->dev); io_free_rbuf_ring(ifq); - mutex_destroy(&ifq->dma_lock); + mutex_destroy(&ifq->pp_lock); kfree(ifq); } @@ -554,14 +560,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EPERM; /* mandatory io_uring features for zc rx */ - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && - ctx->flags & IORING_SETUP_CQE32)) + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || + if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) @@ -599,14 +606,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; } - ifq->dev = ifq->netdev->dev.parent; + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); if (!ifq->dev) { ret = -EOPNOTSUPP; goto err; } get_device(ifq->dev); - ret = io_zcrx_create_area(ifq, &ifq->area, &area); + ret = io_zcrx_create_area(ifq, &area); if (ret) goto err; @@ -617,9 +624,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; ifq->if_rxq = reg.if_rxq; - reg.offsets.rqes = sizeof(struct io_uring); - reg.offsets.head = offsetof(struct io_uring, head); - reg.offsets.tail = offsetof(struct io_uring, tail); reg.zcrx_id = id; scoped_guard(mutex, &ctx->mmap_lock) { @@ -747,45 +751,53 @@ static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, return &ifq->rqes[idx]; } +static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, + struct io_zcrx_ifq *ifq, + struct net_iov **ret_niov) +{ + unsigned niov_idx, area_idx; + struct io_zcrx_area *area; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; + + if (unlikely(rqe->__pad || area_idx)) + return false; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + return false; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + *ret_niov = &area->nia.niovs[niov_idx]; + return true; +} + static void io_zcrx_ring_refill(struct page_pool *pp, struct io_zcrx_ifq *ifq) { unsigned int mask = ifq->rq_entries - 1; unsigned int entries; - netmem_ref netmem; - spin_lock_bh(&ifq->rq_lock); + guard(spinlock_bh)(&ifq->rq_lock); entries = io_zcrx_rqring_entries(ifq); - entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); - if (unlikely(!entries)) { - spin_unlock_bh(&ifq->rq_lock); + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); + if (unlikely(!entries)) return; - } do { struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); - struct io_zcrx_area *area; struct net_iov *niov; - unsigned niov_idx, area_idx; - - area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; - niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; + netmem_ref netmem; - if (unlikely(rqe->__pad || area_idx)) + if (!io_parse_rqe(rqe, ifq, &niov)) continue; - area = ifq->area; - - if (unlikely(niov_idx >= area->nia.num_niovs)) - continue; - niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); - - niov = &area->nia.niovs[niov_idx]; if (!io_zcrx_put_niov_uref(niov)) continue; netmem = net_iov_to_netmem(niov); - if (page_pool_unref_netmem(netmem, 1) != 0) + if (!page_pool_unref_and_test(netmem)) continue; if (unlikely(niov->pp != pp)) { @@ -798,7 +810,6 @@ static void io_zcrx_ring_refill(struct page_pool *pp, } while (--entries); smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); - spin_unlock_bh(&ifq->rq_lock); } static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) @@ -860,8 +871,8 @@ static int io_pp_zc_init(struct page_pool *pp) return -EINVAL; if (WARN_ON_ONCE(!pp->dma_map)) return -EOPNOTSUPP; - if (pp->p.order != 0) - return -EOPNOTSUPP; + if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) + return -EINVAL; if (pp->p.dma_dir != DMA_FROM_DEVICE) return -EOPNOTSUPP; @@ -917,33 +928,108 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) +#define IO_ZCRX_SYS_REFILL_BATCH 32 + +static void io_return_buffers(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_rqe *rqes, unsigned nr) +{ + int i; + + for (i = 0; i < nr; i++) { + struct net_iov *niov; + netmem_ref netmem; + + if (!io_parse_rqe(&rqes[i], ifq, &niov)) + continue; + + scoped_guard(spinlock_bh, &ifq->rq_lock) { + if (!io_zcrx_put_niov_uref(niov)) + continue; + } + + netmem = net_iov_to_netmem(niov); + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; + struct io_uring_zcrx_rqe __user *user_rqes; + struct io_uring_zcrx_sync_refill zr; + struct io_zcrx_ifq *ifq; + unsigned nr, i; + + if (nr_arg) + return -EINVAL; + if (copy_from_user(&zr, arg, sizeof(zr))) + return -EFAULT; + if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) + return -EINVAL; + if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) + return -EINVAL; + + ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); + if (!ifq) + return -EINVAL; + nr = zr.nr_entries; + user_rqes = u64_to_user_ptr(zr.rqes); + + for (i = 0; i < nr;) { + unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); + size_t size = batch * sizeof(rqes[0]); + + if (copy_from_user(rqes, user_rqes + i, size)) + return i ? i : -EFAULT; + io_return_buffers(ifq, rqes, batch); + + i += batch; + + if (fatal_signal_pending(current)) + return i; + cond_resched(); + } + return nr; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { + struct io_ring_ctx *ctx = req->ctx; struct io_uring_zcrx_cqe *rcqe; struct io_zcrx_area *area; struct io_uring_cqe *cqe; u64 offset; - if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) + if (!io_defer_get_uncommited_cqe(ctx, &cqe)) return false; cqe->user_data = req->cqe.user_data; cqe->res = len; cqe->flags = IORING_CQE_F_MORE; + if (ctx->flags & IORING_SETUP_CQE_MIXED) + cqe->flags |= IORING_CQE_F_32; area = io_zcrx_iov_to_area(niov); - offset = off + (net_iov_idx(niov) << PAGE_SHIFT); + offset = off + (net_iov_idx(niov) << ifq->niov_shift); rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); rcqe->__pad = 0; return true; } -static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) +static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) { + struct io_zcrx_area *area = ifq->area; struct net_iov *niov = NULL; + if (area->mem.is_dmabuf) + return NULL; + spin_lock_bh(&area->freelist_lock); if (area->free_count) niov = __io_zcrx_get_free_niov(area); @@ -975,9 +1061,9 @@ static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, if (folio_test_partial_kmap(page_folio(dst_page)) || folio_test_partial_kmap(page_folio(src_page))) { - dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE); + dst_page += dst_offset / PAGE_SIZE; dst_offset = offset_in_page(dst_offset); - src_page = nth_page(src_page, src_offset / PAGE_SIZE); + src_page += src_offset / PAGE_SIZE; src_offset = offset_in_page(src_offset); n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); n = min(n, len); @@ -1003,19 +1089,15 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct page *src_page, unsigned int src_offset, size_t len) { - struct io_zcrx_area *area = ifq->area; size_t copied = 0; int ret = 0; - if (area->mem.is_dmabuf) - return -EFAULT; - while (len) { struct io_copy_cache cc; struct net_iov *niov; size_t n; - niov = io_zcrx_alloc_fallback(area); + niov = io_alloc_fallback_niov(ifq); if (!niov) { ret = -ENOMEM; break; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 109c4ca36434..33ef61503092 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -16,11 +16,10 @@ struct io_zcrx_mem { unsigned long nr_folios; struct sg_table page_sg_table; unsigned long account_pages; + struct sg_table *sgt; struct dma_buf_attachment *attach; struct dma_buf *dmabuf; - struct sg_table *sgt; - unsigned long dmabuf_offset; }; struct io_zcrx_area { @@ -42,6 +41,7 @@ struct io_zcrx_area { struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct io_zcrx_area *area; + unsigned niov_shift; spinlock_t rq_lock ____cacheline_aligned_in_smp; struct io_uring *rq_ring; @@ -53,12 +53,18 @@ struct io_zcrx_ifq { struct device *dev; struct net_device *netdev; netdevice_tracker netdev_tracker; - spinlock_t lock; - struct mutex dma_lock; + + /* + * Page pool and net configuration lock, can be taken deeper in the + * net stack. + */ + struct mutex pp_lock; struct io_mapped_region region; }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -91,6 +97,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); |