diff options
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r-- | io_uring/io_uring.c | 155 |
1 files changed, 100 insertions, 55 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a165..49ebdeb5b2d9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -79,6 +79,7 @@ #include "io-wq.h" +#include "filetable.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" @@ -108,9 +109,6 @@ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) - #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ @@ -179,6 +177,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif +static void io_poison_cached_req(struct io_kiocb *req) +{ + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; +} + +static void io_poison_req(struct io_kiocb *req) +{ + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; +} + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -235,6 +253,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -290,7 +310,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) @@ -598,27 +614,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -730,10 +748,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; ocq_size += sizeof(struct io_uring_cqe); + } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); @@ -752,11 +772,29 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, } /* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; +} + +/* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -770,12 +808,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -793,9 +841,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); @@ -806,14 +854,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -985,7 +1034,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -1406,8 +1455,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, tw); - if (unlikely(io_should_terminate_tw())) + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(io_should_terminate_tw(ctx))) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); @@ -2003,11 +2054,9 @@ fail: switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, 0); io_queue_iowq(req); break; case IO_APOLL_OK: @@ -2736,6 +2785,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -2767,6 +2820,7 @@ static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } @@ -3047,10 +3101,10 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* - * Use system_unbound_wq to avoid spawning tons of event kworkers + * Use system_dfl_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime - * over using system_wq. + * over using system_percpu_wq. */ queue_work(iou_wq, &ctx->exit_work); } @@ -3404,12 +3458,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, struct file *file; long ret; - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER | - IORING_ENTER_EXT_ARG_REG | - IORING_ENTER_NO_IOWAIT))) + if (unlikely(flags & ~IORING_ENTER_FLAGS)) return -EINVAL; /* @@ -3659,6 +3708,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p) !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + return 0; } @@ -3809,15 +3866,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | - IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; + p->features = IORING_FEAT_FLAGS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3825,8 +3874,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER - && !(ctx->flags & IORING_SETUP_R_DISABLED)) - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + /* + * Unlike io_register_enable_rings(), don't need WRITE_ONCE() + * since ctx isn't yet accessible from other tasks + */ + ctx->submitter_task = get_task_struct(current); + } file = io_uring_get_file(ctx); if (IS_ERR(file)) { @@ -3877,17 +3931,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return -EINVAL; } - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + if (p.flags & ~IORING_SETUP_FLAGS) return -EINVAL; - return io_uring_create(entries, &p, params); } |