summaryrefslogtreecommitdiff
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c155
1 files changed, 100 insertions, 55 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 93633613a165..49ebdeb5b2d9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -79,6 +79,7 @@
#include "io-wq.h"
+#include "filetable.h"
#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
@@ -108,9 +109,6 @@
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
-#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
- IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
-
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
@@ -179,6 +177,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = {
};
#endif
+static void io_poison_cached_req(struct io_kiocb *req)
+{
+ req->ctx = IO_URING_PTR_POISON;
+ req->tctx = IO_URING_PTR_POISON;
+ req->file = IO_URING_PTR_POISON;
+ req->creds = IO_URING_PTR_POISON;
+ req->io_task_work.func = IO_URING_PTR_POISON;
+ req->apoll = IO_URING_PTR_POISON;
+}
+
+static void io_poison_req(struct io_kiocb *req)
+{
+ io_poison_cached_req(req);
+ req->async_data = IO_URING_PTR_POISON;
+ req->kbuf = IO_URING_PTR_POISON;
+ req->comp_list.next = IO_URING_PTR_POISON;
+ req->file_node = IO_URING_PTR_POISON;
+ req->link = IO_URING_PTR_POISON;
+}
+
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
@@ -235,6 +253,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
+ if (IS_ENABLED(CONFIG_KASAN))
+ io_poison_cached_req(req);
wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
@@ -290,7 +310,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
- io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_rsrc_cache_free(ctx);
}
@@ -337,9 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_cmd),
sizeof(struct io_async_cmd));
- spin_lock_init(&ctx->msg_lock);
- ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct io_kiocb), 0);
ret |= io_futex_cache_init(ctx);
ret |= io_rsrc_cache_init(ctx);
if (ret)
@@ -598,27 +614,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{
- size_t cqe_size = sizeof(struct io_uring_cqe);
-
lockdep_assert_held(&ctx->uring_lock);
/* don't abort if we're dying, entries must get freed */
if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
return;
- if (ctx->flags & IORING_SETUP_CQE32)
- cqe_size <<= 1;
-
io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) {
+ size_t cqe_size = sizeof(struct io_uring_cqe);
struct io_uring_cqe *cqe;
struct io_overflow_cqe *ocqe;
+ bool is_cqe32 = false;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
+ if (ocqe->cqe.flags & IORING_CQE_F_32 ||
+ ctx->flags & IORING_SETUP_CQE32) {
+ is_cqe32 = true;
+ cqe_size <<= 1;
+ }
if (!dying) {
- if (!io_get_cqe_overflow(ctx, &cqe, true))
+ if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32))
break;
memcpy(cqe, &ocqe->cqe, cqe_size);
}
@@ -730,10 +748,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
{
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+ bool is_cqe32 = false;
- if (is_cqe32)
+ if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) {
+ is_cqe32 = true;
ocq_size += sizeof(struct io_uring_cqe);
+ }
ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
@@ -752,11 +772,29 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
}
/*
+ * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
+ * because the ring is a single 16b entry away from wrapping.
+ */
+static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
+{
+ if (__io_cqring_events(ctx) < ctx->cq_entries) {
+ struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
+
+ cqe->user_data = 0;
+ cqe->res = 0;
+ cqe->flags = IORING_CQE_F_SKIP;
+ ctx->cached_cq_tail++;
+ return true;
+ }
+ return false;
+}
+
+/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
@@ -770,12 +808,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
return false;
+ /*
+ * Post dummy CQE if a 32b CQE is needed and there's only room for a
+ * 16b CQE before the ring wraps.
+ */
+ if (cqe32 && off + 1 == ctx->cq_entries) {
+ if (!io_fill_nop_cqe(ctx, off))
+ return false;
+ off = 0;
+ }
+
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
free = ctx->cq_entries - queued;
/* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off);
- if (!len)
+ if (len < (cqe32 + 1))
return false;
if (ctx->flags & IORING_SETUP_CQE32) {
@@ -793,9 +841,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
{
struct io_uring_cqe *cqe;
- if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))))
return false;
- if (unlikely(!io_get_cqe(ctx, &cqe)))
+ if (unlikely(!io_get_cqe(ctx, &cqe, true)))
return false;
memcpy(cqe, src_cqe, 2 * sizeof(*cqe));
@@ -806,14 +854,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
+ bool cqe32 = cflags & IORING_CQE_F_32;
struct io_uring_cqe *cqe;
- if (likely(io_get_cqe(ctx, &cqe))) {
+ if (likely(io_get_cqe(ctx, &cqe, cqe32))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (cqe32) {
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
@@ -985,7 +1034,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
lockdep_assert_held(&req->ctx->uring_lock);
req_set_fail(req);
- io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED));
+ io_req_set_res(req, res, io_put_kbuf(req, res, NULL));
if (def->fail)
def->fail(req);
io_req_complete_defer(req);
@@ -1406,8 +1455,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw)
void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
{
- io_tw_lock(req->ctx, tw);
- if (unlikely(io_should_terminate_tw()))
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_tw_lock(ctx, tw);
+ if (unlikely(io_should_terminate_tw(ctx)))
io_req_defer_failed(req, -EFAULT);
else if (req->flags & REQ_F_FORCE_ASYNC)
io_queue_iowq(req);
@@ -2003,11 +2054,9 @@ fail:
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
- io_kbuf_recycle(req, 0);
io_req_task_queue(req);
break;
case IO_APOLL_ABORTED:
- io_kbuf_recycle(req, 0);
io_queue_iowq(req);
break;
case IO_APOLL_OK:
@@ -2736,6 +2785,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
if (check_shl_overflow(off, 1, &off))
return SIZE_MAX;
}
+ if (flags & IORING_SETUP_CQE_MIXED) {
+ if (cq_entries < 2)
+ return SIZE_MAX;
+ }
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
@@ -2767,6 +2820,7 @@ static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
+ io_poison_req(req);
kmem_cache_free(req_cachep, req);
nr++;
}
@@ -3047,10 +3101,10 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
- * Use system_unbound_wq to avoid spawning tons of event kworkers
+ * Use system_dfl_wq to avoid spawning tons of event kworkers
* if we're exiting a ton of rings at the same time. It just adds
* noise and overhead, there's no discernable change in runtime
- * over using system_wq.
+ * over using system_percpu_wq.
*/
queue_work(iou_wq, &ctx->exit_work);
}
@@ -3404,12 +3458,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
struct file *file;
long ret;
- if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
- IORING_ENTER_REGISTERED_RING |
- IORING_ENTER_ABS_TIMER |
- IORING_ENTER_EXT_ARG_REG |
- IORING_ENTER_NO_IOWAIT)))
+ if (unlikely(flags & ~IORING_ENTER_FLAGS))
return -EINVAL;
/*
@@ -3659,6 +3708,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
!(flags & IORING_SETUP_SINGLE_ISSUER))
return -EINVAL;
+ /*
+ * Nonsensical to ask for CQE32 and mixed CQE support, it's not
+ * supported to post 16b CQEs on a ring setup with CQE32.
+ */
+ if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
+ (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
+ return -EINVAL;
+
return 0;
}
@@ -3809,15 +3866,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
- IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
- IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
- IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT;
+ p->features = IORING_FEAT_FLAGS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3825,8 +3874,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
}
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
- && !(ctx->flags & IORING_SETUP_R_DISABLED))
- WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
+ && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
+ /*
+ * Unlike io_register_enable_rings(), don't need WRITE_ONCE()
+ * since ctx isn't yet accessible from other tasks
+ */
+ ctx->submitter_task = get_task_struct(current);
+ }
file = io_uring_get_file(ctx);
if (IS_ERR(file)) {
@@ -3877,17 +3931,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
return -EINVAL;
}
- if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
- IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
- IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
- IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
- IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
+ if (p.flags & ~IORING_SETUP_FLAGS)
return -EINVAL;
-
return io_uring_create(entries, &p, params);
}