From 1292e972fff2b2d81e139e0c2fe5f50249e78c58 Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 15 Jan 2020 17:35:38 +0100 Subject: io_uring: fix compat for IORING_REGISTER_FILES_UPDATE fds field of struct io_uring_files_update is problematic with regards to compat user space, as pointer size is different in 32-bit, 32-on-64-bit, and 64-bit user space. In order to avoid custom handling of compat in the syscall implementation, make fds __u64 and use u64_to_user_ptr in order to retrieve it. Also, align the field naturally and check that no garbage is passed there. Fixes: c3a31e605620c279 ("io_uring: add support for IORING_REGISTER_FILES_UPDATE") Signed-off-by: Eugene Syromiatnikov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a3300e1b9a01..55cfcb71606d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -178,7 +178,8 @@ struct io_uring_params { struct io_uring_files_update { __u32 offset; - __s32 *fds; + __u32 resv; + __aligned_u64 /* __s32 * */ fds; }; #endif -- cgit v1.2.3 From d63d1b5edb7b832210bfde587ba9e7549fa064eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Dec 2019 10:38:56 -0700 Subject: io_uring: add support for fallocate() This exposes fallocate(2) through io_uring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 61 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5953d7f13690..7a4e00ef02be 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -319,6 +319,7 @@ struct io_sync { loff_t len; loff_t off; int flags; + int mode; }; struct io_cancel { @@ -2101,6 +2102,54 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fallocate_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); + req->sync.mode = READ_ONCE(sqe->len); + return 0; +} + +static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fallocate always requiring blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fallocate_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fallocate_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + + return 0; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -3123,6 +3172,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_ACCEPT: ret = io_accept_prep(req, sqe); break; + case IORING_OP_FALLOCATE: + ret = io_fallocate_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3277,6 +3329,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_async_cancel(req, nxt); break; + case IORING_OP_FALLOCATE: + if (sqe) { + ret = io_fallocate_prep(req, sqe); + if (ret) + break; + } + ret = io_fallocate(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 55cfcb71606d..ad1574f35eb3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -76,6 +76,7 @@ enum { IORING_OP_ASYNC_CANCEL, IORING_OP_LINK_TIMEOUT, IORING_OP_CONNECT, + IORING_OP_FALLOCATE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 15b71abe7b52df214785dde0de9f581cc0216d17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 11:20:36 -0700 Subject: io_uring: add support for IORING_OP_OPENAT This works just like openat(2), except it can be performed async. For the normal case of a non-blocking path lookup this will complete inline. If we have to do IO to perform the open, it'll be done from async context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 95 ++++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/io_uring.h | 2 + 2 files changed, 95 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a4e00ef02be..34cbce622fcd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -70,6 +70,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -353,6 +355,15 @@ struct io_sr_msg { int msg_flags; }; +struct io_open { + struct file *file; + int dfd; + umode_t mode; + const char __user *fname; + struct filename *filename; + int flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -371,12 +382,17 @@ struct io_async_rw { ssize_t size; }; +struct io_async_open { + struct filename *filename; +}; + struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; + struct io_async_open open; }; }; @@ -397,6 +413,7 @@ struct io_kiocb { struct io_timeout timeout; struct io_connect connect; struct io_sr_msg sr_msg; + struct io_open open; }; struct io_async_ctx *io; @@ -2150,6 +2167,67 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mode = READ_ONCE(sqe->len); + req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.flags = READ_ONCE(sqe->open_flags); + + req->open.filename = getname(req->open.fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct open_flags op; + struct open_how how; + struct file *file; + int ret; + + if (force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + + how = build_open_how(req->open.flags, req->open.mode); + ret = build_open_flags(&how, &op); + if (ret) + goto err; + + ret = get_unused_fd_flags(how.flags); + if (ret < 0) + goto err; + + file = do_filp_open(req->open.dfd, req->open.filename, &op); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + } else { + fsnotify_open(file); + fd_install(ret, file); + } +err: + putname(req->open.filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -3175,6 +3253,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_FALLOCATE: ret = io_fallocate_prep(req, sqe); break; + case IORING_OP_OPENAT: + ret = io_openat_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3337,6 +3418,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_fallocate(req, nxt, force_nonblock); break; + case IORING_OP_OPENAT: + if (sqe) { + ret = io_openat_prep(req, sqe); + if (ret) + break; + } + ret = io_openat(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3409,7 +3498,7 @@ static bool io_req_op_valid(int op) return op >= IORING_OP_NOP && op < IORING_OP_LAST; } -static int io_req_needs_file(struct io_kiocb *req) +static int io_req_needs_file(struct io_kiocb *req, int fd) { switch (req->opcode) { case IORING_OP_NOP: @@ -3419,6 +3508,8 @@ static int io_req_needs_file(struct io_kiocb *req) case IORING_OP_ASYNC_CANCEL: case IORING_OP_LINK_TIMEOUT: return 0; + case IORING_OP_OPENAT: + return fd != -1; default: if (io_req_op_valid(req->opcode)) return 1; @@ -3448,7 +3539,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - ret = io_req_needs_file(req); + ret = io_req_needs_file(req, fd); if (ret <= 0) return ret; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ad1574f35eb3..c1a7c1c65eaf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -34,6 +34,7 @@ struct io_uring_sqe { __u32 timeout_flags; __u32 accept_flags; __u32 cancel_flags; + __u32 open_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -77,6 +78,7 @@ enum { IORING_OP_LINK_TIMEOUT, IORING_OP_CONNECT, IORING_OP_FALLOCATE, + IORING_OP_OPENAT, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From b5dba59e0cf7e2cc4d3b3b1ac5fe81ddf21959eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 14:02:38 -0700 Subject: io_uring: add support for IORING_OP_CLOSE This works just like close(2), unsurprisingly. We remove the file descriptor and post the completion inline, then offload the actual (potential) last file put to async context. Mark the async part of this work as uncancellable, as we really must guarantee that the latter part of the close is run. Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 110 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index fe227650efd6..6aaff7bfe8b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -301,6 +301,12 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_close { + struct file *file; + struct file *put_file; + int fd; +}; + struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -414,6 +420,7 @@ struct io_kiocb { struct io_connect connect; struct io_sr_msg sr_msg; struct io_open open; + struct io_close close; }; struct io_async_ctx *io; @@ -2228,6 +2235,94 @@ err: return 0; } +static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If we queue this for async, it must not be cancellable. That would + * leave the 'file' in an undeterminate state. + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EINVAL; + + req->close.fd = READ_ONCE(sqe->fd); + if (req->file->f_op == &io_uring_fops || + req->close.fd == req->ring_fd) + return -EBADF; + + return 0; +} + +static void io_close_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + /* Invoked with files, we need to do the close */ + if (req->work.files) { + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) { + req_set_fail_links(req); + } + io_cqring_add_event(req, ret); + } + + fput(req->close.put_file); + + /* we bypassed the re-issue, drop the submission reference */ + io_put_req(req); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + int ret; + + req->close.put_file = NULL; + ret = __close_fd_get_file(req->close.fd, &req->close.put_file); + if (ret < 0) + return ret; + + /* if the file has a flush method, be safe and punt to async */ + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + goto eagain; + } + + /* + * No ->flush(), safely close from here and just punt the + * fput() to async context. + */ + ret = filp_close(req->close.put_file, current->files); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + + if (io_wq_current_is_worker()) { + struct io_wq_work *old_work, *work; + + old_work = work = &req->work; + io_close_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; + } + +eagain: + req->work.func = io_close_finish; + return -EAGAIN; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -3256,6 +3351,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_OPENAT: ret = io_openat_prep(req, sqe); break; + case IORING_OP_CLOSE: + ret = io_close_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3426,6 +3524,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_openat(req, nxt, force_nonblock); break; + case IORING_OP_CLOSE: + if (sqe) { + ret = io_close_prep(req, sqe); + if (ret) + break; + } + ret = io_close(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3572,6 +3678,9 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (!req->ring_file) + return -EBADF; + rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index c1a7c1c65eaf..084dea85b838 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -79,6 +79,7 @@ enum { IORING_OP_CONNECT, IORING_OP_FALLOCATE, IORING_OP_OPENAT, + IORING_OP_CLOSE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 05f3fb3c5397524feae2e73ee8e150a9090a7da2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Dec 2019 11:22:50 -0700 Subject: io_uring: avoid ring quiesce for fixed file set unregister and update We currently fully quiesce the ring before an unregister or update of the fixed fileset. This is very expensive, and we can be a bit smarter about this. Add a percpu refcount for the file tables as a whole. Grab a percpu ref when we use a registered file, and put it on completion. This is cheap to do. Upon removal of a file from a set, switch the ref count to atomic mode. When we hit zero ref on the completion side, then we know we can drop the previously registered files. When the old files have been dropped, switch the ref back to percpu mode for normal operation. Since there's a period between doing the update and the kernel being done with it, add a IORING_OP_FILES_UPDATE opcode that can perform the same action. The application knows the update has completed when it gets the CQE for it. Between doing the update and receiving this completion, the application must continue to use the unregistered fd if submitting IO on this particular file. This takes the runtime of test/file-register from liburing from 14s to about 0.7s. Signed-off-by: Jens Axboe --- fs/io_uring.c | 485 ++++++++++++++++++++++++++++++------------ include/uapi/linux/io_uring.h | 1 + 2 files changed, 351 insertions(+), 135 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6aaff7bfe8b5..4325068324b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -179,6 +179,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,7 +246,7 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; /* if used, fixed mapped user buffers */ @@ -370,6 +385,13 @@ struct io_open { int flags; }; +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -421,6 +443,7 @@ struct io_kiocb { struct io_sr_msg sr_msg; struct io_open open; struct io_close close; + struct io_files_update files_update; }; struct io_async_ctx *io; @@ -496,6 +519,9 @@ static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); static struct kmem_cache *req_cachep; @@ -945,6 +971,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); percpu_ref_put_many(&ctx->refs, *nr); + percpu_ref_put_many(&ctx->file_data->refs, *nr); *nr = 0; } } @@ -955,8 +982,12 @@ static void __io_free_req(struct io_kiocb *req) if (req->io) kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); + } if (req->flags & REQ_F_INFLIGHT) { unsigned long flags; @@ -3293,6 +3324,45 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) return 0; } +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_req_defer_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3354,6 +3424,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_CLOSE: ret = io_close_prep(req, sqe); break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3532,6 +3605,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_close(req, nxt, force_nonblock); break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3631,8 +3712,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -3653,7 +3734,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, return ret; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3661,6 +3742,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -4338,19 +4420,37 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + flush_work(&data->ref_work); + percpu_ref_get(&data->refs); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -4381,16 +4481,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -4438,7 +4528,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -4500,7 +4590,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -4515,36 +4605,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables; + struct file *file; int fd, ret = 0; unsigned i; - if (ctx->file_table) + if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), GFP_KERNEL); - if (!ctx->file_table) + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; } @@ -4561,13 +4774,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); + file = fget(fd); ret = -EBADF; - if (!table->files[index]) + if (!file) break; + /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this @@ -4575,26 +4789,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); + if (file->f_op == &io_uring_fops) { + fput(file); break; } ret = 0; + table->files[index] = file; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); + kfree(ctx->file_data->table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return ret; } @@ -4606,69 +4820,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) -{ -#if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(io_file_from_index(ctx, index)); -#endif -} - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4712,29 +4863,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = u64_to_user_ptr(up.fds); + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4744,16 +4931,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4779,11 +4966,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -5546,7 +5754,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -5710,18 +5917,22 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (opcode != IORING_UNREGISTER_FILES && + opcode != IORING_REGISTER_FILES_UPDATE) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + wait_for_completion(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5762,9 +5973,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + + if (opcode != IORING_UNREGISTER_FILES && + opcode != IORING_REGISTER_FILES_UPDATE) { + /* bring the ctx back to life */ + reinit_completion(&ctx->completions[0]); + percpu_ref_reinit(&ctx->refs); + } return ret; } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 084dea85b838..ca436b9d4921 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -80,6 +80,7 @@ enum { IORING_OP_FALLOCATE, IORING_OP_OPENAT, IORING_OP_CLOSE, + IORING_OP_FILES_UPDATE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From eddc7ef52a6b37b7ba3d1c8a8fbb63d5d9914f8a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Dec 2019 21:18:10 -0700 Subject: io_uring: add support for IORING_OP_STATX This provides support for async statx(2) through io_uring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 86 ++++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/io_uring.h | 2 + 2 files changed, 87 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4325068324b7..115c7d413372 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -379,9 +379,13 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; - umode_t mode; + union { + umode_t mode; + unsigned mask; + }; const char __user *fname; struct filename *filename; + struct statx __user *buffer; int flags; }; @@ -2266,6 +2270,74 @@ err: return 0; } +static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned lookup_flags; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mask = READ_ONCE(sqe->len); + req->open.fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->open.flags = READ_ONCE(sqe->statx_flags); + + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.flags)) + return -EINVAL; + + req->open.filename = getname_flags(req->open.fname, lookup_flags, NULL); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_open *ctx = &req->open; + unsigned lookup_flags; + struct path path; + struct kstat stat; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->flags)) + return -EINVAL; + +retry: + /* filename_lookup() drops it, keep a reference */ + ctx->filename->refcnt++; + + ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, + NULL); + if (ret) + goto err; + + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->flags); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + if (!ret) + ret = cp_statx(&stat, ctx->buffer); +err: + putname(ctx->filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { /* @@ -3427,6 +3499,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_FILES_UPDATE: ret = io_files_update_prep(req, sqe); break; + case IORING_OP_STATX: + ret = io_statx_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3613,6 +3688,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_files_update(req, force_nonblock); break; + case IORING_OP_STATX: + if (sqe) { + ret = io_statx_prep(req, sqe); + if (ret) + break; + } + ret = io_statx(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3699,6 +3782,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd) case IORING_OP_LINK_TIMEOUT: return 0; case IORING_OP_OPENAT: + case IORING_OP_STATX: return fd != -1; default: if (io_req_op_valid(req->opcode)) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ca436b9d4921..3f45f7c543de 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -35,6 +35,7 @@ struct io_uring_sqe { __u32 accept_flags; __u32 cancel_flags; __u32 open_flags; + __u32 statx_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -81,6 +82,7 @@ enum { IORING_OP_OPENAT, IORING_OP_CLOSE, IORING_OP_FILES_UPDATE, + IORING_OP_STATX, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From ce35a47a3a0208a77b4d31b7f2e8ed57d624093d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2019 08:04:44 -0700 Subject: io_uring: add IOSQE_ASYNC io_uring defaults to always doing inline submissions, if at all possible. But for larger copies, even if the data is fully cached, that can take a long time. Add an IOSQE_ASYNC flag that the application can set on the SQE - if set, it'll ensure that we always go async for those kinds of requests. Use the io-wq IO_WQ_WORK_CONCURRENT flag to ensure we get the concurrency we desire for this case. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++++++-- include/uapi/linux/io_uring.h | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 115c7d413372..9ffcfdc6382b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -483,6 +483,7 @@ struct io_kiocb { #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ +#define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */ u64 user_data; u32 result; u32 sequence; @@ -4017,8 +4018,17 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) req_set_fail_links(req); io_double_put_req(req); } - } else + } else if ((req->flags & REQ_F_FORCE_ASYNC) && + !io_wq_current_is_worker()) { + /* + * Never try inline submit of IOSQE_ASYNC is set, go straight + * to async execution. + */ + req->work.flags |= IO_WQ_WORK_CONCURRENT; + io_queue_async_work(req); + } else { __io_queue_sqe(req, sqe); + } } static inline void io_queue_link_head(struct io_kiocb *req) @@ -4031,7 +4041,7 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK) + IOSQE_IO_HARDLINK | IOSQE_ASYNC) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) @@ -4044,6 +4054,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINVAL; goto err_req; } + if (sqe->flags & IOSQE_ASYNC) + req->flags |= REQ_F_FORCE_ASYNC; ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 3f45f7c543de..d7ec50247a3a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -51,6 +51,7 @@ struct io_uring_sqe { #define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ #define IOSQE_IO_LINK (1U << 2) /* links next sqe */ #define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */ +#define IOSQE_ASYNC (1U << 4) /* always go async */ /* * io_uring_setup() flags -- cgit v1.2.3 From 3a6820f2bb8a079975109c25a5d1f29f46bce5d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 22 Dec 2019 15:19:35 -0700 Subject: io_uring: add non-vectored read/write commands For uses cases that don't already naturally have an iovec, it's easier (or more convenient) to just use a buffer address + length. This is particular true if the use case is from languages that want to create a memory safe abstraction on top of io_uring, and where introducing the need for the iovec may impose an ownership issue. For those cases, they currently need an indirection buffer, which means allocating data just for this purpose. Add basic read/write that don't require the iovec. Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++++++++++++ include/uapi/linux/io_uring.h | 2 ++ 2 files changed, 25 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index c54a8bd37b54..407ba3388e14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -654,6 +654,18 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .fd_non_neg = 1, }, + { + /* IORING_OP_READ */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_WRITE */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -1867,6 +1879,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->rw.kiocb.private) return -EINVAL; + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + ssize_t ret; + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); + *iovec = NULL; + return ret; + } + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -3634,10 +3653,12 @@ static int io_req_defer_prep(struct io_kiocb *req, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: @@ -3741,6 +3762,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: if (sqe) { ret = io_read_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3750,6 +3772,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: if (sqe) { ret = io_write_prep(req, sqe, force_nonblock); if (ret < 0) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d7ec50247a3a..7fdf994f3313 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -84,6 +84,8 @@ enum { IORING_OP_CLOSE, IORING_OP_FILES_UPDATE, IORING_OP_STATX, + IORING_OP_READ, + IORING_OP_WRITE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From ba04291eb66ed895f194ae5abd3748d72bf8aaea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Dec 2019 16:33:42 -0700 Subject: io_uring: allow use of offset == -1 to mean file position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This behaves like preadv2/pwritev2 with offset == -1, it'll use (and update) the current file position. This obviously comes with the caveat that if the application has multiple read/writes in flight, then the end result will not be as expected. This is similar to threads sharing a file descriptor and doing IO using the current file position. Since this feature isn't easily detectable by doing a read or write, add a feature flags, IORING_FEAT_RW_CUR_POS, to allow applications to detect presence of this feature. Reported-by: 李通洲 Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++++- include/uapi/linux/io_uring.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 407ba3388e14..5286620e4e46 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -495,6 +495,7 @@ struct io_kiocb { #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ #define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */ +#define REQ_F_CUR_POS 262144 /* read/write uses file position */ u64 user_data; u32 result; u32 sequence; @@ -1711,6 +1712,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); + if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1782,6 +1787,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = kiocb->ki_pos; if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else @@ -6154,7 +6163,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7fdf994f3313..1f96136eb6ee 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -174,6 +174,7 @@ struct io_uring_params { #define IORING_FEAT_SINGLE_MMAP (1U << 0) #define IORING_FEAT_NODROP (1U << 1) #define IORING_FEAT_SUBMIT_STABLE (1U << 2) +#define IORING_FEAT_RW_CUR_POS (1U << 3) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From 4840e418c2fc533d55ff6caa5b9313eed1d26cfd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Dec 2019 22:03:45 -0700 Subject: io_uring: add IORING_OP_FADVISE This adds support for doing fadvise through io_uring. We assume that WILLNEED doesn't block, but that DONTNEED may block. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 2 ++ 2 files changed, 55 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5286620e4e46..9ca12b900b42 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -72,6 +72,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -400,6 +401,13 @@ struct io_files_update { u32 offset; }; +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -452,6 +460,7 @@ struct io_kiocb { struct io_open open; struct io_close close; struct io_files_update files_update; + struct io_fadvise fadvise; }; struct io_async_ctx *io; @@ -667,6 +676,10 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, }, + { + /* IORING_OP_FADVISE */ + .needs_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2436,6 +2449,35 @@ err: return 0; } +static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); + req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_fadvise *fa = &req->fadvise; + int ret; + + /* DONTNEED may block, others _should_ not */ + if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) + return -EAGAIN; + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned lookup_flags; @@ -3721,6 +3763,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_STATX: ret = io_statx_prep(req, sqe); break; + case IORING_OP_FADVISE: + ret = io_fadvise_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3917,6 +3962,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_statx(req, nxt, force_nonblock); break; + case IORING_OP_FADVISE: + if (sqe) { + ret = io_fadvise_prep(req, sqe); + if (ret) + break; + } + ret = io_fadvise(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1f96136eb6ee..f86d1c776078 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -36,6 +36,7 @@ struct io_uring_sqe { __u32 cancel_flags; __u32 open_flags; __u32 statx_flags; + __u32 fadvise_advice; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -86,6 +87,7 @@ enum { IORING_OP_STATX, IORING_OP_READ, IORING_OP_WRITE, + IORING_OP_FADVISE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From c1ca757bd6f4632c510714631ddcc2d13030fe1e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Dec 2019 22:18:28 -0700 Subject: io_uring: add IORING_OP_MADVISE This adds support for doing madvise(2) through io_uring. We assume that any operation can block, and hence punt everything async. This could be improved, but hard to make bullet proof. The async punt ensures it's safe. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 59 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 60 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9ca12b900b42..17a199c99c7c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -408,6 +408,13 @@ struct io_fadvise { u32 advice; }; +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -461,6 +468,7 @@ struct io_kiocb { struct io_close close; struct io_files_update files_update; struct io_fadvise fadvise; + struct io_madvise madvise; }; struct io_async_ctx *io; @@ -680,6 +688,10 @@ static const struct io_op_def io_op_defs[] = { /* IORING_OP_FADVISE */ .needs_file = 1, }, + { + /* IORING_OP_MADVISE */ + .needs_mm = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2449,6 +2461,42 @@ err: return 0; } +static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); + req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = &req->madvise; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_madvise(ma->addr, ma->len, ma->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (sqe->ioprio || sqe->buf_index || sqe->addr) @@ -3766,6 +3814,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_FADVISE: ret = io_fadvise_prep(req, sqe); break; + case IORING_OP_MADVISE: + ret = io_madvise_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3970,6 +4021,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_fadvise(req, nxt, force_nonblock); break; + case IORING_OP_MADVISE: + if (sqe) { + ret = io_madvise_prep(req, sqe); + if (ret) + break; + } + ret = io_madvise(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f86d1c776078..8ad3cece5440 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -88,6 +88,7 @@ enum { IORING_OP_READ, IORING_OP_WRITE, IORING_OP_FADVISE, + IORING_OP_MADVISE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 8110c1a6212e430a84edd2b83fe9043def8b743e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Dec 2019 15:39:54 -0700 Subject: io_uring: add support for IORING_SETUP_CLAMP Some applications like to start small in terms of ring size, and then ramp up as needed. This is a bit tricky to do currently, since we don't advertise the max ring size. This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring size exceed what we support, then clamp them at the max values instead of returning -EINVAL. Since we return the chosen ring sizes after setup, no further changes are needed on the application side. io_uring already changes the ring sizes if the application doesn't ask for power-of-two sizes, for example. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1fbe3eb78d08..7c44b0ef10d7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6234,8 +6234,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) bool account_mem; int ret; - if (!entries || entries > IORING_MAX_ENTRIES) + if (!entries) return -EINVAL; + if (entries > IORING_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = IORING_MAX_ENTRIES; + } /* * Use twice as many entries for the CQ ring. It's possible for the @@ -6252,8 +6257,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + if (p->cq_entries < p->sq_entries) return -EINVAL; + if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + p->cq_entries = IORING_MAX_CQ_ENTRIES; + } p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; @@ -6345,7 +6355,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP)) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8ad3cece5440..29fae13395a8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -61,6 +61,7 @@ struct io_uring_sqe { #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ +#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ enum { IORING_OP_NOP, -- cgit v1.2.3 From fddafacee287b3140212c92464077e971401f860 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 4 Jan 2020 20:19:44 -0700 Subject: io_uring: add support for send(2) and recv(2) This adds IORING_OP_SEND for send(2) support, and IORING_OP_RECV for recv(2) support. Signed-off-by: Jens Axboe --- fs/io_uring.c | 140 ++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 2 + 2 files changed, 137 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d083811ccb9..edf072b6cb8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -377,8 +377,12 @@ struct io_connect { struct io_sr_msg { struct file *file; - struct user_msghdr __user *msg; + union { + struct user_msghdr __user *msg; + void __user *buf; + }; int msg_flags; + size_t len; }; struct io_open { @@ -692,6 +696,18 @@ static const struct io_op_def io_op_defs[] = { /* IORING_OP_MADVISE */ .needs_mm = 1, }, + { + /* IORING_OP_SEND */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + { + /* IORING_OP_RECV */ + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2802,8 +2818,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); - if (!io) + if (!io || req->opcode == IORING_OP_SEND) return 0; io->msg.iov = io->msg.fast_iov; @@ -2883,6 +2900,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2893,7 +2960,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (!io) + if (!io || req->opcode == IORING_OP_RECV) return 0; io->msg.iov = io->msg.fast_iov; @@ -2975,6 +3042,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(READ, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -3811,9 +3931,11 @@ static int io_req_defer_prep(struct io_kiocb *req, ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: @@ -3956,20 +4078,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: if (sqe) { ret = io_sendmsg_prep(req, sqe); if (ret < 0) break; } - ret = io_sendmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_SENDMSG) + ret = io_sendmsg(req, nxt, force_nonblock); + else + ret = io_send(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: if (sqe) { ret = io_recvmsg_prep(req, sqe); if (ret) break; } - ret = io_recvmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_RECVMSG) + ret = io_recvmsg(req, nxt, force_nonblock); + else + ret = io_recv(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 29fae13395a8..0fe270ab191c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -90,6 +90,8 @@ enum { IORING_OP_WRITE, IORING_OP_FADVISE, IORING_OP_MADVISE, + IORING_OP_SEND, + IORING_OP_RECV, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From f2842ab5b72d7ee5f7f8385c2d4f32c133f5837b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 11:04:00 -0700 Subject: io_uring: enable option to only trigger eventfd for async completions If an application is using eventfd notifications with poll to know when new SQEs can be issued, it's expecting the following read/writes to complete inline. And with that, it knows that there are events available, and don't want spurious wakeups on the eventfd for those requests. This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like IORING_REGISTER_EVENTFD, except it only triggers notifications for events that happen from async completions (IRQ, or io-wq worker completions). Any completions inline from the submission itself will not trigger notifications. Suggested-by: Mark Papadakis Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++++++++- include/uapi/linux/io_uring.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 42bf83b3fbd5..70656762244f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -206,6 +206,7 @@ struct io_ring_ctx { int account_mem: 1; int cq_overflow_flushed: 1; int drain_next: 1; + int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -963,13 +964,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +{ + if (!ctx->eventfd_async) + return true; + return io_wq_current_is_worker() || in_interrupt(); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) + if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -6556,10 +6564,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg); + if (ret) + break; + if (opcode == IORING_REGISTER_EVENTFD_ASYNC) + ctx->eventfd_async = 1; + else + ctx->eventfd_async = 0; break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0fe270ab191c..66772a90a7f2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -192,6 +192,7 @@ struct io_uring_params { #define IORING_REGISTER_EVENTFD 4 #define IORING_UNREGISTER_EVENTFD 5 #define IORING_REGISTER_FILES_UPDATE 6 +#define IORING_REGISTER_EVENTFD_ASYNC 7 struct io_uring_files_update { __u32 offset; -- cgit v1.2.3 From cebdb98617ae3e842c81c73758a185248b37cfd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 17:59:24 -0700 Subject: io_uring: add support for IORING_OP_OPENAT2 Add support for the new openat2(2) system call. It's trivial to do, as we can have openat(2) just be wrapped around it. Suggested-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 +++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 1 + 2 files changed, 64 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3a57ea98fe3a..0b30b0cf8af5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -707,6 +707,11 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, }, + { + /* IORING_OP_OPENAT2 */ + .needs_file = 1, + .fd_non_neg = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2487,11 +2492,46 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct open_how __user *how; + const char __user *fname; + size_t len; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, + len); + if (ret) + return ret; + + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { struct open_flags op; - struct open_how how; struct file *file; int ret; @@ -2500,12 +2540,11 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; } - how = build_open_how(req->open.how.flags, req->open.how.mode); - ret = build_open_flags(&how, &op); + ret = build_open_flags(&req->open.how, &op); if (ret) goto err; - ret = get_unused_fd_flags(how.flags); + ret = get_unused_fd_flags(req->open.how.flags); if (ret < 0) goto err; @@ -2526,6 +2565,13 @@ err: return 0; } +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); + return io_openat2(req, nxt, force_nonblock); +} + static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) @@ -3984,6 +4030,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_MADVISE: ret = io_madvise_prep(req, sqe); break; + case IORING_OP_OPENAT2: + ret = io_openat2_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4204,6 +4253,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_madvise(req, nxt, force_nonblock); break; + case IORING_OP_OPENAT2: + if (sqe) { + ret = io_openat2_prep(req, sqe); + if (ret) + break; + } + ret = io_openat2(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 66772a90a7f2..fea7da182851 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -92,6 +92,7 @@ enum { IORING_OP_MADVISE, IORING_OP_SEND, IORING_OP_RECV, + IORING_OP_OPENAT2, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 66f4af93da5761d2fa05c0dc673a47003cdb9cfe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 16 Jan 2020 15:36:52 -0700 Subject: io_uring: add support for probing opcodes The application currently has no way of knowing if a given opcode is supported or not without having to try and issue one and see if we get -EINVAL or not. And even this approach is fraught with peril, as maybe we're getting -EINVAL due to some fields being missing, or maybe it's just not that easy to issue that particular command without doing some other leg work in terms of setup first. This adds IORING_REGISTER_PROBE, which fills in a structure with info on what it supported or not. This will work even with sparse opcode fields, which may happen in the future or even today if someone backports specific features to older kernels. Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 18 +++++++++++++++ 2 files changed, 69 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a645a37b4c7..7715a729271a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -561,6 +561,8 @@ struct io_op_def { unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; }; static const struct io_op_def io_op_defs[] = { @@ -6566,6 +6568,45 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -6582,7 +6623,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -ENXIO; if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE) { + opcode != IORING_REGISTER_FILES_UPDATE && + opcode != IORING_REGISTER_PROBE) { percpu_ref_kill(&ctx->refs); /* @@ -6644,6 +6686,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -6651,7 +6699,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE) { + opcode != IORING_REGISTER_FILES_UPDATE && + opcode != IORING_REGISTER_PROBE) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index fea7da182851..955fd477e530 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -194,6 +194,7 @@ struct io_uring_params { #define IORING_UNREGISTER_EVENTFD 5 #define IORING_REGISTER_FILES_UPDATE 6 #define IORING_REGISTER_EVENTFD_ASYNC 7 +#define IORING_REGISTER_PROBE 8 struct io_uring_files_update { __u32 offset; @@ -201,4 +202,21 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +#define IO_URING_OP_SUPPORTED (1U << 0) + +struct io_uring_probe_op { + __u8 op; + __u8 resv; + __u16 flags; /* IO_URING_OP_* flags */ + __u32 resv2; +}; + +struct io_uring_probe { + __u8 last_op; /* last opcode supported */ + __u8 ops_len; /* length of ops[] array below */ + __u16 resv; + __u32 resv2[3]; + struct io_uring_probe_op ops[0]; +}; + #endif -- cgit v1.2.3 From 6b47ee6ecab142f938a40bf3b297abac74218ee2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jan 2020 20:22:41 +0300 Subject: io_uring: optimise sqe-to-req flags translation For each IOSQE_* flag there is a corresponding REQ_F_* flag. And there is a repetitive pattern of their translation: e.g. if (sqe->flags & SQE_FLAG*) req->flags |= REQ_F_FLAG* Use same numeric values/bits for them and copy instead of manual handling. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 92 +++++++++++++++++++++++++++++-------------- include/uapi/linux/io_uring.h | 23 ++++++++--- 2 files changed, 81 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index ed1adeda370e..cf5bad51f752 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -452,6 +453,65 @@ struct io_async_ctx { }; }; +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + + REQ_F_LINK_NEXT_BIT, + REQ_F_FAIL_LINK_BIT, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_IOPOLL_COMPLETED_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_TIMEOUT_BIT, + REQ_F_ISREG_BIT, + REQ_F_MUST_PUNT_BIT, + REQ_F_TIMEOUT_NOSEQ_BIT, + REQ_F_COMP_LOCKED_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + + /* already grabbed next link */ + REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* fail rest of links */ + REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + /* on inflight list */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* polled IO has completed */ + REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), + /* has linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* timeout request */ + REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* must be punted even for NONBLOCK */ + REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), + /* no timeout sequence */ + REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), + /* completion under lock */ + REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -494,23 +554,6 @@ struct io_kiocb { struct list_head link_list; unsigned int flags; refcount_t refs; -#define REQ_F_NOWAIT 1 /* must not punt to workers */ -#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ -#define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ -#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ -#define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_TIMEOUT 1024 /* timeout request */ -#define REQ_F_ISREG 2048 /* regular file */ -#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ -#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ -#define REQ_F_INFLIGHT 16384 /* on inflight list */ -#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ -#define REQ_F_FORCE_ASYNC 131072 /* IOSQE_ASYNC */ -#define REQ_F_CUR_POS 262144 /* read/write uses file position */ u64 user_data; u32 result; u32 sequence; @@ -4355,9 +4398,6 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - if (flags & IOSQE_IO_DRAIN) - req->flags |= REQ_F_IO_DRAIN; - if (!io_req_needs_file(req, fd)) return 0; @@ -4593,8 +4633,9 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINVAL; goto err_req; } - if (sqe_flags & IOSQE_ASYNC) - req->flags |= REQ_F_FORCE_ASYNC; + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| + IOSQE_ASYNC); ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { @@ -4618,10 +4659,6 @@ err_req: head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - - if (sqe_flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; @@ -4648,9 +4685,6 @@ err_req: } if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; - if (sqe_flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - INIT_LIST_HEAD(&req->link_list); ret = io_req_defer_prep(req, sqe); if (ret) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 955fd477e530..57d05cc5e271 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -45,14 +45,27 @@ struct io_uring_sqe { }; }; +enum { + IOSQE_FIXED_FILE_BIT, + IOSQE_IO_DRAIN_BIT, + IOSQE_IO_LINK_BIT, + IOSQE_IO_HARDLINK_BIT, + IOSQE_ASYNC_BIT, +}; + /* * sqe->flags */ -#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ -#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ -#define IOSQE_IO_LINK (1U << 2) /* links next sqe */ -#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */ -#define IOSQE_ASYNC (1U << 4) /* always go async */ +/* use fixed fileset */ +#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) +/* issue after inflight IO */ +#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) +/* links next sqe */ +#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) +/* like LINK, but stronger */ +#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) +/* always go async */ +#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* * io_uring_setup() flags -- cgit v1.2.3 From cccf0ee834559ae0b327b40290e14f6a2a017177 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Jan 2020 16:34:48 -0700 Subject: io_uring/io-wq: don't use static creds/mm assignments We currently setup the io_wq with a static set of mm and creds. Even for a single-use io-wq per io_uring, this is suboptimal as we have may have multiple enters of the ring. For sharing the io-wq backend, it doesn't work at all. Switch to passing in the creds and mm when the work item is setup. This means that async work is no longer deferred to the io_uring mm and creds, it is done with the current mm and creds. Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know they can rely on the current personality (mm and creds) being the same for direct issue and async issue. Reviewed-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io-wq.c | 68 +++++++++++++++++++++++++++++-------------- fs/io-wq.h | 7 +++-- fs/io_uring.c | 36 +++++++++++++++++++---- include/uapi/linux/io_uring.h | 1 + 4 files changed, 82 insertions(+), 30 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io-wq.c b/fs/io-wq.c index 54e270ae12ab..7ccedc82a703 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -56,7 +56,8 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; - const struct cred *creds; + const struct cred *cur_creds; + const struct cred *saved_creds; struct files_struct *restore_files; }; @@ -109,8 +110,6 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - const struct cred *creds; - struct mm_struct *mm; refcount_t refs; struct completion done; @@ -137,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; - if (worker->creds) { - revert_creds(worker->creds); - worker->creds = NULL; + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; } if (current->files != worker->restore_files) { @@ -398,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) return NULL; } +static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +{ + if (worker->mm) { + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + } + if (!work->mm) { + set_fs(KERNEL_DS); + return; + } + if (mmget_not_zero(work->mm)) { + use_mm(work->mm); + if (!worker->mm) + set_fs(USER_DS); + worker->mm = work->mm; + /* hang on to this mm */ + work->mm = NULL; + return; + } + + /* failed grabbing mm, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; +} + +static void io_wq_switch_creds(struct io_worker *worker, + struct io_wq_work *work) +{ + const struct cred *old_creds = override_creds(work->creds); + + worker->cur_creds = work->creds; + if (worker->saved_creds) + put_cred(old_creds); /* creds set by previous switch */ + else + worker->saved_creds = old_creds; +} + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { @@ -446,18 +482,10 @@ next: current->files = work->files; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && - wq->mm) { - if (mmget_not_zero(wq->mm)) { - use_mm(wq->mm); - set_fs(USER_DS); - worker->mm = wq->mm; - } else { - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if (!worker->creds) - worker->creds = override_creds(wq->creds); + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); /* * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, * the worker function will do the right thing. @@ -1037,7 +1065,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; - wq->creds = data->creds; for_each_node(node) { struct io_wqe *wqe; @@ -1064,9 +1091,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->done); - /* caller must have already done mmgrab() on this mm */ - wq->mm = data->mm; - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); diff --git a/fs/io-wq.h b/fs/io-wq.h index 1cd039af8813..167316ad447e 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -7,7 +7,6 @@ enum { IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_NEEDS_USER = 8, IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, @@ -74,6 +73,8 @@ struct io_wq_work { }; void (*func)(struct io_wq_work **); struct files_struct *files; + struct mm_struct *mm; + const struct cred *creds; unsigned flags; }; @@ -83,15 +84,15 @@ struct io_wq_work { (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ + (work)->mm = NULL; \ + (work)->creds = NULL; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { - struct mm_struct *mm; struct user_struct *user; - const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; diff --git a/fs/io_uring.c b/fs/io_uring.c index 1dd20305c664..0ea36911745d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -875,6 +875,29 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static inline void io_req_work_grab_env(struct io_kiocb *req, + const struct io_op_def *def) +{ + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); +} + +static inline void io_req_work_drop_env(struct io_kiocb *req) +{ + if (req->work.mm) { + mmdrop(req->work.mm); + req->work.mm = NULL; + } + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } +} + static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { @@ -888,8 +911,8 @@ static inline bool io_prep_async_work(struct io_kiocb *req, if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - if (def->needs_mm) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + + io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); return do_hashed; @@ -1180,6 +1203,8 @@ static void __io_req_aux_free(struct io_kiocb *req) else fput(req->file); } + + io_req_work_drop_env(req); } static void __io_free_req(struct io_kiocb *req) @@ -3963,6 +3988,8 @@ static int io_req_defer_prep(struct io_kiocb *req, { ssize_t ret = 0; + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + switch (req->opcode) { case IORING_OP_NOP: break; @@ -5725,9 +5752,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.mm = ctx->sqo_mm; data.user = ctx->user; - data.creds = ctx->creds; data.get_work = io_get_work; data.put_work = io_put_work; @@ -6535,7 +6560,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 57d05cc5e271..9988e82f858b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -195,6 +195,7 @@ struct io_uring_params { #define IORING_FEAT_NODROP (1U << 1) #define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_RW_CUR_POS (1U << 3) +#define IORING_FEAT_CUR_PERSONALITY (1U << 4) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From 24369c2e3bb06d8c4e71fd6ceaf4f8a01ae79b7c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2020 03:15:48 +0300 Subject: io_uring: add io-wq workqueue sharing If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to be a valid io_uring fd io-wq of which will be shared with the newly created io_uring instance. If the flag is set but it can't share io-wq, it fails. This allows creation of "sibling" io_urings, where we prefer to keep the SQ/CQ private, but want to share the async backend to minimize the amount of overhead associated with having multiple rings that belong to the same backend. Reported-by: Jens Axboe Reported-by: Daurnimator Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 64 +++++++++++++++++++++++++++++++++---------- include/uapi/linux/io_uring.h | 4 ++- 2 files changed, 53 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0ea36911745d..275355bd3a64 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5704,11 +5704,56 @@ static void io_get_work(struct io_wq_work *work) refcount_inc(&req->refs); } +static int io_init_wq_offload(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_wq_data data; + struct fd f; + struct io_ring_ctx *ctx_attach; + unsigned int concurrency; + int ret = 0; + + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + + if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; + } + return ret; + } + + f = fdget(p->wq_fd); + if (!f.file) + return -EBADF; + + if (f.file->f_op != &io_uring_fops) { + ret = -EINVAL; + goto out_fput; + } + + ctx_attach = f.file->private_data; + /* @io_wq is protected by holding the fd */ + if (!io_wq_get(ctx_attach->io_wq, &data)) { + ret = -EINVAL; + goto out_fput; + } + + ctx->io_wq = ctx_attach->io_wq; +out_fput: + fdput(f); + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_wq_data data; - unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -5752,18 +5797,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.user = ctx->user; - data.get_work = io_get_work; - data.put_work = io_put_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; + ret = io_init_wq_offload(ctx, p); + if (ret) goto err; - } return 0; err: @@ -6589,7 +6625,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP)) + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 9988e82f858b..e067b92af5ad 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -75,6 +75,7 @@ enum { #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ +#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ enum { IORING_OP_NOP, @@ -183,7 +184,8 @@ struct io_uring_params { __u32 sq_thread_cpu; __u32 sq_thread_idle; __u32 features; - __u32 resv[4]; + __u32 wq_fd; + __u32 resv[3]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; -- cgit v1.2.3 From 071698e13ac6ba786dfa22349a7b62deb5a9464d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 Jan 2020 10:04:42 -0700 Subject: io_uring: allow registering credentials If an application wants to use a ring with different kinds of credentials, it can register them upfront. We don't lookup credentials, the credentials of the task calling IORING_REGISTER_PERSONALITY is used. An 'id' is returned for the application to use in subsequent personality support. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 2 ++ 2 files changed, 70 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 275355bd3a64..d74567fc9628 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -273,6 +273,8 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr personality_idr; + struct { unsigned cached_cq_tail; unsigned cq_entries; @@ -796,6 +798,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -6177,6 +6180,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + const struct cred *cred; + + cred = idr_remove(&ctx->personality_idr, id); + if (cred) + put_cred(cred); + return 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -6193,6 +6207,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -6683,6 +6698,45 @@ out: return ret; } +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds = get_current_cred(); + int id; + + id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (id < 0) + put_cred(creds); + return id; +} + +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *old_creds; + + old_creds = idr_remove(&ctx->personality_idr, id); + if (old_creds) { + put_cred(old_creds); + return 0; + } + + return -EINVAL; +} + +static bool io_register_op_must_quiesce(int op) +{ + switch (op) { + case IORING_UNREGISTER_FILES: + case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_PROBE: + case IORING_REGISTER_PERSONALITY: + case IORING_UNREGISTER_PERSONALITY: + return false; + default: + return true; + } +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -6698,9 +6752,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE && - opcode != IORING_REGISTER_PROBE) { + if (io_register_op_must_quiesce(opcode)) { percpu_ref_kill(&ctx->refs); /* @@ -6768,15 +6820,24 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_probe(ctx, arg, nr_args); break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; default: ret = -EINVAL; break; } - - if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE && - opcode != IORING_REGISTER_PROBE) { + if (io_register_op_must_quiesce(opcode)) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e067b92af5ad..b4ccf31db2d1 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -211,6 +211,8 @@ struct io_uring_params { #define IORING_REGISTER_FILES_UPDATE 6 #define IORING_REGISTER_EVENTFD_ASYNC 7 #define IORING_REGISTER_PROBE 8 +#define IORING_REGISTER_PERSONALITY 9 +#define IORING_UNREGISTER_PERSONALITY 10 struct io_uring_files_update { __u32 offset; -- cgit v1.2.3 From 75c6a03904e0dd414a4d99a3072075cb5117e5bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 Jan 2020 10:15:23 -0700 Subject: io_uring: support using a registered personality for commands For personalities previously registered via IORING_REGISTER_PERSONALITY, allow any command to select them. This is done through setting sqe->personality to the id returned from registration, and then flagging sqe->flags with IOSQE_PERSONALITY. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 20 +++++++++++++++++++- include/uapi/linux/io_uring.h | 7 ++++++- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index d74567fc9628..8bcf0538e2e1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4626,9 +4626,10 @@ static inline void io_queue_link_head(struct io_kiocb *req) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { + const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; unsigned int sqe_flags; - int ret; + int ret, id; sqe_flags = READ_ONCE(sqe->flags); @@ -4637,6 +4638,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EINVAL; goto err_req; } + + id = READ_ONCE(sqe->personality); + if (id) { + const struct cred *personality_creds; + + personality_creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!personality_creds)) { + ret = -EINVAL; + goto err_req; + } + old_creds = override_creds(personality_creds); + } + /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| IOSQE_ASYNC); @@ -4646,6 +4660,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); + if (old_creds) + revert_creds(old_creds); return false; } @@ -4706,6 +4722,8 @@ err_req: } } + if (old_creds) + revert_creds(old_creds); return true; } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b4ccf31db2d1..98105ff8d3e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -40,7 +40,12 @@ struct io_uring_sqe { }; __u64 user_data; /* data to be passed back at completion time */ union { - __u16 buf_index; /* index into fixed buffers, if used */ + struct { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* personality to use, if used */ + __u16 personality; + }; __u64 __pad2[3]; }; }; -- cgit v1.2.3 From 3e4827b05d2ac2d377ed136a52829ec46787bf4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jan 2020 15:18:09 -0700 Subject: io_uring: add support for epoll_ctl(2) This adds IORING_OP_EPOLL_CTL, which can perform the same work as the epoll_ctl(2) system call. Signed-off-by: Jens Axboe --- fs/io_uring.c | 71 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 72 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d8d0e217847..c5ca84a305d3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -74,6 +74,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -423,6 +424,14 @@ struct io_madvise { u32 advice; }; +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -536,6 +545,7 @@ struct io_kiocb { struct io_files_update files_update; struct io_fadvise fadvise; struct io_madvise madvise; + struct io_epoll epoll; }; struct io_async_ctx *io; @@ -728,6 +738,10 @@ static const struct io_op_def io_op_defs[] = { .fd_non_neg = 1, .file_table = 1, }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .file_table = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2611,6 +2625,52 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, return io_openat2(req, nxt, force_nonblock); } +static int io_epoll_ctl_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); + req->epoll.fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(req->epoll.op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_EPOLL) + struct io_epoll *ie = &req->epoll; + int ret; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) @@ -4075,6 +4135,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_OPENAT2: ret = io_openat2_prep(req, sqe); break; + case IORING_OP_EPOLL_CTL: + ret = io_epoll_ctl_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4303,6 +4366,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_openat2(req, nxt, force_nonblock); break; + case IORING_OP_EPOLL_CTL: + if (sqe) { + ret = io_epoll_ctl_prep(req, sqe); + if (ret) + break; + } + ret = io_epoll_ctl(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 98105ff8d3e6..3f7961c1c243 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -112,6 +112,7 @@ enum { IORING_OP_SEND, IORING_OP_RECV, IORING_OP_OPENAT2, + IORING_OP_EPOLL_CTL, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3