From b9445598d8c60a1379887b957024b71343965f74 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 12:25:45 +0100 Subject: io_uring: openat directly into fixed fd table Instead of opening a file into a process's file table as usual and then registering the fd within io_uring, some users may want to skip the first step and place it directly into io_uring's fixed file table. This patch adds such a capability for IORING_OP_OPENAT and IORING_OP_OPENAT2. The behaviour is controlled by setting sqe->file_index, where 0 implies the old behaviour using normal file tables. If non-zero value is specified, then it will behave as described and place the file into a fixed file slot sqe->file_index - 1. A file table should be already created, the slot should be valid and empty, otherwise the operation will fail. Keep the error codes consistent with IORING_OP_FILES_UPDATE, ENXIO and EINVAL on inappropriate fixed tables, and return EBADF on collision with already registered file. Note: IOSQE_FIXED_FILE can't be used to switch between modes, because accept takes a file, and it already uses the flag with a different meaning. Suggested-by: Josh Triplett Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/e9b33d1163286f51ea707f87d95bd596dada1e65.1629888991.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 79126d5cd289..45a4f2373694 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -55,7 +55,10 @@ struct io_uring_sqe { } __attribute__((packed)); /* personality to use, if used */ __u16 personality; - __s32 splice_fd_in; + union { + __s32 splice_fd_in; + __u32 file_index; + }; __u64 __pad2[2]; }; -- cgit v1.2.3 From 2e480058ddc21ec53a10e8b41623e245e908bdbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Aug 2021 11:33:19 -0600 Subject: io-wq: provide a way to limit max number of workers io-wq divides work into two categories: 1) Work that completes in a bounded time, like reading from a regular file or a block device. This type of work is limited based on the size of the SQ ring. 2) Work that may never complete, we call this unbounded work. The amount of workers here is just limited by RLIMIT_NPROC. For various uses cases, it's handy to have the kernel limit the maximum amount of pending workers for both categories. Provide a way to do with with a new IORING_REGISTER_IOWQ_MAX_WORKERS operation. IORING_REGISTER_IOWQ_MAX_WORKERS takes an array of two integers and sets the max worker count to what is being passed in for each category. The old values are returned into that same array. If 0 is being passed in for either category, it simply returns the current value. The value is capped at RLIMIT_NPROC. This actually isn't that important as it's more of a hint, if we're exceeding the value then our attempt to fork a new worker will fail. This happens naturally already if more than one node is in the system, as these values are per-node internally for io-wq. Reported-by: Johannes Lundberg Link: https://github.com/axboe/liburing/issues/420 Signed-off-by: Jens Axboe --- fs/io-wq.c | 29 +++++++++++++++++++++++++++++ fs/io-wq.h | 1 + fs/io_uring.c | 32 ++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 3 +++ 4 files changed, 65 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/io-wq.c b/fs/io-wq.c index 8da9bb103916..4b5fc621ab39 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1152,6 +1152,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) return 0; } +/* + * Set max number of unbounded workers, returns old value. If new_count is 0, + * then just return the old value. + */ +int io_wq_max_workers(struct io_wq *wq, int *new_count) +{ + int i, node, prev = 0; + + for (i = 0; i < 2; i++) { + if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) + new_count[i] = task_rlimit(current, RLIMIT_NPROC); + } + + rcu_read_lock(); + for_each_node(node) { + struct io_wqe_acct *acct; + + for (i = 0; i < 2; i++) { + acct = &wq->wqes[node]->acct[i]; + prev = max_t(int, acct->max_workers, prev); + if (new_count[i]) + acct->max_workers = new_count[i]; + new_count[i] = prev; + } + } + rcu_read_unlock(); + return 0; +} + static __init int io_wq_init(void) { int ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index 308af3928424..bf5c4c533760 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_max_workers(struct io_wq *wq, int *new_count); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/fs/io_uring.c b/fs/io_uring.c index 53326449d685..edbda88142f9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10233,6 +10233,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } +static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) +{ + struct io_uring_task *tctx = current->io_uring; + __u32 new_count[2]; + int i, ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + if (copy_from_user(new_count, arg, sizeof(new_count))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i] > INT_MAX) + return -EINVAL; + + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + return ret; + + if (copy_to_user(arg, new_count, sizeof(new_count))) + return -EFAULT; + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -10250,6 +10275,7 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_BUFFERS_UPDATE: case IORING_REGISTER_IOWQ_AFF: case IORING_UNREGISTER_IOWQ_AFF: + case IORING_REGISTER_IOWQ_MAX_WORKERS: return false; default: return true; @@ -10406,6 +10432,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_iowq_aff(ctx); break; + case IORING_REGISTER_IOWQ_MAX_WORKERS: + ret = -EINVAL; + if (!arg || nr_args != 2) + break; + ret = io_register_iowq_max_workers(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 45a4f2373694..64fe809c4e36 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -309,6 +309,9 @@ enum { IORING_REGISTER_IOWQ_AFF = 17, IORING_UNREGISTER_IOWQ_AFF = 18, + /* set/get max number of workers */ + IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + /* this goes last */ IORING_REGISTER_LAST }; -- cgit v1.2.3 From 50c1df2b56e0f581b1dbf334dbf807d6fb8f77b2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Aug 2021 17:11:06 -0600 Subject: io_uring: support CLOCK_BOOTTIME/REALTIME for timeouts Certain use cases want to use CLOCK_BOOTTIME or CLOCK_REALTIME rather than CLOCK_MONOTONIC, instead of the default CLOCK_MONOTONIC. Add an IORING_TIMEOUT_BOOTTIME and IORING_TIMEOUT_REALTIME flag that allows timeouts and linked timeouts to use the selected clock source. Only one clock source may be selected, and we -EINVAL the request if more than one is given. If neither BOOTIME nor REALTIME are selected, the previous default of MONOTONIC is used. Link: https://github.com/axboe/liburing/issues/369 Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++++++--- include/uapi/linux/io_uring.h | 3 +++ 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index edbda88142f9..1c99f0143b57 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -508,6 +508,7 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; + u32 flags; }; struct io_accept { @@ -5712,6 +5713,22 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) @@ -5725,7 +5742,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, req->timeout.off = 0; /* noseq */ data = req->async_data; list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); return 0; @@ -5807,7 +5824,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~IORING_TIMEOUT_ABS) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; req->timeout.off = off; @@ -5819,12 +5839,13 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = req->async_data; data->req = req; + data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 64fe809c4e36..b6d28d927a3f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -151,6 +151,9 @@ enum { */ #define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_UPDATE (1U << 1) +#define IORING_TIMEOUT_BOOTTIME (1U << 2) +#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) /* * sqe->splice_flags -- cgit v1.2.3 From f1042b6ccb887f07301f6b096b3d0cfcf9189323 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Aug 2021 19:54:39 -0600 Subject: io_uring: allow updating linked timeouts We allow updating normal timeouts, add support for adjusting timings of linked timeouts as well. Reported-by: Victor Stewart Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 +++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 11 ++++++----- 2 files changed, 46 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index aa978292f34b..7cc458e0b636 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -552,6 +552,7 @@ struct io_timeout_rem { /* timeout update */ struct timespec64 ts; u32 flags; + bool ltimeout; }; struct io_rw { @@ -1069,6 +1070,7 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -5732,6 +5734,31 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } } +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_kiocb *req; + bool found = false; + + list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { + found = user_data == req->user_data; + if (found) + break; + } + if (!found) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) @@ -5763,10 +5790,15 @@ static int io_timeout_remove_prep(struct io_kiocb *req, if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; + tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE) { - if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; @@ -5800,9 +5832,13 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_update(ctx, tr->addr, &tr->ts, - io_translate_timeout_mode(tr->flags)); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); spin_unlock_irq(&ctx->timeout_lock); } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b6d28d927a3f..3caec9199658 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -149,12 +149,13 @@ enum { /* * sqe->timeout_flags */ -#define IORING_TIMEOUT_ABS (1U << 0) -#define IORING_TIMEOUT_UPDATE (1U << 1) -#define IORING_TIMEOUT_BOOTTIME (1U << 2) -#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_TIMEOUT_ABS (1U << 0) +#define IORING_TIMEOUT_UPDATE (1U << 1) +#define IORING_TIMEOUT_BOOTTIME (1U << 2) +#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) - +#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* * sqe->splice_flags * extends splice(2) flags -- cgit v1.2.3