From 2f3893437a4ebf2e892ca172e9e122841319d675 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Nov 2022 16:57:41 +0000 Subject: io_uring: cmpxchg for poll arm refs release Replace atomically substracting the ownership reference at the end of arming a poll with a cmpxchg. We try to release ownership by setting 0 assuming that poll_refs didn't change while we were arming. If it did change, we keep the ownership and use it to queue a tw, which is fully capable to process all events and (even tolerates spurious wake ups). It's a bit more elegant as we reduce races b/w setting the cancellation flag and getting refs with this release, and with that we don't have to worry about any kinds of underflows. It's not the fastest path for polling. The performance difference b/w cmpxchg and atomic dec is usually negligible and it's not the fastest path. Cc: stable@vger.kernel.org Fixes: aa43477b04025 ("io_uring: poll rework") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0c95251624397ea6def568ff040cad2d7926fd51.1668963050.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 055632e9092a..1b78b527075d 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -518,7 +518,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - int v; INIT_HLIST_NODE(&req->hash_node); req->work.cancel_seq = atomic_read(&ctx->cancel_seq); @@ -586,11 +585,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (ipt->owning) { /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. + * Try to release ownership. If we see a change of state, e.g. + * poll was waken up, queue up a tw, it'll deal with it. */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) + if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) __io_poll_execute(req, 0); } return 0; -- cgit v1.2.3 From a26a35e9019fd70bf3cf647dcfdae87abc7bacea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Nov 2022 16:57:42 +0000 Subject: io_uring: make poll refs more robust poll_refs carry two functions, the first is ownership over the request. The second is notifying the io_poll_check_events() that there was an event but wake up couldn't grab the ownership, so io_poll_check_events() should retry. We want to make poll_refs more robust against overflows. Instead of always incrementing it, which covers two purposes with one atomic, check if poll_refs is elevated enough and if so set a retry flag without attempts to grab ownership. The gap between the bias check and following atomics may seem racy, but we don't need it to be strict. Moreover there might only be maximum 4 parallel updates: by the first and the second poll entries, __io_arm_poll_handler() and cancellation. From those four, only poll wake ups may be executed multiple times, but they're protected by a spin. Cc: stable@vger.kernel.org Reported-by: Lin Ma Fixes: aa43477b04025 ("io_uring: poll rework") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c762bc31f8683b3270f3587691348a7119ef9c9d.1668963050.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 1b78b527075d..b444b7d87697 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -40,7 +40,14 @@ struct io_poll_table { }; #define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK GENMASK(30, 0) +#define IO_POLL_RETRY_FLAG BIT(30) +#define IO_POLL_REF_MASK GENMASK(29, 0) + +/* + * We usually have 1-2 refs taken, 128 is more than enough and we want to + * maximise the margin between this amount and the moment when it overflows. + */ +#define IO_POLL_REF_BIAS 128 #define IO_WQE_F_DOUBLE 1 @@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe) return priv & IO_WQE_F_DOUBLE; } +static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) +{ + int v; + + /* + * poll_refs are already elevated and we don't have much hope for + * grabbing the ownership. Instead of incrementing set a retry flag + * to notify the loop that there might have been some change. + */ + v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); + if (v & IO_POLL_REF_MASK) + return false; + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); +} + /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not @@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe) */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { + if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) + return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } @@ -235,6 +259,16 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) */ if ((v & IO_POLL_REF_MASK) != 1) req->cqe.res = 0; + if (v & IO_POLL_RETRY_FLAG) { + req->cqe.res = 0; + /* + * We won't find new events that came in between + * vfs_poll and the ref put unless we clear the flag + * in advance. + */ + atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); + v &= ~IO_POLL_RETRY_FLAG; + } /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { -- cgit v1.2.3 From 9d94c04c0db024922e886c9fd429659f22f48ea4 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 23 Nov 2022 02:40:15 +0800 Subject: io_uring/filetable: fix file reference underflow There is an interesting reference bug when -ENOMEM occurs in calling of io_install_fixed_file(). KASan report like below: [ 14.057131] ================================================================== [ 14.059161] BUG: KASAN: use-after-free in unix_get_socket+0x10/0x90 [ 14.060975] Read of size 8 at addr ffff88800b09cf20 by task kworker/u8:2/45 [ 14.062684] [ 14.062768] CPU: 2 PID: 45 Comm: kworker/u8:2 Not tainted 6.1.0-rc4 #1 [ 14.063099] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 14.063666] Workqueue: events_unbound io_ring_exit_work [ 14.063936] Call Trace: [ 14.064065] [ 14.064175] dump_stack_lvl+0x34/0x48 [ 14.064360] print_report+0x172/0x475 [ 14.064547] ? _raw_spin_lock_irq+0x83/0xe0 [ 14.064758] ? __virt_addr_valid+0xef/0x170 [ 14.064975] ? unix_get_socket+0x10/0x90 [ 14.065167] kasan_report+0xad/0x130 [ 14.065353] ? unix_get_socket+0x10/0x90 [ 14.065553] unix_get_socket+0x10/0x90 [ 14.065744] __io_sqe_files_unregister+0x87/0x1e0 [ 14.065989] ? io_rsrc_refs_drop+0x1c/0xd0 [ 14.066199] io_ring_exit_work+0x388/0x6a5 [ 14.066410] ? io_uring_try_cancel_requests+0x5bf/0x5bf [ 14.066674] ? try_to_wake_up+0xdb/0x910 [ 14.066873] ? virt_to_head_page+0xbe/0xbe [ 14.067080] ? __schedule+0x574/0xd20 [ 14.067273] ? read_word_at_a_time+0xe/0x20 [ 14.067492] ? strscpy+0xb5/0x190 [ 14.067665] process_one_work+0x423/0x710 [ 14.067879] worker_thread+0x2a2/0x6f0 [ 14.068073] ? process_one_work+0x710/0x710 [ 14.068284] kthread+0x163/0x1a0 [ 14.068454] ? kthread_complete_and_exit+0x20/0x20 [ 14.068697] ret_from_fork+0x22/0x30 [ 14.068886] [ 14.069000] [ 14.069088] Allocated by task 289: [ 14.069269] kasan_save_stack+0x1e/0x40 [ 14.069463] kasan_set_track+0x21/0x30 [ 14.069652] __kasan_slab_alloc+0x58/0x70 [ 14.069899] kmem_cache_alloc+0xc5/0x200 [ 14.070100] __alloc_file+0x20/0x160 [ 14.070283] alloc_empty_file+0x3b/0xc0 [ 14.070479] path_openat+0xc3/0x1770 [ 14.070689] do_filp_open+0x150/0x270 [ 14.070888] do_sys_openat2+0x113/0x270 [ 14.071081] __x64_sys_openat+0xc8/0x140 [ 14.071283] do_syscall_64+0x3b/0x90 [ 14.071466] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 14.071791] [ 14.071874] Freed by task 0: [ 14.072027] kasan_save_stack+0x1e/0x40 [ 14.072224] kasan_set_track+0x21/0x30 [ 14.072415] kasan_save_free_info+0x2a/0x50 [ 14.072627] __kasan_slab_free+0x106/0x190 [ 14.072858] kmem_cache_free+0x98/0x340 [ 14.073075] rcu_core+0x427/0xe50 [ 14.073249] __do_softirq+0x110/0x3cd [ 14.073440] [ 14.073523] Last potentially related work creation: [ 14.073801] kasan_save_stack+0x1e/0x40 [ 14.074017] __kasan_record_aux_stack+0x97/0xb0 [ 14.074264] call_rcu+0x41/0x550 [ 14.074436] task_work_run+0xf4/0x170 [ 14.074619] exit_to_user_mode_prepare+0x113/0x120 [ 14.074858] syscall_exit_to_user_mode+0x1d/0x40 [ 14.075092] do_syscall_64+0x48/0x90 [ 14.075272] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 14.075529] [ 14.075612] Second to last potentially related work creation: [ 14.075900] kasan_save_stack+0x1e/0x40 [ 14.076098] __kasan_record_aux_stack+0x97/0xb0 [ 14.076325] task_work_add+0x72/0x1b0 [ 14.076512] fput+0x65/0xc0 [ 14.076657] filp_close+0x8e/0xa0 [ 14.076825] __x64_sys_close+0x15/0x50 [ 14.077019] do_syscall_64+0x3b/0x90 [ 14.077199] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 14.077448] [ 14.077530] The buggy address belongs to the object at ffff88800b09cf00 [ 14.077530] which belongs to the cache filp of size 232 [ 14.078105] The buggy address is located 32 bytes inside of [ 14.078105] 232-byte region [ffff88800b09cf00, ffff88800b09cfe8) [ 14.078685] [ 14.078771] The buggy address belongs to the physical page: [ 14.079046] page:000000001bd520e7 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88800b09de00 pfn:0xb09c [ 14.079575] head:000000001bd520e7 order:1 compound_mapcount:0 compound_pincount:0 [ 14.079946] flags: 0x100000000010200(slab|head|node=0|zone=1) [ 14.080244] raw: 0100000000010200 0000000000000000 dead000000000001 ffff88800493cc80 [ 14.080629] raw: ffff88800b09de00 0000000080190018 00000001ffffffff 0000000000000000 [ 14.081016] page dumped because: kasan: bad access detected [ 14.081293] [ 14.081376] Memory state around the buggy address: [ 14.081618] ffff88800b09ce00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 14.081974] ffff88800b09ce80: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc [ 14.082336] >ffff88800b09cf00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 14.082690] ^ [ 14.082909] ffff88800b09cf80: fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc [ 14.083266] ffff88800b09d000: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb [ 14.083622] ================================================================== The actual tracing of this bug is shown below: commit 8c71fe750215 ("io_uring: ensure fput() called correspondingly when direct install fails") adds an additional fput() in io_fixed_fd_install() when io_file_bitmap_get() returns error values. In that case, the routine will never make it to io_install_fixed_file() due to an early return. static int io_fixed_fd_install(...) { if (alloc_slot) { ... ret = io_file_bitmap_get(ctx); if (unlikely(ret < 0)) { io_ring_submit_unlock(ctx, issue_flags); fput(file); return ret; } ... } ... ret = io_install_fixed_file(req, file, issue_flags, file_slot); ... } In the above scenario, the reference is okay as io_fixed_fd_install() ensures the fput() is called when something bad happens, either via bitmap or via inner io_install_fixed_file(). However, the commit 61c1b44a21d7 ("io_uring: fix deadlock on iowq file slot alloc") breaks the balance because it places fput() into the common path for both io_file_bitmap_get() and io_install_fixed_file(). Since io_install_fixed_file() handles the fput() itself, the reference underflow come across then. There are some extra commits make the current code into io_fixed_fd_install() -> __io_fixed_fd_install() -> io_install_fixed_file() However, the fact that there is an extra fput() is called if io_install_fixed_file() calls fput(). Traversing through the code, I find that the existing two callers to __io_fixed_fd_install(): io_fixed_fd_install() and io_msg_send_fd() have fput() when handling error return, this patch simply removes the fput() in io_install_fixed_file() to fix the bug. Fixes: 61c1b44a21d7 ("io_uring: fix deadlock on iowq file slot alloc") Signed-off-by: Lin Ma Link: https://lore.kernel.org/r/be4ba4b.5d44.184a0a406a4.Coremail.linma@zju.edu.cn Signed-off-by: Jens Axboe --- io_uring/filetable.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 7b473259f3f4..68dfc6936aa7 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -101,8 +101,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); return ret; } -- cgit v1.2.3 From 12ad3d2d6c5b0131a6052de91360849e3e154846 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 25 Nov 2022 07:15:54 -0700 Subject: io_uring/poll: fix poll_refs race with cancelation There is an interesting race condition of poll_refs which could result in a NULL pointer dereference. The crash trace is like: KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 PID: 30781 Comm: syz-executor.2 Not tainted 6.0.0-g493ffd6605b2 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:io_poll_remove_entry io_uring/poll.c:154 [inline] RIP: 0010:io_poll_remove_entries+0x171/0x5b4 io_uring/poll.c:190 Code: ... RSP: 0018:ffff88810dfefba0 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000040000 RDX: ffffc900030c4000 RSI: 000000000003ffff RDI: 0000000000040000 RBP: 0000000000000008 R08: ffffffff9764d3dd R09: fffffbfff3836781 R10: fffffbfff3836781 R11: 0000000000000000 R12: 1ffff11003422d60 R13: ffff88801a116b04 R14: ffff88801a116ac0 R15: dffffc0000000000 FS: 00007f9c07497700(0000) GS:ffff88811a600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffb5c00ea98 CR3: 0000000105680005 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: io_apoll_task_func+0x3f/0xa0 io_uring/poll.c:299 handle_tw_list io_uring/io_uring.c:1037 [inline] tctx_task_work+0x37e/0x4f0 io_uring/io_uring.c:1090 task_work_run+0x13a/0x1b0 kernel/task_work.c:177 get_signal+0x2402/0x25a0 kernel/signal.c:2635 arch_do_signal_or_restart+0x3b/0x660 arch/x86/kernel/signal.c:869 exit_to_user_mode_loop kernel/entry/common.c:166 [inline] exit_to_user_mode_prepare+0xc2/0x160 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:283 [inline] syscall_exit_to_user_mode+0x58/0x160 kernel/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x63/0xcd The root cause for this is a tiny overlooking in io_poll_check_events() when cocurrently run with poll cancel routine io_poll_cancel_req(). The interleaving to trigger use-after-free: CPU0 | CPU1 | io_apoll_task_func() | io_poll_cancel_req() io_poll_check_events() | // do while first loop | v = atomic_read(...) | // v = poll_refs = 1 | ... | io_poll_mark_cancelled() | atomic_or() | // poll_refs = IO_POLL_CANCEL_FLAG | 1 | atomic_sub_return(...) | // poll_refs = IO_POLL_CANCEL_FLAG | // loop continue | | | io_poll_execute() | io_poll_get_ownership() | // poll_refs = IO_POLL_CANCEL_FLAG | 1 | // gets the ownership v = atomic_read(...) | // poll_refs not change | | if (v & IO_POLL_CANCEL_FLAG) | return -ECANCELED; | // io_poll_check_events return | // will go into | // io_req_complete_failed() free req | | | io_apoll_task_func() | // also go into io_req_complete_failed() And the interleaving to trigger the kernel WARNING: CPU0 | CPU1 | io_apoll_task_func() | io_poll_cancel_req() io_poll_check_events() | // do while first loop | v = atomic_read(...) | // v = poll_refs = 1 | ... | io_poll_mark_cancelled() | atomic_or() | // poll_refs = IO_POLL_CANCEL_FLAG | 1 | atomic_sub_return(...) | // poll_refs = IO_POLL_CANCEL_FLAG | // loop continue | | v = atomic_read(...) | // v = IO_POLL_CANCEL_FLAG | | io_poll_execute() | io_poll_get_ownership() | // poll_refs = IO_POLL_CANCEL_FLAG | 1 | // gets the ownership | WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) | // v & IO_POLL_REF_MASK = 0 WARN | | | io_apoll_task_func() | // also go into io_req_complete_failed() By looking up the source code and communicating with Pavel, the implementation of this atomic poll refs should continue the loop of io_poll_check_events() just to avoid somewhere else to grab the ownership. Therefore, this patch simply adds another AND operation to make sure the loop will stop if it finds the poll_refs is exactly equal to IO_POLL_CANCEL_FLAG. Since io_poll_cancel_req() grabs ownership and will finally make its way to io_req_complete_failed(), the req will be reclaimed as expected. Fixes: aa43477b0402 ("io_uring: poll rework") Signed-off-by: Lin Ma Reviewed-by: Pavel Begunkov [axboe: tweak description and code style] Signed-off-by: Jens Axboe --- io_uring/poll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index b444b7d87697..d9bf1767867e 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -308,7 +308,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) * Release all references, retry if someone tried to restart * task_work while we were executing it. */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) & + IO_POLL_REF_MASK); return IOU_POLL_NO_ACTION; } -- cgit v1.2.3 From 7cfe7a09489c1cefee7181e07b5f2bcbaebd9f41 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 25 Nov 2022 09:36:29 -0700 Subject: io_uring: clear TIF_NOTIFY_SIGNAL if set and task_work not available With how task_work is added and signaled, we can have TIF_NOTIFY_SIGNAL set and no task_work pending as it got run in a previous loop. Treat TIF_NOTIFY_SIGNAL like get_signal(), always clear it if set regardless of whether or not task_work is pending to run. Cc: stable@vger.kernel.org Fixes: 46a525e199e4 ("io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL") Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cef5ff924e63..50bc3af44953 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -238,9 +238,14 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline int io_run_task_work(void) { + /* + * Always check-and-clear the task_work notification signal. With how + * signaling works for task_work, we can find it set with nothing to + * run. We need to clear it for that case, like get_signal() does. + */ + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) + clear_notify_signal(); if (task_work_pending(current)) { - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - clear_notify_signal(); __set_current_state(TASK_RUNNING); task_work_run(); return 1; -- cgit v1.2.3