From a0e44c64b6061dda7e00b7c458e4523e2331b739 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 1 Aug 2022 18:25:11 +0000 Subject: binder: fix UAF of ref->proc caused by race condition A transaction of type BINDER_TYPE_WEAK_HANDLE can fail to increment the reference for a node. In this case, the target proc normally releases the failed reference upon close as expected. However, if the target is dying in parallel the call will race with binder_deferred_release(), so the target could have released all of its references by now leaving the cleanup of the new failed reference unhandled. The transaction then ends and the target proc gets released making the ref->proc now a dangling pointer. Later on, ref->node is closed and we attempt to take spin_lock(&ref->proc->inner_lock), which leads to the use-after-free bug reported below. Let's fix this by cleaning up the failed reference on the spot instead of relying on the target to do so. ================================================================== BUG: KASAN: use-after-free in _raw_spin_lock+0xa8/0x150 Write of size 4 at addr ffff5ca207094238 by task kworker/1:0/590 CPU: 1 PID: 590 Comm: kworker/1:0 Not tainted 5.19.0-rc8 #10 Hardware name: linux,dummy-virt (DT) Workqueue: events binder_deferred_func Call trace: dump_backtrace.part.0+0x1d0/0x1e0 show_stack+0x18/0x70 dump_stack_lvl+0x68/0x84 print_report+0x2e4/0x61c kasan_report+0xa4/0x110 kasan_check_range+0xfc/0x1a4 __kasan_check_write+0x3c/0x50 _raw_spin_lock+0xa8/0x150 binder_deferred_func+0x5e0/0x9b0 process_one_work+0x38c/0x5f0 worker_thread+0x9c/0x694 kthread+0x188/0x190 ret_from_fork+0x10/0x20 Acked-by: Christian Brauner (Microsoft) Signed-off-by: Carlos Llamas Cc: stable # 4.14+ Link: https://lore.kernel.org/r/20220801182511.3371447-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/android') diff --git a/drivers/android/binder.c b/drivers/android/binder.c index c964d7c8c384..6428f6be69e3 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1385,6 +1385,18 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, } ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; + if (ret && ref == new_ref) { + /* + * Cleanup the failed reference here as the target + * could now be dead and have already released its + * references by now. Calling on the new reference + * with strong=0 and a tmp_refs will not decrement + * the node. The new_ref gets kfree'd below. + */ + binder_cleanup_ref_olocked(new_ref); + ref = NULL; + } + binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* -- cgit v1.2.3 From d6f35446d0769a98e9d761593d267cdd24f09ecd Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 10 Aug 2022 16:02:25 +0000 Subject: binder_alloc: Add missing mmap_lock calls when using the VMA Take the mmap_read_lock() when using the VMA in binder_alloc_print_pages() and when checking for a VMA in binder_alloc_new_buf_locked(). It is worth noting binder_alloc_new_buf_locked() drops the VMA read lock after it verifies a VMA exists, but may be taken again deeper in the call stack, if necessary. Fixes: a43cfc87caaf (android: binder: stop saving a pointer to the VMA) Cc: stable Reported-by: Ondrej Mosnacek Reported-by: syzbot+a7b60a176ec13cafb793@syzkaller.appspotmail.com Tested-by: Ondrej Mosnacek Acked-by: Carlos Llamas Signed-off-by: Liam R. Howlett Link: https://lore.kernel.org/r/20220810160209.1630707-1-Liam.Howlett@oracle.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 1014beb12802..51f4e1c5cd01 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -402,12 +402,15 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; + mmap_read_lock(alloc->vma_vm_mm); if (!binder_alloc_get_vma(alloc)) { + mmap_read_unlock(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: binder_alloc_buf, no vma\n", alloc->pid); return ERR_PTR(-ESRCH); } + mmap_read_unlock(alloc->vma_vm_mm); data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -929,17 +932,25 @@ void binder_alloc_print_pages(struct seq_file *m, * Make sure the binder_alloc is fully initialized, otherwise we might * read inconsistent state. */ - if (binder_alloc_get_vma(alloc) != NULL) { - for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { - page = &alloc->pages[i]; - if (!page->page_ptr) - free++; - else if (list_empty(&page->lru)) - active++; - else - lru++; - } + + mmap_read_lock(alloc->vma_vm_mm); + if (binder_alloc_get_vma(alloc) == NULL) { + mmap_read_unlock(alloc->vma_vm_mm); + goto uninitialized; } + + mmap_read_unlock(alloc->vma_vm_mm); + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + page = &alloc->pages[i]; + if (!page->page_ptr) + free++; + else if (list_empty(&page->lru)) + active++; + else + lru++; + } + +uninitialized: mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); -- cgit v1.2.3 From db7e5c10351e3dd58e6bef237c8fa74282e5d59e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 19 Aug 2022 10:55:54 +0200 Subject: Revert "binder_alloc: Add missing mmap_lock calls when using the VMA" This reverts commit d6f35446d0769a98e9d761593d267cdd24f09ecd. It is coming in through Andrew's tree instead, and for some reason we have different versions. I trust the version from Andrew more as the original offending commit came through his tree. Fixes: d6f35446d076 ("binder_alloc: Add missing mmap_lock calls when using the VMA") Cc: stable Cc: Ondrej Mosnacek Cc: Ondrej Mosnacek Cc: Carlos Llamas Cc: Liam R. Howlett Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/20220819184027.7b3fda3e@canb.auug.org.au Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 51f4e1c5cd01..1014beb12802 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -402,15 +402,12 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; - mmap_read_lock(alloc->vma_vm_mm); if (!binder_alloc_get_vma(alloc)) { - mmap_read_unlock(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: binder_alloc_buf, no vma\n", alloc->pid); return ERR_PTR(-ESRCH); } - mmap_read_unlock(alloc->vma_vm_mm); data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -932,25 +929,17 @@ void binder_alloc_print_pages(struct seq_file *m, * Make sure the binder_alloc is fully initialized, otherwise we might * read inconsistent state. */ - - mmap_read_lock(alloc->vma_vm_mm); - if (binder_alloc_get_vma(alloc) == NULL) { - mmap_read_unlock(alloc->vma_vm_mm); - goto uninitialized; - } - - mmap_read_unlock(alloc->vma_vm_mm); - for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { - page = &alloc->pages[i]; - if (!page->page_ptr) - free++; - else if (list_empty(&page->lru)) - active++; - else - lru++; + if (binder_alloc_get_vma(alloc) != NULL) { + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + page = &alloc->pages[i]; + if (!page->page_ptr) + free++; + else if (list_empty(&page->lru)) + active++; + else + lru++; + } } - -uninitialized: mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); -- cgit v1.2.3 From 44e602b4e52f70f04620bbbf4fe46ecb40170bde Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 10 Aug 2022 16:02:25 +0000 Subject: binder_alloc: add missing mmap_lock calls when using the VMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take the mmap_read_lock() when using the VMA in binder_alloc_print_pages() and when checking for a VMA in binder_alloc_new_buf_locked(). It is worth noting binder_alloc_new_buf_locked() drops the VMA read lock after it verifies a VMA exists, but may be taken again deeper in the call stack, if necessary. Link: https://lkml.kernel.org/r/20220810160209.1630707-1-Liam.Howlett@oracle.com Fixes: a43cfc87caaf (android: binder: stop saving a pointer to the VMA) Signed-off-by: Liam R. Howlett Reported-by: Ondrej Mosnacek Reported-by: Acked-by: Carlos Llamas Tested-by: Ondrej Mosnacek Cc: Minchan Kim Cc: Christian Brauner (Microsoft) Cc: Greg Kroah-Hartman Cc: Hridya Valsaraju Cc: Joel Fernandes Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Matthew Wilcox (Oracle) Cc: "Arve Hjønnevåg" Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 1014beb12802..51f4e1c5cd01 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -402,12 +402,15 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; + mmap_read_lock(alloc->vma_vm_mm); if (!binder_alloc_get_vma(alloc)) { + mmap_read_unlock(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: binder_alloc_buf, no vma\n", alloc->pid); return ERR_PTR(-ESRCH); } + mmap_read_unlock(alloc->vma_vm_mm); data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -929,17 +932,25 @@ void binder_alloc_print_pages(struct seq_file *m, * Make sure the binder_alloc is fully initialized, otherwise we might * read inconsistent state. */ - if (binder_alloc_get_vma(alloc) != NULL) { - for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { - page = &alloc->pages[i]; - if (!page->page_ptr) - free++; - else if (list_empty(&page->lru)) - active++; - else - lru++; - } + + mmap_read_lock(alloc->vma_vm_mm); + if (binder_alloc_get_vma(alloc) == NULL) { + mmap_read_unlock(alloc->vma_vm_mm); + goto uninitialized; } + + mmap_read_unlock(alloc->vma_vm_mm); + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + page = &alloc->pages[i]; + if (!page->page_ptr) + free++; + else if (list_empty(&page->lru)) + active++; + else + lru++; + } + +uninitialized: mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); -- cgit v1.2.3 From 1da52815d5f1b654c89044db0cdc6adce43da1f1 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 29 Aug 2022 20:12:48 +0000 Subject: binder: fix alloc->vma_vm_mm null-ptr dereference Syzbot reported a couple issues introduced by commit 44e602b4e52f ("binder_alloc: add missing mmap_lock calls when using the VMA"), in which we attempt to acquire the mmap_lock when alloc->vma_vm_mm has not been initialized yet. This can happen if a binder_proc receives a transaction without having previously called mmap() to setup the binder_proc->alloc space in [1]. Also, a similar issue occurs via binder_alloc_print_pages() when we try to dump the debugfs binder stats file in [2]. Sample of syzbot's crash report: ================================================================== KASAN: null-ptr-deref in range [0x0000000000000128-0x000000000000012f] CPU: 0 PID: 3755 Comm: syz-executor229 Not tainted 6.0.0-rc1-next-20220819-syzkaller #0 syz-executor229[3755] cmdline: ./syz-executor2294415195 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022 RIP: 0010:__lock_acquire+0xd83/0x56d0 kernel/locking/lockdep.c:4923 [...] Call Trace: lock_acquire kernel/locking/lockdep.c:5666 [inline] lock_acquire+0x1ab/0x570 kernel/locking/lockdep.c:5631 down_read+0x98/0x450 kernel/locking/rwsem.c:1499 mmap_read_lock include/linux/mmap_lock.h:117 [inline] binder_alloc_new_buf_locked drivers/android/binder_alloc.c:405 [inline] binder_alloc_new_buf+0xa5/0x19e0 drivers/android/binder_alloc.c:593 binder_transaction+0x242e/0x9a80 drivers/android/binder.c:3199 binder_thread_write+0x664/0x3220 drivers/android/binder.c:3986 binder_ioctl_write_read drivers/android/binder.c:5036 [inline] binder_ioctl+0x3470/0x6d00 drivers/android/binder.c:5323 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] ================================================================== Fix these issues by setting up alloc->vma_vm_mm pointer during open() and caching directly from current->mm. This guarantees we have a valid reference to take the mmap_lock during scenarios described above. [1] https://syzkaller.appspot.com/bug?extid=f7dc54e5be28950ac459 [2] https://syzkaller.appspot.com/bug?extid=a75ebe0452711c9e56d9 Fixes: 44e602b4e52f ("binder_alloc: add missing mmap_lock calls when using the VMA") Cc: # v5.15+ Cc: Liam R. Howlett Reported-by: syzbot+f7dc54e5be28950ac459@syzkaller.appspotmail.com Reported-by: syzbot+a75ebe0452711c9e56d9@syzkaller.appspotmail.com Reviewed-by: Liam R. Howlett Acked-by: Todd Kjos Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220829201254.1814484-2-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 1014beb12802..a61df6e1103a 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -322,7 +322,6 @@ static inline void binder_alloc_set_vma(struct binder_alloc *alloc, */ if (vma) { vm_start = vma->vm_start; - alloc->vma_vm_mm = vma->vm_mm; mmap_assert_write_locked(alloc->vma_vm_mm); } else { mmap_assert_locked(alloc->vma_vm_mm); @@ -792,7 +791,6 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; binder_alloc_set_vma(alloc, vma); - mmgrab(alloc->vma_vm_mm); return 0; @@ -1080,6 +1078,8 @@ static struct shrinker binder_shrinker = { void binder_alloc_init(struct binder_alloc *alloc) { alloc->pid = current->group_leader->pid; + alloc->vma_vm_mm = current->mm; + mmgrab(alloc->vma_vm_mm); mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); } -- cgit v1.2.3 From 76ff33468beaabbd0fe141b3ea1c6f514a8ef7d4 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 29 Aug 2022 20:12:49 +0000 Subject: binder: fix trivial kernel-doc typo Correct the misspelling of 'invariant' in kernel-doc section. No functional changes in this patch. Reviewed-by: Christian Brauner (Microsoft) Acked-by: Todd Kjos Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220829201254.1814484-3-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 1e4fd37af5e0..0c37935ff7a2 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -75,10 +75,10 @@ struct binder_lru_page { /** * struct binder_alloc - per-binder proc state for binder allocator * @vma: vm_area_struct passed to mmap_handler - * (invarient after mmap) + * (invariant after mmap) * @tsk: tid for task that called init for this proc * (invariant after init) - * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap) + * @vma_vm_mm: copy of vma->vm_mm (invariant after mmap) * @buffer: base of per-proc address space mapped via mmap * @buffers: list of all buffers for this proc * @free_buffers: rb tree of buffers available for allocation -- cgit v1.2.3 From 22534a44cb8ca660a14d62e320e45fde962e9410 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 29 Aug 2022 20:12:52 +0000 Subject: binder: remove unused binder_alloc->buffer_free The ->buffer_free member was introduced in the first revision of the driver under staging but it appears like it was never actually used according to git's history. Remove it from binder_alloc. Reviewed-by: Christian Brauner (Microsoft) Acked-by: Todd Kjos Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220829201254.1814484-6-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 0c37935ff7a2..f61a12d5c1e7 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -109,7 +109,6 @@ struct binder_alloc { size_t free_async_space; struct binder_lru_page *pages; size_t buffer_size; - uint32_t buffer_free; int pid; size_t pages_high; bool oneway_spam_detected; -- cgit v1.2.3 From eaf271ea844b8dea5256bd3c73e642ef13ce68a2 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 29 Aug 2022 20:12:54 +0000 Subject: binderfs: remove unused INTSTRLEN macro Fix the following W=1 build error: drivers/android/binderfs.c:42: error: macro "INTSTRLEN" is not used [-Werror=unused-macros] 42 | #define INTSTRLEN 21 | No functional changes in this patch. Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220829201254.1814484-8-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binderfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/android') diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 588d753a7a19..44939ea1874f 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -39,7 +39,6 @@ #define FIRST_INODE 1 #define SECOND_INODE 2 #define INODE_OFFSET 3 -#define INTSTRLEN 21 #define BINDERFS_MAX_MINOR (1U << MINORBITS) /* Ensure that the initial ipc namespace always has devices available. */ #define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4) -- cgit v1.2.3 From 9d64d2405f7d30d49818f6682acd0392348f0fdb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 23 Aug 2022 11:53:39 +0200 Subject: binderfs: rework superblock destruction So far we relied on .put_super = binderfs_put_super() to destroy info we stashed in sb->s_fs_info. This gave us the required ordering between ->evict_inode() and sb->s_fs_info destruction. But the current implementation of binderfs_fill_super() has a memory leak in the rare circumstance that d_make_root() fails because ->put_super() is only called when sb->s_root is initialized. Fix this by removing ->put_super() and simply do all that work in binderfs_kill_super(). Reported-by: Dongliang Mu Signed-off-by: Al Viro Signed-off-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220823095339.853371-1-brauner@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/android/binderfs.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 44939ea1874f..09b2ce7e4c34 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -339,22 +339,10 @@ static int binderfs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static void binderfs_put_super(struct super_block *sb) -{ - struct binderfs_info *info = sb->s_fs_info; - - if (info && info->ipc_ns) - put_ipc_ns(info->ipc_ns); - - kfree(info); - sb->s_fs_info = NULL; -} - static const struct super_operations binderfs_super_ops = { .evict_inode = binderfs_evict_inode, .show_options = binderfs_show_options, .statfs = simple_statfs, - .put_super = binderfs_put_super, }; static inline bool is_binderfs_control_device(const struct dentry *dentry) @@ -784,11 +772,27 @@ static int binderfs_init_fs_context(struct fs_context *fc) return 0; } +static void binderfs_kill_super(struct super_block *sb) +{ + struct binderfs_info *info = sb->s_fs_info; + + /* + * During inode eviction struct binderfs_info is needed. + * So first wipe the super_block then free struct binderfs_info. + */ + kill_litter_super(sb); + + if (info && info->ipc_ns) + put_ipc_ns(info->ipc_ns); + + kfree(info); +} + static struct file_system_type binder_fs_type = { .name = "binder", .init_fs_context = binderfs_init_fs_context, .parameters = binderfs_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = binderfs_kill_super, .fs_flags = FS_USERNS_MOUNT, }; -- cgit v1.2.3 From e66b77e50522845d494a4a61ea2ad8f0d046a56f Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 6 Sep 2022 13:59:45 +0000 Subject: binder: rename alloc->vma_vm_mm to alloc->mm Rename ->vma_vm_mm to ->mm to reflect the fact that we no longer cache this reference from vma->vm_mm but from current->mm instead. No functional changes in this patch. Reviewed-by: Christian Brauner (Microsoft) Acked-by: Todd Kjos Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220906135948.3048225-2-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 34 +++++++++++++++++----------------- drivers/android/binder_alloc.h | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 9b1778c00610..749a4cd30a83 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -208,8 +208,8 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (need_mm && mmget_not_zero(alloc->vma_vm_mm)) - mm = alloc->vma_vm_mm; + if (need_mm && mmget_not_zero(alloc->mm)) + mm = alloc->mm; if (mm) { mmap_read_lock(mm); @@ -322,9 +322,9 @@ static inline void binder_alloc_set_vma(struct binder_alloc *alloc, */ if (vma) { vm_start = vma->vm_start; - mmap_assert_write_locked(alloc->vma_vm_mm); + mmap_assert_write_locked(alloc->mm); } else { - mmap_assert_locked(alloc->vma_vm_mm); + mmap_assert_locked(alloc->mm); } alloc->vma_addr = vm_start; @@ -336,7 +336,7 @@ static inline struct vm_area_struct *binder_alloc_get_vma( struct vm_area_struct *vma = NULL; if (alloc->vma_addr) - vma = vma_lookup(alloc->vma_vm_mm, alloc->vma_addr); + vma = vma_lookup(alloc->mm, alloc->vma_addr); return vma; } @@ -401,15 +401,15 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; - mmap_read_lock(alloc->vma_vm_mm); + mmap_read_lock(alloc->mm); if (!binder_alloc_get_vma(alloc)) { - mmap_read_unlock(alloc->vma_vm_mm); + mmap_read_unlock(alloc->mm); binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: binder_alloc_buf, no vma\n", alloc->pid); return ERR_PTR(-ESRCH); } - mmap_read_unlock(alloc->vma_vm_mm); + mmap_read_unlock(alloc->mm); data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -823,7 +823,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers = 0; mutex_lock(&alloc->mutex); BUG_ON(alloc->vma_addr && - vma_lookup(alloc->vma_vm_mm, alloc->vma_addr)); + vma_lookup(alloc->mm, alloc->vma_addr)); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -873,8 +873,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) kfree(alloc->pages); } mutex_unlock(&alloc->mutex); - if (alloc->vma_vm_mm) - mmdrop(alloc->vma_vm_mm); + if (alloc->mm) + mmdrop(alloc->mm); binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d buffers %d, pages %d\n", @@ -931,13 +931,13 @@ void binder_alloc_print_pages(struct seq_file *m, * read inconsistent state. */ - mmap_read_lock(alloc->vma_vm_mm); + mmap_read_lock(alloc->mm); if (binder_alloc_get_vma(alloc) == NULL) { - mmap_read_unlock(alloc->vma_vm_mm); + mmap_read_unlock(alloc->mm); goto uninitialized; } - mmap_read_unlock(alloc->vma_vm_mm); + mmap_read_unlock(alloc->mm); for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { page = &alloc->pages[i]; if (!page->page_ptr) @@ -1020,7 +1020,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - mm = alloc->vma_vm_mm; + mm = alloc->mm; if (!mmget_not_zero(mm)) goto err_mmget; if (!mmap_read_trylock(mm)) @@ -1089,8 +1089,8 @@ static struct shrinker binder_shrinker = { void binder_alloc_init(struct binder_alloc *alloc) { alloc->pid = current->group_leader->pid; - alloc->vma_vm_mm = current->mm; - mmgrab(alloc->vma_vm_mm); + alloc->mm = current->mm; + mmgrab(alloc->mm); mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); } diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index f61a12d5c1e7..ab3b027bcd29 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -78,7 +78,7 @@ struct binder_lru_page { * (invariant after mmap) * @tsk: tid for task that called init for this proc * (invariant after init) - * @vma_vm_mm: copy of vma->vm_mm (invariant after mmap) + * @mm: copy of task->mm (invariant after open) * @buffer: base of per-proc address space mapped via mmap * @buffers: list of all buffers for this proc * @free_buffers: rb tree of buffers available for allocation @@ -101,7 +101,7 @@ struct binder_lru_page { struct binder_alloc { struct mutex mutex; unsigned long vma_addr; - struct mm_struct *vma_vm_mm; + struct mm_struct *mm; void __user *buffer; struct list_head buffers; struct rb_root free_buffers; -- cgit v1.2.3 From d6d04d71daae9377129b13641f97ba2a961b2b26 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 6 Sep 2022 13:59:46 +0000 Subject: binder: remove binder_alloc_set_vma() The mmap_locked asserts here are not needed since this is only called back from the mmap stack in ->mmap() and ->close() which always acquire the lock first. Remove these asserts along with binder_alloc_set_vma() altogether since it's trivial enough to be consumed by callers. Cc: Liam R. Howlett Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220906135948.3048225-3-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 749a4cd30a83..1c39cfce32fa 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -309,27 +309,6 @@ err_no_vma: return vma ? -ENOMEM : -ESRCH; } - -static inline void binder_alloc_set_vma(struct binder_alloc *alloc, - struct vm_area_struct *vma) -{ - unsigned long vm_start = 0; - - /* - * Allow clearing the vma with holding just the read lock to allow - * munmapping downgrade of the write lock before freeing and closing the - * file using binder_alloc_vma_close(). - */ - if (vma) { - vm_start = vma->vm_start; - mmap_assert_write_locked(alloc->mm); - } else { - mmap_assert_locked(alloc->mm); - } - - alloc->vma_addr = vm_start; -} - static inline struct vm_area_struct *binder_alloc_get_vma( struct binder_alloc *alloc) { @@ -793,7 +772,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, buffer->free = 1; binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; - binder_alloc_set_vma(alloc, vma); + alloc->vma_addr = vma->vm_start; return 0; @@ -983,7 +962,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) */ void binder_alloc_vma_close(struct binder_alloc *alloc) { - binder_alloc_set_vma(alloc, NULL); + alloc->vma_addr = 0; } /** -- cgit v1.2.3 From 7b0dbd94076567170f89172e0f07583915010ac6 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 6 Sep 2022 13:59:47 +0000 Subject: binder: fix binder_alloc kernel-doc warnings Update the kernel-doc section of struct binder_alloc to fix the following warnings reported by ./scripts/kernel-doc: warning: Function parameter or member 'mutex' not described in 'binder_alloc' warning: Function parameter or member 'vma_addr' not described in 'binder_alloc' No functional changes in this patch. Reviewed-by: Christian Brauner (Microsoft) Acked-by: Todd Kjos Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20220906135948.3048225-4-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index ab3b027bcd29..0f811ac4bcff 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -74,10 +74,9 @@ struct binder_lru_page { /** * struct binder_alloc - per-binder proc state for binder allocator - * @vma: vm_area_struct passed to mmap_handler + * @mutex: protects binder_alloc fields + * @vma_addr: vm_area_struct->vm_start passed to mmap_handler * (invariant after mmap) - * @tsk: tid for task that called init for this proc - * (invariant after init) * @mm: copy of task->mm (invariant after open) * @buffer: base of per-proc address space mapped via mmap * @buffers: list of all buffers for this proc -- cgit v1.2.3 From f5d39b020809146cc28e6e73369bf8065e0310aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:22 +0200 Subject: freezer,sched: Rewrite core freezer logic Rewrite the core freezer to behave better wrt thawing and be simpler in general. By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is ensured frozen tasks stay frozen until thawed and don't randomly wake up early, as is currently possible. As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up two PF_flags (yay!). Specifically; the current scheme works a little like: freezer_do_not_count(); schedule(); freezer_count(); And either the task is blocked, or it lands in try_to_freezer() through freezer_count(). Now, when it is blocked, the freezer considers it frozen and continues. However, on thawing, once pm_freezing is cleared, freezer_count() stops working, and any random/spurious wakeup will let a task run before its time. That is, thawing tries to thaw things in explicit order; kernel threads and workqueues before doing bringing SMP back before userspace etc.. However due to the above mentioned races it is entirely possible for userspace tasks to thaw (by accident) before SMP is back. This can be a fatal problem in asymmetric ISA architectures (eg ARMv9) where the userspace task requires a special CPU to run. As said; replace this with a special task state TASK_FROZEN and add the following state transitions: TASK_FREEZABLE -> TASK_FROZEN __TASK_STOPPED -> TASK_FROZEN __TASK_TRACED -> TASK_FROZEN The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL (IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state is already required to deal with spurious wakeups and the freezer causes one such when thawing the task (since the original state is lost). The special __TASK_{STOPPED,TRACED} states *can* be restored since their canonical state is in ->jobctl. With this, frozen tasks need an explicit TASK_FROZEN wakeup and are free of undue (early / spurious) wakeups. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org --- drivers/android/binder.c | 4 +- drivers/media/pci/pt3/pt3.c | 4 +- fs/cifs/inode.c | 4 +- fs/cifs/transport.c | 5 +- fs/coredump.c | 5 +- fs/nfs/file.c | 3 +- fs/nfs/inode.c | 12 +- fs/nfs/nfs3proc.c | 3 +- fs/nfs/nfs4proc.c | 14 +-- fs/nfs/nfs4state.c | 3 +- fs/nfs/pnfs.c | 4 +- fs/xfs/xfs_trans_ail.c | 8 +- include/linux/freezer.h | 245 ++--------------------------------------- include/linux/sched.h | 13 ++- include/linux/sunrpc/sched.h | 7 +- include/linux/wait.h | 12 +- kernel/cgroup/legacy_freezer.c | 23 ++-- kernel/exit.c | 4 +- kernel/fork.c | 5 +- kernel/freezer.c | 133 +++++++++++++++------- kernel/futex/waitwake.c | 8 +- kernel/hung_task.c | 4 +- kernel/power/main.c | 6 +- kernel/power/process.c | 10 +- kernel/ptrace.c | 2 +- kernel/sched/core.c | 2 +- kernel/signal.c | 14 +-- kernel/time/hrtimer.c | 4 +- kernel/umh.c | 20 ++-- mm/khugepaged.c | 4 +- net/sunrpc/sched.c | 12 +- net/unix/af_unix.c | 8 +- 32 files changed, 210 insertions(+), 395 deletions(-) (limited to 'drivers/android') diff --git a/drivers/android/binder.c b/drivers/android/binder.c index c964d7c8c384..50197ef4b787 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4247,10 +4247,9 @@ static int binder_wait_for_work(struct binder_thread *thread, struct binder_proc *proc = thread->proc; int ret = 0; - freezer_do_not_count(); binder_inner_proc_lock(proc); for (;;) { - prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (binder_has_work_ilocked(thread, do_proc_work)) break; if (do_proc_work) @@ -4267,7 +4266,6 @@ static int binder_wait_for_work(struct binder_thread *thread, } finish_wait(&thread->wait, &wait); binder_inner_proc_unlock(proc); - freezer_count(); return ret; } diff --git a/drivers/media/pci/pt3/pt3.c b/drivers/media/pci/pt3/pt3.c index 0d51bdf01f43..f6deac85962e 100644 --- a/drivers/media/pci/pt3/pt3.c +++ b/drivers/media/pci/pt3/pt3.c @@ -445,8 +445,8 @@ static int pt3_fetch_thread(void *data) pt3_proc_dma(adap); delay = ktime_set(0, PT3_FETCH_DELAY * NSEC_PER_MSEC); - set_current_state(TASK_UNINTERRUPTIBLE); - freezable_schedule_hrtimeout_range(&delay, + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); + schedule_hrtimeout_range(&delay, PT3_FETCH_DELAY_DELTA * NSEC_PER_MSEC, HRTIMER_MODE_REL); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index eeeaba3dec05..54e6bfc6a5a6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2326,7 +2326,7 @@ cifs_invalidate_mapping(struct inode *inode) static int cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -2344,7 +2344,7 @@ cifs_revalidate_mapping(struct inode *inode) return 0; rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (rc) return rc; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index de7aeced7e16..eda09fba1550 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -757,8 +757,9 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_freezekillable_unsafe(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED); + error = wait_event_state(server->response_q, + midQ->mid_state != MID_REQUEST_SUBMITTED, + (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; diff --git a/fs/coredump.c b/fs/coredump.c index 3d9054c49a11..9ce59f212a4a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -402,9 +402,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) if (core_waiters > 0) { struct core_thread *ptr; - freezer_do_not_count(); - wait_for_completion(&core_state->startup); - freezer_count(); + wait_for_completion_state(&core_state->startup, + TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); /* * Wait for all the threads to become inactive, so that * all the thread context (extended register state, like diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d2bcd4834c0e..e8301a372738 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -570,7 +570,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); lock_page(page); mapping = page_file_mapping(page); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b4e46b0ffa2d..a6058f5787a5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -static int nfs_wait_killable(int mode) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } - -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) -{ - return nfs_wait_killable(mode); -} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** @@ -1331,7 +1326,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; spin_lock(&inode->i_lock); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1597eef40d54..2e7579626cf0 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3ed14a2a84a4..4553803538e5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -416,8 +416,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); - freezable_schedule_timeout_killable_unsafe( - nfs4_update_delay(timeout)); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) return 0; return -EINTR; @@ -427,7 +427,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); - freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) return 0; return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; @@ -7406,7 +7407,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; - freezable_schedule_timeout_interruptible(timeout); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(timeout); timeout *= 2; timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); status = -ERESTARTSYS; @@ -7474,10 +7476,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) break; status = -ERESTARTSYS; - freezer_do_not_count(); - wait_woken(&waiter.wait, TASK_INTERRUPTIBLE, + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, NFS4_LOCK_MAXTIMEOUT); - freezer_count(); } while (!signalled()); remove_wait_queue(q, &waiter.wait); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9bab3e9c702a..7e185f7eb260 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1314,7 +1314,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (res) goto out; if (clp->cl_cons_state < 0) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 41a9b6b58fb9..70706db317f3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1908,7 +1908,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) pnfs_layoutcommit_inode(lo->plh_inode, false); return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) @@ -3193,7 +3193,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (status) goto out; } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d3a97a028560..16fbf2a1144c 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -602,9 +602,9 @@ xfsaild( while (1) { if (tout && tout <= 20) - set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); /* * Check kthread_should_stop() after we set the task state to @@ -653,14 +653,14 @@ xfsaild( ailp->ail_target == ailp->ail_target_prev && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); - freezable_schedule(); + schedule(); tout = 0; continue; } spin_unlock(&ailp->ail_lock); if (tout) - freezable_schedule_timeout(msecs_to_jiffies(tout)); + schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 0621c5f86c39..b303472255be 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -8,9 +8,11 @@ #include #include #include +#include #ifdef CONFIG_FREEZER -extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */ +DECLARE_STATIC_KEY_FALSE(freezer_active); + extern bool pm_freezing; /* PM freezing in effect */ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ @@ -22,10 +24,7 @@ extern unsigned int freeze_timeout_msecs; /* * Check if a process has been frozen */ -static inline bool frozen(struct task_struct *p) -{ - return p->flags & PF_FROZEN; -} +extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); @@ -34,9 +33,10 @@ extern bool freezing_slow_path(struct task_struct *p); */ static inline bool freezing(struct task_struct *p) { - if (likely(!atomic_read(&system_freezing_cnt))) - return false; - return freezing_slow_path(p); + if (static_branch_unlikely(&freezer_active)) + return freezing_slow_path(p); + + return false; } /* Takes and releases task alloc lock using task_lock() */ @@ -48,23 +48,14 @@ extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); -/* - * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION - * If try_to_freeze causes a lockdep warning it means the caller may deadlock - */ -static inline bool try_to_freeze_unsafe(void) +static inline bool try_to_freeze(void) { might_sleep(); if (likely(!freezing(current))) return false; - return __refrigerator(false); -} - -static inline bool try_to_freeze(void) -{ if (!(current->flags & PF_NOFREEZE)) debug_check_no_locks_held(); - return try_to_freeze_unsafe(); + return __refrigerator(false); } extern bool freeze_task(struct task_struct *p); @@ -79,195 +70,6 @@ static inline bool cgroup_freezing(struct task_struct *task) } #endif /* !CONFIG_CGROUP_FREEZER */ -/* - * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it - * calls wait_for_completion(&vfork) and reset right after it returns from this - * function. Next, the parent should call try_to_freeze() to freeze itself - * appropriately in case the child has exited before the freezing of tasks is - * complete. However, we don't want kernel threads to be frozen in unexpected - * places, so we allow them to block freeze_processes() instead or to set - * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the - * parent won't really block freeze_processes(), since ____call_usermodehelper() - * (the child) does a little before exec/exit and it can't be frozen before - * waking up the parent. - */ - - -/** - * freezer_do_not_count - tell freezer to ignore %current - * - * Tell freezers to ignore the current task when determining whether the - * target frozen state is reached. IOW, the current task will be - * considered frozen enough by freezers. - * - * The caller shouldn't do anything which isn't allowed for a frozen task - * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair - * wrap a scheduling operation and nothing much else. - */ -static inline void freezer_do_not_count(void) -{ - current->flags |= PF_FREEZER_SKIP; -} - -/** - * freezer_count - tell freezer to stop ignoring %current - * - * Undo freezer_do_not_count(). It tells freezers that %current should be - * considered again and tries to freeze if freezing condition is already in - * effect. - */ -static inline void freezer_count(void) -{ - current->flags &= ~PF_FREEZER_SKIP; - /* - * If freezing is in progress, the following paired with smp_mb() - * in freezer_should_skip() ensures that either we see %true - * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP. - */ - smp_mb(); - try_to_freeze(); -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline void freezer_count_unsafe(void) -{ - current->flags &= ~PF_FREEZER_SKIP; - smp_mb(); - try_to_freeze_unsafe(); -} - -/** - * freezer_should_skip - whether to skip a task when determining frozen - * state is reached - * @p: task in quesion - * - * This function is used by freezers after establishing %true freezing() to - * test whether a task should be skipped when determining the target frozen - * state is reached. IOW, if this function returns %true, @p is considered - * frozen enough. - */ -static inline bool freezer_should_skip(struct task_struct *p) -{ - /* - * The following smp_mb() paired with the one in freezer_count() - * ensures that either freezer_count() sees %true freezing() or we - * see cleared %PF_FREEZER_SKIP and return %false. This makes it - * impossible for a task to slip frozen state testing after - * clearing %PF_FREEZER_SKIP. - */ - smp_mb(); - return p->flags & PF_FREEZER_SKIP; -} - -/* - * These functions are intended to be used whenever you want allow a sleeping - * task to be frozen. Note that neither return any clear indication of - * whether a freeze event happened while in this function. - */ - -/* Like schedule(), but should not block the freezer. */ -static inline void freezable_schedule(void) -{ - freezer_do_not_count(); - schedule(); - freezer_count(); -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline void freezable_schedule_unsafe(void) -{ - freezer_do_not_count(); - schedule(); - freezer_count_unsafe(); -} - -/* - * Like schedule_timeout(), but should not block the freezer. Do not - * call this with locks held. - */ -static inline long freezable_schedule_timeout(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout(timeout); - freezer_count(); - return __retval; -} - -/* - * Like schedule_timeout_interruptible(), but should not block the freezer. Do not - * call this with locks held. - */ -static inline long freezable_schedule_timeout_interruptible(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout_interruptible(timeout); - freezer_count(); - return __retval; -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout) -{ - long __retval; - - freezer_do_not_count(); - __retval = schedule_timeout_interruptible(timeout); - freezer_count_unsafe(); - return __retval; -} - -/* Like schedule_timeout_killable(), but should not block the freezer. */ -static inline long freezable_schedule_timeout_killable(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout_killable(timeout); - freezer_count(); - return __retval; -} - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -static inline long freezable_schedule_timeout_killable_unsafe(long timeout) -{ - long __retval; - freezer_do_not_count(); - __retval = schedule_timeout_killable(timeout); - freezer_count_unsafe(); - return __retval; -} - -/* - * Like schedule_hrtimeout_range(), but should not block the freezer. Do not - * call this with locks held. - */ -static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - u64 delta, const enum hrtimer_mode mode) -{ - int __retval; - freezer_do_not_count(); - __retval = schedule_hrtimeout_range(expires, delta, mode); - freezer_count(); - return __retval; -} - -/* - * Freezer-friendly wrappers around wait_event_interruptible(), - * wait_event_killable() and wait_event_interruptible_timeout(), originally - * defined in - */ - -/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ -#define wait_event_freezekillable_unsafe(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_killable(wq, (condition)); \ - freezer_count_unsafe(); \ - __retval; \ -}) - #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } @@ -281,35 +83,8 @@ static inline void thaw_kernel_threads(void) {} static inline bool try_to_freeze(void) { return false; } -static inline void freezer_do_not_count(void) {} -static inline void freezer_count(void) {} -static inline int freezer_should_skip(struct task_struct *p) { return 0; } static inline void set_freezable(void) {} -#define freezable_schedule() schedule() - -#define freezable_schedule_unsafe() schedule() - -#define freezable_schedule_timeout(timeout) schedule_timeout(timeout) - -#define freezable_schedule_timeout_interruptible(timeout) \ - schedule_timeout_interruptible(timeout) - -#define freezable_schedule_timeout_interruptible_unsafe(timeout) \ - schedule_timeout_interruptible(timeout) - -#define freezable_schedule_timeout_killable(timeout) \ - schedule_timeout_killable(timeout) - -#define freezable_schedule_timeout_killable_unsafe(timeout) \ - schedule_timeout_killable(timeout) - -#define freezable_schedule_hrtimeout_range(expires, delta, mode) \ - schedule_hrtimeout_range(expires, delta, mode) - -#define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - #endif /* !CONFIG_FREEZER */ #endif /* FREEZER_H_INCLUDED */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fc23a2d9813..4c91efd6df82 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -97,12 +97,19 @@ struct task_group; #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 -/* RT specific auxilliary flag to mark RT lock waiters */ #define TASK_RTLOCK_WAIT 0x00001000 -#define TASK_STATE_MAX 0x00002000 +#define TASK_FREEZABLE 0x00002000 +#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) +#define TASK_FROZEN 0x00008000 +#define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) +/* + * DO NOT ADD ANY NEW USERS ! + */ +#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) + /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) @@ -1716,7 +1723,6 @@ extern struct pid *cad_pid; #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ @@ -1727,7 +1733,6 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index acc62647317c..baeca2f564dc 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -252,7 +252,7 @@ int rpc_malloc(struct rpc_task *); void rpc_free(struct rpc_task *); int rpciod_up(void); void rpciod_down(void); -int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); +int rpc_wait_for_completion_task(struct rpc_task *task); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct net; void rpc_show_tasks(struct net *); @@ -264,11 +264,6 @@ extern struct workqueue_struct *xprtiod_workqueue; void rpc_prepare_task(struct rpc_task *task); gfp_t rpc_task_gfp_mask(void); -static inline int rpc_wait_for_completion_task(struct rpc_task *task) -{ - return __rpc_wait_for_completion_task(task, NULL); -} - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) static inline const char * rpc_qname(const struct rpc_wait_queue *q) { diff --git a/include/linux/wait.h b/include/linux/wait.h index b926eb9f3e26..14ad8a0e9fac 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -361,8 +361,8 @@ do { \ } while (0) #define __wait_event_freezable(wq_head, condition) \ - ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ - freezable_schedule()) + ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ + 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true @@ -420,8 +420,8 @@ do { \ #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, timeout, \ - __ret = freezable_schedule_timeout(__ret)) + (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ + __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid @@ -642,8 +642,8 @@ do { \ #define __wait_event_freezable_exclusive(wq, condition) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ - freezable_schedule()) + ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ + schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 08236798d173..1b6b21851e9d 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); } mutex_unlock(&freezer_mutex); @@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); freezer->state = 0; @@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset) __thaw_task(task); } else { freeze_task(task); + /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; @@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } + if (freezing(task) && !frozen(task)) + goto out_iter_end; } freezer->state |= CGROUP_FROZEN; @@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); freezer->state &= ~CGROUP_FROZEN; + if (was_freezing) + static_branch_dec(&freezer_active); unfreeze_cgroup(freezer); } } diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e..ba683f8d292a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -374,10 +374,10 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf69..a1de2f5e305d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1420,13 +1420,12 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { + unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; int killed; - freezer_do_not_count(); cgroup_enter_frozen(); - killed = wait_for_completion_killable(vfork); + killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); - freezer_count(); if (killed) { task_lock(child); diff --git a/kernel/freezer.c b/kernel/freezer.c index 45ab36ffd0e7..4fad0e6fca64 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -13,10 +13,11 @@ #include /* total number of freezing conditions in effect */ -atomic_t system_freezing_cnt = ATOMIC_INIT(0); -EXPORT_SYMBOL(system_freezing_cnt); +DEFINE_STATIC_KEY_FALSE(freezer_active); +EXPORT_SYMBOL(freezer_active); -/* indicate whether PM freezing is in effect, protected by +/* + * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; @@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock); * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * - * This function is called by freezing() if system_freezing_cnt isn't zero + * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. @@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p) } EXPORT_SYMBOL(freezing_slow_path); +bool frozen(struct task_struct *p) +{ + return READ_ONCE(p->__state) & TASK_FROZEN; +} + /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ + unsigned int state = get_current_state(); bool was_frozen = false; - unsigned int save = get_current_state(); pr_debug("%s entered refrigerator\n", current->comm); + WARN_ON_ONCE(state && !(state & TASK_NORMAL)); + for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + bool freeze; + + set_current_state(TASK_FROZEN); spin_lock_irq(&freezer_lock); - current->flags |= PF_FROZEN; - if (!freezing(current) || - (check_kthr_stop && kthread_should_stop())) - current->flags &= ~PF_FROZEN; + freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); - if (!(current->flags & PF_FROZEN)) + if (!freeze) break; + was_frozen = true; schedule(); } + __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); - /* - * Restore saved task state before returning. The mb'd version - * needs to be used; otherwise, it might silently break - * synchronization which depends on ordered task state change. - */ - set_current_state(save); - return was_frozen; } EXPORT_SYMBOL(__refrigerator); @@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p) } } +static int __set_task_frozen(struct task_struct *p, void *arg) +{ + unsigned int state = READ_ONCE(p->__state); + + if (p->on_rq) + return 0; + + if (p != current && task_curr(p)) + return 0; + + if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) + return 0; + + /* + * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they + * can suffer spurious wakeups. + */ + if (state & TASK_FREEZABLE) + WARN_ON_ONCE(!(state & TASK_NORMAL)); + +#ifdef CONFIG_LOCKDEP + /* + * It's dangerous to freeze with locks held; there be dragons there. + */ + if (!(state & __TASK_FREEZABLE_UNSAFE)) + WARN_ON_ONCE(debug_locks && p->lockdep_depth); +#endif + + WRITE_ONCE(p->__state, TASK_FROZEN); + return TASK_FROZEN; +} + +static bool __freeze_task(struct task_struct *p) +{ + /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ + return task_call_func(p, __set_task_frozen, NULL); +} + /** * freeze_task - send a freeze request to given task * @p: task to send the request to @@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; - /* - * This check can race with freezer_do_not_count, but worst case that - * will result in an extra wakeup being sent to the task. It does not - * race with freezer_count(), the barriers in freezer_count() and - * freezer_should_skip() ensure that either freezer_count() sees - * freezing == true in try_to_freeze() and freezes, or - * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task - * normally. - */ - if (freezer_should_skip(p)) - return false; - spin_lock_irqsave(&freezer_lock, flags); - if (!freezing(p) || frozen(p)) { + if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } @@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p) if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else - wake_up_state(p, TASK_INTERRUPTIBLE); + wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } +/* + * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical + * state in p->jobctl. If either of them got a wakeup that was missed because + * TASK_FROZEN, then their canonical state reflects that and the below will + * refuse to restore the special state and instead issue the wakeup. + */ +static int __set_task_special(struct task_struct *p, void *arg) +{ + unsigned int state = 0; + + if (p->jobctl & JOBCTL_TRACED) + state = TASK_TRACED; + + else if (p->jobctl & JOBCTL_STOPPED) + state = TASK_STOPPED; + + if (state) + WRITE_ONCE(p->__state, state); + + return state; +} + void __thaw_task(struct task_struct *p) { - unsigned long flags; + unsigned long flags, flags2; spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) - wake_up_process(p); + if (WARN_ON_ONCE(freezing(p))) + goto unlock; + + if (lock_task_sighand(p, &flags2)) { + /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ + bool ret = task_call_func(p, __set_task_special, NULL); + unlock_task_sighand(p, &flags2); + if (ret) + goto unlock; + } + + wake_up_state(p, TASK_FROZEN); +unlock: spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 4ce0923f1ce3..ba01b9408203 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -334,7 +334,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ @@ -352,7 +352,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } @@ -430,7 +430,7 @@ retry: return ret; } - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; @@ -504,7 +504,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, return; } - freezable_schedule(); + schedule(); } /** diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bb2354f73ded..f1321c03c32a 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; + if (unlikely(READ_ONCE(t->__state) & (TASK_FREEZABLE | TASK_FROZEN))) + return; /* * When a freshly created task is scheduled once, changes its state to diff --git a/kernel/power/main.c b/kernel/power/main.c index dae3d17919e6..31ec4a9b9d70 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -24,7 +24,7 @@ unsigned int lock_system_sleep(void) { unsigned int flags = current->flags; - current->flags |= PF_FREEZER_SKIP; + current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); return flags; } @@ -48,8 +48,8 @@ void unlock_system_sleep(unsigned int flags) * Which means, if we use try_to_freeze() here, it would make them * enter the refrigerator, thus causing hibernation to lockup. */ - if (!(flags & PF_FREEZER_SKIP)) - current->flags &= ~PF_FREEZER_SKIP; + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); diff --git a/kernel/power/process.c b/kernel/power/process.c index 3068601e585a..ddd9988327fe 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -50,8 +50,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - if (!freezer_should_skip(p)) - todo++; + todo++; } read_unlock(&tasklist_lock); @@ -96,8 +95,7 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); @@ -129,7 +127,7 @@ int freeze_processes(void) current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); @@ -190,7 +188,7 @@ void thaw_processes(void) trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1893d909e45c..54482193e1ed 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -269,7 +269,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) read_unlock(&tasklist_lock); if (!ret && !ignore_state && - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN))) ret = -ESRCH; return ret; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea26526b45bb..2b85d1b5fe0c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6428,7 +6428,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && - !(prev->flags & PF_FROZEN); + !(prev_state & TASK_FROZEN); if (prev->sched_contributes_to_load) rq->nr_uninterruptible++; diff --git a/kernel/signal.c b/kernel/signal.c index 6f86fda5e432..c2e90d6ca0d1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2304,7 +2304,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, read_unlock(&tasklist_lock); cgroup_enter_frozen(); preempt_enable_no_resched(); - freezable_schedule(); + schedule(); cgroup_leave_frozen(true); /* @@ -2473,7 +2473,7 @@ static bool do_signal_stop(int signr) /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); - freezable_schedule(); + schedule(); return true; } else { /* @@ -2548,11 +2548,11 @@ static void do_freezer_trap(void) * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); - freezable_schedule(); + schedule(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) @@ -3600,9 +3600,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - __set_current_state(TASK_INTERRUPTIBLE); - ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, - HRTIMER_MODE_REL); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23af5eca11b1..3ae661ab6260 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod struct restart_block *restart; do { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) - freezable_schedule(); + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; diff --git a/kernel/umh.c b/kernel/umh.c index 8945eaf4c671..850631518665 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -404,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { + unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; @@ -437,25 +438,22 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; + if (wait & UMH_KILLABLE) + state |= TASK_KILLABLE; + if (wait & UMH_FREEZABLE) - freezer_do_not_count(); + state |= TASK_FREEZABLE; - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; + retval = wait_for_completion_state(&done, state); + if (!retval) + goto wait_done; + if (wait & UMH_KILLABLE) { /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; - /* fallthrough, umh_complete() was already called */ } - wait_for_completion(&done); - - if (wait & UMH_FREEZABLE) - freezer_count(); - wait_done: retval = sub_info->retval; out: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01f71786d530..77162fb7e90b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -730,8 +730,8 @@ static void khugepaged_alloc_sleep(void) DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); - freezable_schedule_timeout_interruptible( - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 25b9221950ff..46cbf151a50b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -333,14 +333,12 @@ static int rpc_complete_task(struct rpc_task *task) * to enforce taking of the wq->lock and hence avoid races with * rpc_complete_task(). */ -int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action) +int rpc_wait_for_completion_task(struct rpc_task *task) { - if (action == NULL) - action = rpc_wait_bit_killable; return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, - action, TASK_KILLABLE); + rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } -EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); +EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task); /* * Make an RPC task runnable. @@ -964,7 +962,7 @@ static void __rpc_execute(struct rpc_task *task) trace_rpc_task_sync_sleep(task, task->tk_action); status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE); if (status < 0) { /* * When a sync task receives a signal, it exits with diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..dda9eb1ab41f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2543,13 +2543,14 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { + unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || @@ -2562,10 +2563,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); - if (freezable) - timeo = freezable_schedule_timeout(timeo); - else - timeo = schedule_timeout(timeo); + timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) -- cgit v1.2.3