diff options
Diffstat (limited to 'kernel/fork.c')
| -rw-r--r-- | kernel/fork.c | 208 |
1 files changed, 117 insertions, 91 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..3da0f08615a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,7 +93,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> -#include <linux/stackleak.h> +#include <linux/kstack_erase.h> #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> @@ -105,6 +105,7 @@ #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> #include <linux/tick.h> +#include <linux/unwind_deferred.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -188,33 +189,33 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -223,11 +224,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -240,32 +242,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -273,55 +275,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); + return -ENOMEM; + } + /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); - return -ENOMEM; - } - - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -330,7 +324,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -345,7 +339,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -377,8 +377,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -417,7 +416,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -437,11 +437,11 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -457,12 +457,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } @@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) @@ -732,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + unwind_task_free(tsk); sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); @@ -1010,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } @@ -1046,21 +1051,26 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); - futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_set_word(mm, default_dump_filter); mm->def_flags = 0; } + if (futex_mm_init(mm)) + goto fail_mm_init; + if (mm_alloc_pgd(mm)) goto fail_nopgd; @@ -1090,6 +1100,8 @@ fail_nocontext: fail_noid: mm_free_pgd(mm); fail_nopgd: + futex_hash_free(mm); +fail_mm_init: free_mm(mm); return NULL; } @@ -1145,7 +1157,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, @@ -1499,7 +1511,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1537,19 +1549,19 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return -EAGAIN; } fs->users++; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return 0; } tsk->fs = copy_fs_struct(fs); @@ -1558,7 +1570,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1588,7 +1600,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -1637,7 +1649,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; @@ -1680,6 +1692,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1876,7 +1892,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); - set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + mm_flags_set(MMF_MULTIPROCESS, tsk->mm); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1886,10 +1902,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { - int i; - - for (i = 0; i < RV_PER_TASK_MONITORS; i++) - p->rv[i].da_mon.monitoring = false; + memset(&p->rv, 0, sizeof(p->rv)); } #else #define rv_task_fork(p) do {} while (0) @@ -2119,13 +2132,10 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif -#ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ -#endif + #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -2135,6 +2145,8 @@ __latent_entropy struct task_struct *copy_process( p->bpf_ctx = NULL; #endif + unwind_task_init(p); + /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) @@ -2289,7 +2301,7 @@ __latent_entropy struct task_struct *copy_process( if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) - goto bad_fork_core_free; + goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created @@ -2533,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2649,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2667,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; @@ -2743,7 +2751,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif -noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, +static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { @@ -3149,13 +3157,13 @@ int ksys_unshare(unsigned long unshare_flags) if (new_fs) { fs = current->fs; - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } if (new_fd) @@ -3216,7 +3224,7 @@ int unshare_files(void) return 0; } -int sysctl_max_threads(const struct ctl_table *table, int write, +static int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -3238,3 +3246,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write, return 0; } + +static const struct ctl_table fork_sysctl_table[] = { + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, +}; + +static int __init init_fork_sysctl(void) +{ + register_sysctl_init("kernel", fork_sysctl_table); + return 0; +} + +subsys_initcall(init_fork_sysctl); |
